コード例 #1
0
def supported_vm_sizes(ws):
    """
    Get vm sizes available for your region
    :param ws: azureml Workspace instance
    :return: list
    """
    return [size for size in AmlCompute.supported_vmsizes(workspace=ws)]
コード例 #2
0
    def __check_compute_target(self, compute_target, use_gpu: bool):
        __vm_size = ''
        if isinstance(compute_target, AmlCompute):
            __vm_size = compute_target.vm_size
        elif isinstance(compute_target, str):
            compute = ComputeTarget(workspace=self.__workspace,
                                    name=compute_target)
            __vm_size = compute.vm_size

        if self.__vm_size_list is None:
            self.__vm_size_list = AmlCompute.supported_vmsizes(
                self.__workspace)

        vm_description = list(
            filter(
                lambda vmsize: str.upper(vmsize['name']) == str.upper(
                    __vm_size), self.__vm_size_list))[0]
        if (use_gpu and vm_description['gpus'] == 0):
            raise errors.TrainingComputeException(
                f'gpu_compute was specified, but the target does not have GPUs: {vm_description} '
            )
        if (not (use_gpu) and vm_description['vCPUs'] == 0):
            raise errors.TrainingComputeException(
                f'cpu_compute was specified, but the target does not have CPUs: {vm_description} '
            )
コード例 #3
0
    def get_aml_ws_sizes(self, aml_ws_name):
        ws = get_aml_ws(self.config, aml_ws_name)

        # TODO: make this an xt cmd: xt list sizes
        from azureml.core.compute import ComputeTarget, AmlCompute
        sizes = AmlCompute.supported_vmsizes(workspace=ws)
        # for size in sizes:
        #     if size["gpus"] > 0:
        #         console.print(size)

        return sizes
コード例 #4
0
def show_available_vm_sizes(ctx):
    """
    Show, which VM Sizes are available in the workspace's Azure region
    """

    ws = Workspace.from_config()

    pprint(AmlCompute.supported_vmsizes(workspace=ws))

    print(
        "\n>>> For VM prices, see https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/ <<<\n"
    )
コード例 #5
0
    def __init__(
        self,
        workspace,
        compute_target=None,
        environment_definition=None,
        experiment_name=None,
        initial_node_count=None,
        jupyter=None,
        jupyter_port=None,
        dashboard_port=None,
        scheduler_port=None,
        scheduler_idle_timeout=None,
        worker_death_timeout=None,
        additional_ports=None,
        admin_username=None,
        admin_ssh_key=None,
        datastores=None,
        code_store=None,
        vnet_resource_group=None,
        vnet=None,
        subnet=None,
        show_output=False,
        telemetry_opt_out=None,
        asynchronous=False,
        **kwargs,
    ):
        ### REQUIRED PARAMETERS
        self.workspace = workspace
        self.compute_target = compute_target

        ### ENVIRONMENT
        self.environment_definition = environment_definition

        ### EXPERIMENT DEFINITION
        self.experiment_name = experiment_name
        self.tags = {"tag": "azureml-dask"}

        ### ENVIRONMENT AND VARIABLES
        self.initial_node_count = initial_node_count

        ### SEND TELEMETRY
        self.telemetry_opt_out = telemetry_opt_out
        self.telemetry_set = False

        ### FUTURE EXTENSIONS
        self.kwargs = kwargs
        self.show_output = show_output

        ## CREATE COMPUTE TARGET
        self.admin_username = admin_username
        self.admin_ssh_key = admin_ssh_key
        self.vnet_resource_group = vnet_resource_group
        self.vnet = vnet
        self.subnet = subnet
        self.compute_target_set = True
        self.pub_key_file = ""
        self.pri_key_file = ""
        if self.compute_target is None:
            try:
                self.compute_target = self.__create_compute_target()
                self.compute_target_set = False
            except Exception as e:
                logger.exception(e)
                return
        elif self.compute_target.admin_user_ssh_key is not None and (
            self.admin_ssh_key is None or self.admin_username is None
        ):
            logger.exception(
                "Please provide private key and admin username to access compute target {}".format(
                    self.compute_target.name
                )
            )
            return

        ### GPU RUN INFO
        self.workspace_vm_sizes = AmlCompute.supported_vmsizes(self.workspace)
        self.workspace_vm_sizes = [
            (e["name"].lower(), e["gpus"]) for e in self.workspace_vm_sizes
        ]
        self.workspace_vm_sizes = dict(self.workspace_vm_sizes)

        self.compute_target_vm_size = self.compute_target.serialize()["properties"][
            "status"
        ]["vmSize"].lower()
        self.n_gpus_per_node = self.workspace_vm_sizes[self.compute_target_vm_size]
        self.use_gpu = True if self.n_gpus_per_node > 0 else False
        if self.environment_definition is None:
            if self.use_gpu:
                self.environment_definition = self.workspace.environments[
                    "AzureML-Dask-GPU"
                ]
            else:
                self.environment_definition = self.workspace.environments[
                    "AzureML-Dask-CPU"
                ]

        ### JUPYTER AND PORT FORWARDING
        self.jupyter = jupyter
        self.jupyter_port = jupyter_port
        self.dashboard_port = dashboard_port
        self.scheduler_port = scheduler_port
        self.scheduler_idle_timeout = scheduler_idle_timeout
        self.portforward_proc = None
        self.worker_death_timeout = worker_death_timeout
        self.end_logging = False  # FLAG FOR STOPPING THE port_forward_logger THREAD

        if additional_ports is not None:
            if type(additional_ports) != list:
                error_message = (
                    f"The additional_ports parameter is of {type(additional_ports)}"
                    " type but needs to be a list of int tuples."
                    " Check the documentation."
                )
                logger.exception(error_message)
                raise TypeError(error_message)

            if len(additional_ports) > 0:
                if type(additional_ports[0]) != tuple:
                    error_message = (
                        f"The additional_ports elements are of {type(additional_ports[0])}"
                        " type but needs to be a list of int tuples."
                        " Check the documentation."
                    )
                    raise TypeError(error_message)

                ### check if all elements are tuples of length two and int type
                all_correct = True
                for el in additional_ports:
                    if type(el) != tuple or len(el) != 2:
                        all_correct = False
                        break

                    if (type(el[0]), type(el[1])) != (int, int):
                        all_correct = False
                        break

                if not all_correct:
                    error_message = (
                        "At least one of the elements of the additional_ports parameter"
                        " is wrong. Make sure it is a list of int tuples."
                        " Check the documentation."
                    )
                    raise TypeError(error_message)

        self.additional_ports = additional_ports
        self.scheduler_ip_port = (
            None  ### INIT FOR HOLDING THE ADDRESS FOR THE SCHEDULER
        )

        ### DATASTORES
        self.datastores = datastores

        ### RUNNING IN MATRIX OR LOCAL
        self.same_vnet = None
        self.is_in_ci = False

        ### GET RUNNING LOOP
        self._loop_runner = LoopRunner(loop=None, asynchronous=asynchronous)
        self.loop = self._loop_runner.loop

        self.abs_path = pathlib.Path(__file__).parent.absolute()

        ### INITIALIZE CLUSTER
        super().__init__(asynchronous=asynchronous)

        if not self.asynchronous:
            self._loop_runner.start()
            self.sync(self.__get_defaults)

            if not self.telemetry_opt_out:
                self.__append_telemetry()

            self.sync(self.__create_cluster)
コード例 #6
0
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

print("SDK version:", azureml.core.VERSION)

ws = Workspace.from_config(auth=AzureCliAuthentication(),
                           path=os.path.join(
                               os.path.dirname(os.path.realpath(__file__)),
                               'config.json'))

print(f'Workspace: {ws.name}')

experiment_name = 'train-bert-ner-on-amlcompute'
experiment = Experiment(workspace=ws, name=experiment_name)

supported_vms = AmlCompute.supported_vmsizes(workspace=ws)
# print(supported_vms)

project_folder = './ner'

bert_env = Environment("bert_aml_env")

conda_dep = CondaDependencies()
conda_dep.set_python_version('3.7.3')
conda_dep.add_pip_package("torch")
conda_dep.add_pip_package("adal")
conda_dep.add_pip_package("cloudpickle")
conda_dep.add_pip_package("docker")
conda_dep.add_pip_package("numpy")
conda_dep.add_pip_package("scipy")
conda_dep.add_pip_package("tokenizers")
コード例 #7
0
    def __init__(
        self,
        workspace,
        compute_target,
        environment_definition,
        experiment_name=None,
        initial_node_count=None,
        jupyter=None,
        jupyter_port=None,
        dashboard_port=None,
        scheduler_port=None,
        scheduler_idle_timeout=None,
        worker_death_timeout=None,
        additional_ports=None,
        admin_username=None,
        admin_ssh_key=None,
        datastores=None,
        code_store=None,
        asynchronous=False,
        **kwargs,
    ):
        ### REQUIRED PARAMETERS
        self.workspace = workspace
        self.compute_target = compute_target
        self.environment_definition = environment_definition

        ### EXPERIMENT DEFINITION
        self.experiment_name = experiment_name

        ### ENVIRONMENT AND VARIABLES
        self.initial_node_count = initial_node_count

        ### GPU RUN INFO
        self.workspace_vm_sizes = AmlCompute.supported_vmsizes(self.workspace)
        self.workspace_vm_sizes = [
            (e["name"].lower(), e["gpus"]) for e in self.workspace_vm_sizes
        ]
        self.workspace_vm_sizes = dict(self.workspace_vm_sizes)

        self.compute_target_vm_size = self.compute_target.serialize()["properties"][
            "status"
        ]["vmSize"].lower()
        self.n_gpus_per_node = self.workspace_vm_sizes[self.compute_target_vm_size]
        self.use_gpu = True if self.n_gpus_per_node > 0 else False

        ### JUPYTER AND PORT FORWARDING
        self.jupyter = jupyter
        self.jupyter_port = jupyter_port
        self.dashboard_port = dashboard_port
        self.scheduler_port = scheduler_port
        self.scheduler_idle_timeout = scheduler_idle_timeout
        self.worker_death_timeout = worker_death_timeout

        if additional_ports is not None:
            if type(additional_ports) != list:
                error_message = (
                    f"The additional_ports parameter is of {type(additional_ports)}"
                    " type but needs to be a list of int tuples."
                    " Check the documentation."
                )
                logger.exception(error_message)
                raise TypeError(error_message)

            if len(additional_ports) > 0:
                if type(additional_ports[0]) != tuple:
                    error_message = (
                        f"The additional_ports elements are of {type(additional_ports[0])}"
                        " type but needs to be a list of int tuples."
                        " Check the documentation."
                    )
                    raise TypeError(error_message)

                ### check if all elements are tuples of length two and int type
                all_correct = True
                for el in additional_ports:
                    if type(el) != tuple or len(el) != 2:
                        all_correct = False
                        break

                    if (type(el[0]), type(el[1])) != (int, int):
                        all_correct = False
                        break

                if not all_correct:
                    error_message = (
                        f"At least one of the elements of the additional_ports parameter"
                        " is wrong. Make sure it is a list of int tuples."
                        " Check the documentation."
                    )
                    raise TypeError(error_message)

        self.additional_ports = additional_ports

        self.admin_username = admin_username
        self.admin_ssh_key = admin_ssh_key
        self.scheduler_ip_port = (
            None  ### INIT FOR HOLDING THE ADDRESS FOR THE SCHEDULER
        )

        ### DATASTORES
        self.datastores = datastores

        ### FUTURE EXTENSIONS
        self.kwargs = kwargs

        ### RUNNING IN MATRIX OR LOCAL
        self.same_vnet = None

        ### GET RUNNING LOOP
        self._loop_runner = LoopRunner(loop=None, asynchronous=asynchronous)
        self.loop = self._loop_runner.loop

        self.abs_path = pathlib.Path(__file__).parent.absolute()

        ### INITIALIZE CLUSTER
        super().__init__(asynchronous=asynchronous)

        if not self.asynchronous:
            self._loop_runner.start()
            self.sync(self.__get_defaults)
            self.sync(self.__create_cluster)
コード例 #8
0
    def __init__(
        self,
        workspace,
        compute_target,
        environment_definition,
        experiment_name=None,
        initial_node_count=None,
        jupyter=None,
        jupyter_port=None,
        additional_ports=None,
        admin_username=None,
        admin_ssh_key=None,
        telemetry_opt_out=None,
        **kwargs,
    ):
        ### REQUIRED PARAMETERS
        self.workspace = workspace
        self.compute_target = compute_target
        self.environment_definition = environment_definition

        ### EXPERIMENT DEFINITION
        self.experiment_name = experiment_name
        self.tags = {"tag": "azureml-ngc-tools"}

        ### ENVIRONMENT AND VARIABLES
        self.initial_node_count = initial_node_count

        ### SEND TELEMETRY
        self.telemetry_opt_out = telemetry_opt_out
        self.telemetry_set = False

        ### GPU RUN INFO
        self.workspace_vm_sizes = AmlCompute.supported_vmsizes(self.workspace)
        self.workspace_vm_sizes = [(e["name"].lower(), e["gpus"])
                                   for e in self.workspace_vm_sizes]
        self.workspace_vm_sizes = dict(self.workspace_vm_sizes)

        self.compute_target_vm_size = self.compute_target.serialize(
        )["properties"]["status"]["vmSize"].lower()
        self.n_gpus_per_node = self.workspace_vm_sizes[
            self.compute_target_vm_size]
        self.use_gpu = True if self.n_gpus_per_node > 0 else False

        ### JUPYTER AND PORT FORWARDING
        self.jupyter = jupyter
        self.jupyter_port = jupyter_port
        self.portforward_proc = None
        self.end_logging = False  # FLAG FOR STOPPING THE port_forward_logger THREAD

        if additional_ports is not None:
            if type(additional_ports) != list:
                error_message = (
                    f"The additional_ports parameter is of {type(additional_ports)}"
                    " type but needs to be a list of int tuples."
                    " Check the documentation.")
                logger.exception(error_message)
                raise TypeError(error_message)

            if len(additional_ports) > 0:
                if type(additional_ports[0]) != tuple:
                    error_message = (
                        f"The additional_ports elements are of {type(additional_ports[0])}"
                        " type but needs to be a list of int tuples."
                        " Check the documentation.")
                    raise TypeError(error_message)

                ### check if all elements are tuples of length two and int type
                all_correct = True
                for el in additional_ports:
                    if type(el) != tuple or len(el) != 2:
                        all_correct = False
                        break

                    if (type(el[0]), type(el[1])) != (int, int):
                        all_correct = False
                        break

                if not all_correct:
                    error_message = (
                        f"At least one of the elements of the additional_ports parameter"
                        " is wrong. Make sure it is a list of int tuples."
                        " Check the documentation.")
                    raise TypeError(error_message)

        self.additional_ports = [] if additional_ports is None else additional_ports

        self.admin_username = admin_username
        self.admin_ssh_key = admin_ssh_key

        ### FUTURE EXTENSIONS
        self.kwargs = kwargs

        ### ABSOLUTE PATH
        self.abs_path = pathlib.Path(__file__).parent.absolute()

        # ### close the cluster handler
        # signal.signal(signal.SIGINT, self.__signal_handler)

        ### define script parameters
        self.script_params = {}
        self.script_params["--use_gpu"] = self.use_gpu
        self.script_params["--n_gpus_per_node"] = self.n_gpus_per_node

        ### headnode info
        self.headnode_info = {}

        if not self.telemetry_opt_out:
            self.__append_telemetry()

        self.__create_cluster()
        self.__print_message("Cluster created...")
コード例 #9
0
from azureml.core import ScriptRunConfig
from azureml.core.conda_dependencies import CondaDependencies

subscription_id = "0000-00000000-00000000-0000"
resource_group = "AdvanceAnalytics.ML"
workspace_name = "aa-ml-aml-workspace"
workspace_region = 'eastus'
computetarget_vm = 'Standard_NC6'

ws = Workspace.create(name=workspace_name,
                      subscription_id=subscription_id,
                      resource_group=resource_group,
                      location=workspace_region,
                      exist_ok=True)

AmlCompute.supported_vmsizes(ws)

# Create a new runconfig object
run_config = RunConfiguration()
run_config.target = "amlcompute"
run_config.amlcompute.vm_size = computetarget_vm
run_config.framework = 'python'
run_config.environment.docker.base_image = 'tensorflow/tensorflow:1.6.0-gpu'
run_config.environment.docker.gpu_support = True
run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=[
        'opencv==3.4.1', 'urllib3', 'tqdm', 'scikit-learn', 'pandas',
        'tensorflow-gpu', 'keras-gpu'
    ])

working_dir = r'.\keras'
コード例 #10
0
ファイル: rutils.py プロジェクト: tdard/computer-vision
def get_available_vm_sizes_from_config():
    ws = Workspace.from_config()
    vm_sizes = AmlCompute.supported_vmsizes(workspace=ws,
                                            location="francecentral")
    return vm_sizes
コード例 #11
0
def start(login, app):
    login_config = ngccontent.get_config(login)
    app_config = ngccontent.get_config(app)

    ### WORKSPACE
    subscription_id = login_config["azureml_user"]["subscription_id"]
    resource_group = login_config["azureml_user"]["resource_group"]
    workspace_name = login_config["azureml_user"]["workspace_name"]

    try:
        ws = Workspace(
            workspace_name=workspace_name,
            subscription_id=subscription_id,
            resource_group=resource_group,
        )
    except ProjectSystemException:
        msg = f'\n\nThe workspace "{workspace_name}" does not exist. '
        msg += f"Go to \n\n  "
        msg += f"-->> https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace <<--\n\n"
        msg += f"and create the workspace first.\n\n\n"
        msg += f"Your current configuration: \n\n"
        msg += f"Workspace name: {workspace_name} \n"
        msg += f"Subscription id: {subscription_id} \n"
        msg += f"Resource group: {resource_group}\n\n"

        logger.exception(msg)
        raise Exception(msg)

    verify = f"""
    Subscription ID: {subscription_id}
    Resource Group: {resource_group}
    Workspace: {workspace_name}"""
    logger.info(verify)

    ### experiment name
    exp_name = login_config["aml_compute"]["exp_name"]

    ### azure ml names
    ct_name = login_config["aml_compute"]["ct_name"]
    vm_name = login_config["aml_compute"]["vm_name"].lower()
    vm_priority = login_config["aml_compute"]["vm_priority"]

    ### trust but verify
    verify = f"""
    Experiment name: {exp_name}"""
    logger.info(verify)

    ### GPU RUN INFO
    workspace_vm_sizes = AmlCompute.supported_vmsizes(ws)
    pascal_volta_pattern = pattern = re.compile(
        r"[a-z]+_nc[0-9]+[s]?_v[2,3]"
    )  ### matches NC-series v2 and v3
    workspace_vm_sizes = [
        (e["name"].lower(), e["gpus"])
        for e in workspace_vm_sizes
        if pattern.match(e["name"].lower())
    ]
    workspace_vm_sizes = dict(workspace_vm_sizes)

    ### GET COMPUTE TARGET
    if vm_name in workspace_vm_sizes:
        gpus_per_node = workspace_vm_sizes[vm_name]

        verify = f"""
    Compute target: {ct_name}
    VM Size: {vm_name}
    No of GPUs: {gpus_per_node}
    Priority: {vm_priority}
        """
        logger.info(verify)

        ### get SSH keys
        ssh_key_pub, pri_key_file = get_ssh_keys()

        if ct_name not in ws.compute_targets:
            logger.warning(f"Compute target {ct_name} does not exist...")
            ct = createOrGetComputeTarget(
                ws, ct_name, vm_name, vm_priority, ssh_key_pub, login_config
            )
        else:
            ct = ws.compute_targets[ct_name]

            if ct.provisioning_state == "Failed":
                logger.warning(
                    f"Compute target {ct_name} found but provisioning_state is showing as 'failed'..."
                )
                logger.warning(f"Deleting {ct_name} target and will attempt again...")
                logger.warning(
                    f"If this fails again check that you have enough resources in your subscription..."
                )

                ct.delete()
                time.sleep(5)
                ct = createOrGetComputeTarget(
                    ws, ct_name, vm_name, vm_priority, ssh_key_pub, login_config
                )
            else:
                logger.info(f"    Using pre-existing compute target {ct_name}")
    else:
        logger.exception("Unsupported vm_size {vm_size}".format(vm_size=vm_name))
        logger.exception("The specified vm size must be one of ...")

        for azure_gpu_vm_size in workspace_vm_sizes.keys():
            logger.exception("... " + azure_gpu_vm_size)
        raise Exception(
            "{vm_size} does not have Pascal or above GPU Family".format(vm_size=vm_name)
        )

    env = createOrGetEnvironment(ws, login_config, app_config)

    ### UPLOAD ADDITIONAL CONTENT IF NOT EXISTS
    for additional_content in app_config["additional_content"]["list"]:
        url = additional_content["url"]
        targetfile = additional_content["filename"]
        src_path = additional_content["localdirectory"]
        dest_path = additional_content["computedirectory"]
        
        if (
            "source" in additional_content.keys()
            and additional_content["source"] == "github"
        ):
            ngccontent.clone_github_repo(url,"additional_content",src_path)
        else:
            if app_config["additional_content"]["download_content"]:
                ngccontent.download(url, "additional_content", targetfile)
            if (
                app_config["additional_content"]["unzip_content"]
                and additional_content["zipped"]
            ):
                ngccontent.unzipFile(targetfile, "additional_content", src_path)

        if app_config["additional_content"]["upload_content"]:
            ngccontent.upload_data(
                ws,
                ws.get_default_datastore(),
                "additional_content/" + src_path,
                dest_path,
            )

    if (login_config["aml_compute"]["max_nodes"]==1):    
        amlcluster = AzureMLComputeCluster(
            workspace=ws,
            compute_target=ct,
            initial_node_count=1,
            experiment_name=login_config["aml_compute"]["exp_name"],
            environment_definition=env,
            jupyter_port=login_config["aml_compute"]["jupyter_port"],
            telemetry_opt_out=login_config["azureml_user"]["telemetry_opt_out"],
            admin_username=login_config["aml_compute"]["admin_name"],
            admin_ssh_key=pri_key_file,
        )
    else:
        logger.info("Creating a Dask Cluster with {} nodes".format(login_config["aml_compute"]["max_nodes"]))
        amlcluster = AzureMLCluster(
            workspace=ws,
            compute_target=ct,
            initial_node_count=login_config["aml_compute"]["max_nodes"],
            experiment_name=login_config["aml_compute"]["exp_name"],
            environment_definition=env,
            jupyter_port=login_config["aml_compute"]["jupyter_port"],
            telemetry_opt_out=login_config["azureml_user"]["telemetry_opt_out"],
            admin_username=login_config["aml_compute"]["admin_name"],
            admin_ssh_key=pri_key_file,
        )       

    logger.info(f"\n    Go to: {amlcluster.jupyter_link}")
    logger.info("    Press Ctrl+C to stop the cluster.")

    try:
        while True:
            pass
    except KeyboardInterrupt:
        amlcluster.close()