Beispiel #1
0
def setup(num):
    workspace_name = '%s-%s-%02d' % (workspace_prefix, location, num)

    try:
        ws = Workspace.get(
            name=workspace_name,
            subscription_id=subscription_id,
            resource_group=resource_group)
        print('Found existing workspace %s' % workspace_name)
    except WorkspaceException:
        print('Creating new workspace %s...' % workspace_name)

        ws = Workspace.create(
            name=workspace_name,
            subscription_id=subscription_id,
            resource_group=resource_group,
            location=location)

    try:
        compute_target = AmlCompute(ws, compute_name)
        print('Found existing compute %s' % compute_name)

        compute_target.update(min_nodes=min_nodes, max_nodes=max_nodes)
    except ComputeTargetException:
        print('Creating new compute target %s...' % compute_name)

        compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, min_nodes=min_nodes, max_nodes=max_nodes)
        compute_target = ComputeTarget.create(ws, compute_name, compute_config)
        compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)

    ds = ws.get_default_datastore()
    ds.upload("testdata")

    dataset_name = 'sample_dataset'

    if dataset_name not in ws.datasets:
        data = Dataset.File.from_files(path=[(ds, 'testdata.txt')])

        data.register(
            workspace = ws,
            name = dataset_name,
            description = 'Sample data for load test')

        print('Dataset successfully registered')
    else:
        print('Dataset already exists')
class ClusterConnector:
    def __init__(
        self,
        workspace,
        cluster_name,
        ssh_key,
        vm_type,
        admin_username="******",
    ):
        """Thin wrapper class around azureml.core.compute.AmlCluster

        Provides parallel ssh objects and helper for master node and all node commands
        and file copies.

        Usage:
        >>> cc = ClusterConnector(workspace, "MyCluster", sshkey, "Standard_ND40rs_v2")
        >>> cc.initialize(min_nodes=0, max_nodes=4, idle_timeout_secs=30)
        >>> cluster = cc.cluster
        >>> [print(node['name']) for node in cc.cluster.list_nodes()]
        """

        self.cluster_name = cluster_name
        self.workspace = workspace
        self.ssh_key = ssh_key
        self.vm_type = vm_type
        self.admin_username = admin_username

        enable_host_logger()
        hlog = logging.getLogger("pssh.host_logger")
        tstr = datetime.now().isoformat(timespec="minutes")
        [
            hlog.removeHandler(h) for h in hlog.handlers
            if isinstance(h, logging.StreamHandler)
        ]
        os.makedirs("clusterlogs", exist_ok=True)
        self.logfile = "clusterlogs/{}_{}.log".format(self.workspace.name,
                                                      tstr)
        hlog.addHandler(logging.FileHandler(self.logfile))

        self.cluster = None
        self._master_scp = None
        self._master_ssh = None
        self._all_ssh = None

    def initialise(self, min_nodes=0, max_nodes=0, idle_timeout_secs=1800):
        """Initialise underlying AmlCompute cluster instance"""
        self._create_or_update_cluster(min_nodes, max_nodes, idle_timeout_secs)

    def _check_logs_emessage(self, host, port):
        msg = "Remote command failed on {}:{}. For details see {}".format(
            host, port, self.logfile)
        return msg

    def terminate(self):

        print('Attempting to terminate cluster "{}"'.format(
            colored(self.cluster_name, "green")))
        try:
            self.cluster.update(min_nodes=0,
                                max_nodes=0,
                                idle_seconds_before_scaledown=10)
            self.cluster.wait_for_completion()
        except ComputeTargetException as err:
            raise RuntimeError(
                "Failed to terminate cluster nodes ({})".format(err))

        if len(self.cluster.list_nodes()):
            raise RuntimeError(
                "Failed to terminate cluster nodes (nodes still running)")

    @property
    def cluster_nodes(self):
        self.cluster.refresh_state()
        return sorted(self.cluster.list_nodes(), key=lambda n: n["port"])

    def _create_or_update_cluster(self, min_nodes, max_nodes,
                                  idle_timeout_secs):

        try:
            self.cluster = AmlCompute(workspace=self.workspace,
                                      name=self.cluster_name)
            print('Updating existing cluster "{}"'.format(
                colored(self.cluster_name, "green")))
            self.cluster.update(
                min_nodes=min_nodes,
                max_nodes=max_nodes,
                idle_seconds_before_scaledown=idle_timeout_secs,
            )
        except ComputeTargetException:
            print('Creating new cluster "{}"'.format(
                colored(self.cluster_name, "green")))
            cluster_config = AmlCompute.provisioning_configuration(
                vm_size=self.vm_type,
                min_nodes=min_nodes,
                max_nodes=max_nodes,
                idle_seconds_before_scaledown=idle_timeout_secs,
                admin_username=self.admin_username,
                admin_user_ssh_key=self.ssh_key,
                remote_login_port_public_access="Enabled",
            )
            self.cluster = AmlCompute.create(self.workspace, self.cluster_name,
                                             cluster_config)

        self.cluster.wait_for_completion()

        if len(self.cluster_nodes) < min_nodes:
            sleep(30)
            if len(self.cluster_nodes) < min_nodes:
                raise RuntimeError("Failed to provision sufficient nodes")

    def _copy_nodefile_to_nodes(self):

        if len(self.cluster_nodes) == 1:
            cprint("Single node cluster -- skipping IB config", "yellow")
            return

        print("Collecting cluster IB info")

        outputs = self._all_ssh.run_command(
            r'ifconfig ib0 | grep -oe "inet[^6][adr: ]*[0-9.]*" | cut -d" " -f2',
            shell="bash -c",
        )
        self._all_ssh.join(outputs)

        ibaddrs = []
        for output in outputs:
            host = output.host
            port = output.client.port
            if output.exit_code != 0:
                print(list(output.stdout))
                print(list(output.stderr))
                raise RuntimeError("Failed to get IB ip for {}:{}".format(
                    host, port))
            try:
                ibaddr = list(output.stdout)[0].split()[0]
            except IndexError:
                raise RuntimeError("Failed to get IB ip for {}:{} - "
                                   "No ib interface found!".format(host, port))
            print("Mapping {}:{} -> {}".format(host, port, ibaddr))
            if port == self._master_scp.port:
                cprint("IB Master: {}".format(ibaddr), "green")
                ibaddrs = [ibaddr] + ibaddrs
            else:
                ibaddrs.append(ibaddr)

        with NamedTemporaryFile(delete=False, mode="wt") as nfh:
            self.nodefile = nfh.name
            for addr in ibaddrs:
                nfh.write("{}\n".format(addr))

        self.ibaddrs = ibaddrs
        self.copy_to_all_nodes(self.nodefile, "./nodefile")

    def _create_cluster_ssh_conns(self):

        hostips = [n["publicIpAddress"] for n in self.cluster_nodes]
        hostconfigs = [HostConfig(port=n["port"]) for n in self.cluster_nodes]

        self._all_ssh = ParallelSSHClient(hostips,
                                          host_config=hostconfigs,
                                          user=self.admin_username)

        self._master_ssh = ParallelSSHClient(hostips[:1],
                                             host_config=hostconfigs[:1],
                                             user=self.admin_username)

        self._master_scp = SSHClient(hostips[0],
                                     port=hostconfigs[0].port,
                                     user=self.admin_username)

    def copy_to_all_nodes(self, source, dest):

        copy_jobs = self._all_ssh.copy_file(source, dest)
        joinall(copy_jobs, raise_error=True)

    def copy_to_master_node(self, source, dest):

        self._master_scp.copy_file(source, dest)

    def copy_from_master_node(self, source, dest):

        self._master_scp.copy_remote_file(source, dest)

    def run_on_all_nodes(self, command):

        outputs = self._all_ssh.run_command(command, shell="bash -c")
        self._all_ssh.join(outputs, consume_output=True)

        for output in outputs:
            if int(output.exit_code) != 0:
                host = output.host
                port = output.client.port
                raise RuntimeError(self._check_logs_emessage(host, port))

    def run_on_master_node(self, command):

        outputs = self._master_ssh.run_command(command, shell="bash -c")
        self._master_ssh.join(outputs)

        for output in outputs:
            if int(output.exit_code) != 0:
                host = output.host
                port = output.client.port
                raise RuntimeError(self._check_logs_emessage(host, port))

    def attempt_termination(self):
        try:
            self.terminate()
        except RuntimeError as err:
            print(colored("ERROR: {}\n\n", "red", attrs=["bold"]).format(err))
            self.warn_unterminated()

    def warn_unterminated(self):
        print(
            colored("WARNING: {}", "red", attrs=["bold"]).format(
                colored(
                    "Cluster {} is still running - terminate manually to avoid "
                    "additional compute costs".format(
                        colored(self.cluster_name, "green")),
                    "red",
                )))
    # Check settings and redeploy if required settings have changed
    print("Found existing cluster")
    if cluster.vm_size.lower() != aml_settings["vm_size"].lower(
    ) or cluster.vm_priority.lower() != aml_settings["vm_priority"].lower():
        cluster.delete()
        cluster.wait_for_completion(show_output=True)
        raise ComputeTargetException(
            "Cluster is of incorrect size or has incorrect priority. Deleting cluster and provisioning a new one."
        )

    # Update AMLCompute
    #if cluster.provisioning_configuration.min_nodes != aml_settings["min_nodes"] or cluster.provisioning_configuration.max_nodes != aml_settings["max_nodes"] or cluster.provisioning_configuration.idle_seconds_before_scaledown != aml_settings["idle_seconds_before_scaledown"]:
    print("Updating settings of Cluster")
    cluster.update(min_nodes=aml_settings["min_nodes"],
                   max_nodes=aml_settings["max_nodes"],
                   idle_seconds_before_scaledown=aml_settings[
                       "idle_seconds_before_scaledown"])

    # Wait until the operation has completed
    cluster.wait_for_completion(show_output=True)

    print("Successfully updated Cluster definition")
except ComputeTargetException:
    print("Loading failed")
    print("Creating new AML Compute resource")
    compute_config = AmlCompute.provisioning_configuration(
        vm_size=aml_settings["vm_size"],
        vm_priority=aml_settings["vm_priority"],
        min_nodes=aml_settings["min_nodes"],
        max_nodes=aml_settings["max_nodes"],
        idle_seconds_before_scaledown=aml_settings[
Beispiel #4
0
def main():
    # Loading input values
    print("::debug::Loading input values")
    parameters_file = os.environ.get("INPUT_PARAMETERSFILE",
                                     default="workspace.json")
    azure_credentials = os.environ.get("INPUT_AZURECREDENTIALS", default="{}")
    azure_credentials = json.loads(azure_credentials)

    # Loading parameters file
    print("::debug::Loading parameters file")
    parameters_file_path = os.path.join(".aml", parameters_file)
    try:
        with open(parameters_file_path) as f:
            parameters = json.load(f)
    except FileNotFoundError:
        print(
            f"::error::Could not find parameter file in {parameters_file_path}. Please provide a parameter file in your repository (e.g. .aml/workspace.json)."
        )
        return

    # Loading Workspace
    sp_auth = ServicePrincipalAuthentication(
        tenant_id=azure_credentials.get("tenantId", ""),
        service_principal_id=azure_credentials.get("clientId", ""),
        service_principal_password=azure_credentials.get("clientSecret", ""))
    try:
        print("::debug::Loading existing Workspace")
        ws = Workspace.get(
            name=parameters.get("name", None),
            subscription_id=azure_credentials.get("subscriptionId", ""),
            resource_group=parameters.get("resourceGroup", None),
            auth=sp_auth)
        print("::debug::Successfully loaded existing Workspace")
    except AuthenticationException as exception:
        print(
            f"::error::Could not retrieve user token. Please paste output of `az ad sp create-for-rbac --name <your-sp-name> --role contributor --scopes /subscriptions/<your-subscriptionId>/resourceGroups/<your-rg> --sdk-auth` as value of secret variable: AZURE_CREDENTIALS: {exception}"
        )
        return
    except AuthenticationError as exception:
        print(f"::error::Microsoft REST Authentication Error: {exception}")
        return
    except AdalError as exception:
        print(
            f"::error::Active Directory Authentication Library Error: {exception}"
        )
        return
    except ProjectSystemException as exception:
        print(f"::error::Workspace authorizationfailed: {exception}")
        return

    # TODO: Create compute if not existing.
    try:
        # Loading AMLCompute
        print("::debug::Loading existing AML Compute")
        cluster = AmlCompute(workspace=ws, name=parameters["name"])

        # Check settings and redeploy if required settings have changed
        print("::debug::Found existing cluster")
        if cluster.vm_size.lower() != parameters["vm_size"].lower(
        ) or cluster.vm_priority.lower() != parameters["vm_priority"].lower():
            cluster.delete()
            cluster.wait_for_completion(show_output=True)
            raise ComputeTargetException(
                "Cluster is of incorrect size or has incorrect priority. Deleting cluster and provisioning a new one."
            )

        # Update AMLCompute
        #if cluster.provisioning_configuration.min_nodes != aml_settings["min_nodes"] or cluster.provisioning_configuration.max_nodes != aml_settings["max_nodes"] or cluster.provisioning_configuration.idle_seconds_before_scaledown != aml_settings["idle_seconds_before_scaledown"]:
        print("::debug::Updating settings of Cluster")
        cluster.update(min_nodes=parameters["min_nodes"],
                       max_nodes=parameters["max_nodes"],
                       idle_seconds_before_scaledown=parameters[
                           "idle_seconds_before_scaledown"])

        # Wait until the operation has completed
        cluster.wait_for_completion(show_output=True)

        print("::debug::Successfully updated Cluster definition")
    except ComputeTargetException:
        print("::debug::Loading failed")
        print("::debug::Creating new AML Compute resource")
        compute_config = AmlCompute.provisioning_configuration(
            vm_size=parameters["vm_size"],
            vm_priority=parameters["vm_priority"],
            min_nodes=parameters["min_nodes"],
            max_nodes=parameters["max_nodes"],
            idle_seconds_before_scaledown=parameters[
                "idle_seconds_before_scaledown"],
            tags=parameters["tags"],
            description=parameters["description"])

        # Deploy to VNET if provided
        if parameters["vnet_resource_group_name"] and parameters[
                "vnet_name"] and parameters["subnet_name"]:
            compute_config.vnet_resourcegroup_name = parameters[
                "vnet_resource_group_name"]
            compute_config.vnet_name = parameters["vnet_name"]
            compute_config.subnet_name = parameters["subnet_name"]

        # Set Credentials if provided
        if parameters["admin_username"] and parameters["admin_user_password"]:
            compute_config.admin_username = parameters["admin_username"]
            compute_config.admin_user_password = parameters[
                "admin_user_password"]
        elif parameters["admin_username"] and parameters["admin_user_ssh_key"]:
            compute_config.admin_username = parameters["admin_username"]
            compute_config.admin_user_ssh_key = parameters[
                "admin_user_ssh_key"]

        # Create Compute Target
        cluster = ComputeTarget.create(
            workspace=ws,
            name=parameters["name"],
            provisioning_configuration=compute_config)

        # Wait until the cluster is attached
        cluster.wait_for_completion(show_output=True)

    # Checking status of AMLCompute Cluster
    print("::debug::Checking status of AMLCompute Cluster")
    if cluster.provisioning_state == "Failed":
        cluster.delete()
        raise Exception(
            "::debug::Deployment of AMLCompute Cluster failed with the following status: {} and logs: \n{}"
            .format(cluster.provisioning_state, cluster.provisioning_errors))

    print(parameters)
    print(
        "::debug::Successfully finished Azure Machine Learning Compute Action")