Python AmlCompute.wait_for_completion Beispiele, azureml.core.compute.AmlCompute.wait_for_completion Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: main_azure.py Projekt: Anishtalukdar/introact-repo

def ComputeCompute():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    location = request.json['location']
    cluster_name = request.json['cluster_name']
    vm_size = request.json['vm_size']
    min_nodes = request.json['min_nodes']
    max_nodes = request.json['max_nodes']
    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')
    #aml_compute = AmlCompute(ws, cluster_name)
    #cluster_name = 'cpu-cluster'

    try:
        aml_compute = AmlCompute(ws, cluster_name)
        print('Found existing AML compute context.')
        return "Found existing AML compute context."
    except:
        print('need to create new Compute.')
        print('Creating new AML compute context.')
        aml_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                           min_nodes=min_nodes,
                                                           max_nodes=max_nodes)
        aml_compute = AmlCompute.create(ws,
                                        name=cluster_name,
                                        provisioning_configuration=aml_config)
        aml_compute.wait_for_completion(show_output=True)
        return "Compute successfully created"

Beispiel #2

0

Datei anzeigen

def get_aml_compute(workspace):
    # TODO: Set desired name for compute target
    aml_compute_target = "example_vm_name"
    try:
        aml_compute = AmlCompute(workspace, aml_compute_target)
        print("found existing compute target.")
    except ComputeTargetException:
        print("creating new compute target")

        # TODO: Configure desired VM, see: https://docs.microsoft.com/nl-nl/azure/virtual-machines/sizes-general
        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size="STANDARD_D1_V2",
            min_nodes=0,
            max_nodes=1,
            vnet_resourcegroup_name="",
            vnet_name="-vn",
            subnet_name="default",
            idle_seconds_before_scaledown=1800,
            vm_priority='lowpriority')
        aml_compute = ComputeTarget.create(workspace, aml_compute_target,
                                           provisioning_config)
        aml_compute.wait_for_completion(show_output=True,
                                        min_node_count=None,
                                        timeout_in_minutes=20)
    print("Azure Machine Learning Compute attached")
    return aml_compute

Beispiel #3

0

Datei anzeigen

def choose_compute_target(workspace, name):
    try:
        aml_compute = AmlCompute(workspace, name)
        print("Found existing compute target: {}".format(name))
    except:
        print("Creating new compute target: {}".format(name))

        provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                                    min_nodes=1,
                                                                    max_nodes=4)
        aml_compute = ComputeTarget.create(workspace, name, provisioning_config)
        aml_compute.wait_for_completion(show_output=True)
    print(aml_compute)
    return aml_compute

Beispiel #4

0

Datei anzeigen

Datei: setup.py Projekt: akshay-0/broccoli

def setup(num):
    workspace_name = '%s-%s-%02d' % (workspace_prefix, location, num)

    try:
        ws = Workspace.get(
            name=workspace_name,
            subscription_id=subscription_id,
            resource_group=resource_group)
        print('Found existing workspace %s' % workspace_name)
    except WorkspaceException:
        print('Creating new workspace %s...' % workspace_name)

        ws = Workspace.create(
            name=workspace_name,
            subscription_id=subscription_id,
            resource_group=resource_group,
            location=location)

    try:
        compute_target = AmlCompute(ws, compute_name)
        print('Found existing compute %s' % compute_name)

        compute_target.update(min_nodes=min_nodes, max_nodes=max_nodes)
    except ComputeTargetException:
        print('Creating new compute target %s...' % compute_name)

        compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, min_nodes=min_nodes, max_nodes=max_nodes)
        compute_target = ComputeTarget.create(ws, compute_name, compute_config)
        compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)

    ds = ws.get_default_datastore()
    ds.upload("testdata")

    dataset_name = 'sample_dataset'

    if dataset_name not in ws.datasets:
        data = Dataset.File.from_files(path=[(ds, 'testdata.txt')])

        data.register(
            workspace = ws,
            name = dataset_name,
            description = 'Sample data for load test')

        print('Dataset successfully registered')
    else:
        print('Dataset already exists')

Beispiel #5

0

Datei anzeigen

from azureml.core import Workspace
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

ws = Workspace.get(name='akws',
                   subscription_id='8b3748c0-bb0b-4913-ab5b-c462062118fe',
                   resource_group='akrg')

cpu_cluster_name = 'tdsp-cluster'

#verify that cluster does not exist
try:
    cpu_cluster = AmlCompute(workspace=ws, name=cpu_cluster_name)
    print('Cluster already exists.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='Standard_NC6', max_nodes=4)
    cpu_cluster = AmlCompute.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Beispiel #6

0

Datei anzeigen

Datei: create_aml_cluster.py Projekt: himanshuahlawat31/mlops-quickstart

)
parser.add_argument("--path", type=str, help="path", dest="path", required=True)
args = parser.parse_args()

print("Argument 1: %s" % args.aml_compute_target)
print("Argument 2: %s" % args.path)

print("creating AzureCliAuthentication...")
cli_auth = AzureCliAuthentication()
print("done creating AzureCliAuthentication!")

print("get workspace...")
ws = Workspace.from_config(path=args.path, auth=cli_auth)
print("done getting workspace!")

try:
    aml_compute = AmlCompute(ws, args.aml_compute_target)
    print("found existing compute target.")
except ComputeTargetException:
    print("creating new compute target")

    provisioning_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_DS4_V2", min_nodes=1, max_nodes=8
    )
    aml_compute = ComputeTarget.create(ws, args.aml_compute_target, provisioning_config)
    aml_compute.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20
    )

print("Aml Compute attached")

Beispiel #7

0

Datei anzeigen

cpu_compute_name = config['cpu_compute']
try:
    cpu_compute_target = AmlCompute(ws, cpu_compute_name)
    print("found existing compute target: %s" % cpu_compute_name)
except:  # ComputeTargetException:
    print("creating new compute target")

    provisioning_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_D2_V2',
        max_nodes=4,
        idle_seconds_before_scaledown=1800)
    cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name,
                                              provisioning_config)
    cpu_compute_target.wait_for_completion(show_output=True,
                                           min_node_count=None,
                                           timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster.
print(cpu_compute_target.get_status().serialize())

# choose a name for your cluster
gpu_compute_name = config['gpu_compute']

try:
    gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
    print("found existing compute target: %s" % gpu_compute_name)
except:
    print('Creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(
        vm_size='STANDARD_NC6',

Beispiel #8

0

Datei anzeigen

Datei: hyperparameter_tuning.py Projekt: soniaang/ACE_Azure_ML

# choose a name for your cluster
cluster_name = "gpucluster"

try:
    compute_target = AmlCompute(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                            max_nodes=4,
                                                            idle_seconds_before_scaledown=1800)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())


# Create a project directory

# Create a directory that will contain all the necessary code from your 
# local machine that you will need access to on the remote resource. This 
# includes the training script and any additional files your training script 
# depends on.

import os

project_folder = './pytorch-hymenoptera'

Beispiel #9

0

Datei anzeigen

def build_pipeline(dataset, ws, config):
    print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name))

    base_dir = '.'
        
    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = './scripts'
    os.makedirs(script_folder, exist_ok=True)
    
    shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_create.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'train.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)
    
    cpu_compute_name = config['cpu_compute']
    try:
        cpu_compute_target = AmlCompute(ws, cpu_compute_name)
        print("found existing compute target: %s" % cpu_compute_name)
    except:# ComputeTargetException:
        print("creating new compute target")
        
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
                                                                    max_nodes=4,
                                                                    idle_seconds_before_scaledown=1800)    
        cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name, provisioning_config)
        cpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
        
    # use get_status() to get a detailed status for the current cluster. 
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = config['gpu_compute']

    try:
        gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
        print("found existing compute target: %s" % gpu_compute_name)
    except: 
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                                    max_nodes=10,
                                                                    idle_seconds_before_scaledown=1800)

        # create the cluster
        gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout. 
        # if no min node count is provided it uses the scale settings for the cluster
        gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster. 
    try:
        print(gpu_compute_target.get_status().serialize())
    except BaseException as e:
        print("Could not get status of compute target.")
        print(e)

    # conda dependencies for compute targets
    cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"])
    
    # Runconfigs
    cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
    cpu_compute_run_config.environment.docker.enabled = True
    cpu_compute_run_config.environment.docker.gpu_support = False
    cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    cpu_compute_run_config.environment.spark.precache_packages = False

    print("PipelineData object created")

    # DataReference to where video data is stored.
    video_data = DataReference(
        datastore=def_blob_store,
        data_reference_name="video_data",
        path_on_datastore=os.path.join("prednet", "data", "video", dataset))
    print("DataReference object created")
        
    # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.
    raw_data = PipelineData("raw_video_fames", datastore=def_blob_store)
    preprocessed_data = PipelineData("preprocessed_video_frames", datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    data_output = PipelineData("output_data", datastore=def_blob_store)

    # prepare dataset for training/testing prednet
    video_decoding = PythonScriptStep(
        name='decode_videos',
        script_name="video_decoding.py", 
        arguments=["--input_data", video_data, "--output_data", raw_data],
        inputs=[video_data],
        outputs=[raw_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    print("video_decode step created")

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(
        name='prepare_data',
        script_name="data_preparation.py", 
        arguments=["--input_data", raw_data, "--output_data", preprocessed_data],
        inputs=[raw_data],
        outputs=[preprocessed_data],
        compute_target=cpu_compute_target, 
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.']
    )
    data_prep.run_after(video_decoding)

    print("data_prep step created")


    # configure access to ACR for pulling our custom docker image
    acr = ContainerRegistry()
    acr.address = config['acr_address']
    acr.username = config['acr_username']
    acr.password = config['acr_password']
    
    est = Estimator(source_directory=script_folder,
                    compute_target=gpu_compute_target,
                    entry_script='train.py', 
                    use_gpu=True,
                    node_count=1,
                    custom_docker_image = "wopauli_1.8-gpu:1",
                    image_registry_details=acr,
                    user_managed=True
                    )

    ps = RandomParameterSampling(
        {
            '--batch_size': choice(1, 2, 4, 8),
            '--filter_sizes': choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
            '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"),
            '--learning_rate': loguniform(-6, -1),
            '--lr_decay': loguniform(-9, -1),
            '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
            '--transfer_learning': choice("True", "False")
        }
    )

    policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=10)

    hdc = HyperDriveConfig(estimator=est, 
                            hyperparameter_sampling=ps, 
                            policy=policy, 
                            primary_metric_name='val_loss', 
                            primary_metric_goal=PrimaryMetricGoal.MINIMIZE, 
                            max_total_runs=10,
                            max_concurrent_runs=5, 
                            max_duration_minutes=60*6
                            )

    hd_step = HyperDriveStep(
        name="train_w_hyperdrive",
        hyperdrive_run_config=hdc,
        estimator_entry_script_arguments=[
            '--data-folder', preprocessed_data, 
            '--remote_execution',
            '--dataset', dataset
            ],
        inputs=[preprocessed_data],
        metrics_output = data_metrics,
        allow_reuse=True
    )
    hd_step.run_after(data_prep)

    registration_step = PythonScriptStep(
        name='register_model',
        script_name='model_registration.py',
        arguments=['--input_dir', data_metrics, '--output_dir', data_output],
        compute_target=cpu_compute_target,
        inputs=[data_metrics],
        outputs=[data_output],
        source_directory=script_folder,
        allow_reuse=True,
        hash_paths=['.']
    )
    registration_step.run_after(hd_step)

    pipeline = Pipeline(workspace=ws, steps=[video_decoding, data_prep, hd_step, registration_step])
    print ("Pipeline is built")

    pipeline.validate()
    print("Simple validation complete") 

    pipeline_name = 'prednet_' + dataset
    published_pipeline = pipeline.publish(name=pipeline_name)
    

    schedule = Schedule.create(workspace=ws, name=pipeline_name + "_sch",
                            pipeline_id=published_pipeline.id, 
                            experiment_name=pipeline_name,
                            datastore=def_blob_store,
                            wait_for_provisioning=True,
                            description="Datastore scheduler for Pipeline" + pipeline_name,
                            path_on_datastore=os.path.join('prednet/data/video', dataset, 'Train'),
                            polling_interval=1
                            )

    return pipeline_name

Beispiel #10

0

Datei anzeigen

outputDf.T

from azureml.core.compute import AmlCompute

aml_name = 'cpu-cluster'
try:
    aml_compute = AmlCompute(ws, aml_name)
    print('Found existing AML compute context.')
except:
    print('Creating new AML compute context.')
    aml_config = AmlCompute.provisioning_configuration(
        vm_size="Standard_D2_v2", min_nodes=1, max_nodes=4)
    aml_compute = AmlCompute.create(ws,
                                    name=aml_name,
                                    provisioning_configuration=aml_config)
    aml_compute.wait_for_completion(show_output=True)

#writefile get_data.py

from sklearn import datasets
from sklearn.model_selection import train_test_split
from scipy import sparse
import numpy as np

#def get_data():
boston = pd.read_csv(
    'C:\\Users\\datacore\\OneDrive\\Desktop\\Capstone Project\\train_values_wJZrCmI.csv'
)
X = boston.drop(columns=['poverty_probability'])
y = boston['poverty_probability']
y = y.to_numpy()

Beispiel #11

0

Datei anzeigen

try:
    batch_ai_compute = AmlCompute(workspace=ws, name=compute_target_name)
    print('found existing Azure Batch AI cluster:', batch_ai_compute.name)
except ComputeTargetException:
    print('creating new Azure Batch AI cluster...')
    batch_ai_config = AmlCompute.provisioning_configuration(
        vm_size="Standard_NC6",
        vm_priority="dedicated",
        min_nodes=0,
        max_nodes=4,
        idle_seconds_before_scaledown=300)
    batch_ai_compute = AmlCompute.create(
        ws,
        name=compute_target_name,
        provisioning_configuration=batch_ai_config)
    batch_ai_compute.wait_for_completion(show_output=True)

project_folder = './tmp/automl-remote-batchai'
if not os.path.exists(project_folder):
    os.makedirs(project_folder)

shutil.copy('./scripts/get_data.py', project_folder)

print("Training the model...")
# configure Auto ML
automl_config = AutoMLConfig(task='classification',
                             debug_log='automl_errors.log',
                             primary_metric='AUC_weighted',
                             iteration_timeout_minutes=2,
                             iterations=20,
                             n_cross_validations=5,

Beispiel #12

0

Datei anzeigen

Datei: clusterconnector.py Projekt: numericalalgorithmsgroup/AzureML_Best_Practice

class ClusterConnector:
    def __init__(
        self,
        workspace,
        cluster_name,
        ssh_key,
        vm_type,
        admin_username="******",
    ):
        """Thin wrapper class around azureml.core.compute.AmlCluster

        Provides parallel ssh objects and helper for master node and all node commands
        and file copies.

        Usage:
        >>> cc = ClusterConnector(workspace, "MyCluster", sshkey, "Standard_ND40rs_v2")
        >>> cc.initialize(min_nodes=0, max_nodes=4, idle_timeout_secs=30)
        >>> cluster = cc.cluster
        >>> [print(node['name']) for node in cc.cluster.list_nodes()]
        """

        self.cluster_name = cluster_name
        self.workspace = workspace
        self.ssh_key = ssh_key
        self.vm_type = vm_type
        self.admin_username = admin_username

        enable_host_logger()
        hlog = logging.getLogger("pssh.host_logger")
        tstr = datetime.now().isoformat(timespec="minutes")
        [
            hlog.removeHandler(h) for h in hlog.handlers
            if isinstance(h, logging.StreamHandler)
        ]
        os.makedirs("clusterlogs", exist_ok=True)
        self.logfile = "clusterlogs/{}_{}.log".format(self.workspace.name,
                                                      tstr)
        hlog.addHandler(logging.FileHandler(self.logfile))

        self.cluster = None
        self._master_scp = None
        self._master_ssh = None
        self._all_ssh = None

    def initialise(self, min_nodes=0, max_nodes=0, idle_timeout_secs=1800):
        """Initialise underlying AmlCompute cluster instance"""
        self._create_or_update_cluster(min_nodes, max_nodes, idle_timeout_secs)

    def _check_logs_emessage(self, host, port):
        msg = "Remote command failed on {}:{}. For details see {}".format(
            host, port, self.logfile)
        return msg

    def terminate(self):

        print('Attempting to terminate cluster "{}"'.format(
            colored(self.cluster_name, "green")))
        try:
            self.cluster.update(min_nodes=0,
                                max_nodes=0,
                                idle_seconds_before_scaledown=10)
            self.cluster.wait_for_completion()
        except ComputeTargetException as err:
            raise RuntimeError(
                "Failed to terminate cluster nodes ({})".format(err))

        if len(self.cluster.list_nodes()):
            raise RuntimeError(
                "Failed to terminate cluster nodes (nodes still running)")

    @property
    def cluster_nodes(self):
        self.cluster.refresh_state()
        return sorted(self.cluster.list_nodes(), key=lambda n: n["port"])

    def _create_or_update_cluster(self, min_nodes, max_nodes,
                                  idle_timeout_secs):

        try:
            self.cluster = AmlCompute(workspace=self.workspace,
                                      name=self.cluster_name)
            print('Updating existing cluster "{}"'.format(
                colored(self.cluster_name, "green")))
            self.cluster.update(
                min_nodes=min_nodes,
                max_nodes=max_nodes,
                idle_seconds_before_scaledown=idle_timeout_secs,
            )
        except ComputeTargetException:
            print('Creating new cluster "{}"'.format(
                colored(self.cluster_name, "green")))
            cluster_config = AmlCompute.provisioning_configuration(
                vm_size=self.vm_type,
                min_nodes=min_nodes,
                max_nodes=max_nodes,
                idle_seconds_before_scaledown=idle_timeout_secs,
                admin_username=self.admin_username,
                admin_user_ssh_key=self.ssh_key,
                remote_login_port_public_access="Enabled",
            )
            self.cluster = AmlCompute.create(self.workspace, self.cluster_name,
                                             cluster_config)

        self.cluster.wait_for_completion()

        if len(self.cluster_nodes) < min_nodes:
            sleep(30)
            if len(self.cluster_nodes) < min_nodes:
                raise RuntimeError("Failed to provision sufficient nodes")

    def _copy_nodefile_to_nodes(self):

        if len(self.cluster_nodes) == 1:
            cprint("Single node cluster -- skipping IB config", "yellow")
            return

        print("Collecting cluster IB info")

        outputs = self._all_ssh.run_command(
            r'ifconfig ib0 | grep -oe "inet[^6][adr: ]*[0-9.]*" | cut -d" " -f2',
            shell="bash -c",
        )
        self._all_ssh.join(outputs)

        ibaddrs = []
        for output in outputs:
            host = output.host
            port = output.client.port
            if output.exit_code != 0:
                print(list(output.stdout))
                print(list(output.stderr))
                raise RuntimeError("Failed to get IB ip for {}:{}".format(
                    host, port))
            try:
                ibaddr = list(output.stdout)[0].split()[0]
            except IndexError:
                raise RuntimeError("Failed to get IB ip for {}:{} - "
                                   "No ib interface found!".format(host, port))
            print("Mapping {}:{} -> {}".format(host, port, ibaddr))
            if port == self._master_scp.port:
                cprint("IB Master: {}".format(ibaddr), "green")
                ibaddrs = [ibaddr] + ibaddrs
            else:
                ibaddrs.append(ibaddr)

        with NamedTemporaryFile(delete=False, mode="wt") as nfh:
            self.nodefile = nfh.name
            for addr in ibaddrs:
                nfh.write("{}\n".format(addr))

        self.ibaddrs = ibaddrs
        self.copy_to_all_nodes(self.nodefile, "./nodefile")

    def _create_cluster_ssh_conns(self):

        hostips = [n["publicIpAddress"] for n in self.cluster_nodes]
        hostconfigs = [HostConfig(port=n["port"]) for n in self.cluster_nodes]

        self._all_ssh = ParallelSSHClient(hostips,
                                          host_config=hostconfigs,
                                          user=self.admin_username)

        self._master_ssh = ParallelSSHClient(hostips[:1],
                                             host_config=hostconfigs[:1],
                                             user=self.admin_username)

        self._master_scp = SSHClient(hostips[0],
                                     port=hostconfigs[0].port,
                                     user=self.admin_username)

    def copy_to_all_nodes(self, source, dest):

        copy_jobs = self._all_ssh.copy_file(source, dest)
        joinall(copy_jobs, raise_error=True)

    def copy_to_master_node(self, source, dest):

        self._master_scp.copy_file(source, dest)

    def copy_from_master_node(self, source, dest):

        self._master_scp.copy_remote_file(source, dest)

    def run_on_all_nodes(self, command):

        outputs = self._all_ssh.run_command(command, shell="bash -c")
        self._all_ssh.join(outputs, consume_output=True)

        for output in outputs:
            if int(output.exit_code) != 0:
                host = output.host
                port = output.client.port
                raise RuntimeError(self._check_logs_emessage(host, port))

    def run_on_master_node(self, command):

        outputs = self._master_ssh.run_command(command, shell="bash -c")
        self._master_ssh.join(outputs)

        for output in outputs:
            if int(output.exit_code) != 0:
                host = output.host
                port = output.client.port
                raise RuntimeError(self._check_logs_emessage(host, port))

    def attempt_termination(self):
        try:
            self.terminate()
        except RuntimeError as err:
            print(colored("ERROR: {}\n\n", "red", attrs=["bold"]).format(err))
            self.warn_unterminated()

    def warn_unterminated(self):
        print(
            colored("WARNING: {}", "red", attrs=["bold"]).format(
                colored(
                    "Cluster {} is still running - terminate manually to avoid "
                    "additional compute costs".format(
                        colored(self.cluster_name, "green")),
                    "red",
                )))

Beispiel #13

0

Datei anzeigen

Datei: pipelines_build.py Projekt: clauren42/mlsolutions

def build_pipeline(dataset, ws, config):
    print("building pipeline for dataset %s in workspace %s" %
          (dataset, ws.name))

    hostname = socket.gethostname()
    if hostname == 'wopauliNC6':
        base_dir = '.'
    else:
        base_dir = '.'

    def_blob_store = ws.get_default_datastore()

    # folder for scripts that need to be uploaded to Aml compute target
    script_folder = './scripts'
    os.makedirs(script_folder, exist_ok=True)

    shutil.copy(os.path.join(base_dir, 'video_decoding.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_submit.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'pipelines_build.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'train.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'prednet.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'keras_utils.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
    shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)

    cpu_compute_name = config['cpu_compute']
    try:
        cpu_compute_target = AmlCompute(ws, cpu_compute_name)
        print("found existing compute target: %s" % cpu_compute_name)
    except ComputeTargetException:
        print("creating new compute target")

        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_D2_V2',
            max_nodes=4,
            idle_seconds_before_scaledown=1800)
        cpu_compute_target = ComputeTarget.create(ws, cpu_compute_name,
                                                  provisioning_config)
        cpu_compute_target.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster.
    print(cpu_compute_target.get_status().serialize())

    # choose a name for your cluster
    gpu_compute_name = config['gpu_compute']

    try:
        gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
        print("found existing compute target: %s" % gpu_compute_name)
    except ComputeTargetException:
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_NC6',
            max_nodes=5,
            idle_seconds_before_scaledown=1800)

        # create the cluster
        gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name,
                                                  provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout.
        # if no min node count is provided it uses the scale settings for the cluster
        gpu_compute_target.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=20)

    # use get_status() to get a detailed status for the current cluster.
    print(gpu_compute_target.get_status().serialize())

    # conda dependencies for compute targets
    cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"],
                                      pip_packages=[
                                          "azure-storage-blob==1.5.0",
                                          "hickle==3.4.3", "requests==2.21.0",
                                          "sklearn", "pandas==0.24.2",
                                          "azureml-sdk==1.0.21",
                                          "numpy==1.16.2", "pillow==6.0.0"
                                      ])
    gpu_cd = CondaDependencies.create(pip_packages=[
        "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0",
        "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3",
        "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0",
        "sklearn", "pandas==0.24.2", "azureml-sdk==1.0.21", "numpy==1.16.2"
    ])

    # Runconfigs
    cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
    cpu_compute_run_config.environment.docker.enabled = True
    cpu_compute_run_config.environment.docker.gpu_support = False
    cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
    cpu_compute_run_config.environment.spark.precache_packages = False

    gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd)
    gpu_compute_run_config.environment.docker.enabled = True
    gpu_compute_run_config.environment.docker.gpu_support = True
    gpu_compute_run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE
    gpu_compute_run_config.environment.spark.precache_packages = False

    print("PipelineData object created")

    video_data = DataReference(datastore=def_blob_store,
                               data_reference_name="video_data",
                               path_on_datastore=os.path.join(
                                   "prednet", "data", "video", dataset))

    # Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.
    raw_data = PipelineData("raw_video_fames", datastore=def_blob_store)
    preprocessed_data = PipelineData("preprocessed_video_frames",
                                     datastore=def_blob_store)
    data_metrics = PipelineData("data_metrics", datastore=def_blob_store)
    data_output = PipelineData("output_data", datastore=def_blob_store)

    print("DataReference object created")

    # prepare dataset for training/testing prednet
    video_decoding = PythonScriptStep(
        name='decode_videos',
        script_name="video_decoding.py",
        arguments=["--input_data", video_data, "--output_data", raw_data],
        inputs=[video_data],
        outputs=[raw_data],
        compute_target=cpu_compute_target,
        source_directory=script_folder,
        runconfig=cpu_compute_run_config,
        allow_reuse=True,
        hash_paths=['.'])
    print("video_decode created")

    # prepare dataset for training/testing recurrent neural network
    data_prep = PythonScriptStep(name='prepare_data',
                                 script_name="data_preparation.py",
                                 arguments=[
                                     "--input_data", raw_data, "--output_data",
                                     preprocessed_data
                                 ],
                                 inputs=[raw_data],
                                 outputs=[preprocessed_data],
                                 compute_target=cpu_compute_target,
                                 source_directory=script_folder,
                                 runconfig=cpu_compute_run_config,
                                 allow_reuse=True,
                                 hash_paths=['.'])
    data_prep.run_after(video_decoding)

    print("data_prep created")

    est = TensorFlow(source_directory=script_folder,
                     compute_target=gpu_compute_target,
                     pip_packages=[
                         'keras==2.0.8', 'theano', 'tensorflow==1.8.0',
                         'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod',
                         'hickle'
                     ],
                     entry_script='train.py',
                     use_gpu=True,
                     node_count=1)

    ps = RandomParameterSampling({
        '--batch_size':
        choice(2, 4, 8, 16),
        '--filter_sizes':
        choice("3, 3, 3", "4, 4, 4", "5, 5, 5"),
        '--stack_sizes':
        choice("48, 96, 192", "36, 72, 144", "12, 24, 48"),  #, "48, 96"),
        '--learning_rate':
        loguniform(-6, -1),
        '--lr_decay':
        loguniform(-9, -1),
        '--freeze_layers':
        choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "1", "2",
               "3"),
        '--transfer_learning':
        choice("True", "False")
    })

    policy = BanditPolicy(evaluation_interval=2,
                          slack_factor=0.1,
                          delay_evaluation=20)

    hdc = HyperDriveRunConfig(
        estimator=est,
        hyperparameter_sampling=ps,
        policy=policy,
        primary_metric_name='val_loss',
        primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
        max_total_runs=5,  #100,
        max_concurrent_runs=5,  #10,
        max_duration_minutes=60 * 6)

    hd_step = HyperDriveStep(name="train_w_hyperdrive",
                             hyperdrive_run_config=hdc,
                             estimator_entry_script_arguments=[
                                 '--data-folder', preprocessed_data,
                                 '--remote_execution'
                             ],
                             inputs=[preprocessed_data],
                             metrics_output=data_metrics,
                             allow_reuse=True)
    hd_step.run_after(data_prep)

    registration_step = PythonScriptStep(
        name='register_model',
        script_name='model_registration.py',
        arguments=['--input_dir', data_metrics, '--output_dir', data_output],
        compute_target=gpu_compute_target,
        inputs=[data_metrics],
        outputs=[data_output],
        source_directory=script_folder,
        allow_reuse=True,
        hash_paths=['.'])
    registration_step.run_after(hd_step)

    pipeline = Pipeline(
        workspace=ws,
        steps=[video_decoding, data_prep, hd_step, registration_step])
    print("Pipeline is built")

    pipeline.validate()
    print("Simple validation complete")

    pipeline_name = 'prednet_' + dataset
    pipeline.publish(name=pipeline_name)

    return pipeline_name

Beispiel #14

0

Datei anzeigen

Datei: train_model_remote.py Projekt: olufemig/AutoMLExamples

nodes = 4

dsvm_name = 'dsvmaml'
try:
    dsvm_compute = AmlCompute(ws, dsvm_name)
    print('found existing dsvm.')
except:
    print('creating new dsvm.')
    # Below is using a VM of SKU Standard_D2_v2 which is 2 core machine. You can check Azure virtual machines documentation for additional SKUs of VMs.
    dsvm_config = AmlCompute.provisioning_configuration(vm_size="Standard_NC6",
                                                        max_nodes=nodes,
                                                        min_nodes=0)
    dsvm_compute = AmlCompute.create(ws,
                                     name=dsvm_name,
                                     provisioning_configuration=dsvm_config)
    dsvm_compute.wait_for_completion(show_output=True)

automl_settings = {
    "name": "AutoML_Demo_Experiment_{0}".format(time.time()),
    "iteration_timeout_minutes": 60,
    "iterations": 20,
    "n_cross_validations": 5,
    "primary_metric": 'accuracy',
    "preprocess": True,
    "verbosity": logging.INFO,
    "max_concurrent_iterations": nodes
}

## note here that the project folder gets uploaded to our DSVM.
## therefore we must have any extra classes/files in there as well i.e. app_helper.py and app_config.conf
automated_ml_config = AutoMLConfig(

Beispiel #15

0

Datei anzeigen

Datei: main.py Projekt: marvinbuss/AMLCompute

def main():
    # Loading input values
    print("::debug::Loading input values")
    parameters_file = os.environ.get("INPUT_PARAMETERSFILE",
                                     default="workspace.json")
    azure_credentials = os.environ.get("INPUT_AZURECREDENTIALS", default="{}")
    azure_credentials = json.loads(azure_credentials)

    # Loading parameters file
    print("::debug::Loading parameters file")
    parameters_file_path = os.path.join(".aml", parameters_file)
    try:
        with open(parameters_file_path) as f:
            parameters = json.load(f)
    except FileNotFoundError:
        print(
            f"::error::Could not find parameter file in {parameters_file_path}. Please provide a parameter file in your repository (e.g. .aml/workspace.json)."
        )
        return

    # Loading Workspace
    sp_auth = ServicePrincipalAuthentication(
        tenant_id=azure_credentials.get("tenantId", ""),
        service_principal_id=azure_credentials.get("clientId", ""),
        service_principal_password=azure_credentials.get("clientSecret", ""))
    try:
        print("::debug::Loading existing Workspace")
        ws = Workspace.get(
            name=parameters.get("name", None),
            subscription_id=azure_credentials.get("subscriptionId", ""),
            resource_group=parameters.get("resourceGroup", None),
            auth=sp_auth)
        print("::debug::Successfully loaded existing Workspace")
    except AuthenticationException as exception:
        print(
            f"::error::Could not retrieve user token. Please paste output of `az ad sp create-for-rbac --name <your-sp-name> --role contributor --scopes /subscriptions/<your-subscriptionId>/resourceGroups/<your-rg> --sdk-auth` as value of secret variable: AZURE_CREDENTIALS: {exception}"
        )
        return
    except AuthenticationError as exception:
        print(f"::error::Microsoft REST Authentication Error: {exception}")
        return
    except AdalError as exception:
        print(
            f"::error::Active Directory Authentication Library Error: {exception}"
        )
        return
    except ProjectSystemException as exception:
        print(f"::error::Workspace authorizationfailed: {exception}")
        return

    # TODO: Create compute if not existing.
    try:
        # Loading AMLCompute
        print("::debug::Loading existing AML Compute")
        cluster = AmlCompute(workspace=ws, name=parameters["name"])

        # Check settings and redeploy if required settings have changed
        print("::debug::Found existing cluster")
        if cluster.vm_size.lower() != parameters["vm_size"].lower(
        ) or cluster.vm_priority.lower() != parameters["vm_priority"].lower():
            cluster.delete()
            cluster.wait_for_completion(show_output=True)
            raise ComputeTargetException(
                "Cluster is of incorrect size or has incorrect priority. Deleting cluster and provisioning a new one."
            )

        # Update AMLCompute
        #if cluster.provisioning_configuration.min_nodes != aml_settings["min_nodes"] or cluster.provisioning_configuration.max_nodes != aml_settings["max_nodes"] or cluster.provisioning_configuration.idle_seconds_before_scaledown != aml_settings["idle_seconds_before_scaledown"]:
        print("::debug::Updating settings of Cluster")
        cluster.update(min_nodes=parameters["min_nodes"],
                       max_nodes=parameters["max_nodes"],
                       idle_seconds_before_scaledown=parameters[
                           "idle_seconds_before_scaledown"])

        # Wait until the operation has completed
        cluster.wait_for_completion(show_output=True)

        print("::debug::Successfully updated Cluster definition")
    except ComputeTargetException:
        print("::debug::Loading failed")
        print("::debug::Creating new AML Compute resource")
        compute_config = AmlCompute.provisioning_configuration(
            vm_size=parameters["vm_size"],
            vm_priority=parameters["vm_priority"],
            min_nodes=parameters["min_nodes"],
            max_nodes=parameters["max_nodes"],
            idle_seconds_before_scaledown=parameters[
                "idle_seconds_before_scaledown"],
            tags=parameters["tags"],
            description=parameters["description"])

        # Deploy to VNET if provided
        if parameters["vnet_resource_group_name"] and parameters[
                "vnet_name"] and parameters["subnet_name"]:
            compute_config.vnet_resourcegroup_name = parameters[
                "vnet_resource_group_name"]
            compute_config.vnet_name = parameters["vnet_name"]
            compute_config.subnet_name = parameters["subnet_name"]

        # Set Credentials if provided
        if parameters["admin_username"] and parameters["admin_user_password"]:
            compute_config.admin_username = parameters["admin_username"]
            compute_config.admin_user_password = parameters[
                "admin_user_password"]
        elif parameters["admin_username"] and parameters["admin_user_ssh_key"]:
            compute_config.admin_username = parameters["admin_username"]
            compute_config.admin_user_ssh_key = parameters[
                "admin_user_ssh_key"]

        # Create Compute Target
        cluster = ComputeTarget.create(
            workspace=ws,
            name=parameters["name"],
            provisioning_configuration=compute_config)

        # Wait until the cluster is attached
        cluster.wait_for_completion(show_output=True)

    # Checking status of AMLCompute Cluster
    print("::debug::Checking status of AMLCompute Cluster")
    if cluster.provisioning_state == "Failed":
        cluster.delete()
        raise Exception(
            "::debug::Deployment of AMLCompute Cluster failed with the following status: {} and logs: \n{}"
            .format(cluster.provisioning_state, cluster.provisioning_errors))

    print(parameters)
    print(
        "::debug::Successfully finished Azure Machine Learning Compute Action")