コード例 #1
0
ファイル: custom.py プロジェクト: t-0-m-1-3/azure-cli
def create_job(client,
               resource_group,
               job_name,
               json_file,
               location=None,
               cluster_name=None,
               cluster_resource_group=None,
               raw=False):
    with open(json_file) as f:
        json_obj = json.load(f)
        params = _get_deserializer()('JobCreateParameters', json_obj)
        if location:
            params.location = location
        if not params.location:
            raise CLIError('Please provide location for job creation.')
        # If cluster name is specified, find the cluster and use its resource id for the new job.
        if cluster_name is not None:
            if cluster_resource_group is None:  # The job must be created in the cluster's resource group.
                cluster_resource_group = resource_group
            cluster = client.clusters.get(cluster_resource_group, cluster_name)
            params.cluster = models.ResourceId(cluster.id)
        if params.cluster is None:
            raise CLIError(
                'Please provide cluster information via command line or configuration file.'
            )
        return client.jobs.create(resource_group, job_name, params, raw=raw)
コード例 #2
0
ファイル: custom.py プロジェクト: zackliu/azure-cli
def create_cluster(cmd, client,  # pylint: disable=too-many-locals
                   resource_group, cluster_name, json_file=None, location=None, user_name=None,
                   ssh_key=None, password=None, generate_ssh_keys=None, image=None, custom_image=None,
                   use_auto_storage=False, vm_size=None, vm_priority='dedicated', target=None, min_nodes=None,
                   max_nodes=None, subnet=None, nfs_name=None, nfs_resource_group=None, nfs_mount_path='nfs',
                   azure_file_share=None, afs_mount_path='afs', container_name=None, container_mount_path='bfs',
                   account_name=None, account_key=None, setup_task=None, setup_task_output=None):
    if generate_ssh_keys:
        _generate_ssh_keys()
        if ssh_key is None:
            ssh_key = _get_default_ssh_public_key_location()
    _ensure_resource_not_exist(client.clusters, resource_group, cluster_name)
    _verify_subnet(client, subnet, nfs_name, nfs_resource_group or resource_group)
    if json_file:
        with open(json_file) as f:
            json_obj = json.load(f)
            params = _get_deserializer()('ClusterCreateParameters', json_obj)
    else:
        # noinspection PyTypeChecker
        params = models.ClusterCreateParameters()
    if params.node_setup:
        params.node_setup.mount_volumes = _patch_mount_volumes(
            cmd.cli_ctx, params.node_setup.mount_volumes, account_name, account_key)
    params = _update_user_account_settings(params, user_name, ssh_key, password)
    params.location = location or _get_resource_group_location(cmd.cli_ctx, resource_group)
    params = _update_nodes_information(params, image, custom_image, vm_size, vm_priority, target, min_nodes, max_nodes)
    if nfs_name or azure_file_share or container_name:
        params.node_setup = params.node_setup or models.NodeSetup()
    mount_volumes = params.node_setup.mount_volumes if params.node_setup else None
    if nfs_name:
        file_server = client.file_servers.get(nfs_resource_group or resource_group, nfs_name)
        mount_volumes = _add_nfs_to_mount_volumes(mount_volumes, file_server.id, nfs_mount_path)
    if azure_file_share:
        mount_volumes = _add_azure_file_share_to_mount_volumes(cmd.cli_ctx, mount_volumes, azure_file_share,
                                                               afs_mount_path, account_name, account_key)
    if container_name:
        mount_volumes = _add_azure_container_to_mount_volumes(cmd.cli_ctx, mount_volumes, container_name,
                                                              container_mount_path, account_name, account_key)
    if use_auto_storage:
        auto_storage_account, auto_storage_key = _configure_auto_storage(cmd.cli_ctx, params.location)
        mount_volumes = _add_azure_file_share_to_mount_volumes(
            cmd.cli_ctx, mount_volumes, AUTO_STORAGE_SHARE_NAME, AUTO_STORAGE_SHARE_PATH,
            auto_storage_account, auto_storage_key)
        mount_volumes = _add_azure_container_to_mount_volumes(
            cmd.cli_ctx, mount_volumes, AUTO_STORAGE_CONTAINER_NAME, AUTO_STORAGE_CONTAINER_PATH,
            auto_storage_account, auto_storage_key)
    if mount_volumes:
        if params.node_setup is None:
            params.node_setup = models.NodeSetup()
        params.node_setup.mount_volumes = mount_volumes
    if subnet:
        params.subnet = models.ResourceId(id=subnet)
    if setup_task:
        params = _add_setup_task(setup_task, setup_task_output, params)
    return client.clusters.create(resource_group, cluster_name, params)
コード例 #3
0
 def test_experiments_isolation(self, resource_group, location):
     self.client.workspaces.create(resource_group.name, 'first',
                                   location).result()
     self.client.workspaces.create(resource_group.name, 'second',
                                   location).result()
     # Create a cluster, two experiments and a job in each experiment
     for workspace in ['first', 'second']:
         cluster = self.client.clusters.create(
             resource_group.name,
             workspace,
             'cluster',
             parameters=models.ClusterCreateParameters(
                 vm_size='STANDARD_D1',
                 scale_settings=models.ScaleSettings(
                     manual=models.ManualScaleSettings(
                         target_node_count=0)),
                 user_account_settings=models.UserAccountSettings(
                     admin_user_name=helpers.ADMIN_USER_NAME,
                     admin_user_password=helpers.ADMIN_USER_PASSWORD),
                 vm_priority='lowpriority')).result()
         for experiment in ['exp1', 'exp2']:
             self.client.experiments.create(resource_group.name, workspace,
                                            experiment).result()
             self.client.jobs.create(
                 resource_group.name,
                 workspace,
                 experiment,
                 'job',
                 parameters=models.JobCreateParameters(
                     cluster=models.ResourceId(id=cluster.id),
                     node_count=1,
                     std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT',
                     custom_toolkit_settings=models.CustomToolkitSettings(
                         command_line='true'))).result()
     # Delete exp1 in the first workspace
     self.client.experiments.delete(resource_group.name, 'first',
                                    'exp1').result()
     # Ensure the experiment was actually deleted
     self.assertRaises(
         CloudError, lambda: self.client.experiments.get(
             resource_group.name, 'first', 'exp1'))
     for workspace in ['first', 'second']:
         # Ensure the clusters are not affected
         self.client.clusters.get(resource_group.name, workspace, 'cluster')
         # Ensure the other experiments are not affected
         for experiment in ['exp1', 'exp2']:
             if workspace == 'first' and experiment == 'exp1':
                 continue
             self.client.experiments.get(resource_group.name, workspace,
                                         experiment)
             job = self.client.jobs.get(resource_group.name, workspace,
                                        experiment, 'job')
             # And check the job are not terminated
             self.assertEqual(job.execution_state,
                              models.ExecutionState.queued)
コード例 #4
0
    def create_custom_job(client,
                          resource_group,
                          cluster_id,
                          job_name,
                          nodes,
                          cmd,
                          job_preparation_cmd=None,
                          container=None):
        """Creates custom toolkit job

        :param BatchAIManagementClient client: client instance.
        :param str resource_group: resource group name.
        :param str cluster_id: resource Id of the cluster.
        :param str job_name: job name.
        :param int nodes: number of nodes to execute the job.
        :param str cmd: command line to run.
        :param str or None job_preparation_cmd: Job preparation command line.
        :param models.ContainerSettings or None container: container settings to run the job.
        :return models.Job: the created job.
        """
        job_preparation = None
        if job_preparation_cmd:
            job_preparation = models.JobPreparation(
                command_line=job_preparation_cmd)
        client.experiments.create(resource_group,
                                  Helpers.DEFAULT_WORKSPACE_NAME,
                                  Helpers.DEFAULT_EXPERIMENT_NAME).result()
        return client.jobs.create(
            resource_group,
            Helpers.DEFAULT_WORKSPACE_NAME,
            Helpers.DEFAULT_EXPERIMENT_NAME,
            job_name,
            parameters=models.JobCreateParameters(
                cluster=models.ResourceId(id=cluster_id),
                node_count=nodes,
                std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                    Helpers.AZURE_FILES_MOUNTING_PATH),
                output_directories=[
                    models.OutputDirectory(
                        id=Helpers.JOB_OUTPUT_DIRECTORY_ID,
                        path_prefix=Helpers.JOB_OUTPUT_DIRECTORY_PATH,
                        path_suffix="files")
                ],
                input_directories=[
                    models.InputDirectory(
                        id='INPUT',
                        path='$AZ_BATCHAI_MOUNT_ROOT/{0}/input'.format(
                            Helpers.AZURE_FILES_MOUNTING_PATH))
                ],
                container_settings=container,
                job_preparation=job_preparation,
                custom_toolkit_settings=models.CustomToolkitSettings(
                    command_line=cmd))).result()
コード例 #5
0
 def test_job_environment_variables_and_secrets(self, resource_group,
                                                location, cluster):
     """Tests if it's possible to mount external file systems for a job."""
     job_name = 'job'
     job = self.client.jobs.create(
         resource_group.name,
         helpers.DEFAULT_WORKSPACE_NAME,
         helpers.DEFAULT_EXPERIMENT_NAME,
         job_name,
         parameters=models.JobCreateParameters(
             cluster=models.ResourceId(id=cluster.id),
             node_count=1,
             std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                 helpers.AZURE_FILES_MOUNTING_PATH),
             environment_variables=[
                 models.EnvironmentVariable(name='VARIABLE', value='VALUE')
             ],
             secrets=[
                 models.EnvironmentVariableWithSecretValue(
                     name='SECRET_VARIABLE', value='SECRET')
             ],
             # Check that the job preparation has access to env variables and secrets.
             job_preparation=models.JobPreparation(
                 command_line='echo $VARIABLE $SECRET_VARIABLE'),
             # Check that the job has access to env variables and secrets.
             custom_toolkit_settings=models.CustomToolkitSettings(
                 command_line='echo $VARIABLE $SECRET_VARIABLE'))).result(
                 )  # type: models.Job
     self.assertEqual(
         helpers.wait_for_job_completion(self.is_live, self.client,
                                         resource_group.name, job.name,
                                         helpers.MINUTE),
         models.ExecutionState.succeeded)
     # Check that environment variables are reported by the server.
     self.assertEqual(len(job.environment_variables), 1)
     self.assertEqual(job.environment_variables[0].name, 'VARIABLE')
     self.assertEqual(job.environment_variables[0].value, 'VALUE')
     # Check that secrets are reported back by server, but value is not reported.
     self.assertEqual(len(job.secrets), 1)
     self.assertEqual(job.secrets[0].name, 'SECRET_VARIABLE')
     self.assertIsNone(job.secrets[0].value)
     # Check that job and job prep had access to the env variables and secrets.
     helpers.assert_job_files_are(
         self, self.client, resource_group.name, job.name,
         helpers.STANDARD_OUTPUT_DIRECTORY_ID, {
             u'stdout.txt': u'VALUE SECRET\n',
             u'stderr.txt': u'',
             u'stdout-job_prep.txt': u'VALUE SECRET\n',
             u'stderr-job_prep.txt': u''
         })
コード例 #6
0
def create_job(config,
               cluster_id,
               workspace,
               experiment,
               job_name,
               image_name,
               command,
               number_of_vms=1):
    ''' Creates job
    '''
    input_directories = [
        models.InputDirectory(id='SCRIPT',
                              path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format(
                                  config.fileshare_mount_point, job_name)),
        models.InputDirectory(id='DATASET',
                              path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format(
                                  config.fileshare_mount_point, 'data'))
    ]

    std_output_path_prefix = "$AZ_BATCHAI_MOUNT_ROOT/{0}".format(
        config.fileshare_mount_point)

    output_directories = [
        models.OutputDirectory(id='MODEL',
                               path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                                   config.fileshare_mount_point),
                               path_suffix="models"),
        models.OutputDirectory(id='NOTEBOOKS',
                               path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                                   config.fileshare_mount_point),
                               path_suffix="notebooks")
    ]

    parameters = models.JobCreateParameters(
        location=config.location,
        cluster=models.ResourceId(id=cluster_id),
        node_count=number_of_vms,
        input_directories=input_directories,
        std_out_err_path_prefix=std_output_path_prefix,
        output_directories=output_directories,
        container_settings=models.ContainerSettings(
            image_source_registry=models.ImageSourceRegistry(
                image=image_name)),
        custom_toolkit_settings=models.CustomToolkitSettings(
            command_line=command))

    client = client_from(config)
    _ = client.jobs.create(config.group_name, workspace, experiment, job_name,
                           parameters)
コード例 #7
0
 def create_resource(self, name, **kwargs):
     if self.is_live:
         self.client = create_batchai_client(self)
         group = self._get_resource_group(**kwargs)
         self.resource = create_cluster(
             self.client, self.location, group.name, name, self.vm_size,
             self.target_nodes,
             self._get_storage_account(**kwargs).name,
             self._get_storage_account_key(**kwargs))
         if self.wait:
             wait_for_nodes(self.is_live, self.client, group.name, name,
                            self.target_nodes, NODE_STARTUP_TIMEOUT_SEC)
     else:
         self.resource = models.Cluster()
         self.resource.id = models.ResourceId(id='fake')
     return {self.parameter_name: self.resource}
コード例 #8
0
ファイル: custom.py プロジェクト: zackliu/azure-cli
def create_file_server(cmd, client, resource_group, file_server_name, json_file=None, vm_size=None, location=None,
                       user_name=None, ssh_key=None, password=None, generate_ssh_keys=None,
                       disk_count=None, disk_size=None, caching_type=None, storage_sku=None, subnet=None,
                       raw=False):
    if generate_ssh_keys:
        _generate_ssh_keys()
        if ssh_key is None:
            ssh_key = _get_default_ssh_public_key_location()
    _ensure_resource_not_exist(client, resource_group, file_server_name)
    if json_file:
        with open(json_file) as f:
            json_obj = json.load(f)
            params = _get_deserializer()('FileServerCreateParameters', json_obj)
    else:
        # noinspection PyTypeChecker
        params = models.FileServerCreateParameters()
    params = _update_user_account_settings(params, user_name, ssh_key, password)
    params.location = location or _get_resource_group_location(cmd.cli_ctx, resource_group)
    if not params.data_disks:
        # noinspection PyTypeChecker
        params.data_disks = models.DataDisks()
    if disk_size:
        params.data_disks.disk_size_in_gb = disk_size
    if not params.data_disks.disk_size_in_gb:
        raise CLIError('Please provide disk size in Gb.')
    if disk_count:
        params.data_disks.disk_count = disk_count
    if not params.data_disks.disk_count:
        raise CLIError('Please provide number of data disks (at least one disk is required).')
    if caching_type:
        params.data_disks.caching_type = caching_type
    if storage_sku:
        params.data_disks.storage_account_type = storage_sku
    if not params.data_disks.storage_account_type:
        raise CLIError('Please provide storage account type (storage sku).')
    if vm_size:
        params.vm_size = vm_size
    if not params.vm_size:
        raise CLIError('Please provide VM size.')
    if subnet:
        if not is_valid_resource_id(subnet):
            raise CLIError('Ill-formed subnet resource id')
        params.subnet = models.ResourceId(id=subnet)

    return client.create(resource_group, file_server_name, params, raw=raw)
コード例 #9
0
ファイル: custom.py プロジェクト: zackliu/azure-cli
def _add_nfs_to_mount_volumes(volumes, file_server_id, mount_path):
    """Adds NFS to the mount volumes.

    :param models.MountVolumes or None volumes: existing mount volumes.
    :param str file_server_id: resource id of the file server.
    :param str mount_path: relative mount path for the file server.
    :return models.ClusterCreateParameters: updated parameters.
    """
    result = copy.deepcopy(volumes) if volumes else models.MountVolumes()
    if not mount_path:
        raise CLIError('File server relative mount path cannot be empty.')
    if result.file_servers is None:
        result.file_servers = []
    result.file_servers.append(models.FileServerReference(
        relative_mount_path=mount_path,
        file_server=models.ResourceId(id=file_server_id),
        mount_options="rw"))
    return result
コード例 #10
0
ファイル: custom.py プロジェクト: t-0-m-1-3/azure-cli
def add_nfs_to_cluster_create_parameters(params, file_server_id, mount_path):
    """Adds NFS to the cluster create parameters.

    :param model.ClusterCreateParameters params: cluster create parameters.
    :param str file_server_id: resource id of the file server.
    :param str mount_path: relative mount path for the file server.
    """
    if not mount_path:
        raise CLIError('File server relative mount path cannot be empty.')
    if params.node_setup is None:
        params.node_setup = models.NodeSetup()
    if params.node_setup.mount_volumes is None:
        params.node_setup.mount_volumes = models.MountVolumes()
    if params.node_setup.mount_volumes.file_servers is None:
        params.node_setup.mount_volumes.file_servers = []
    params.node_setup.mount_volumes.file_servers.append(
        models.FileServerReference(
            relative_mount_path=mount_path,
            file_server=models.ResourceId(file_server_id),
            mount_options="rw"))
コード例 #11
0
 def create_resource(self, name, **kwargs):
     if self.is_live:
         self.client = Helpers.create_batchai_client(self)
         group = self._get_resource_group(**kwargs)
         self.resource = Helpers.create_cluster(
             self.client, self.location, group.name, name, self.vm_size,
             self.target_nodes,
             self._get_storage_account(**kwargs).name,
             self._get_storage_account_key(**kwargs))
         self.client.experiments.create(
             group.name, Helpers.DEFAULT_WORKSPACE_NAME,
             Helpers.DEFAULT_EXPERIMENT_NAME).result()
         if self.wait:
             Helpers.wait_for_nodes(self.is_live, self.client, group.name,
                                    name, self.target_nodes,
                                    Helpers.NODE_STARTUP_TIMEOUT_SEC)
     else:
         self.resource = models.Cluster()
         self.resource.id = models.ResourceId(id='fake')
     return {self.parameter_name: self.resource}
コード例 #12
0
ファイル: custom.py プロジェクト: zackliu/azure-cli
def create_job(cmd, client, resource_group, job_name, json_file, location=None, cluster_name=None,
               cluster_resource_group=None, nfs_name=None, nfs_resource_group=None, nfs_mount_path='nfs',
               azure_file_share=None, afs_mount_path='afs', container_name=None, container_mount_path='bfs',
               account_name=None, account_key=None):
    _ensure_resource_not_exist(client.jobs, resource_group, job_name)
    with open(json_file) as f:
        json_obj = json.load(f)
        params = _get_deserializer()('JobCreateParameters', json_obj)  # type: models.JobCreateParameters
        params.location = location or _get_resource_group_location(cmd.cli_ctx, resource_group)
        # If cluster name is specified, find the cluster and use its resource id for the new job.
        if cluster_name is not None:
            if cluster_resource_group is None:  # The job must be created in the cluster's resource group.
                cluster_resource_group = resource_group
            cluster = client.clusters.get(cluster_resource_group, cluster_name)
            params.cluster = models.ResourceId(id=cluster.id)
        if params.cluster is None:
            raise CLIError('Please provide cluster information via command line or configuration file.')
        if params.mount_volumes:
            params.mount_volumes = _patch_mount_volumes(
                cmd.cli_ctx, params.mount_volumes, account_name, account_key)
        # Add file systems specified via command line into mount volumes
        if nfs_name or azure_file_share or container_name:
            params.mount_volumes = params.mount_volumes or models.MountVolumes()
        mount_volumes = params.mount_volumes
        if nfs_name:
            file_server = client.file_servers.get(nfs_resource_group or resource_group, nfs_name)
            mount_volumes = _add_nfs_to_mount_volumes(mount_volumes, file_server.id, nfs_mount_path)
        if azure_file_share:
            mount_volumes = _add_azure_file_share_to_mount_volumes(cmd.cli_ctx, mount_volumes, azure_file_share,
                                                                   afs_mount_path, account_name, account_key)
        if container_name:
            mount_volumes = _add_azure_container_to_mount_volumes(cmd.cli_ctx, mount_volumes, container_name,
                                                                  container_mount_path, account_name, account_key)
        if mount_volumes:
            params.mount_volumes = mount_volumes
        return client.jobs.create(resource_group, job_name, params)
コード例 #13
0
def submit_job(config, pretrained_model_type, retraining_type,
               output_model_name, num_epochs):
    ''' Defines and submits a job. Does not check for completion. '''
    client = get_client(config)
    job_name = 'job{}'.format(
        datetime.datetime.utcnow().strftime('%m_%d_%H_%M_%S'))
    cluster = client.clusters.get(config.bait_resource_group_name,
                                  config.bait_cluster_name)

    # Define the command line arguments to the retraining script
    command_line_args = '--input_dir $AZ_BATCHAI_INPUT_TRAININGDATA ' + \
     '--validation_dir $AZ_BATCHAI_INPUT_VALIDATIONDATA ' + \
     '--output_dir $AZ_BATCHAI_OUTPUT_MODEL ' + \
     '--num_epochs {} '.format(num_epochs) + \
     '--retraining_type {} '.format(retraining_type) + \
     '--model_type {} '.format(pretrained_model_type) + \
     '--model_filename $AZ_BATCHAI_INPUT_PRETRAINEDMODELS/'
    if pretrained_model_type == 'alexnet':
        command_line_args += 'AlexNet.model'
    elif pretrained_model_type == 'resnet18':
        command_line_args += 'ResNet_18.model'

    # Define the job
    cntk_settings = tm.CNTKsettings(
        language_type='python',
        python_script_file_path='$AZ_BATCHAI_INPUT_SCRIPT/' +
        'retrain_model_distributed.py',
        command_line_args=command_line_args,
        process_count=config.bait_vms_per_job)  # NC6s -- one GPU per VM

    job_create_params = tm.job_create_parameters.JobCreateParameters(
        location=config.bait_region,
        cluster=tm.ResourceId(cluster.id),
        node_count=config.bait_vms_per_job,
        std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/afs',
        output_directories=[
            tm.OutputDirectory(id='MODEL',
                               path_prefix='$AZ_BATCHAI_MOUNT_ROOT/afs')
        ],
        input_directories=[
            tm.InputDirectory(id='SCRIPT',
                              path='$AZ_BATCHAI_MOUNT_ROOT/afs/scripts'),
            tm.InputDirectory(
                id='PRETRAINEDMODELS',
                path='$AZ_BATCHAI_MOUNT_ROOT/afs/pretrainedmodels'),
            tm.InputDirectory(
                id='TRAININGDATA',
                path='$AZ_BATCHAI_MOUNT_ROOT/nfs/training_images'),
            tm.InputDirectory(
                id='VALIDATIONDATA',
                path='$AZ_BATCHAI_MOUNT_ROOT/nfs/validation_images')
        ],
        cntk_settings=cntk_settings)

    # Submit the job
    job = client.jobs.create(
        resource_group_name=config.bait_resource_group_name,
        job_name=job_name,
        parameters=job_create_params)

    return (job_name)
コード例 #14
0
ts_from = sys.argv[1]
ts_to = sys.argv[2]
device_ids = j['device_ids']
tags = j['tags']
job_name_template = j['job_name']

credentials = ServicePrincipalCredentials(client_id=CLIENT,
                                          secret=KEY,
                                          tenant=TENANT_ID)

batchai_client = batchai.BatchAIManagementClient(
    credentials=credentials, subscription_id=subscription_id)
cluster = batchai_client.clusters.get(resource_group_name, cluster_name)

# run an async job for each sensor
for device_id in device_ids:
    for tag in tags:
        job_name = job_name_template.format(device_id, tag)
        custom_settings = baimodels.CustomToolkitSettings(
            command_line=command_line.format(device_id, tag, ts_from, ts_to,
                                             config_file_path))
        print('command line: ' + custom_settings.command_line)
        params = baimodels.job_create_parameters.JobCreateParameters(
            location=location,
            cluster=baimodels.ResourceId(cluster.id),
            node_count=node_count,
            std_out_err_path_prefix=std_out_err_path_prefix,
            custom_toolkit_settings=custom_settings)

        batchai_client.jobs.create(resource_group_name, job_name, params)
コード例 #15
0
# define grid of tuned hyperparameters
param_specs = [
    DiscreteParameter(parameter_name="LATENT_DIM", values=[5, 10, 15]),
    DiscreteParameter(parameter_name="HIDDEN_LAYERS", values=[1, 2, 3]),
    DiscreteParameter(parameter_name="BATCH_SIZE", values=[8, 16, 32]),
    DiscreteParameter(parameter_name="T", values=[72, 168, 336]),
    DiscreteParameter(parameter_name="LEARNING_RATE",
                      values=[0.01, 0.001, 0.0001]),
    DiscreteParameter(parameter_name="ALPHA", values=[0.1, 0.001, 0])
]

parameters = ParameterSweep(param_specs)

# create a template for Batch AI job
jcp = models.JobCreateParameters(
    cluster=models.ResourceId(id=cluster.id),
    node_count=1,
    std_out_err_path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/logs',
    output_directories=[
        models.OutputDirectory(id='ALL',
                               path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/output')
    ],
    custom_toolkit_settings=models.CustomToolkitSettings(
        command_line=
        'python $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts/FF_multi_step_multivariate.py \
        --scriptdir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts \
        --datadir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/data \
        --outdir $AZ_BATCHAI_OUTPUT_ALL \
        -l {0} -n {1} -b {2} -T {3} -r {4} -a {5}'.format(
            parameters['LATENT_DIM'], parameters['HIDDEN_LAYERS'],
            parameters['BATCH_SIZE'], parameters['T'],
コード例 #16
0
    def test_job_level_mounting(self, resource_group, location, cluster,
                                storage_account, storage_account_key):
        """Tests if it's possible to mount external file systems for a job."""
        job_name = 'job'

        # Create file share and container to mount on the job level
        if storage_account.name != FAKE_STORAGE.name:
            files = FileService(storage_account.name, storage_account_key)
            files.create_share('jobshare', fail_on_exist=False)
            blobs = BlockBlobService(storage_account.name, storage_account_key)
            blobs.create_container('jobcontainer', fail_on_exist=False)

        job = self.client.jobs.create(
            resource_group.name,
            job_name,
            parameters=models.JobCreateParameters(
                location=location,
                cluster=models.ResourceId(id=cluster.id),
                node_count=1,
                mount_volumes=models.
                MountVolumes(azure_file_shares=[
                    models.AzureFileShareReference(
                        account_name=storage_account.name,
                        azure_file_url='https://{0}.file.core.windows.net/{1}'.
                        format(storage_account.name, 'jobshare'),
                        relative_mount_path='job_afs',
                        credentials=models.AzureStorageCredentialsInfo(
                            account_key=storage_account_key),
                    )
                ],
                             azure_blob_file_systems=[
                                 models.AzureBlobFileSystemReference(
                                     account_name=storage_account.name,
                                     container_name='jobcontainer',
                                     relative_mount_path='job_bfs',
                                     credentials=models.
                                     AzureStorageCredentialsInfo(
                                         account_key=storage_account_key),
                                 )
                             ]),
                # Put standard output on cluster level AFS to check that the job has access to it.
                std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                    AZURE_FILES_MOUNTING_PATH),
                # Create two output directories on job level AFS and blobfuse.
                output_directories=[
                    models.OutputDirectory(
                        id='OUTPUT1',
                        path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/job_afs'),
                    models.OutputDirectory(
                        id='OUTPUT2',
                        path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/job_bfs')
                ],
                # Check that the job preparation has access to job level file systems.
                job_preparation=models.JobPreparation(
                    command_line=
                    'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/prep_afs.txt; '
                    'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/prep_bfs.txt; '
                    'echo done'),
                # Check that the job has access to job
                custom_toolkit_settings=models.CustomToolkitSettings(
                    command_line=
                    'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/job_afs.txt; '
                    'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/job_bfs.txt; '
                    'mkdir $AZ_BATCHAI_OUTPUT_OUTPUT1/afs; '
                    'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/afs/job_afs.txt; '
                    'mkdir $AZ_BATCHAI_OUTPUT_OUTPUT2/bfs; '
                    'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/bfs/job_bfs.txt; '
                    'echo done'))).result()
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.succeeded)

        job = self.client.jobs.get(resource_group.name, job.name)
        # Assert job and job prep standard output is populated on cluster level filesystem
        assert_job_files_are(
            self, self.client, resource_group.name, job.name,
            STANDARD_OUTPUT_DIRECTORY_ID, {
                u'stdout.txt': u'done\n',
                u'stderr.txt': u'',
                u'stdout-job_prep.txt': u'done\n',
                u'stderr-job_prep.txt': u''
            })
        # Assert files are generated on job level AFS
        assert_job_files_are(self, self.client, resource_group.name, job.name,
                             'OUTPUT1', {
                                 u'job_afs.txt': u'afs\n',
                                 u'prep_afs.txt': u'afs\n',
                                 u'afs': None
                             })
        # Assert files are generated on job level blobfuse
        assert_job_files_are(self, self.client, resource_group.name, job.name,
                             'OUTPUT2', {
                                 u'job_bfs.txt': u'bfs\n',
                                 u'prep_bfs.txt': u'bfs\n',
                                 u'bfs': None
                             })
        # Assert subfolders are available via API
        assert_job_files_in_path_are(self, self.client, resource_group.name,
                                     job.name, 'OUTPUT1', 'afs',
                                     {u'job_afs.txt': u'afs\n'})
        assert_job_files_in_path_are(self, self.client, resource_group.name,
                                     job.name, 'OUTPUT2', 'bfs',
                                     {u'job_bfs.txt': u'bfs\n'})

        # Assert that we can access the output files created on job level mount volumes directly in storage using path
        # segment returned by the server.
        if storage_account.name != FAKE_STORAGE.name:
            files = FileService(storage_account.name, storage_account_key)
            self.assertTrue(
                files.exists(
                    'jobshare', job.job_output_directory_path_segment + '/' +
                    OUTPUT_DIRECTORIES_FOLDER_NAME, 'job_afs.txt'))
            blobs = BlockBlobService(storage_account.name, storage_account_key)
            self.assertTrue(
                blobs.exists(
                    'jobcontainer', job.job_output_directory_path_segment +
                    '/' + OUTPUT_DIRECTORIES_FOLDER_NAME + '/job_bfs.txt'))
        # After the job is done the filesystems should be unmounted automatically, check this by submitting a new job.
        checker = self.client.jobs.create(
            resource_group.name,
            'checker',
            parameters=models.JobCreateParameters(
                location=location,
                cluster=models.ResourceId(id=cluster.id),
                node_count=1,
                std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                    AZURE_FILES_MOUNTING_PATH),
                custom_toolkit_settings=models.CustomToolkitSettings(
                    command_line='echo job; df | grep -E "job_bfs|job_afs"'))
        ).result()
        # Check the job failed because there are not job level mount volumes anymore
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, checker.name, MINUTE),
            models.ExecutionState.failed)
        # Check that the cluster level AFS was still mounted
        assert_job_files_are(self, self.client, resource_group.name,
                             checker.name, STANDARD_OUTPUT_DIRECTORY_ID, {
                                 u'stdout.txt': u'job\n',
                                 u'stderr.txt': u''
                             })
コード例 #17
0
    def test_file_server(self, resource_group, location, storage_account,
                         storage_account_key):
        """Tests file server functionality

        1. Create file server
        2. Create two clusters with this file server
        3. Check that the file server is mounted:
            a. submit tasks (one from host and another from container) on the first cluster to write data to nfs
            b. submit a task on the second cluster to read the data from nfs
        """
        server = create_file_server(
            self.client, location, resource_group.name,
            self.file_server_name)  # type: models.FileServer

        cluster1 = create_cluster(
            self.client,
            location,
            resource_group.name,
            'cluster1',
            'STANDARD_D1',
            1,
            storage_account.name,
            storage_account_key,
            file_servers=[
                models.FileServerReference(
                    file_server=models.ResourceId(id=server.id),
                    relative_mount_path='nfs',
                    mount_options="rw")
            ])
        cluster2 = create_cluster(
            self.client,
            location,
            resource_group.name,
            'cluster2',
            'STANDARD_D1',
            1,
            storage_account.name,
            storage_account_key,
            file_servers=[
                models.FileServerReference(
                    file_server=models.ResourceId(id=server.id),
                    relative_mount_path='nfs',
                    mount_options="rw")
            ])
        # Verify the file server is reported.
        assert_existing_file_servers_are(self, self.client,
                                         resource_group.name,
                                         [self.file_server_name])

        # Verify the file server become available in a reasonable time
        self.assertTrue(
            wait_for_file_server(self.is_live, self.client,
                                 resource_group.name, self.file_server_name,
                                 _FILE_SERVER_CREATION_TIMEOUT_SEC))

        # Verify the remote login information and private ip are reported
        server = self.client.file_servers.get(
            resource_group.name,
            self.file_server_name)  # type: models.FileServer
        self.assertRegexpMatches(server.mount_settings.file_server_public_ip,
                                 RE_ID_ADDRESS)
        self.assertRegexpMatches(server.mount_settings.file_server_internal_ip,
                                 RE_ID_ADDRESS)

        # Verify the clusters allocated nodes successfully
        self.assertEqual(
            wait_for_nodes(self.is_live, self.client, resource_group.name,
                           'cluster1', 1, NODE_STARTUP_TIMEOUT_SEC), 1)
        self.assertEqual(
            wait_for_nodes(self.is_live, self.client, resource_group.name,
                           'cluster2', 1, NODE_STARTUP_TIMEOUT_SEC), 1)

        # Execute publishing tasks on the first cluster
        job1 = create_custom_job(
            self.client, resource_group.name, location, cluster1.id,
            'host_publisher', 1,
            'echo hi from host > $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt')
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job1.name, MINUTE),
            models.ExecutionState.succeeded)
        job2 = create_custom_job(
            self.client,
            resource_group.name,
            location,
            cluster1.id,
            'container_publisher',
            1,
            'echo hi from container >> $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt',
            container=models.ContainerSettings(
                image_source_registry=models.ImageSourceRegistry(
                    image="ubuntu")))
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job2.name, MINUTE),
            models.ExecutionState.succeeded)

        # Execute consumer task on the second cluster
        job3 = create_custom_job(
            self.client, resource_group.name, location, cluster2.id,
            'consumer', 1, 'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt; '
            'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt')
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job3.name, MINUTE),
            models.ExecutionState.succeeded)

        # Verify the data
        assert_job_files_are(
            self, self.client, resource_group.name, job3.name,
            STANDARD_OUTPUT_DIRECTORY_ID, {
                u'stdout.txt': u'hi from host\nhi from container\n',
                u'stderr.txt': ''
            })

        # Delete clusters
        self.client.clusters.delete(resource_group.name, 'cluster1').result()
        self.client.clusters.delete(resource_group.name, 'cluster2').result()

        # Test deletion
        self.client.file_servers.delete(resource_group.name,
                                        self.file_server_name).result()
        assert_existing_file_servers_are(self, self.client,
                                         resource_group.name, [])