예제 #1
0
def create_job(config,
               cluster_id,
               workspace,
               experiment,
               job_name,
               image_name,
               command,
               number_of_vms=1):
    ''' Creates job
    '''
    input_directories = [
        models.InputDirectory(id='SCRIPT',
                              path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format(
                                  config.fileshare_mount_point, job_name)),
        models.InputDirectory(id='DATASET',
                              path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format(
                                  config.fileshare_mount_point, 'data'))
    ]

    std_output_path_prefix = "$AZ_BATCHAI_MOUNT_ROOT/{0}".format(
        config.fileshare_mount_point)

    output_directories = [
        models.OutputDirectory(id='MODEL',
                               path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                                   config.fileshare_mount_point),
                               path_suffix="models"),
        models.OutputDirectory(id='NOTEBOOKS',
                               path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                                   config.fileshare_mount_point),
                               path_suffix="notebooks")
    ]

    parameters = models.JobCreateParameters(
        location=config.location,
        cluster=models.ResourceId(id=cluster_id),
        node_count=number_of_vms,
        input_directories=input_directories,
        std_out_err_path_prefix=std_output_path_prefix,
        output_directories=output_directories,
        container_settings=models.ContainerSettings(
            image_source_registry=models.ImageSourceRegistry(
                image=image_name)),
        custom_toolkit_settings=models.CustomToolkitSettings(
            command_line=command))

    client = client_from(config)
    _ = client.jobs.create(config.group_name, workspace, experiment, job_name,
                           parameters)
예제 #2
0
    def create_custom_job(client,
                          resource_group,
                          cluster_id,
                          job_name,
                          nodes,
                          cmd,
                          job_preparation_cmd=None,
                          container=None):
        """Creates custom toolkit job

        :param BatchAIManagementClient client: client instance.
        :param str resource_group: resource group name.
        :param str cluster_id: resource Id of the cluster.
        :param str job_name: job name.
        :param int nodes: number of nodes to execute the job.
        :param str cmd: command line to run.
        :param str or None job_preparation_cmd: Job preparation command line.
        :param models.ContainerSettings or None container: container settings to run the job.
        :return models.Job: the created job.
        """
        job_preparation = None
        if job_preparation_cmd:
            job_preparation = models.JobPreparation(
                command_line=job_preparation_cmd)
        client.experiments.create(resource_group,
                                  Helpers.DEFAULT_WORKSPACE_NAME,
                                  Helpers.DEFAULT_EXPERIMENT_NAME).result()
        return client.jobs.create(
            resource_group,
            Helpers.DEFAULT_WORKSPACE_NAME,
            Helpers.DEFAULT_EXPERIMENT_NAME,
            job_name,
            parameters=models.JobCreateParameters(
                cluster=models.ResourceId(id=cluster_id),
                node_count=nodes,
                std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                    Helpers.AZURE_FILES_MOUNTING_PATH),
                output_directories=[
                    models.OutputDirectory(
                        id=Helpers.JOB_OUTPUT_DIRECTORY_ID,
                        path_prefix=Helpers.JOB_OUTPUT_DIRECTORY_PATH,
                        path_suffix="files")
                ],
                input_directories=[
                    models.InputDirectory(
                        id='INPUT',
                        path='$AZ_BATCHAI_MOUNT_ROOT/{0}/input'.format(
                            Helpers.AZURE_FILES_MOUNTING_PATH))
                ],
                container_settings=container,
                job_preparation=job_preparation,
                custom_toolkit_settings=models.CustomToolkitSettings(
                    command_line=cmd))).result()
예제 #3
0
def submit_job(config, pretrained_model_type, retraining_type,
               output_model_name, num_epochs):
    ''' Defines and submits a job. Does not check for completion. '''
    client = get_client(config)
    job_name = 'job{}'.format(
        datetime.datetime.utcnow().strftime('%m_%d_%H_%M_%S'))
    cluster = client.clusters.get(config.bait_resource_group_name,
                                  config.bait_cluster_name)

    # Define the command line arguments to the retraining script
    command_line_args = '--input_dir $AZ_BATCHAI_INPUT_TRAININGDATA ' + \
     '--validation_dir $AZ_BATCHAI_INPUT_VALIDATIONDATA ' + \
     '--output_dir $AZ_BATCHAI_OUTPUT_MODEL ' + \
     '--num_epochs {} '.format(num_epochs) + \
     '--retraining_type {} '.format(retraining_type) + \
     '--model_type {} '.format(pretrained_model_type) + \
     '--model_filename $AZ_BATCHAI_INPUT_PRETRAINEDMODELS/'
    if pretrained_model_type == 'alexnet':
        command_line_args += 'AlexNet.model'
    elif pretrained_model_type == 'resnet18':
        command_line_args += 'ResNet_18.model'

    # Define the job
    cntk_settings = tm.CNTKsettings(
        language_type='python',
        python_script_file_path='$AZ_BATCHAI_INPUT_SCRIPT/' +
        'retrain_model_distributed.py',
        command_line_args=command_line_args,
        process_count=config.bait_vms_per_job)  # NC6s -- one GPU per VM

    job_create_params = tm.job_create_parameters.JobCreateParameters(
        location=config.bait_region,
        cluster=tm.ResourceId(cluster.id),
        node_count=config.bait_vms_per_job,
        std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/afs',
        output_directories=[
            tm.OutputDirectory(id='MODEL',
                               path_prefix='$AZ_BATCHAI_MOUNT_ROOT/afs')
        ],
        input_directories=[
            tm.InputDirectory(id='SCRIPT',
                              path='$AZ_BATCHAI_MOUNT_ROOT/afs/scripts'),
            tm.InputDirectory(
                id='PRETRAINEDMODELS',
                path='$AZ_BATCHAI_MOUNT_ROOT/afs/pretrainedmodels'),
            tm.InputDirectory(
                id='TRAININGDATA',
                path='$AZ_BATCHAI_MOUNT_ROOT/nfs/training_images'),
            tm.InputDirectory(
                id='VALIDATIONDATA',
                path='$AZ_BATCHAI_MOUNT_ROOT/nfs/validation_images')
        ],
        cntk_settings=cntk_settings)

    # Submit the job
    job = client.jobs.create(
        resource_group_name=config.bait_resource_group_name,
        job_name=job_name,
        parameters=job_create_params)

    return (job_name)
    DiscreteParameter(parameter_name="BATCH_SIZE", values=[8, 16, 32]),
    DiscreteParameter(parameter_name="T", values=[72, 168, 336]),
    DiscreteParameter(parameter_name="LEARNING_RATE",
                      values=[0.01, 0.001, 0.0001]),
    DiscreteParameter(parameter_name="ALPHA", values=[0.1, 0.001, 0])
]

parameters = ParameterSweep(param_specs)

# create a template for Batch AI job
jcp = models.JobCreateParameters(
    cluster=models.ResourceId(id=cluster.id),
    node_count=1,
    std_out_err_path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/logs',
    output_directories=[
        models.OutputDirectory(id='ALL',
                               path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/output')
    ],
    custom_toolkit_settings=models.CustomToolkitSettings(
        command_line=
        'python $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts/FF_multi_step_multivariate.py \
        --scriptdir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts \
        --datadir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/data \
        --outdir $AZ_BATCHAI_OUTPUT_ALL \
        -l {0} -n {1} -b {2} -T {3} -r {4} -a {5}'.format(
            parameters['LATENT_DIM'], parameters['HIDDEN_LAYERS'],
            parameters['BATCH_SIZE'], parameters['T'],
            parameters['LEARNING_RATE'], parameters['ALPHA'])),
    container_settings=models.ContainerSettings(
        image_source_registry=models.ImageSourceRegistry(
            image=cfg['docker_image'])),
    mount_volumes=models.MountVolumes(azure_file_shares=[
예제 #5
0
     location='northeurope',
     cluster=models.ResourceId(id=cluster.id),
     # The number of VMs in the cluster to use
     node_count=1,

     # Override the path where the std out and std err files will be written to.
     # In this case we will write these out to an Azure Files share
     std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(relative_mount_point),

     input_directories=[models.InputDirectory(
         id='SAMPLE',
         path='$AZ_BATCHAI_MOUNT_ROOT/{0}/data'.format(relative_mount_point))],

     # Specify directories where files will get written to
     output_directories=[models.OutputDirectory(
        id='MODEL',
        path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(relative_mount_point),
        path_suffix="Models")],

     # Container configuration
     container_settings=models.ContainerSettings(
         image_source_registry=models.ImageSourceRegistry(image='microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0')),

     # Toolkit specific settings
     cntk_settings = models.CNTKsettings(
        python_script_file_path='$AZ_BATCHAI_INPUT_SAMPLE/ConvNet_MNIST.py',
        command_line_args='$AZ_BATCHAI_INPUT_SAMPLE $AZ_BATCHAI_OUTPUT_MODEL')
 )

# Create the job
client.jobs.create(resource_group_name, job_name, parameters).result()
예제 #6
0
    def test_job_level_mounting(self, resource_group, location, cluster,
                                storage_account, storage_account_key):
        """Tests if it's possible to mount external file systems for a job."""
        job_name = 'job'

        # Create file share and container to mount on the job level
        if storage_account.name != FAKE_STORAGE.name:
            files = FileService(storage_account.name, storage_account_key)
            files.create_share('jobshare', fail_on_exist=False)
            blobs = BlockBlobService(storage_account.name, storage_account_key)
            blobs.create_container('jobcontainer', fail_on_exist=False)

        job = self.client.jobs.create(
            resource_group.name,
            job_name,
            parameters=models.JobCreateParameters(
                location=location,
                cluster=models.ResourceId(id=cluster.id),
                node_count=1,
                mount_volumes=models.
                MountVolumes(azure_file_shares=[
                    models.AzureFileShareReference(
                        account_name=storage_account.name,
                        azure_file_url='https://{0}.file.core.windows.net/{1}'.
                        format(storage_account.name, 'jobshare'),
                        relative_mount_path='job_afs',
                        credentials=models.AzureStorageCredentialsInfo(
                            account_key=storage_account_key),
                    )
                ],
                             azure_blob_file_systems=[
                                 models.AzureBlobFileSystemReference(
                                     account_name=storage_account.name,
                                     container_name='jobcontainer',
                                     relative_mount_path='job_bfs',
                                     credentials=models.
                                     AzureStorageCredentialsInfo(
                                         account_key=storage_account_key),
                                 )
                             ]),
                # Put standard output on cluster level AFS to check that the job has access to it.
                std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                    AZURE_FILES_MOUNTING_PATH),
                # Create two output directories on job level AFS and blobfuse.
                output_directories=[
                    models.OutputDirectory(
                        id='OUTPUT1',
                        path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/job_afs'),
                    models.OutputDirectory(
                        id='OUTPUT2',
                        path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/job_bfs')
                ],
                # Check that the job preparation has access to job level file systems.
                job_preparation=models.JobPreparation(
                    command_line=
                    'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/prep_afs.txt; '
                    'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/prep_bfs.txt; '
                    'echo done'),
                # Check that the job has access to job
                custom_toolkit_settings=models.CustomToolkitSettings(
                    command_line=
                    'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/job_afs.txt; '
                    'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/job_bfs.txt; '
                    'mkdir $AZ_BATCHAI_OUTPUT_OUTPUT1/afs; '
                    'echo afs > $AZ_BATCHAI_OUTPUT_OUTPUT1/afs/job_afs.txt; '
                    'mkdir $AZ_BATCHAI_OUTPUT_OUTPUT2/bfs; '
                    'echo bfs > $AZ_BATCHAI_OUTPUT_OUTPUT2/bfs/job_bfs.txt; '
                    'echo done'))).result()
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.succeeded)

        job = self.client.jobs.get(resource_group.name, job.name)
        # Assert job and job prep standard output is populated on cluster level filesystem
        assert_job_files_are(
            self, self.client, resource_group.name, job.name,
            STANDARD_OUTPUT_DIRECTORY_ID, {
                u'stdout.txt': u'done\n',
                u'stderr.txt': u'',
                u'stdout-job_prep.txt': u'done\n',
                u'stderr-job_prep.txt': u''
            })
        # Assert files are generated on job level AFS
        assert_job_files_are(self, self.client, resource_group.name, job.name,
                             'OUTPUT1', {
                                 u'job_afs.txt': u'afs\n',
                                 u'prep_afs.txt': u'afs\n',
                                 u'afs': None
                             })
        # Assert files are generated on job level blobfuse
        assert_job_files_are(self, self.client, resource_group.name, job.name,
                             'OUTPUT2', {
                                 u'job_bfs.txt': u'bfs\n',
                                 u'prep_bfs.txt': u'bfs\n',
                                 u'bfs': None
                             })
        # Assert subfolders are available via API
        assert_job_files_in_path_are(self, self.client, resource_group.name,
                                     job.name, 'OUTPUT1', 'afs',
                                     {u'job_afs.txt': u'afs\n'})
        assert_job_files_in_path_are(self, self.client, resource_group.name,
                                     job.name, 'OUTPUT2', 'bfs',
                                     {u'job_bfs.txt': u'bfs\n'})

        # Assert that we can access the output files created on job level mount volumes directly in storage using path
        # segment returned by the server.
        if storage_account.name != FAKE_STORAGE.name:
            files = FileService(storage_account.name, storage_account_key)
            self.assertTrue(
                files.exists(
                    'jobshare', job.job_output_directory_path_segment + '/' +
                    OUTPUT_DIRECTORIES_FOLDER_NAME, 'job_afs.txt'))
            blobs = BlockBlobService(storage_account.name, storage_account_key)
            self.assertTrue(
                blobs.exists(
                    'jobcontainer', job.job_output_directory_path_segment +
                    '/' + OUTPUT_DIRECTORIES_FOLDER_NAME + '/job_bfs.txt'))
        # After the job is done the filesystems should be unmounted automatically, check this by submitting a new job.
        checker = self.client.jobs.create(
            resource_group.name,
            'checker',
            parameters=models.JobCreateParameters(
                location=location,
                cluster=models.ResourceId(id=cluster.id),
                node_count=1,
                std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                    AZURE_FILES_MOUNTING_PATH),
                custom_toolkit_settings=models.CustomToolkitSettings(
                    command_line='echo job; df | grep -E "job_bfs|job_afs"'))
        ).result()
        # Check the job failed because there are not job level mount volumes anymore
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, checker.name, MINUTE),
            models.ExecutionState.failed)
        # Check that the cluster level AFS was still mounted
        assert_job_files_are(self, self.client, resource_group.name,
                             checker.name, STANDARD_OUTPUT_DIRECTORY_ID, {
                                 u'stdout.txt': u'job\n',
                                 u'stderr.txt': u''
                             })