Exemple #1
0
    def test_password_less_ssh_in_container(self, resource_group, location,
                                            cluster):
        """Tests if password-less ssh is configured in containers."""
        job = create_custom_job(
            self.client,
            resource_group.name,
            location,
            cluster.id,
            'job',
            2,
            'ssh 10.0.0.5 echo done && ssh 10.0.0.5 echo done',
            container=models.ContainerSettings(
                image_source_registry=models.ImageSourceRegistry(
                    image='ubuntu')))
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.succeeded)

        job = self.client.jobs.get(resource_group.name, job.name)
        assert_job_files_are(
            self, self.client, resource_group.name, job.name,
            STANDARD_OUTPUT_DIRECTORY_ID, {
                u'stdout.txt': u'done\ndone\n',
                u'stderr.txt': re.compile('Permanently added.*')
            })
        self.client.jobs.delete(resource_group.name, job.name).result()
        self.assertRaises(
            CloudError,
            lambda: self.client.jobs.get(resource_group.name, job.name))
Exemple #2
0
    def test_job_container_preparation_failure_reporting(
            self, resource_group, location, cluster):
        """Tests if job preparation failure is reported correctly."""
        # create a job with failing job preparation
        job = create_custom_job(
            self.client,
            resource_group.name,
            location,
            cluster.id,
            'job',
            1,
            'true',
            'false',
            container=models.ContainerSettings(
                image_source_registry=models.ImageSourceRegistry(
                    image='ubuntu')))
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.failed)

        job = self.client.jobs.get(resource_group.name, job.name)
        self.assertEqual(job.execution_info.exit_code, 1)
        self.assertEqual(len(job.execution_info.errors), 1)
        self.assertEqual(job.execution_info.errors[0].code,
                         'JobPreparationFailed')
        self.client.jobs.delete(resource_group.name, job.name).result()
        self.assertRaises(
            CloudError,
            lambda: self.client.jobs.get(resource_group.name, job.name))
Exemple #3
0
    def test_job_preparation_container(self, resource_group, location,
                                       cluster):
        """Tests job preparation execution for a job running in a container."""
        # create a job with job preparation which populates input data in $AZ_BATCHAI_INPUT_INPUT/hi.txt
        job = create_custom_job(
            self.client,
            resource_group.name,
            location,
            cluster.id,
            'job',
            1,
            'cat $AZ_BATCHAI_INPUT_INPUT/hi.txt',
            'mkdir -p $AZ_BATCHAI_INPUT_INPUT && echo hello | tee $AZ_BATCHAI_INPUT_INPUT/hi.txt',
            container=models.ContainerSettings(
                image_source_registry=models.ImageSourceRegistry(
                    image='ubuntu')))
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.succeeded)

        assert_job_files_are(
            self, self.client, resource_group.name, job.name,
            STANDARD_OUTPUT_DIRECTORY_ID, {
                u'stdout.txt': u'hello\n',
                u'stderr.txt': u'',
                u'stdout-job_prep.txt': u'hello\n',
                u'stderr-job_prep.txt': u''
            })
        self.client.jobs.delete(resource_group.name, job.name).result()
        self.assertRaises(
            CloudError,
            lambda: self.client.jobs.get(resource_group.name, job.name))
 def test_job_creation_and_deletion(self, resource_group, location, cluster, storage_account, storage_account_key):
     """Tests simple scenario for a job - submit, check results, delete."""
     job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1,
                                     'echo hi | tee {0}/hi.txt'.format(Helpers.JOB_OUTPUT_DIRECTORY_PATH_ENV),
                                     container=models.ContainerSettings(
                                         image_source_registry=models.ImageSourceRegistry(image='ubuntu'))
                                     )  # type: models.Job
     self.assertEqual(
         Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE),
         models.ExecutionState.succeeded)
     # Check standard job output
     Helpers.assert_job_files_are(self, self.client, resource_group.name, job.name,
                                  Helpers.STANDARD_OUTPUT_DIRECTORY_ID,
                                  {u'stdout.txt': u'hi\n', u'stderr.txt': u''})
     # Check job's output
     Helpers.assert_job_files_are(self, self.client, resource_group.name, job.name,
                                  Helpers.JOB_OUTPUT_DIRECTORY_ID,
                                  {u'hi.txt': u'hi\n'})
     # Check that we can access the output files directly in storage using path segment returned by the server
     Helpers.assert_file_in_file_share(self, storage_account.name, storage_account_key,
                                       job.job_output_directory_path_segment + '/' + Helpers.STDOUTERR_FOLDER_NAME,
                                       'stdout.txt', u'hi\n')
     self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME,
                             job.name).result()
     self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME,
                                                                Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
 def assertCanRunJobInContainer(self,
                                resource_group,
                                location,
                                cluster_id,
                                timeout_sec=helpers.MINUTE):
     self.assertCanRunJob(
         resource_group, location, cluster_id, 'container_job',
         models.ContainerSettings(
             models.ImageSourceRegistry(image="ubuntu")), timeout_sec)
Exemple #6
0
def create_job(config,
               cluster_id,
               workspace,
               experiment,
               job_name,
               image_name,
               command,
               number_of_vms=1):
    ''' Creates job
    '''
    input_directories = [
        models.InputDirectory(id='SCRIPT',
                              path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format(
                                  config.fileshare_mount_point, job_name)),
        models.InputDirectory(id='DATASET',
                              path='$AZ_BATCHAI_MOUNT_ROOT/{0}/{1}'.format(
                                  config.fileshare_mount_point, 'data'))
    ]

    std_output_path_prefix = "$AZ_BATCHAI_MOUNT_ROOT/{0}".format(
        config.fileshare_mount_point)

    output_directories = [
        models.OutputDirectory(id='MODEL',
                               path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                                   config.fileshare_mount_point),
                               path_suffix="models"),
        models.OutputDirectory(id='NOTEBOOKS',
                               path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(
                                   config.fileshare_mount_point),
                               path_suffix="notebooks")
    ]

    parameters = models.JobCreateParameters(
        location=config.location,
        cluster=models.ResourceId(id=cluster_id),
        node_count=number_of_vms,
        input_directories=input_directories,
        std_out_err_path_prefix=std_output_path_prefix,
        output_directories=output_directories,
        container_settings=models.ContainerSettings(
            image_source_registry=models.ImageSourceRegistry(
                image=image_name)),
        custom_toolkit_settings=models.CustomToolkitSettings(
            command_line=command))

    client = client_from(config)
    _ = client.jobs.create(config.group_name, workspace, experiment, job_name,
                           parameters)
Exemple #7
0
def setup_cluster(config):
    client = client_from(config)
    container_setting_for = lambda img: models.ContainerSettings(
        image_source_registry=models.ImageSourceRegistry(img))
    container_settings = [
        container_setting_for(img) for img in config.image_names
    ]

    volumes = create_volume(config.storage_account['name'],
                            config.storage_account['key'],
                            config.fileshare_name,
                            config.fileshare_mount_point)
    parameters = cluster_parameters_for(config, container_settings, volumes)
    _ = client.clusters.create(config.group_name, config.cluster_name,
                               parameters)
Exemple #8
0
    def test_file_server(self, resource_group, location, storage_account,
                         storage_account_key):
        """Tests file server functionality

        1. Create file server
        2. Create two clusters with this file server
        3. Check that the file server is mounted:
            a. submit tasks (one from host and another from container) on the first cluster to write data to nfs
            b. submit a task on the second cluster to read the data from nfs
        """
        server = create_file_server(
            self.client, location, resource_group.name,
            self.file_server_name)  # type: models.FileServer

        cluster1 = create_cluster(
            self.client,
            location,
            resource_group.name,
            'cluster1',
            'STANDARD_D1',
            1,
            storage_account.name,
            storage_account_key,
            file_servers=[
                models.FileServerReference(
                    file_server=models.ResourceId(id=server.id),
                    relative_mount_path='nfs',
                    mount_options="rw")
            ])
        cluster2 = create_cluster(
            self.client,
            location,
            resource_group.name,
            'cluster2',
            'STANDARD_D1',
            1,
            storage_account.name,
            storage_account_key,
            file_servers=[
                models.FileServerReference(
                    file_server=models.ResourceId(id=server.id),
                    relative_mount_path='nfs',
                    mount_options="rw")
            ])
        # Verify the file server is reported.
        assert_existing_file_servers_are(self, self.client,
                                         resource_group.name,
                                         [self.file_server_name])

        # Verify the file server become available in a reasonable time
        self.assertTrue(
            wait_for_file_server(self.is_live, self.client,
                                 resource_group.name, self.file_server_name,
                                 _FILE_SERVER_CREATION_TIMEOUT_SEC))

        # Verify the remote login information and private ip are reported
        server = self.client.file_servers.get(
            resource_group.name,
            self.file_server_name)  # type: models.FileServer
        self.assertRegexpMatches(server.mount_settings.file_server_public_ip,
                                 RE_ID_ADDRESS)
        self.assertRegexpMatches(server.mount_settings.file_server_internal_ip,
                                 RE_ID_ADDRESS)

        # Verify the clusters allocated nodes successfully
        self.assertEqual(
            wait_for_nodes(self.is_live, self.client, resource_group.name,
                           'cluster1', 1, NODE_STARTUP_TIMEOUT_SEC), 1)
        self.assertEqual(
            wait_for_nodes(self.is_live, self.client, resource_group.name,
                           'cluster2', 1, NODE_STARTUP_TIMEOUT_SEC), 1)

        # Execute publishing tasks on the first cluster
        job1 = create_custom_job(
            self.client, resource_group.name, location, cluster1.id,
            'host_publisher', 1,
            'echo hi from host > $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt')
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job1.name, MINUTE),
            models.ExecutionState.succeeded)
        job2 = create_custom_job(
            self.client,
            resource_group.name,
            location,
            cluster1.id,
            'container_publisher',
            1,
            'echo hi from container >> $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt',
            container=models.ContainerSettings(
                image_source_registry=models.ImageSourceRegistry(
                    image="ubuntu")))
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job2.name, MINUTE),
            models.ExecutionState.succeeded)

        # Execute consumer task on the second cluster
        job3 = create_custom_job(
            self.client, resource_group.name, location, cluster2.id,
            'consumer', 1, 'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt; '
            'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt')
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job3.name, MINUTE),
            models.ExecutionState.succeeded)

        # Verify the data
        assert_job_files_are(
            self, self.client, resource_group.name, job3.name,
            STANDARD_OUTPUT_DIRECTORY_ID, {
                u'stdout.txt': u'hi from host\nhi from container\n',
                u'stderr.txt': ''
            })

        # Delete clusters
        self.client.clusters.delete(resource_group.name, 'cluster1').result()
        self.client.clusters.delete(resource_group.name, 'cluster2').result()

        # Test deletion
        self.client.file_servers.delete(resource_group.name,
                                        self.file_server_name).result()
        assert_existing_file_servers_are(self, self.client,
                                         resource_group.name, [])
 output_directories=[
     models.OutputDirectory(id='ALL',
                            path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/output')
 ],
 custom_toolkit_settings=models.CustomToolkitSettings(
     command_line=
     'python $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts/FF_multi_step_multivariate.py \
     --scriptdir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/scripts \
     --datadir $AZ_BATCHAI_JOB_MOUNT_ROOT/resources/data \
     --outdir $AZ_BATCHAI_OUTPUT_ALL \
     -l {0} -n {1} -b {2} -T {3} -r {4} -a {5}'.format(
         parameters['LATENT_DIM'], parameters['HIDDEN_LAYERS'],
         parameters['BATCH_SIZE'], parameters['T'],
         parameters['LEARNING_RATE'], parameters['ALPHA'])),
 container_settings=models.ContainerSettings(
     image_source_registry=models.ImageSourceRegistry(
         image=cfg['docker_image'])),
 mount_volumes=models.MountVolumes(azure_file_shares=[
     models.AzureFileShareReference(
         account_name=cfg['storage_account']['name'],
         credentials=models.AzureStorageCredentialsInfo(
             account_key=cfg['storage_account']['key']),
         azure_file_url='https://' + cfg['storage_account']['name'] +
         '.file.core.windows.net/logs',
         relative_mount_path='logs'),
     models.AzureFileShareReference(
         account_name=cfg['storage_account']['name'],
         credentials=models.AzureStorageCredentialsInfo(
             account_key=cfg['storage_account']['key']),
         azure_file_url='https://' + cfg['storage_account']['name'] +
         '.file.core.windows.net/resources',
         relative_mount_path='resources'),
Exemple #10
0
     # In this case we will write these out to an Azure Files share
     std_out_err_path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(relative_mount_point),

     input_directories=[models.InputDirectory(
         id='SAMPLE',
         path='$AZ_BATCHAI_MOUNT_ROOT/{0}/data'.format(relative_mount_point))],

     # Specify directories where files will get written to
     output_directories=[models.OutputDirectory(
        id='MODEL',
        path_prefix='$AZ_BATCHAI_MOUNT_ROOT/{0}'.format(relative_mount_point),
        path_suffix="Models")],

     # Container configuration
     container_settings=models.ContainerSettings(
         image_source_registry=models.ImageSourceRegistry(image='microsoft/cntk:2.1-gpu-python3.5-cuda8.0-cudnn6.0')),

     # Toolkit specific settings
     cntk_settings = models.CNTKsettings(
        python_script_file_path='$AZ_BATCHAI_INPUT_SAMPLE/ConvNet_MNIST.py',
        command_line_args='$AZ_BATCHAI_INPUT_SAMPLE $AZ_BATCHAI_OUTPUT_MODEL')
 )

# Create the job
client.jobs.create(resource_group_name, job_name, parameters).result()


## MONITOR JOB
job = client.jobs.get(resource_group_name, job_name)

print('Job state: {0} '.format(job.execution_state.name))