def assertCanRunJob(self, resource_group, location, cluster_id, job_name,
                        container_settings, timeout_sec):
        create_custom_job(self.client,
                          resource_group.name,
                          location,
                          cluster_id,
                          job_name,
                          1,
                          'echo hello | tee $AZ_BATCHAI_OUTPUT_OUTPUTS/hi.txt',
                          container=container_settings)

        # Verify if the job finishes reasonably fast.
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job_name,
                                    timeout_sec),
            models.ExecutionState.succeeded)

        # Verify if output files and standard output files are available and contain expected greeting.
        assert_job_files_are(self, self.client, resource_group.name, job_name,
                             'OUTPUTS', {u'hi.txt': u'hello\n'})
        assert_job_files_are(self, self.client, resource_group.name, job_name,
                             STANDARD_OUTPUT_DIRECTORY_ID, {
                                 u'stdout.txt': u'hello\n',
                                 u'stderr.txt': ''
                             })
Exemple #2
0
    def test_job_container_preparation_failure_reporting(
            self, resource_group, location, cluster):
        """Tests if job preparation failure is reported correctly."""
        # create a job with failing job preparation
        job = create_custom_job(
            self.client,
            resource_group.name,
            location,
            cluster.id,
            'job',
            1,
            'true',
            'false',
            container=models.ContainerSettings(
                image_source_registry=models.ImageSourceRegistry(
                    image='ubuntu')))
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.failed)

        job = self.client.jobs.get(resource_group.name, job.name)
        self.assertEqual(job.execution_info.exit_code, 1)
        self.assertEqual(len(job.execution_info.errors), 1)
        self.assertEqual(job.execution_info.errors[0].code,
                         'JobPreparationFailed')
        self.client.jobs.delete(resource_group.name, job.name).result()
        self.assertRaises(
            CloudError,
            lambda: self.client.jobs.get(resource_group.name, job.name))
Exemple #3
0
    def test_password_less_ssh_in_container(self, resource_group, location,
                                            cluster):
        """Tests if password-less ssh is configured in containers."""
        job = create_custom_job(
            self.client,
            resource_group.name,
            location,
            cluster.id,
            'job',
            2,
            'ssh 10.0.0.5 echo done && ssh 10.0.0.5 echo done',
            container=models.ContainerSettings(
                image_source_registry=models.ImageSourceRegistry(
                    image='ubuntu')))
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.succeeded)

        job = self.client.jobs.get(resource_group.name, job.name)
        assert_job_files_are(
            self, self.client, resource_group.name, job.name,
            STANDARD_OUTPUT_DIRECTORY_ID, {
                u'stdout.txt': u'done\ndone\n',
                u'stderr.txt': re.compile('Permanently added.*')
            })
        self.client.jobs.delete(resource_group.name, job.name).result()
        self.assertRaises(
            CloudError,
            lambda: self.client.jobs.get(resource_group.name, job.name))
Exemple #4
0
    def test_job_preparation_container(self, resource_group, location,
                                       cluster):
        """Tests job preparation execution for a job running in a container."""
        # create a job with job preparation which populates input data in $AZ_BATCHAI_INPUT_INPUT/hi.txt
        job = create_custom_job(
            self.client,
            resource_group.name,
            location,
            cluster.id,
            'job',
            1,
            'cat $AZ_BATCHAI_INPUT_INPUT/hi.txt',
            'mkdir -p $AZ_BATCHAI_INPUT_INPUT && echo hello | tee $AZ_BATCHAI_INPUT_INPUT/hi.txt',
            container=models.ContainerSettings(
                image_source_registry=models.ImageSourceRegistry(
                    image='ubuntu')))
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.succeeded)

        assert_job_files_are(
            self, self.client, resource_group.name, job.name,
            STANDARD_OUTPUT_DIRECTORY_ID, {
                u'stdout.txt': u'hello\n',
                u'stderr.txt': u'',
                u'stdout-job_prep.txt': u'hello\n',
                u'stderr-job_prep.txt': u''
            })
        self.client.jobs.delete(resource_group.name, job.name).result()
        self.assertRaises(
            CloudError,
            lambda: self.client.jobs.get(resource_group.name, job.name))
Exemple #5
0
    def test_running_job_deletion(self, resource_group, location, cluster):
        """Tests deletion of a running job."""
        job = create_custom_job(self.client, resource_group.name, location,
                                cluster.id, 'job', 1, 'sleep 600')
        self.assertEqual(
            wait_for_job_start_running(self.is_live, self.client,
                                       resource_group.name, job.name, MINUTE),
            models.ExecutionState.running)

        self.client.jobs.delete(resource_group.name, job.name).result()
        self.assertRaises(
            CloudError,
            lambda: self.client.jobs.get(resource_group.name, job.name))
Exemple #6
0
    def test_running_job_termination(self, resource_group, location, cluster):
        """Tests termination of a running job."""
        job = create_custom_job(self.client, resource_group.name, location,
                                cluster.id, 'longrunning', 1, 'sleep 600')
        self.assertEqual(
            wait_for_job_start_running(self.is_live, self.client,
                                       resource_group.name, job.name, MINUTE),
            models.ExecutionState.running)

        self.client.jobs.terminate(resource_group.name, job.name).result()
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.failed)
Exemple #7
0
    def test_queued_job_termination(self, resource_group, location, cluster):
        """Tests termination of a job in queued state."""
        # Create a job which will be in queued state because the cluster has no compute nodes.
        job = create_custom_job(self.client, resource_group.name, location,
                                cluster.id, 'job', 1, 'true')

        self.client.jobs.terminate(resource_group.name, job.name).result()
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.failed)

        self.client.jobs.delete(resource_group.name, job.name).result()
        self.assertRaises(
            CloudError,
            lambda: self.client.jobs.get(resource_group.name, job.name))
Exemple #8
0
    def test_failed_job_reporting(self, resource_group, location, cluster):
        """Tests if job failure is reported correctly."""
        job = create_custom_job(self.client, resource_group.name, location,
                                cluster.id, 'job', 1, 'false')
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.failed)

        job = self.client.jobs.get(resource_group.name, job.name)
        self.assertEqual(job.execution_info.exit_code, 1)
        self.assertEqual(len(job.execution_info.errors), 1)
        self.assertEqual(job.execution_info.errors[0].code, 'JobFailed')
        self.client.jobs.delete(resource_group.name, job.name).result()
        self.assertRaises(
            CloudError,
            lambda: self.client.jobs.get(resource_group.name, job.name))
Exemple #9
0
    def test_completed_job_termination(self, resource_group, location,
                                       cluster):
        """Tests termination of completed job."""
        job = create_custom_job(self.client, resource_group.name, location,
                                cluster.id, 'job', 1, 'true')
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.succeeded)

        # termination of completed job is NOP and must not change the execution state.
        self.client.jobs.terminate(resource_group.name, job.name).result()
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job.name, MINUTE),
            models.ExecutionState.succeeded)

        self.client.jobs.delete(resource_group.name, job.name).result()
        self.assertRaises(
            CloudError,
            lambda: self.client.jobs.get(resource_group.name, job.name))
Exemple #10
0
 def test_job_creation_and_deletion(self, resource_group, location, cluster,
                                    storage_account, storage_account_key):
     """Tests simple scenario for a job - submit, check results, delete."""
     job = create_custom_job(
         self.client,
         resource_group.name,
         location,
         cluster.id,
         'job',
         1,
         'echo hi | tee {0}/hi.txt'.format(JOB_OUTPUT_DIRECTORY_PATH_ENV),
         container=models.ContainerSettings(
             image_source_registry=models.ImageSourceRegistry(
                 image='ubuntu')))  # type: models.Job
     self.assertEqual(
         wait_for_job_completion(self.is_live, self.client,
                                 resource_group.name, job.name, MINUTE),
         models.ExecutionState.succeeded)
     # Check standard job output
     assert_job_files_are(self, self.client, resource_group.name, job.name,
                          STANDARD_OUTPUT_DIRECTORY_ID, {
                              u'stdout.txt': u'hi\n',
                              u'stderr.txt': u''
                          })
     # Check job's output
     assert_job_files_are(self, self.client, resource_group.name, job.name,
                          JOB_OUTPUT_DIRECTORY_ID, {u'hi.txt': u'hi\n'})
     # Check that we can access the output files directly in storage using path segment returned by the server
     assert_file_in_file_share(
         self, storage_account.name, storage_account_key,
         job.job_output_directory_path_segment + '/' +
         STDOUTERR_FOLDER_NAME, 'stdout.txt', u'hi\n')
     self.client.jobs.delete(resource_group.name, job.name).result()
     self.assertRaises(
         CloudError,
         lambda: self.client.jobs.get(resource_group.name, job.name))
Exemple #11
0
    def test_file_server(self, resource_group, location, storage_account,
                         storage_account_key):
        """Tests file server functionality

        1. Create file server
        2. Create two clusters with this file server
        3. Check that the file server is mounted:
            a. submit tasks (one from host and another from container) on the first cluster to write data to nfs
            b. submit a task on the second cluster to read the data from nfs
        """
        server = create_file_server(
            self.client, location, resource_group.name,
            self.file_server_name)  # type: models.FileServer

        cluster1 = create_cluster(
            self.client,
            location,
            resource_group.name,
            'cluster1',
            'STANDARD_D1',
            1,
            storage_account.name,
            storage_account_key,
            file_servers=[
                models.FileServerReference(
                    file_server=models.ResourceId(id=server.id),
                    relative_mount_path='nfs',
                    mount_options="rw")
            ])
        cluster2 = create_cluster(
            self.client,
            location,
            resource_group.name,
            'cluster2',
            'STANDARD_D1',
            1,
            storage_account.name,
            storage_account_key,
            file_servers=[
                models.FileServerReference(
                    file_server=models.ResourceId(id=server.id),
                    relative_mount_path='nfs',
                    mount_options="rw")
            ])
        # Verify the file server is reported.
        assert_existing_file_servers_are(self, self.client,
                                         resource_group.name,
                                         [self.file_server_name])

        # Verify the file server become available in a reasonable time
        self.assertTrue(
            wait_for_file_server(self.is_live, self.client,
                                 resource_group.name, self.file_server_name,
                                 _FILE_SERVER_CREATION_TIMEOUT_SEC))

        # Verify the remote login information and private ip are reported
        server = self.client.file_servers.get(
            resource_group.name,
            self.file_server_name)  # type: models.FileServer
        self.assertRegexpMatches(server.mount_settings.file_server_public_ip,
                                 RE_ID_ADDRESS)
        self.assertRegexpMatches(server.mount_settings.file_server_internal_ip,
                                 RE_ID_ADDRESS)

        # Verify the clusters allocated nodes successfully
        self.assertEqual(
            wait_for_nodes(self.is_live, self.client, resource_group.name,
                           'cluster1', 1, NODE_STARTUP_TIMEOUT_SEC), 1)
        self.assertEqual(
            wait_for_nodes(self.is_live, self.client, resource_group.name,
                           'cluster2', 1, NODE_STARTUP_TIMEOUT_SEC), 1)

        # Execute publishing tasks on the first cluster
        job1 = create_custom_job(
            self.client, resource_group.name, location, cluster1.id,
            'host_publisher', 1,
            'echo hi from host > $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt')
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job1.name, MINUTE),
            models.ExecutionState.succeeded)
        job2 = create_custom_job(
            self.client,
            resource_group.name,
            location,
            cluster1.id,
            'container_publisher',
            1,
            'echo hi from container >> $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt',
            container=models.ContainerSettings(
                image_source_registry=models.ImageSourceRegistry(
                    image="ubuntu")))
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job2.name, MINUTE),
            models.ExecutionState.succeeded)

        # Execute consumer task on the second cluster
        job3 = create_custom_job(
            self.client, resource_group.name, location, cluster2.id,
            'consumer', 1, 'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt; '
            'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt')
        self.assertEqual(
            wait_for_job_completion(self.is_live, self.client,
                                    resource_group.name, job3.name, MINUTE),
            models.ExecutionState.succeeded)

        # Verify the data
        assert_job_files_are(
            self, self.client, resource_group.name, job3.name,
            STANDARD_OUTPUT_DIRECTORY_ID, {
                u'stdout.txt': u'hi from host\nhi from container\n',
                u'stderr.txt': ''
            })

        # Delete clusters
        self.client.clusters.delete(resource_group.name, 'cluster1').result()
        self.client.clusters.delete(resource_group.name, 'cluster2').result()

        # Test deletion
        self.client.file_servers.delete(resource_group.name,
                                        self.file_server_name).result()
        assert_existing_file_servers_are(self, self.client,
                                         resource_group.name, [])