def assertCanRunJob(self, resource_group, location, cluster_id, job_name, container_settings, timeout_sec): Helpers.create_custom_job(self.client, resource_group.name, cluster_id, job_name, 1, 'echo hello | tee $AZ_BATCHAI_OUTPUT_OUTPUTS/hi.txt', container=container_settings) # Verify if the job finishes reasonably fast. self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job_name, timeout_sec), models.ExecutionState.succeeded) # Verify if output files and standard output files are available and contain expected greeting. Helpers.assert_job_files_are(self, self.client, resource_group.name, job_name, 'OUTPUTS', {u'hi.txt': u'hello\n'}) Helpers.assert_job_files_are(self, self.client, resource_group.name, job_name, Helpers.STANDARD_OUTPUT_DIRECTORY_ID, {u'stdout.txt': u'hello\n', u'stderr.txt': ''})
def test_job_creation_and_deletion(self, resource_group, location, cluster, storage_account, storage_account_key): """Tests simple scenario for a job - submit, check results, delete.""" job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1, 'echo hi | tee {0}/hi.txt'.format(Helpers.JOB_OUTPUT_DIRECTORY_PATH_ENV), container=models.ContainerSettings( image_source_registry=models.ImageSourceRegistry(image='ubuntu')) ) # type: models.Job self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE), models.ExecutionState.succeeded) # Check standard job output Helpers.assert_job_files_are(self, self.client, resource_group.name, job.name, Helpers.STANDARD_OUTPUT_DIRECTORY_ID, {u'stdout.txt': u'hi\n', u'stderr.txt': u''}) # Check job's output Helpers.assert_job_files_are(self, self.client, resource_group.name, job.name, Helpers.JOB_OUTPUT_DIRECTORY_ID, {u'hi.txt': u'hi\n'}) # Check that we can access the output files directly in storage using path segment returned by the server Helpers.assert_file_in_file_share(self, storage_account.name, storage_account_key, job.job_output_directory_path_segment + '/' + Helpers.STDOUTERR_FOLDER_NAME, 'stdout.txt', u'hi\n') self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result() self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
def test_running_job_deletion(self, resource_group, location, cluster): """Tests deletion of a running job.""" job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1, 'sleep 600') self.assertEqual( Helpers.wait_for_job_start_running(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE), models.ExecutionState.running) self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result() self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
def test_running_job_termination(self, resource_group, location, cluster): """Tests termination of a running job.""" job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'longrunning', 1, 'sleep 600') self.assertEqual( Helpers.wait_for_job_start_running(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE), models.ExecutionState.running) self.client.jobs.terminate( resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result() self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE), models.ExecutionState.failed)
def test_queued_job_termination(self, resource_group, location, cluster): """Tests termination of a job in queued state.""" # Create a job which will be in queued state because the cluster has no compute nodes. job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1, 'true') self.client.jobs.terminate( resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result() self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE), models.ExecutionState.failed) self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result() self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
def test_failed_job_reporting(self, resource_group, location, cluster): """Tests if job failure is reported correctly.""" job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1, 'false') self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE), models.ExecutionState.failed) job = self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name) self.assertEqual(job.execution_info.exit_code, 1) self.assertEqual(len(job.execution_info.errors), 1) self.assertEqual(job.execution_info.errors[0].code, 'JobFailed') self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result() self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
def test_completed_job_termination(self, resource_group, location, cluster): """Tests termination of completed job.""" job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1, 'true') self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE), models.ExecutionState.succeeded) # termination of completed job is NOP and must not change the execution state. self.client.jobs.terminate(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result() self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE), models.ExecutionState.succeeded) self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result() self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
def test_password_less_ssh(self, resource_group, location, cluster): """Tests if password-less ssh is configured on hosts.""" job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 2, 'ssh 10.0.0.4 echo done && ssh 10.0.0.5 echo done') self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE), models.ExecutionState.succeeded) job = self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name) Helpers.assert_job_files_are(self, self.client, resource_group.name, job.name, Helpers.STANDARD_OUTPUT_DIRECTORY_ID, {u'stdout.txt': u'done\ndone\n', u'stderr.txt': re.compile('Permanently added.*')}) self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result() self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
def test_job_container_preparation_failure_reporting(self, resource_group, location, cluster): """Tests if job preparation failure is reported correctly.""" # create a job with failing job preparation job = Helpers.create_custom_job(self.client, resource_group.name, cluster.id, 'job', 1, 'true', 'false', container=models.ContainerSettings( image_source_registry=models.ImageSourceRegistry(image='ubuntu'))) self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE), models.ExecutionState.failed) job = self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name) self.assertEqual(job.execution_info.exit_code, 1) self.assertEqual(len(job.execution_info.errors), 1) self.assertEqual(job.execution_info.errors[0].code, 'JobPreparationFailed') self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result() self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
def test_job_preparation_host(self, resource_group, location, cluster): """Tests job preparation execution for a job running on a host.""" # create a job with job preparation which populates input data in $AZ_BATCHAI_INPUT_INPUT/hi.txt job = Helpers.create_custom_job( self.client, resource_group.name, cluster.id, 'job', 1, 'cat $AZ_BATCHAI_INPUT_INPUT/hi.txt', 'mkdir -p $AZ_BATCHAI_INPUT_INPUT && echo hello | tee $AZ_BATCHAI_INPUT_INPUT/hi.txt') self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job.name, Helpers.MINUTE), models.ExecutionState.succeeded) Helpers.assert_job_files_are(self, self.client, resource_group.name, job.name, Helpers.STANDARD_OUTPUT_DIRECTORY_ID, {u'stdout.txt': u'hello\n', u'stderr.txt': u'', u'stdout-job_prep.txt': u'hello\n', u'stderr-job_prep.txt': u''}) self.client.jobs.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name).result() self.assertRaises(CloudError, lambda: self.client.jobs.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, Helpers.DEFAULT_EXPERIMENT_NAME, job.name))
def test_file_server(self, resource_group, location, storage_account, storage_account_key): """Tests file server functionality 1. Create file server 2. Create two clusters with this file server 3. Check that the file server is mounted: a. submit tasks (one from host and another from container) on the first cluster to write data to nfs b. submit a task on the second cluster to read the data from nfs """ server = Helpers.create_file_server( self.client, location, resource_group.name, self.file_server_name) # type: models.FileServer cluster1 = Helpers.create_cluster( self.client, location, resource_group.name, 'cluster1', 'STANDARD_D1', 1, storage_account.name, storage_account_key, file_servers=[ models.FileServerReference( file_server=models.ResourceId(id=server.id), relative_mount_path='nfs', mount_options="rw") ]) cluster2 = Helpers.create_cluster( self.client, location, resource_group.name, 'cluster2', 'STANDARD_D1', 1, storage_account.name, storage_account_key, file_servers=[ models.FileServerReference( file_server=models.ResourceId(id=server.id), relative_mount_path='nfs', mount_options="rw") ]) # Verify the file server is reported. Helpers.assert_existing_file_servers_are(self, self.client, resource_group.name, [self.file_server_name]) # Verify the file server become available in a reasonable time self.assertTrue( Helpers.wait_for_file_server(self.is_live, self.client, resource_group.name, self.file_server_name, _FILE_SERVER_CREATION_TIMEOUT_SEC)) # Verify the remote login information and private ip are reported server = self.client.file_servers.get(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, self.file_server_name) self.assertRegexpMatches(server.mount_settings.file_server_public_ip, Helpers.RE_ID_ADDRESS) self.assertRegexpMatches(server.mount_settings.file_server_internal_ip, Helpers.RE_ID_ADDRESS) # Verify the clusters allocated nodes successfully self.assertEqual( Helpers.wait_for_nodes(self.is_live, self.client, resource_group.name, 'cluster1', 1, Helpers.NODE_STARTUP_TIMEOUT_SEC), 1) self.assertEqual( Helpers.wait_for_nodes(self.is_live, self.client, resource_group.name, 'cluster2', 1, Helpers.NODE_STARTUP_TIMEOUT_SEC), 1) # Execute publishing tasks on the first cluster job1 = Helpers.create_custom_job( self.client, resource_group.name, cluster1.id, 'host_publisher', 1, 'echo hi from host > $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt') self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job1.name, Helpers.MINUTE), models.ExecutionState.succeeded) job2 = Helpers.create_custom_job( self.client, resource_group.name, cluster1.id, 'container_publisher', 1, 'echo hi from container >> $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt', container=models.ContainerSettings( image_source_registry=models.ImageSourceRegistry( image="ubuntu"))) self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job2.name, Helpers.MINUTE), models.ExecutionState.succeeded) # Execute consumer task on the second cluster job3 = Helpers.create_custom_job( self.client, resource_group.name, cluster2.id, 'consumer', 1, 'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/host.txt; ' 'cat $AZ_BATCHAI_MOUNT_ROOT/nfs/container.txt') self.assertEqual( Helpers.wait_for_job_completion(self.is_live, self.client, resource_group.name, job3.name, Helpers.MINUTE), models.ExecutionState.succeeded) # Verify the data Helpers.assert_job_files_are( self, self.client, resource_group.name, job3.name, Helpers.STANDARD_OUTPUT_DIRECTORY_ID, { u'stdout.txt': u'hi from host\nhi from container\n', u'stderr.txt': '' }) # Delete clusters self.client.clusters.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, 'cluster1').result() self.client.clusters.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, 'cluster2').result() # Test deletion self.client.file_servers.delete(resource_group.name, Helpers.DEFAULT_WORKSPACE_NAME, self.file_server_name).result() Helpers.assert_existing_file_servers_are(self, self.client, resource_group.name, [])