def create_job(batch_service_client, job_id, resource_files, pool_id): """ Creates a job with the specified ID, associated with the specified pool. :param batch_service_client: A Batch service client. :type batch_service_client: `azure.batch.BatchServiceClient` :param str job_id: The ID for the job. :param str pool_id: The ID for the pool. """ print('Creating job [{}]...'.format(job_id)) job = batch.models.JobAddParameter( id=job_id, pool_info=batch.models.PoolInformation(pool_id=pool_id), job_preparation_task=batchmodels.JobPreparationTask( command_line="/bin/bash -c \"\ sudo apt-get install -y --reinstall make; \ sudo ./installrpy.sh; \ tar -xf cmdstan.tar.gz; \ cd cmdstan; \ make build; \ cd -; \ if sudo test -d ../../cmdstan; then echo stanexist; exit 0; fi; \ sudo mv * ../../; \ \"", resource_files=resource_files, wait_for_success=True, user_identity=batchmodels.UserIdentity(user_name="admin"), )) # batch_service_client.job.add(job)
def create_job(batch_service_client, job_id, resource_files, pool_id): """ Creates a job with the specified ID, associated with the specified pool. :param batch_service_client: A Batch service client. :type batch_service_client: `azure.batch.BatchServiceClient` :param str job_id: The ID for the job. :param str pool_id: The ID for the pool. """ print('Creating job [{}]...'.format(job_id)) job = batch.models.JobAddParameter( id=job_id, pool_info=batch.models.PoolInformation(pool_id=pool_id), job_preparation_task=batchmodels.JobPreparationTask( command_line="/bin/bash -c \"sudo apt-get -y update; \ sudo chmod o+w /usr/local/lib/R/site-library; \ ./installrpy.sh; \ tar -xf cmdstan.tar.gz; \ cd cmdstan; \ make build; \ cd -; \ \"", wait_for_success=True, resource_files=resource_files, user_identity=batchmodels.UserIdentity(user_name="admin"), )) batch_service_client.job.add(job)
def add_docker_batch_task(batch_client, block_blob_client, job_id, pool_id): """Submits a docker task via Batch scheduler :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param block_blob_client: The storage block blob client to use. :type block_blob_client: `azure.storage.blob.BlockBlobService` :param str job_id: The id of the job to use. :param str pool_id: The id of the pool to use. :rtype: list :return: a list of task_id of the task added. """ task_resource_sas_url = common.helpers.upload_blob_and_create_sas( block_blob_client, _CONTAINER_NAME, _TASK_RESOURCE_FILE, _TASK_RESOURCE_FILE_PATH, datetime.datetime.utcnow() + datetime.timedelta(hours=1)) output_container_sas_key = common.helpers.create_container_and_create_sas( block_blob_client=block_blob_client, container_name=_OUTPUT_CONTAINER_NAME, permission=azureblob.ContainerPermissions.WRITE | azureblob.ContainerPermissions.LIST, expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1)) # The start task pulls docker image yidingz/ffmpeg:v3 job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id), job_preparation_task=batchmodels.JobPreparationTask( command_line=_JOB_STARTTASK_CLI, run_elevated=True)) batch_client.job.add(job) task_id_list = [] index = 0 for url in _INPUT_FILE_URLS: filename = urllib.parse.urlsplit(url).path.split('/')[-1] parameters = "'{0}' '{1}' '{2}' '{3}'".format( url, filename, output_container_sas_key, block_blob_client.account_name) # Each task will download a video from chanel9, # transcode, and upload to specified output container task = batchmodels.TaskAddParameter( id=str(index).zfill(4) + '_' + filename.split('.')[0], command_line=_TASK_CLI.format(_TASK_RESOURCE_FILE, _FFMPEG_IMAGE, parameters), run_elevated=True, resource_files=[ batchmodels.ResourceFile(file_path=_TASK_RESOURCE_FILE, blob_source=task_resource_sas_url) ]) task_id_list.append(task.id) batch_client.task.add(job_id=job_id, task=task) index += 1 return task_id_list
def create_job(batch_client, name_job, name_pool, cmd_prep_task=None): user = models.UserIdentity(auto_user=models.AutoUserSpecification( elevation_level=models.ElevationLevel.admin, scope=models.AutoUserScope.task)) prepare_task = models.JobPreparationTask(command_line=cmd_prep_task, id=None, user_identity=user) job = models.JobAddParameter( id=name_job, pool_info=models.PoolInformation(pool_id=name_pool), job_preparation_task=prepare_task) batch_client.job.add(job)
def create_job(self, job_preparation_commands=None): """Creates a job with the specified ID, associated with the specified pool. Args: job_preparation_commands: commands as list of strings to run before the job starts Returns: success: True if job could be created successfully, False otherwise. """ if job_preparation_commands is None: job = batch_models.JobAddParameter( id=self.job_id, pool_info=batch_models.PoolInformation(pool_id=self.pool_id)) else: job_prep_task = batch_models.JobPreparationTask( command_line=job_preparation_commands, wait_for_success=True, rerun_on_node_reboot_after_success=True) job = batch_models.JobAddParameter( id=self.job_id, pool_info=batch_models.PoolInformation(pool_id=self.pool_id), job_preparation_task=job_prep_task) try: logging.info('Attempting to create job [{}]...'.format( self.job_id)) self.batch_client.job.add(job) logging.info('Job [{}] created successfully...'.format( self.job_id)) return True except batch_models.batch_error.BatchErrorException as err: if err.error.code == "JobExists": logging.info("Job [{}] already exists".format(self.job_id)) return False else: logging.exception( "Unknown error occurred while trying to create job [{}]". format(self.job_id)) raise
def test_batch_jobs(self, **kwargs): client = self.create_sharedkey_client(**kwargs) # Test Create Job auto_pool = models.AutoPoolSpecification( pool_lifetime_option=models.PoolLifetimeOption.job, pool=models.PoolSpecification( vm_size='small', cloud_service_configuration=models.CloudServiceConfiguration( os_family='5' ) ) ) job_prep = models.JobPreparationTask(command_line="cmd /c \"echo hello world\"") job_release = models.JobReleaseTask(command_line="cmd /c \"echo goodbye world\"") job_param = models.JobAddParameter( id=self.get_resource_name('batch_job1_'), pool_info=models.PoolInformation( auto_pool_specification=auto_pool ), job_preparation_task=job_prep, job_release_task=job_release ) response = client.job.add(job_param) self.assertIsNone(response) # Test Update Job constraints = models.JobConstraints(max_task_retry_count=3) options = models.JobUpdateParameter( priority=500, constraints=constraints, pool_info=models.PoolInformation( auto_pool_specification=auto_pool ) ) response = client.job.update(job_param.id, options) self.assertIsNone(response) # Test Patch Job options = models.JobPatchParameter(priority=900) response = client.job.patch(job_param.id, options) self.assertIsNone(response) job = client.job.get(job_param.id) self.assertIsInstance(job, models.CloudJob) self.assertEqual(job.id, job_param.id) self.assertEqual(job.constraints.max_task_retry_count, 3) self.assertEqual(job.priority, 900) # Test Create Job with Auto Complete job_auto_param = models.JobAddParameter( id=self.get_resource_name('batch_job2_'), on_all_tasks_complete=models.OnAllTasksComplete.terminate_job, on_task_failure=models.OnTaskFailure.perform_exit_options_job_action, pool_info=models.PoolInformation( auto_pool_specification=auto_pool ) ) response = client.job.add(job_auto_param) self.assertIsNone(response) job = client.job.get(job_auto_param.id) self.assertIsInstance(job, models.CloudJob) self.assertEqual(job.on_all_tasks_complete, models.OnAllTasksComplete.terminate_job) self.assertEqual(job.on_task_failure, models.OnTaskFailure.perform_exit_options_job_action) # Test List Jobs jobs = client.job.list() self.assertIsInstance(jobs, models.CloudJobPaged) self.assertEqual(len(list(jobs)), 2) # Test Disable Job response = client.job.disable(job_param.id, models.DisableJobOption.requeue) self.assertIsNone(response) # Test Enable Job response = client.job.enable(job_param.id) self.assertIsNone(response) # Prep and release task status task_status = client.job.list_preparation_and_release_task_status(job_param.id) self.assertIsInstance(task_status, models.JobPreparationAndReleaseTaskExecutionInformationPaged) self.assertEqual(list(task_status), []) # Test Terminate Job response = client.job.terminate(job_param.id) self.assertIsNone(response) # Test Delete Job response = client.job.delete(job_auto_param.id) self.assertIsNone(response) # Test Job Lifetime Statistics stats = client.job.get_all_lifetime_statistics() self.assertIsInstance(stats, models.JobStatistics) self.assertEqual(stats.num_succeeded_tasks, 0) self.assertEqual(stats.num_failed_tasks, 0)
def _setup_job(self, distributable_list, pool_id, name, log_writer=None): ''' This is the main method for submitting to AzureBatch. ''' job_id = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") + "-" + name.replace("_","-").replace("/","-").replace(".","-").replace("+","-").replace("(","").replace(")","") job_id_etc_list = [] if True: # Pickle the things-to-run - put them in a local directory under the current directory called "runs/[jobid]" where the jobid is based on the date. if log_writer is not None: log_writer("{0}: Pickle the thing to run".format(name)) run_dir_rel = os.path.join("runs",job_id) pstutil.create_directory_if_necessary(run_dir_rel, isfile=False) for index, distributable in enumerate(distributable_list): distributablep_filename = os.path.join(run_dir_rel, "distributable{0}.p".format(index)) with open(distributablep_filename, mode='wb') as f: pickle.dump(distributable, f, pickle.HIGHEST_PROTOCOL) if True: # Copy (update) any (small) input files to the blob if log_writer is not None: log_writer("{0}: Upload small input files".format(name)) data_blob_fn = "{0}-data-v{1}".format(self.container,self.data_version) inputOutputCopier = AzureBatchCopier(data_blob_fn, self.storage_key, self.storage_account_name) script_list = ["",""] #These will be scripts for copying to and from AzureStorage and the cluster nodes. inputOutputCopier2 = AzureBatchCopierNodeLocal(data_blob_fn, self.container, self.data_version, self.storage_key, self.storage_account_name, script_list) for index, distributable in enumerate(distributable_list): inputOutputCopier.input(distributable) inputOutputCopier2.input(distributable) inputOutputCopier2.output(distributable) output_blobfn = "{0}/output{1}".format(run_dir_rel.replace("\\","/"),index) #The name of the directory of return values in Azure Storage. job_id_etc_list.append((job_id, inputOutputCopier, output_blobfn, run_dir_rel)) if True: # Create the jobprep program -- sets the python path and downloads the pythonpath code. Also create node-local folder for return values. if log_writer is not None: log_writer("{0}: Create jobprep.bat script".format(name)) localpythonpath = os.environ.get("PYTHONPATH") #!!should it be able to work without pythonpath being set (e.g. if there was just one file)? Also, is None really the return or is it an exception. jobprep_filename = os.path.join(run_dir_rel, "jobprep.bat") # It only copies down files that are needed, but with some probability (about 1 in 50, say) fails, so we repeat three times. with open(jobprep_filename, mode='w') as f2: f2.write(r"""set set path=%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2;%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\scripts\;%path% for /l %%t in (0,1,3) do FOR /L %%i IN (0,1,{7}) DO python.exe %AZ_BATCH_TASK_WORKING_DIR%\blobxfer.py --skipskip --delete --storageaccountkey {2} --download {3} {4}-pp-v{5}-%%i %AZ_BATCH_NODE_SHARED_DIR%\{4}\pp\v{5}\%%i --remoteresource . {6} mkdir %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{8} exit /b 0 """ .format( None, #0 - not used None, #1 - not used self.storage_key, #2 self.storage_account_name, #3 self.container, #4 self.pp_version, #5 script_list[0], #6 len(localpythonpath.split(';'))-1, #7 index, #8 )) if True: #Split the taskcount roughly evenly among the distributables subtaskcount_list = deal(len(distributable_list),self.taskcount) if True: # Create the map.bat and reduce.bat programs to run. if log_writer is not None: log_writer("{0}: Create map.bat and reduce.bat script".format(name)) pythonpath_string = "set pythonpath=" + ";".join(r"%AZ_BATCH_NODE_SHARED_DIR%\{0}\pp\v{1}\{2}".format(self.container,self.pp_version,i) for i in range(len(localpythonpath.split(';')))) for index in range(len(distributable_list)): subtaskcount = subtaskcount_list[index] output_blobfn = job_id_etc_list[index][2] for i, bat_filename in enumerate(["map{0}.bat".format(index),"reduce{0}.bat".format(index)]): bat_filename = os.path.join(run_dir_rel, bat_filename) with open(bat_filename, mode='w') as f1: #note that it's getting distributable.py from site-packages and never from the pythonpath f1.write(r"""set path=%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2;%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\scripts\;%path% mkdir %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {6}cd %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {6}FOR /L %%i IN (0,1,{11}) DO python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --download {3} {8}/{10} . --remoteresource %%i.{0}.p cd %AZ_BATCH_NODE_SHARED_DIR%\{8}\data\v{9} {13} python.exe %AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\Lib\site-packages\fastlmm\util\distributable.py %AZ_BATCH_JOB_PREP_WORKING_DIR%\distributable{14}.p LocalInParts(%1,{0},result_file=r\"{4}/result.p\",mkl_num_threads={1},temp_dir=r\"{4}\") IF %ERRORLEVEL% NEQ 0 (EXIT /B %ERRORLEVEL%) {6}{7} cd %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {5}for /l %%t in (0,1,3) do python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --upload {3} {8} %1.{0}.p --remoteresource {10}/%1.{0}.p {6}for /l %%t in (0,1,3) do python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --upload {3} {8} result.p --remoteresource {10}/result.p """ .format( subtaskcount, #0 self.mkl_num_threads, #1 self.storage_key, #2 self.storage_account_name, #3 "%AZ_BATCH_TASK_WORKING_DIR%/../../output{0}".format(index), #4 "" if i==0 else "@rem ", #5 "" if i==1 else "@rem ", #6 script_list[1], #7 self.container, #8 self.data_version, #9 output_blobfn, #10 subtaskcount-1, #11 self.pp_version, #12 pythonpath_string, #13 index, #14 )) if True: # Upload the thing-to-run to a blob and the blobxfer program if log_writer is not None: log_writer("{0}: Upload the thing to run".format(name)) block_blob_client = azureblob.BlockBlobService(account_name=self.storage_account_name,account_key=self.storage_key) block_blob_client.create_container(self.container, fail_on_exist=False) blobxfer_blobfn = "utils/v{}/blobxfer.py".format(self.utils_version) blobxfer_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, blobxfer_blobfn, os.path.join(os.path.dirname(__file__),"blobxfer.py"), datetime.datetime.utcnow() + datetime.timedelta(days=30)) jobprep_blobfn = "{}/jobprep.bat".format(run_dir_rel.replace("\\","/")) jobprepbat_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, jobprep_blobfn, os.path.join(run_dir_rel, "jobprep.bat"), datetime.datetime.utcnow() + datetime.timedelta(days=30)) map_reduce_url_list = [] for index in range(len(distributable_list)): distributablep_blobfn = "{0}/distributable{1}.p".format(run_dir_rel.replace("\\","/"),index) distributablep_filename = os.path.join(run_dir_rel, "distributable{0}.p".format(index)) distributablep_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, distributablep_blobfn, distributablep_filename, datetime.datetime.utcnow() + datetime.timedelta(days=30)) #!!!should there be an expiry? map_blobfn = "{0}/map{1}.bat".format(run_dir_rel.replace("\\","/"),index) map_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, map_blobfn, os.path.join(run_dir_rel, "map{0}.bat".format(index)), datetime.datetime.utcnow() + datetime.timedelta(days=30)) reduce_blobfn = "{0}/reduce{1}.bat".format(run_dir_rel.replace("\\","/"),index) reduce_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, reduce_blobfn, os.path.join(run_dir_rel, "reduce{0}.bat".format(index)), datetime.datetime.utcnow() + datetime.timedelta(days=30)) map_reduce_url_list.append((map_url,reduce_url,distributablep_url)) if True: # Copy everything on PYTHONPATH to a blob if log_writer is not None: log_writer("{0}: Upload items on pythonpath as requested".format(name)) if self.update_python_path == 'every_time': self._update_python_path_function() if True: # Create a job with a job prep task if log_writer is not None: log_writer("{0}: Create jobprep.bat".format(name)) resource_files=[ batchmodels.ResourceFile(blob_source=blobxfer_url, file_path="blobxfer.py"), batchmodels.ResourceFile(blob_source=jobprepbat_url, file_path="jobprep.bat")] for index in range(len(distributable_list)): _, _, distributablep_url = map_reduce_url_list[index] resource_files.append(batchmodels.ResourceFile(blob_source=distributablep_url, file_path="distributable{0}.p".format(index))) job_preparation_task = batchmodels.JobPreparationTask( id="jobprep", #run_elevated=True, user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')), resource_files=resource_files, command_line="jobprep.bat", ) job = batchmodels.JobAddParameter( id=job_id, job_preparation_task=job_preparation_task, pool_info=batch.models.PoolInformation(pool_id=pool_id), uses_task_dependencies=True, on_task_failure='performExitOptionsJobAction', ) try: self.batch_client.job.add(job) except batchmodels.BatchErrorException as e: if e.inner_exception.values is not None: raise Exception(e.inner_exception.values[-1].value) else: raise Exception(e.inner_exception) if True: # Add regular tasks to the job if log_writer is not None: log_writer("{0}: Add tasks to job".format(name)) task_factor = int(10**math.ceil(math.log(max(subtaskcount_list),10))) #When we have multiple distributables, this helps us number them e.g. 0,1,2,10,11,12,20,21,22 task_list = [] for index in range(len(distributable_list)): start = len(task_list) map_url, reduce_url, _ = map_reduce_url_list[index] subtaskcount = subtaskcount_list[index] for taskindex in range(subtaskcount): map_task = batchmodels.TaskAddParameter( id=index * task_factor + taskindex, #run_elevated=True, user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')), #!!! seems to exit without needing a failure exit_conditions = batchmodels.ExitConditions(default=batchmodels.ExitOptions(job_action='terminate')), resource_files=[batchmodels.ResourceFile(blob_source=map_url, file_path="map{0}.bat".format(index))], command_line=r"map{0}.bat {1}".format(index, taskindex), ) task_list.append(map_task) end = len(task_list)-1 reduce_task = batchmodels.TaskAddParameter( id="reduce{0}".format(index), #run_elevated=True, user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')), resource_files=[batchmodels.ResourceFile(blob_source=reduce_url, file_path="reduce{0}.bat".format(index))], command_line=r"reduce{0}.bat {1}".format(index, subtaskcount), depends_on = batchmodels.TaskDependencies(task_id_ranges=[batchmodels.TaskIdRange(task_list[start].id,task_list[end].id)]) ) task_list.append(reduce_task) try: for i in range(0,len(task_list),100): #The Python API only lets us add 100 at a time. self.batch_client.task.add_collection(job_id, task_list[i:i+100]) except Exception as exception: print(exception) raise exception return job_id_etc_list