def create_post_processing_task(container_settings, job_id, tasks): post_processing_task = batchmodels.TaskAddParameter( id="postprocessing", command_line=f'/bin/sh -c "cat {job_id}/output/*.csv"', # oddly, using storage_container_url doesn't work here # but storage container name does resource_files=[batchmodels.ResourceFile(auto_storage_container_name=_CONTAINER_NAME, blob_prefix=f"{job_id}/output/", file_path="")], container_settings=container_settings, depends_on=batchmodels.TaskDependencies(task_ids=[task.id for task in tasks]) ) return post_processing_task
def create_merge_task(frame, task_id, job_id, depend_start, depend_end): """ Azure Batch task that executes the ImageMagick `convert` command line to combine all of the output tiles into a single output image. This task uses the task dependency model to make sure it doesn't execute before it's dependent tasks have completed. This way we know all of the output image tiles will exist. :param frame: Frame number of the scene that this merge task is processing. :type frame: int :param task_id: Identifier of the task. :type task_id: str :param job_id: Unique identifier of the job. Job identifiers are unique within a single Azure Batch account. :type job_id: str :param depend_start: First task id of the dependency sequence. If each frame is split into 16 tiles, then every 17th task will be a merge task and that merge task will be dependent on the preceeding 16 tasks. tile tasks 1 - 16, then merge, then tiles 18 - 34, then merge, etc. :type depend_start: int :param depend_end: Final task id of the dependency sequence. Explanation for param `depend_start` applies here as well. :type depend_end: int """ x_tiles = int(os.environ["X_TILES"]) y_tiles = int(os.environ["X_TILES"]) output_sas = os.environ["OUTPUT_CONTAINER_SAS"] working_dir = os.environ["AZ_BATCH_TASK_WORKING_DIR"] output_format = os.environ["OUTPUT_FORMAT"] print("working_dir: {}".format(working_dir)) # crop to border means we need to use montage to tile the images. false means # we can use convert -flatten to layer the images with transparent backgrounds # convert is faster but needs RGBA crop = os.environ["CROP_TO_BORDER"].lower() if crop == "true": command_line = montage_command(frame, x_tiles, y_tiles, output_format) else: command_line = convert_command(frame, output_format) print("merge task command line: {}".format(command_line)) return models.TaskAddParameter( id=pad_number(task_id, PAD_LEN_ID), display_name="frame: {} - merge task".format(frame), command_line=os_specific_command_line(command_line), constraints=models.TaskConstraints(max_task_retry_count=2), environment_settings=[ models.EnvironmentSetting("X_TILES", str(x_tiles)), models.EnvironmentSetting("Y_TILES", str(y_tiles)) ], depends_on=models.TaskDependencies( task_ids=get_dependent_tasks(depend_start, depend_end)), resource_files=get_resource_files(x_tiles, y_tiles, frame), output_files=[ models.OutputFile( file_pattern="../stdout.txt", destination=models.OutputFileDestination( container=models.OutputFileBlobContainerDestination( container_url=output_sas, path="{}/logs/frame-{}/merge.stdout.log".format( job_id, pad_number(frame, PAD_LEN_FRAME)))), upload_options=models.OutputFileUploadOptions( models.OutputFileUploadCondition.task_completion)), models.OutputFile( file_pattern="../stderr.txt", destination=models.OutputFileDestination( container=models.OutputFileBlobContainerDestination( container_url=output_sas, path="{}/logs/frame-{}/merge.stderr.log".format( job_id, pad_number(frame, PAD_LEN_FRAME)))), upload_options=models.OutputFileUploadOptions( models.OutputFileUploadCondition.task_completion)), models.OutputFile( file_pattern="frame_*", destination=models.OutputFileDestination( container=models.OutputFileBlobContainerDestination( container_url=output_sas, path="{}/outputs/final".format(job_id))), upload_options=models.OutputFileUploadOptions( models.OutputFileUploadCondition.task_success)) ])
def _setup_job(self, distributable_list, pool_id, name, log_writer=None): ''' This is the main method for submitting to AzureBatch. ''' job_id = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") + "-" + name.replace("_","-").replace("/","-").replace(".","-").replace("+","-").replace("(","").replace(")","") job_id_etc_list = [] if True: # Pickle the things-to-run - put them in a local directory under the current directory called "runs/[jobid]" where the jobid is based on the date. if log_writer is not None: log_writer("{0}: Pickle the thing to run".format(name)) run_dir_rel = os.path.join("runs",job_id) pstutil.create_directory_if_necessary(run_dir_rel, isfile=False) for index, distributable in enumerate(distributable_list): distributablep_filename = os.path.join(run_dir_rel, "distributable{0}.p".format(index)) with open(distributablep_filename, mode='wb') as f: pickle.dump(distributable, f, pickle.HIGHEST_PROTOCOL) if True: # Copy (update) any (small) input files to the blob if log_writer is not None: log_writer("{0}: Upload small input files".format(name)) data_blob_fn = "{0}-data-v{1}".format(self.container,self.data_version) inputOutputCopier = AzureBatchCopier(data_blob_fn, self.storage_key, self.storage_account_name) script_list = ["",""] #These will be scripts for copying to and from AzureStorage and the cluster nodes. inputOutputCopier2 = AzureBatchCopierNodeLocal(data_blob_fn, self.container, self.data_version, self.storage_key, self.storage_account_name, script_list) for index, distributable in enumerate(distributable_list): inputOutputCopier.input(distributable) inputOutputCopier2.input(distributable) inputOutputCopier2.output(distributable) output_blobfn = "{0}/output{1}".format(run_dir_rel.replace("\\","/"),index) #The name of the directory of return values in Azure Storage. job_id_etc_list.append((job_id, inputOutputCopier, output_blobfn, run_dir_rel)) if True: # Create the jobprep program -- sets the python path and downloads the pythonpath code. Also create node-local folder for return values. if log_writer is not None: log_writer("{0}: Create jobprep.bat script".format(name)) localpythonpath = os.environ.get("PYTHONPATH") #!!should it be able to work without pythonpath being set (e.g. if there was just one file)? Also, is None really the return or is it an exception. jobprep_filename = os.path.join(run_dir_rel, "jobprep.bat") # It only copies down files that are needed, but with some probability (about 1 in 50, say) fails, so we repeat three times. with open(jobprep_filename, mode='w') as f2: f2.write(r"""set set path=%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2;%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\scripts\;%path% for /l %%t in (0,1,3) do FOR /L %%i IN (0,1,{7}) DO python.exe %AZ_BATCH_TASK_WORKING_DIR%\blobxfer.py --skipskip --delete --storageaccountkey {2} --download {3} {4}-pp-v{5}-%%i %AZ_BATCH_NODE_SHARED_DIR%\{4}\pp\v{5}\%%i --remoteresource . {6} mkdir %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{8} exit /b 0 """ .format( None, #0 - not used None, #1 - not used self.storage_key, #2 self.storage_account_name, #3 self.container, #4 self.pp_version, #5 script_list[0], #6 len(localpythonpath.split(';'))-1, #7 index, #8 )) if True: #Split the taskcount roughly evenly among the distributables subtaskcount_list = deal(len(distributable_list),self.taskcount) if True: # Create the map.bat and reduce.bat programs to run. if log_writer is not None: log_writer("{0}: Create map.bat and reduce.bat script".format(name)) pythonpath_string = "set pythonpath=" + ";".join(r"%AZ_BATCH_NODE_SHARED_DIR%\{0}\pp\v{1}\{2}".format(self.container,self.pp_version,i) for i in range(len(localpythonpath.split(';')))) for index in range(len(distributable_list)): subtaskcount = subtaskcount_list[index] output_blobfn = job_id_etc_list[index][2] for i, bat_filename in enumerate(["map{0}.bat".format(index),"reduce{0}.bat".format(index)]): bat_filename = os.path.join(run_dir_rel, bat_filename) with open(bat_filename, mode='w') as f1: #note that it's getting distributable.py from site-packages and never from the pythonpath f1.write(r"""set path=%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2;%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\scripts\;%path% mkdir %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {6}cd %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {6}FOR /L %%i IN (0,1,{11}) DO python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --download {3} {8}/{10} . --remoteresource %%i.{0}.p cd %AZ_BATCH_NODE_SHARED_DIR%\{8}\data\v{9} {13} python.exe %AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\Lib\site-packages\fastlmm\util\distributable.py %AZ_BATCH_JOB_PREP_WORKING_DIR%\distributable{14}.p LocalInParts(%1,{0},result_file=r\"{4}/result.p\",mkl_num_threads={1},temp_dir=r\"{4}\") IF %ERRORLEVEL% NEQ 0 (EXIT /B %ERRORLEVEL%) {6}{7} cd %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {5}for /l %%t in (0,1,3) do python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --upload {3} {8} %1.{0}.p --remoteresource {10}/%1.{0}.p {6}for /l %%t in (0,1,3) do python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --upload {3} {8} result.p --remoteresource {10}/result.p """ .format( subtaskcount, #0 self.mkl_num_threads, #1 self.storage_key, #2 self.storage_account_name, #3 "%AZ_BATCH_TASK_WORKING_DIR%/../../output{0}".format(index), #4 "" if i==0 else "@rem ", #5 "" if i==1 else "@rem ", #6 script_list[1], #7 self.container, #8 self.data_version, #9 output_blobfn, #10 subtaskcount-1, #11 self.pp_version, #12 pythonpath_string, #13 index, #14 )) if True: # Upload the thing-to-run to a blob and the blobxfer program if log_writer is not None: log_writer("{0}: Upload the thing to run".format(name)) block_blob_client = azureblob.BlockBlobService(account_name=self.storage_account_name,account_key=self.storage_key) block_blob_client.create_container(self.container, fail_on_exist=False) blobxfer_blobfn = "utils/v{}/blobxfer.py".format(self.utils_version) blobxfer_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, blobxfer_blobfn, os.path.join(os.path.dirname(__file__),"blobxfer.py"), datetime.datetime.utcnow() + datetime.timedelta(days=30)) jobprep_blobfn = "{}/jobprep.bat".format(run_dir_rel.replace("\\","/")) jobprepbat_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, jobprep_blobfn, os.path.join(run_dir_rel, "jobprep.bat"), datetime.datetime.utcnow() + datetime.timedelta(days=30)) map_reduce_url_list = [] for index in range(len(distributable_list)): distributablep_blobfn = "{0}/distributable{1}.p".format(run_dir_rel.replace("\\","/"),index) distributablep_filename = os.path.join(run_dir_rel, "distributable{0}.p".format(index)) distributablep_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, distributablep_blobfn, distributablep_filename, datetime.datetime.utcnow() + datetime.timedelta(days=30)) #!!!should there be an expiry? map_blobfn = "{0}/map{1}.bat".format(run_dir_rel.replace("\\","/"),index) map_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, map_blobfn, os.path.join(run_dir_rel, "map{0}.bat".format(index)), datetime.datetime.utcnow() + datetime.timedelta(days=30)) reduce_blobfn = "{0}/reduce{1}.bat".format(run_dir_rel.replace("\\","/"),index) reduce_url = commonhelpers.upload_blob_and_create_sas(block_blob_client, self.container, reduce_blobfn, os.path.join(run_dir_rel, "reduce{0}.bat".format(index)), datetime.datetime.utcnow() + datetime.timedelta(days=30)) map_reduce_url_list.append((map_url,reduce_url,distributablep_url)) if True: # Copy everything on PYTHONPATH to a blob if log_writer is not None: log_writer("{0}: Upload items on pythonpath as requested".format(name)) if self.update_python_path == 'every_time': self._update_python_path_function() if True: # Create a job with a job prep task if log_writer is not None: log_writer("{0}: Create jobprep.bat".format(name)) resource_files=[ batchmodels.ResourceFile(blob_source=blobxfer_url, file_path="blobxfer.py"), batchmodels.ResourceFile(blob_source=jobprepbat_url, file_path="jobprep.bat")] for index in range(len(distributable_list)): _, _, distributablep_url = map_reduce_url_list[index] resource_files.append(batchmodels.ResourceFile(blob_source=distributablep_url, file_path="distributable{0}.p".format(index))) job_preparation_task = batchmodels.JobPreparationTask( id="jobprep", #run_elevated=True, user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')), resource_files=resource_files, command_line="jobprep.bat", ) job = batchmodels.JobAddParameter( id=job_id, job_preparation_task=job_preparation_task, pool_info=batch.models.PoolInformation(pool_id=pool_id), uses_task_dependencies=True, on_task_failure='performExitOptionsJobAction', ) try: self.batch_client.job.add(job) except batchmodels.BatchErrorException as e: if e.inner_exception.values is not None: raise Exception(e.inner_exception.values[-1].value) else: raise Exception(e.inner_exception) if True: # Add regular tasks to the job if log_writer is not None: log_writer("{0}: Add tasks to job".format(name)) task_factor = int(10**math.ceil(math.log(max(subtaskcount_list),10))) #When we have multiple distributables, this helps us number them e.g. 0,1,2,10,11,12,20,21,22 task_list = [] for index in range(len(distributable_list)): start = len(task_list) map_url, reduce_url, _ = map_reduce_url_list[index] subtaskcount = subtaskcount_list[index] for taskindex in range(subtaskcount): map_task = batchmodels.TaskAddParameter( id=index * task_factor + taskindex, #run_elevated=True, user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')), #!!! seems to exit without needing a failure exit_conditions = batchmodels.ExitConditions(default=batchmodels.ExitOptions(job_action='terminate')), resource_files=[batchmodels.ResourceFile(blob_source=map_url, file_path="map{0}.bat".format(index))], command_line=r"map{0}.bat {1}".format(index, taskindex), ) task_list.append(map_task) end = len(task_list)-1 reduce_task = batchmodels.TaskAddParameter( id="reduce{0}".format(index), #run_elevated=True, user_identity=batchmodels.UserIdentity(auto_user=batchmodels.AutoUserSpecification(elevation_level='admin')), resource_files=[batchmodels.ResourceFile(blob_source=reduce_url, file_path="reduce{0}.bat".format(index))], command_line=r"reduce{0}.bat {1}".format(index, subtaskcount), depends_on = batchmodels.TaskDependencies(task_id_ranges=[batchmodels.TaskIdRange(task_list[start].id,task_list[end].id)]) ) task_list.append(reduce_task) try: for i in range(0,len(task_list),100): #The Python API only lets us add 100 at a time. self.batch_client.task.add_collection(job_id, task_list[i:i+100]) except Exception as exception: print(exception) raise exception return job_id_etc_list
class AzureBatch: # implements Irunner ''' A class that implement the Irunner interface that map_reduce uses. It lets one run map_reduce work on an Azure batch account. **Constructor:** :Parameters: * **task_count** (*integer*) -- The number of tasks in the AzureBatch job. * **pool_id_list** (*list of strings*) -- A list of names of the AzureBatch pool(s) in which to run. * **with_one_pool** (*bool*) -- (default True) Run two-level map_reduceX as a single AzureBatch job, otherwise runs each top-level value as its own job. * **tree_scale_list** (*list of pairs*) -- (default None) If given, this a is list the same size as the pool_id_list. for each pool_id, it gives a pair: A :class:`AzureP2P` and a string file name. When the job is running, a monitor program will watch the AzureP2P and scale the number of nodes to no more than three times the number of peer-to-peer copies of the file. As the tasks in the job finish, the monitor program will scale the number of nodes down to the number of remaining tasks. * **max_node_count_list** (*list of integers*) -- default None) If given, limits the maximum number of nodes in each pool. * **mkl_num_threads** (*integer*) -- (default of number of processors on node) Limit the number of MKL threads used on a node. * **update_python_path** ('once' [default], 'no', 'every_time') -- How often to transfer the code on the python_path to the nodes. * **max_stderr_count** (*integer*) -- If some tasks fail, the maximum number of stderr files to display. Defaults to 5. * **storage_credential** (*StorageCredential*) -- AzureBatch and AzureStorage credentials. If not given, created from ~/azurebatch/cred.txt. * **storage_account_name** (*string*) Name of Azure storage account used to store run information. Defaults to first account listed in the cred.txt file. * **show_log_diffs** (*bool*) (default True) -- If True, in-place log message will do a carriage return when the message changes. ''' def __init__(self, task_count, pool_id_list, with_one_pool=True, tree_scale_list=None, max_node_count_list=None, mkl_num_threads=None, update_python_path="once", max_stderr_count=5, storage_credential=None, storage_account_name=None, show_log_diffs=True, logging_handler=logging.StreamHandler(sys.stdout)): logger = logging.getLogger() #!!! similar code elsewhere if not logger.handlers: logger.setLevel(logging.INFO) for h in list(logger.handlers): logger.removeHandler(h) if logger.level == logging.NOTSET: logger.setLevel(logging.INFO) logger.addHandler(logging_handler) self.taskcount = task_count self.with_one_pool = with_one_pool self.tree_scale_list = tree_scale_list self.mkl_num_threads = mkl_num_threads self.pool_id_list = pool_id_list self.update_python_path = update_python_path self.show_log_diffs = show_log_diffs self.container = "mapreduce3" #!!!make this an option self.utils_version = 3 #!!!make this an option self.pp_version = 3 #!!!make this an option self.data_version = 3 #!!!make this an option if storage_credential is None or isinstance(storage_credential, str): from onemil.azure_copy import StorageCredential storage_credential = StorageCredential(storage_credential) self.storage_credential = storage_credential self.storage_account_name = storage_account_name or self.storage_credential.storage_account_name_list[ 0] self.storage_key = storage_credential._account_name_to_key[ self.storage_account_name] self.batch_client = storage_credential.batch_client() from onemil.monitor import Real self.world = Real(pool_id_list, tree_scale_list, max_node_count_list, self.batch_client, max_stderr_count=max_stderr_count) choices = ['once', 'every_time', 'no'] assert update_python_path in choices, "Expect update_python_path to be {0}".format( ",".join(["'{0}'".format(item) for item in choices])) if update_python_path == 'once': self._update_python_path_function() def _update_python_path_function(self): localpythonpath = os.environ.get( "PYTHONPATH" ) #!!should it be able to work without pythonpath being set (e.g. if there was just one file)? Also, is None really the return or is it an exception. if localpythonpath == None: raise Exception("Expect local machine to have 'pythonpath' set") for i, localpathpart in enumerate(localpythonpath.split(';')): logging.info("Updating code on pythonpath as needed: {0}".format( localpathpart)) blobxfer( r"blobxfer.py --skipskip --delete --storageaccountkey {0} --upload {1} {2}-pp-v{3}-{4} ." .format( self.storage_key, #0 self.storage_account_name, #1 self.container, #2 self.pp_version, #3 i, #4 ), wd=localpathpart) def _setup_job(self, distributable_list, pool_id, name, log_writer=None): ''' This is the main method for submitting to AzureBatch. ''' job_id = datetime.datetime.utcnow().strftime( "%Y%m%d-%H%M%S") + "-" + name.replace( "_", "-").replace("/", "-").replace(".", "-").replace( "+", "-").replace("(", "").replace(")", "") job_id_etc_list = [] if True: # Pickle the things-to-run - put them in a local directory under the current directory called "runs/[jobid]" where the jobid is based on the date. if log_writer is not None: log_writer("{0}: Pickle the thing to run".format(name)) run_dir_rel = os.path.join("runs", job_id) pstutil.create_directory_if_necessary(run_dir_rel, isfile=False) for index, distributable in enumerate(distributable_list): distributablep_filename = os.path.join( run_dir_rel, "distributable{0}.p".format(index)) with open(distributablep_filename, mode='wb') as f: pickle.dump(distributable, f, pickle.HIGHEST_PROTOCOL) if True: # Copy (update) any (small) input files to the blob if log_writer is not None: log_writer("{0}: Upload small input files".format(name)) data_blob_fn = "{0}-data-v{1}".format(self.container, self.data_version) inputOutputCopier = AzureBatchCopier(data_blob_fn, self.storage_key, self.storage_account_name) script_list = [ "", "" ] #These will be scripts for copying to and from AzureStorage and the cluster nodes. inputOutputCopier2 = AzureBatchCopierNodeLocal( data_blob_fn, self.container, self.data_version, self.storage_key, self.storage_account_name, script_list) for index, distributable in enumerate(distributable_list): inputOutputCopier.input(distributable) inputOutputCopier2.input(distributable) inputOutputCopier2.output(distributable) output_blobfn = "{0}/output{1}".format( run_dir_rel.replace("\\", "/"), index ) #The name of the directory of return values in Azure Storage. job_id_etc_list.append( (job_id, inputOutputCopier, output_blobfn, run_dir_rel)) if True: # Create the jobprep program -- sets the python path and downloads the pythonpath code. Also create node-local folder for return values. if log_writer is not None: log_writer("{0}: Create jobprep.bat script".format(name)) localpythonpath = os.environ.get( "PYTHONPATH" ) #!!should it be able to work without pythonpath being set (e.g. if there was just one file)? Also, is None really the return or is it an exception. jobprep_filename = os.path.join(run_dir_rel, "jobprep.bat") # It only copies down files that are needed, but with some probability (about 1 in 50, say) fails, so we repeat three times. with open(jobprep_filename, mode='w') as f2: f2.write(r"""set set path=%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2;%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\scripts\;%path% for /l %%t in (0,1,3) do FOR /L %%i IN (0,1,{7}) DO python.exe %AZ_BATCH_TASK_WORKING_DIR%\blobxfer.py --skipskip --delete --storageaccountkey {2} --download {3} {4}-pp-v{5}-%%i %AZ_BATCH_NODE_SHARED_DIR%\{4}\pp\v{5}\%%i --remoteresource . {6} mkdir %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{8} exit /b 0 """.format( None, #0 - not used None, #1 - not used self.storage_key, #2 self.storage_account_name, #3 self.container, #4 self.pp_version, #5 script_list[0], #6 len(localpythonpath.split(';')) - 1, #7 index, #8 )) if True: #Split the taskcount roughly evenly among the distributables subtaskcount_list = deal(len(distributable_list), self.taskcount) if True: # Create the map.bat and reduce.bat programs to run. if log_writer is not None: log_writer( "{0}: Create map.bat and reduce.bat script".format(name)) pythonpath_string = "set pythonpath=" + ";".join( r"%AZ_BATCH_NODE_SHARED_DIR%\{0}\pp\v{1}\{2}".format( self.container, self.pp_version, i) for i in xrange(len(localpythonpath.split(';')))) for index in xrange(len(distributable_list)): subtaskcount = subtaskcount_list[index] output_blobfn = job_id_etc_list[index][2] for i, bat_filename in enumerate([ "map{0}.bat".format(index), "reduce{0}.bat".format(index) ]): bat_filename = os.path.join(run_dir_rel, bat_filename) with open(bat_filename, mode='w') as f1: #note that it's getting distributable.py from site-packages and never from the pythonpath f1.write( r"""set path=%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2;%AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\scripts\;%path% mkdir %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {6}cd %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {6}FOR /L %%i IN (0,1,{11}) DO python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --download {3} {8}/{10} . --remoteresource %%i.{0}.p cd %AZ_BATCH_NODE_SHARED_DIR%\{8}\data\v{9} {13} python.exe %AZ_BATCH_APP_PACKAGE_ANACONDA2%\Anaconda2\Lib\site-packages\fastlmm\util\distributable.py %AZ_BATCH_JOB_PREP_WORKING_DIR%\distributable{14}.p LocalInParts(%1,{0},result_file=r\"{4}/result.p\",mkl_num_threads={1},temp_dir=r\"{4}\") IF %ERRORLEVEL% NEQ 0 (EXIT /B %ERRORLEVEL%) {6}{7} cd %AZ_BATCH_TASK_WORKING_DIR%\..\..\output{14} {5}for /l %%t in (0,1,3) do python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --upload {3} {8} %1.{0}.p --remoteresource {10}/%1.{0}.p {6}for /l %%t in (0,1,3) do python.exe %AZ_BATCH_JOB_PREP_WORKING_DIR%\blobxfer.py --storageaccountkey {2} --upload {3} {8} result.p --remoteresource {10}/result.p """.format( subtaskcount, #0 self.mkl_num_threads, #1 self.storage_key, #2 self.storage_account_name, #3 "%AZ_BATCH_TASK_WORKING_DIR%/../../output{0}". format(index), #4 "" if i == 0 else "@rem ", #5 "" if i == 1 else "@rem ", #6 script_list[1], #7 self.container, #8 self.data_version, #9 output_blobfn, #10 subtaskcount - 1, #11 self.pp_version, #12 pythonpath_string, #13 index, #14 )) if True: # Upload the thing-to-run to a blob and the blobxfer program if log_writer is not None: log_writer("{0}: Upload the thing to run".format(name)) block_blob_client = azureblob.BlockBlobService( account_name=self.storage_account_name, account_key=self.storage_key) block_blob_client.create_container(self.container, fail_on_exist=False) blobxfer_blobfn = "utils/v{}/blobxfer.py".format( self.utils_version) blobxfer_url = commonhelpers.upload_blob_and_create_sas( block_blob_client, self.container, blobxfer_blobfn, os.path.join(os.path.dirname(__file__), "blobxfer.py"), datetime.datetime.utcnow() + datetime.timedelta(days=30)) jobprep_blobfn = "{}/jobprep.bat".format( run_dir_rel.replace("\\", "/")) jobprepbat_url = commonhelpers.upload_blob_and_create_sas( block_blob_client, self.container, jobprep_blobfn, os.path.join(run_dir_rel, "jobprep.bat"), datetime.datetime.utcnow() + datetime.timedelta(days=30)) map_reduce_url_list = [] for index in xrange(len(distributable_list)): distributablep_blobfn = "{0}/distributable{1}.p".format( run_dir_rel.replace("\\", "/"), index) distributablep_filename = os.path.join( run_dir_rel, "distributable{0}.p".format(index)) distributablep_url = commonhelpers.upload_blob_and_create_sas( block_blob_client, self.container, distributablep_blobfn, distributablep_filename, datetime.datetime.utcnow() + datetime.timedelta( days=30)) #!!!should there be an expiry? map_blobfn = "{0}/map{1}.bat".format( run_dir_rel.replace("\\", "/"), index) map_url = commonhelpers.upload_blob_and_create_sas( block_blob_client, self.container, map_blobfn, os.path.join(run_dir_rel, "map{0}.bat".format(index)), datetime.datetime.utcnow() + datetime.timedelta(days=30)) reduce_blobfn = "{0}/reduce{1}.bat".format( run_dir_rel.replace("\\", "/"), index) reduce_url = commonhelpers.upload_blob_and_create_sas( block_blob_client, self.container, reduce_blobfn, os.path.join(run_dir_rel, "reduce{0}.bat".format(index)), datetime.datetime.utcnow() + datetime.timedelta(days=30)) map_reduce_url_list.append( (map_url, reduce_url, distributablep_url)) if True: # Copy everything on PYTHONPATH to a blob if log_writer is not None: log_writer( "{0}: Upload items on pythonpath as requested".format( name)) if self.update_python_path == 'every_time': self._update_python_path_function() if True: # Create a job with a job prep task if log_writer is not None: log_writer("{0}: Create jobprep.bat".format(name)) resource_files = [ batchmodels.ResourceFile(blob_source=blobxfer_url, file_path="blobxfer.py"), batchmodels.ResourceFile(blob_source=jobprepbat_url, file_path="jobprep.bat") ] for index in xrange(len(distributable_list)): _, _, distributablep_url = map_reduce_url_list[index] resource_files.append( batchmodels.ResourceFile( blob_source=distributablep_url, file_path="distributable{0}.p".format(index))) job_preparation_task = batchmodels.JobPreparationTask( id="jobprep", #run_elevated=True, user_identity=batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification( elevation_level='admin')), resource_files=resource_files, command_line="jobprep.bat", ) job = batchmodels.JobAddParameter( id=job_id, job_preparation_task=job_preparation_task, pool_info=batch.models.PoolInformation(pool_id=pool_id), uses_task_dependencies=True, on_task_failure='performExitOptionsJobAction', ) try: self.batch_client.job.add(job) except batchmodels.BatchErrorException, e: if e.inner_exception.values is not None: raise Exception(e.inner_exception.values[-1].value) else: raise Exception(e.inner_exception) if True: # Add regular tasks to the job if log_writer is not None: log_writer("{0}: Add tasks to job".format(name)) task_factor = int( 10**math.ceil(math.log(max(subtaskcount_list), 10)) ) #When we have multiple distributables, this helps us number them e.g. 0,1,2,10,11,12,20,21,22 task_list = [] for index in xrange(len(distributable_list)): start = len(task_list) map_url, reduce_url, _ = map_reduce_url_list[index] subtaskcount = subtaskcount_list[index] for taskindex in xrange(subtaskcount): map_task = batchmodels.TaskAddParameter( id=index * task_factor + taskindex, #run_elevated=True, user_identity=batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification( elevation_level='admin')), #!!! seems to exit without needing a failure exit_conditions = batchmodels.ExitConditions(default=batchmodels.ExitOptions(job_action='terminate')), resource_files=[ batchmodels.ResourceFile( blob_source=map_url, file_path="map{0}.bat".format(index)) ], command_line=r"map{0}.bat {1}".format( index, taskindex), ) task_list.append(map_task) end = len(task_list) - 1 reduce_task = batchmodels.TaskAddParameter( id="reduce{0}".format(index), #run_elevated=True, user_identity=batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification( elevation_level='admin')), resource_files=[ batchmodels.ResourceFile( blob_source=reduce_url, file_path="reduce{0}.bat".format(index)) ], command_line=r"reduce{0}.bat {1}".format( index, subtaskcount), depends_on=batchmodels.TaskDependencies(task_id_ranges=[ batchmodels.TaskIdRange(task_list[start].id, task_list[end].id) ])) task_list.append(reduce_task) try: for i in xrange( 0, len(task_list), 100): #The Python API only lets us add 100 at a time. self.batch_client.task.add_collection( job_id, task_list[i:i + 100]) except Exception as exception: print exception raise exception return job_id_etc_list
def create_task(dataset, command, dependencies, max_wall_clock, production): if production: container = RESULTS_CONTAINER else: container = TEST_RESULTS_CONTAINER + "/" + \ generate_task_name(dataset.name) output_files = [ # Upload results batch_models.OutputFile( file_pattern="$AZ_BATCH_TASK_WORKING_DIR/results/**/*", upload_options=batch_models.OutputFileUploadOptions( upload_condition=batch_models.OutputFileUploadCondition. task_success), destination=batch_models.OutputFileDestination( container=batch_models.OutputFileBlobContainerDestination( path=dataset.data_dir, container_url=container + SAS_TOKEN))), batch_models.OutputFile( file_pattern= f"$AZ_BATCH_NODE_ROOT_DIR/fsmounts/{FILE_SHARE_NAME}/*.csv", upload_options=batch_models.OutputFileUploadOptions( upload_condition=batch_models.OutputFileUploadCondition. task_success), destination=batch_models.OutputFileDestination( container=batch_models.OutputFileBlobContainerDestination( container_url=container + SAS_TOKEN))), batch_models.OutputFile( file_pattern= f"$AZ_BATCH_NODE_ROOT_DIR/fsmounts/{FILE_SHARE_NAME}/last-update/*", upload_options=batch_models.OutputFileUploadOptions( upload_condition=batch_models.OutputFileUploadCondition. task_success), destination=batch_models.OutputFileDestination( container=batch_models.OutputFileBlobContainerDestination( path="last-update", container_url=container + SAS_TOKEN))), # Upload stderr and stdout batch_models.OutputFile( file_pattern="$AZ_BATCH_TASK_DIR/std*.txt", upload_options=batch_models.OutputFileUploadOptions( upload_condition=batch_models.OutputFileUploadCondition. task_completion), destination=batch_models.OutputFileDestination( container=batch_models.OutputFileBlobContainerDestination( path=DATETIME_NOWISH + "/" + generate_task_name(dataset.name), container_url=PROCESS_LOG_CONTAINER + "/" + SAS_TOKEN))) ] return batch_models.TaskAddParameter( id=generate_task_name(dataset.name), display_name=(dataset.name + "_python_script_job"), command_line=command, resource_files=[ batch_models.ResourceFile(storage_container_url=CONFIG_CONTAINER + SAS_TOKEN, blob_prefix=dataset.name + CONFIG_FILE) ], depends_on=batch_models.TaskDependencies(task_ids=dependencies), user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( scope='pool', elevation_level='admin')), container_settings=batch_models.TaskContainerSettings( image_name=DOCKER_CONTAINER_URL, container_run_options='-w /home/rstudio/covid-rt-estimates'), constraints=batch_models.TaskConstraints( max_wall_clock_time=datetime.timedelta(minutes=max_wall_clock)), output_files=output_files)