def run(task_id, task_args): """ Performs a Run. task_id: The tracking ID for this task. task_args: The input arguments for this task: """ run_id = task_args['bundle_id'] execution_time_limit = task_args['execution_time_limit'] container = task_args['container_name'] reply_to_queue_name = task_args['reply_to'] is_predict_step = task_args.get("predict", False) queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(), config.getAzureServiceBusKey(), config.getAzureServiceBusIssuer(), config.get_service_bus_shared_access_key_name(), config.get_service_bus_shared_access_key_value(), reply_to_queue_name) root_dir = None current_dir = os.getcwd() temp_dir = config.getLocalRoot() try: running_processes = subprocess.check_output(["fuser", temp_dir]) except subprocess.CalledProcessError, e: running_processes = ''
def run(task_id, task_args): """ Performs a Run. task_id: The tracking ID for this task. task_args: The input arguments for this task: """ run_id = task_args['bundle_id'] execution_time_limit = task_args['execution_time_limit'] container = task_args['container_name'] reply_to_queue_name = task_args['reply_to'] is_predict_step = task_args.get("predict", False) queue = AzureServiceBusQueue( config.getAzureServiceBusNamespace(), config.getAzureServiceBusKey(), config.getAzureServiceBusIssuer(), config.get_service_bus_shared_access_key_name(), config.get_service_bus_shared_access_key_value(), reply_to_queue_name) root_dir = None current_dir = os.getcwd() temp_dir = config.getLocalRoot() try: running_processes = subprocess.check_output(["fuser", temp_dir]) except subprocess.CalledProcessError, e: running_processes = ''
def main(): """ Setup the worker and start it. """ config = WorkerConfig() logging.config.dictConfig(config.getLoggerDictConfig()) # queue to listen to for notifications of tasks to perform queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(), config.getAzureServiceBusKey(), config.getAzureServiceBusIssuer(), config.getAzureServiceBusQueue()) # map task type to function to accomplish the task vtable = {'run': get_run_func(config)} # create and start the worker worker = BaseWorker(queue, vtable, logger) logger.info("Starting compute worker.") worker.start()
def main(): """ Setup the worker and start it. """ config = WorkerConfig() logging.config.dictConfig(config.getLoggerDictConfig()) # queue to listen to for notifications of tasks to perform queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(), config.getAzureServiceBusKey(), config.getAzureServiceBusIssuer(), config.getAzureServiceBusQueue()) # map task type to function to accomplish the task vtable = { 'run' : get_run_func(config) } # create and start the worker worker = BaseWorker(queue, vtable, logger) logger.info("Starting compute worker.") worker.start()
def run(task_id, task_args): """ Performs a Run. task_id: The tracking ID for this task. task_args: The input arguments for this task: """ run_id = task_args['bundle_id'] execution_time_limit = task_args['execution_time_limit'] container = task_args['container_name'] reply_to_queue_name = task_args['reply_to'] queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(), config.getAzureServiceBusKey(), config.getAzureServiceBusIssuer(), reply_to_queue_name) root_dir = None current_dir = os.getcwd() try: _send_update(queue, task_id, 'running') # Create temporary directory for the run root_dir = tempfile.mkdtemp(dir=config.getLocalRoot()) # Fetch and stage the bundles blob_service = BlobService(config.getAzureStorageAccountName(), config.getAzureStorageAccountKey()) bundles = getBundle(root_dir, blob_service, container, run_id, 'run') # Verify we have an input folder: create one if it's not in the bundle. input_rel_path = join('run', 'input') if input_rel_path not in bundles: input_dir = join(root_dir, 'run', 'input') if os.path.exists(input_dir) == False: os.mkdir(input_dir) # Verify we have a program prog_rel_path = join('run', 'program') if prog_rel_path not in bundles: raise Exception("Program bundle is not available.") prog_info = bundles[prog_rel_path] if prog_info is None: raise Exception("Program metadata is not available.") prog_cmd = "" if 'command' in prog_info: prog_cmd = prog_info['command'].strip() if len(prog_cmd) <= 0: raise Exception("Program command is not specified.") # Create output folder output_dir = join(root_dir, 'run', 'output') if os.path.exists(output_dir) == False: os.mkdir(output_dir) # Create temp folder temp_dir = join(root_dir, 'run', 'temp') if os.path.exists(temp_dir) == False: os.mkdir(temp_dir) # Report the list of folders and files staged # # Invoke custom evaluation program run_dir = join(root_dir, 'run') os.chdir(run_dir) logger.debug("Execution directory: %s", run_dir) # Update command-line with the real paths logger.debug("CMD: %s", prog_cmd) prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \ .replace("$input", join('.', 'input')) \ .replace("$output", join('.', 'output')) \ .replace("$tmp", join('.', 'temp')) \ .replace("/", os.path.sep) \ .replace("\\", os.path.sep) logger.debug("Invoking program: %s", prog_cmd) stdout_file = join(run_dir, 'stdout.txt') stderr_file = join(run_dir, 'stderr.txt') startTime = time.time() exit_code = None timed_out = False with open(stdout_file, "wb") as out, open(stderr_file, "wb") as err: evaluator_process = Popen(prog_cmd.split(' '), stdout=out, stderr=err) while exit_code is None: exit_code = evaluator_process.poll() # time in seconds if exit_code is None and time.time() - startTime > execution_time_limit: evaluator_process.kill() exit_code = -1 logger.info("Killed process for running too long!") err.write("Execution time limit exceeded!") timed_out = True break else: time.sleep(.1) logger.debug("Exit Code: %d", exit_code) endTime = time.time() elapsedTime = endTime - startTime prog_status = { 'exitCode': exit_code, 'elapsedTime': elapsedTime } with open(join(output_dir, 'metadata'), 'w') as f: f.write(yaml.dump(prog_status, default_flow_style=False)) # Upload stdout and stderr files stdout_id = "%s/stdout.txt" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, stdout_id, stdout_file) stderr_id = "%s/stderr.txt" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, stderr_id, stderr_file) # check if timed out AFTER output files are written! If we exit sooner, no output is written if timed_out: raise Exception("Execution time limit exceeded!") private_dir = join(output_dir, 'private') if os.path.exists(private_dir): logger.debug("Packing private results...") private_output_file = join(root_dir, 'run', 'private_output.zip') shutil.make_archive(os.path.splitext(private_output_file)[0], 'zip', output_dir) private_output_id = "%s/private_output.zip" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, private_output_id, private_output_file) shutil.rmtree(private_dir) # Pack results and send them to Blob storage logger.debug("Packing results...") output_file = join(root_dir, 'run', 'output.zip') shutil.make_archive(os.path.splitext(output_file)[0], 'zip', output_dir) output_id = "%s/output.zip" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, output_id, output_file) _send_update(queue, task_id, 'finished') except Exception: logger.exception("Run task failed (task_id=%s).", task_id) _send_update(queue, task_id, 'failed') # comment out for dev and viewing of raw folder outputs. if root_dir is not None: # Try cleaning-up temporary directory try: os.chdir(current_dir) shutil.rmtree(root_dir) except: logger.exception("Unable to clean-up local folder %s (task_id=%s)", root_dir, task_id)
def run(task_id, task_args): """ Performs a Run. task_id: The tracking ID for this task. task_args: The input arguments for this task: """ run_id = task_args['bundle_id'] execution_time_limit = task_args['execution_time_limit'] container = task_args['container_name'] reply_to_queue_name = task_args['reply_to'] is_predict_step = task_args.get("predict", False) queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(), config.getAzureServiceBusKey(), config.getAzureServiceBusIssuer(), reply_to_queue_name) root_dir = None current_dir = os.getcwd() try: _send_update(queue, task_id, 'running') # Create temporary directory for the run root_dir = tempfile.mkdtemp(dir=config.getLocalRoot()) # Fetch and stage the bundles blob_service = BlobService(config.getAzureStorageAccountName(), config.getAzureStorageAccountKey()) bundles = getBundle(root_dir, blob_service, container, run_id, 'run') # Verify we have an input folder: create one if it's not in the bundle. input_rel_path = join('run', 'input') if input_rel_path not in bundles: input_dir = join(root_dir, 'run', 'input') if os.path.exists(input_dir) == False: os.mkdir(input_dir) # Verify we have a program prog_rel_path = join('run', 'program') if prog_rel_path not in bundles: raise Exception("Program bundle is not available.") prog_info = bundles[prog_rel_path] if prog_info is None: raise Exception("Program metadata is not available.") prog_cmd_list = [] if 'command' in prog_info: if isinstance(prog_info['command'], type([])): prog_cmd_list = [_.strip() for _ in prog_info['command']] else: prog_cmd_list = [prog_info['command'].strip()] if len(prog_cmd_list) <= 0: raise Exception("Program command is not specified.") # Create output folder output_dir = join(root_dir, 'run', 'output') if os.path.exists(output_dir) == False: os.mkdir(output_dir) # Create temp folder temp_dir = join(root_dir, 'run', 'temp') if os.path.exists(temp_dir) == False: os.mkdir(temp_dir) # Report the list of folders and files staged # # Invoke custom evaluation program run_dir = join(root_dir, 'run') os.chdir(run_dir) os.environ["PATH"] += os.pathsep + run_dir + "/program" logger.debug("Execution directory: %s", run_dir) if is_predict_step: stdout_file_name = 'prediction_stdout_file.txt' stderr_file_name = 'prediction_stderr_file.txt' else: stdout_file_name = 'stdout.txt' stderr_file_name = 'stderr.txt' stdout_file = join(run_dir, stdout_file_name) stderr_file = join(run_dir, stderr_file_name) stdout = open(stdout_file, "a+") stderr = open(stderr_file, "a+") prog_status = [] for prog_cmd_counter, prog_cmd in enumerate(prog_cmd_list): # Update command-line with the real paths logger.debug("CMD: %s", prog_cmd) prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \ .replace("$input", join('.', 'input')) \ .replace("$output", join('.', 'output')) \ .replace("$tmp", join('.', 'temp')) \ .replace("/", os.path.sep) \ .replace("\\", os.path.sep) logger.debug("Invoking program: %s", prog_cmd) startTime = time.time() exit_code = None timed_out = False evaluator_process = Popen(prog_cmd.split(' '), stdout=stdout, stderr=stderr, env=os.environ) logger.debug("Started process, pid=%s" % evaluator_process.pid) time_difference = time.time() - startTime signal.signal(signal.SIGALRM, alarm_handler) signal.alarm(int(math.fabs(math.ceil(execution_time_limit - time_difference)))) exit_code = None logger.debug("Checking process, exit_code = %s" % exit_code) try: while exit_code == None: time.sleep(1) exit_code = evaluator_process.poll() except (ValueError, OSError): pass # tried to communicate with dead process except ExecutionTimeLimitExceeded: exit_code = -1 logger.info("Killed process for running too long!") stderr.write("Execution time limit exceeded!") evaluator_process.kill() timed_out = True signal.alarm(0) logger.debug("Exit Code: %d", exit_code) endTime = time.time() elapsedTime = endTime - startTime if len(prog_cmd_list) == 1: # Overwrite prog_status array with dict prog_status = { 'exitCode': exit_code, 'elapsedTime': elapsedTime } else: # otherwise we're doing multi-track and processing multiple commands so append to the array prog_status.append({ 'exitCode': exit_code, 'elapsedTime': elapsedTime }) with open(join(output_dir, 'metadata'), 'w') as f: f.write(yaml.dump(prog_status, default_flow_style=False)) stdout.close() stderr.close() logger.debug("Saving output files") stdout_id = "%s/%s" % (os.path.splitext(run_id)[0], stdout_file_name) _upload(blob_service, container, stdout_id, stdout_file) stderr_id = "%s/%s" % (os.path.splitext(run_id)[0], stderr_file_name) _upload(blob_service, container, stderr_id, stderr_file) private_dir = join(output_dir, 'private') if os.path.exists(private_dir): logger.debug("Packing private results...") private_output_file = join(root_dir, 'run', 'private_output.zip') shutil.make_archive(os.path.splitext(private_output_file)[0], 'zip', output_dir) private_output_id = "%s/private_output.zip" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, private_output_id, private_output_file) shutil.rmtree(private_dir) # Pack results and send them to Blob storage logger.debug("Packing results...") output_file = join(root_dir, 'run', 'output.zip') shutil.make_archive(os.path.splitext(output_file)[0], 'zip', output_dir) output_id = "%s/output.zip" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, output_id, output_file) # Check if the output folder contain an "html file" and copy the html file as detailed_results.html # traverse root directory, and list directories as dirs and files as files html_found = False for root, dirs, files in os.walk(output_dir): if not (html_found): path = root.split('/') for file in files: file_to_upload = os.path.join(root,file) file_ext = os.path.splitext(file_to_upload)[1] if file_ext.lower() ==".html": html_file_id = "%s/html/%s" % (os.path.splitext(run_id)[0],"detailed_results.html") _upload(blob_service, container, html_file_id, file_to_upload, "html") html_found = True # check if timed out AFTER output files are written! If we exit sooner, no output is written if timed_out: logger.exception("Run task timed out (task_id=%s).", task_id) _send_update(queue, task_id, 'failed') elif exit_code != 0: logger.exception("Run task exit code non-zero (task_id=%s).", task_id) _send_update(queue, task_id, 'failed', extra={'traceback': open(stderr_file).read()}) else: _send_update(queue, task_id, 'finished') except Exception: logger.exception("Run task failed (task_id=%s).", task_id) _send_update(queue, task_id, 'failed', extra={'traceback': traceback.format_exc()}) # comment out for dev and viewing of raw folder outputs. if root_dir is not None: # Try cleaning-up temporary directory try: os.chdir(current_dir) shutil.rmtree(root_dir) except: logger.exception("Unable to clean-up local folder %s (task_id=%s)", root_dir, task_id) # Cleanup any stuck processes or old files temp_dir = config.getLocalRoot() # Cleanup dir for the_file in os.listdir(temp_dir): file_path = os.path.join(temp_dir, the_file) if os.path.isfile(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) # Kill running processes in the temp dir call(["fuser", "-f", temp_dir])
def run(task_id, task_args): """ Performs a Run. task_id: The tracking ID for this task. task_args: The input arguments for this task: """ run_id = task_args['bundle_id'] execution_time_limit = task_args['execution_time_limit'] container = task_args['container_name'] reply_to_queue_name = task_args['reply_to'] queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(), config.getAzureServiceBusKey(), config.getAzureServiceBusIssuer(), reply_to_queue_name) root_dir = None current_dir = os.getcwd() try: _send_update(queue, task_id, 'running') # Create temporary directory for the run root_dir = tempfile.mkdtemp(dir=config.getLocalRoot()) # Fetch and stage the bundles blob_service = BlobService(config.getAzureStorageAccountName(), config.getAzureStorageAccountKey()) bundles = getBundle(root_dir, blob_service, container, run_id, 'run') # Verify we have an input folder: create one if it's not in the bundle. input_rel_path = join('run', 'input') if input_rel_path not in bundles: input_dir = join(root_dir, 'run', 'input') if os.path.exists(input_dir) == False: os.mkdir(input_dir) # Verify we have a program prog_rel_path = join('run', 'program') if prog_rel_path not in bundles: raise Exception("Program bundle is not available.") prog_info = bundles[prog_rel_path] if prog_info is None: raise Exception("Program metadata is not available.") prog_cmd = "" if 'command' in prog_info: prog_cmd = prog_info['command'].strip() if len(prog_cmd) <= 0: raise Exception("Program command is not specified.") # Create output folder output_dir = join(root_dir, 'run', 'output') if os.path.exists(output_dir) == False: os.mkdir(output_dir) # Create temp folder temp_dir = join(root_dir, 'run', 'temp') if os.path.exists(temp_dir) == False: os.mkdir(temp_dir) # Report the list of folders and files staged # # Invoke custom evaluation program run_dir = join(root_dir, 'run') os.chdir(run_dir) os.environ["PATH"] += os.pathsep + run_dir + "/program" logger.debug("Execution directory: %s", run_dir) # Update command-line with the real paths logger.debug("CMD: %s", prog_cmd) prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \ .replace("$input", join('.', 'input')) \ .replace("$output", join('.', 'output')) \ .replace("$tmp", join('.', 'temp')) \ .replace("/", os.path.sep) \ .replace("\\", os.path.sep) logger.debug("Invoking program: %s", prog_cmd) stdout_file = join(run_dir, 'stdout.txt') stderr_file = join(run_dir, 'stderr.txt') startTime = time.time() exit_code = None timed_out = False with open(stdout_file, "wb") as out, open(stderr_file, "wb") as err: evaluator_process = Popen(prog_cmd.split(' '), stdout=out, stderr=err) while exit_code is None: exit_code = evaluator_process.poll() # time in seconds if exit_code is None and time.time( ) - startTime > execution_time_limit: exit_code = -1 logger.info("Killed process for running too long!") err.write("Execution time limit exceeded!") evaluator_process.kill() timed_out = True break else: time.sleep(.1) logger.debug("Exit Code: %d", exit_code) endTime = time.time() elapsedTime = endTime - startTime prog_status = {'exitCode': exit_code, 'elapsedTime': elapsedTime} with open(join(output_dir, 'metadata'), 'w') as f: f.write(yaml.dump(prog_status, default_flow_style=False)) # Upload stdout and stderr files stdout_id = "%s/stdout.txt" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, stdout_id, stdout_file) stderr_id = "%s/stderr.txt" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, stderr_id, stderr_file) private_dir = join(output_dir, 'private') if os.path.exists(private_dir): logger.debug("Packing private results...") private_output_file = join(root_dir, 'run', 'private_output.zip') shutil.make_archive( os.path.splitext(private_output_file)[0], 'zip', output_dir) private_output_id = "%s/private_output.zip" % ( os.path.splitext(run_id)[0]) _upload(blob_service, container, private_output_id, private_output_file) shutil.rmtree(private_dir) # Pack results and send them to Blob storage logger.debug("Packing results...") output_file = join(root_dir, 'run', 'output.zip') shutil.make_archive( os.path.splitext(output_file)[0], 'zip', output_dir) output_id = "%s/output.zip" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, output_id, output_file) # Check if the output folder contain an "html file" and copy the html file as detailed_results.html # traverse root directory, and list directories as dirs and files as files html_found = False for root, dirs, files in os.walk(output_dir): if not (html_found): path = root.split('/') for file in files: file_to_upload = os.path.join(root, file) file_ext = os.path.splitext(file_to_upload)[1] if file_ext.lower() == ".html": html_file_id = "%s/html/%s" % (os.path.splitext( run_id)[0], "detailed_results.html") print "file_to_upload:%s" % file_to_upload _upload(blob_service, container, html_file_id, file_to_upload, "html") html_found = True # check if timed out AFTER output files are written! If we exit sooner, no output is written if timed_out: logger.exception("Run task failed (task_id=%s).", task_id) _send_update(queue, task_id, 'failed') else: _send_update(queue, task_id, 'finished') except Exception: logger.exception("Run task failed (task_id=%s).", task_id) _send_update(queue, task_id, 'failed') # comment out for dev and viewing of raw folder outputs. if root_dir is not None: # Try cleaning-up temporary directory try: os.chdir(current_dir) shutil.rmtree(root_dir) except: logger.exception( "Unable to clean-up local folder %s (task_id=%s)", root_dir, task_id)
def run(task_id, task_args): """ Performs a Run. task_id: The tracking ID for this task. task_args: The input arguments for this task: """ run_id = task_args['bundle_id'] container = task_args['container_name'] reply_to_queue_name = task_args['reply_to'] queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(), config.getAzureServiceBusKey(), config.getAzureServiceBusIssuer(), reply_to_queue_name) root_dir = None current_dir = os.getcwd() try: _send_update(queue, task_id, 'running') # Create temporary directory for the run root_dir = tempfile.mkdtemp(dir=config.getLocalRoot()) # Fetch and stage the bundles blob_service = BlobService(config.getAzureStorageAccountName(), config.getAzureStorageAccountKey()) bundles = getBundle(root_dir, blob_service, container, run_id, 'run') # Verify we have an input folder: create one if it's not in the bundle. input_rel_path = join('run', 'input') if input_rel_path not in bundles: input_dir = join(root_dir, 'run', 'input') if os.path.exists(input_dir) == False: os.mkdir(input_dir) # Verify we have a program prog_rel_path = join('run', 'program') if prog_rel_path not in bundles: raise Exception("Program bundle is not available.") prog_info = bundles[prog_rel_path] if prog_info is None: raise Exception("Program metadata is not available.") prog_cmd = "" if 'command' in prog_info: prog_cmd = prog_info['command'].strip() if len(prog_cmd) <= 0: raise Exception("Program command is not specified.") # Create output folder output_dir = join(root_dir, 'run', 'output') if os.path.exists(output_dir) == False: os.mkdir(output_dir) # Create temp folder temp_dir = join(root_dir, 'run', 'temp') if os.path.exists(temp_dir) == False: os.mkdir(temp_dir) # Report the list of folders and files staged # # Invoke custom evaluation program run_dir = join(root_dir, 'run') os.chdir(run_dir) logger.debug("Execution directory: %s", run_dir) # Update command-line with the real paths logger.debug("CMD: %s", prog_cmd) prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \ .replace("$input", join('.', 'input')) \ .replace("$output", join('.', 'output')) \ .replace("$tmp", join('.', 'temp')) \ .replace("/", os.path.sep) \ .replace("\\", os.path.sep) logger.debug("Invoking program: %s", prog_cmd) stdout_file = join(run_dir, 'stdout.txt') stderr_file = join(run_dir, 'stderr.txt') startTime = time.time() exitCode = os.system(prog_cmd + ' >' + stdout_file + ' 2>' + stderr_file) # Run it! logger.debug("Exit Code: %d", exitCode) endTime = time.time() elapsedTime = endTime - startTime prog_status = {'exitCode': exitCode, 'elapsedTime': elapsedTime} with open(join(output_dir, 'metadata'), 'w') as f: f.write(yaml.dump(prog_status, default_flow_style=False)) # Upload stdout and stderr files stdout_id = "%s/stdout.txt" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, stdout_id, stdout_file) stderr_id = "%s/stderr.txt" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, stderr_id, stderr_file) # Pack results and send them to Blob storage logger.debug("Packing results...") output_file = join(root_dir, 'run', 'output.zip') shutil.make_archive( os.path.splitext(output_file)[0], 'zip', output_dir) output_id = "%s/output.zip" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, output_id, output_file) _send_update(queue, task_id, 'finished') except Exception: logger.exception("Run task failed (task_id=%s).", task_id) _send_update(queue, task_id, 'failed') if root_dir is not None: # Try cleaning-up temporary directory try: os.chdir(current_dir) shutil.rmtree(root_dir) except: logger.exception( "Unable to clean-up local folder %s (task_id=%s)", root_dir, task_id)
def run(task_id, task_args): """ Performs a Run. task_id: The tracking ID for this task. task_args: The input arguments for this task: """ run_id = task_args['bundle_id'] execution_time_limit = task_args['execution_time_limit'] container = task_args['container_name'] reply_to_queue_name = task_args['reply_to'] is_predict_step = task_args.get("predict", False) queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(), config.getAzureServiceBusKey(), config.getAzureServiceBusIssuer(), reply_to_queue_name) root_dir = None current_dir = os.getcwd() try: _send_update(queue, task_id, 'running') # Create temporary directory for the run root_dir = tempfile.mkdtemp(dir=config.getLocalRoot()) # Fetch and stage the bundles blob_service = BlobService(config.getAzureStorageAccountName(), config.getAzureStorageAccountKey()) bundles = getBundle(root_dir, blob_service, container, run_id, 'run') # Verify we have an input folder: create one if it's not in the bundle. input_rel_path = join('run', 'input') if input_rel_path not in bundles: input_dir = join(root_dir, 'run', 'input') if os.path.exists(input_dir) == False: os.mkdir(input_dir) # Verify we have a program prog_rel_path = join('run', 'program') if prog_rel_path not in bundles: raise Exception("Program bundle is not available.") prog_info = bundles[prog_rel_path] if prog_info is None: raise Exception("Program metadata is not available.") prog_cmd_list = [] if 'command' in prog_info: if isinstance(prog_info['command'], type([])): prog_cmd_list = [_.strip() for _ in prog_info['command']] else: prog_cmd_list = [prog_info['command'].strip()] if len(prog_cmd_list) <= 0: raise Exception("Program command is not specified.") # Create output folder output_dir = join(root_dir, 'run', 'output') if os.path.exists(output_dir) == False: os.mkdir(output_dir) # Create temp folder temp_dir = join(root_dir, 'run', 'temp') if os.path.exists(temp_dir) == False: os.mkdir(temp_dir) # Report the list of folders and files staged # # Invoke custom evaluation program run_dir = join(root_dir, 'run') os.chdir(run_dir) os.environ["PATH"] += os.pathsep + run_dir + "/program" logger.debug("Execution directory: %s", run_dir) if is_predict_step: stdout_file_name = 'prediction_stdout_file.txt' stderr_file_name = 'prediction_stderr_file.txt' else: stdout_file_name = 'stdout.txt' stderr_file_name = 'stderr.txt' stdout_file = join(run_dir, stdout_file_name) stderr_file = join(run_dir, stderr_file_name) stdout = open(stdout_file, "a+") stderr = open(stderr_file, "a+") prog_status = [] for prog_cmd_counter, prog_cmd in enumerate(prog_cmd_list): # Update command-line with the real paths logger.debug("CMD: %s", prog_cmd) prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \ .replace("$input", join('.', 'input')) \ .replace("$output", join('.', 'output')) \ .replace("$tmp", join('.', 'temp')) \ .replace("/", os.path.sep) \ .replace("\\", os.path.sep) logger.debug("Invoking program: %s", prog_cmd) startTime = time.time() exit_code = None timed_out = False evaluator_process = Popen(prog_cmd.split(' '), stdout=stdout, stderr=stderr, env=os.environ) logger.debug("Started process, pid=%s" % evaluator_process.pid) time_difference = time.time() - startTime signal.signal(signal.SIGALRM, alarm_handler) signal.alarm( int( math.fabs( math.ceil(execution_time_limit - time_difference)))) exit_code = None logger.debug("Checking process, exit_code = %s" % exit_code) try: while exit_code == None: time.sleep(1) exit_code = evaluator_process.poll() except (ValueError, OSError): pass # tried to communicate with dead process except ExecutionTimeLimitExceeded: exit_code = -1 logger.info("Killed process for running too long!") stderr.write("Execution time limit exceeded!") evaluator_process.kill() timed_out = True signal.alarm(0) logger.debug("Exit Code: %d", exit_code) endTime = time.time() elapsedTime = endTime - startTime if len(prog_cmd_list) == 1: # Overwrite prog_status array with dict prog_status = { 'exitCode': exit_code, 'elapsedTime': elapsedTime } else: # otherwise we're doing multi-track and processing multiple commands so append to the array prog_status.append({ 'exitCode': exit_code, 'elapsedTime': elapsedTime }) with open(join(output_dir, 'metadata'), 'w') as f: f.write(yaml.dump(prog_status, default_flow_style=False)) stdout.close() stderr.close() logger.debug("Saving output files") stdout_id = "%s/%s" % (os.path.splitext(run_id)[0], stdout_file_name) _upload(blob_service, container, stdout_id, stdout_file) stderr_id = "%s/%s" % (os.path.splitext(run_id)[0], stderr_file_name) _upload(blob_service, container, stderr_id, stderr_file) private_dir = join(output_dir, 'private') if os.path.exists(private_dir): logger.debug("Packing private results...") private_output_file = join(root_dir, 'run', 'private_output.zip') shutil.make_archive( os.path.splitext(private_output_file)[0], 'zip', output_dir) private_output_id = "%s/private_output.zip" % ( os.path.splitext(run_id)[0]) _upload(blob_service, container, private_output_id, private_output_file) shutil.rmtree(private_dir) # Pack results and send them to Blob storage logger.debug("Packing results...") output_file = join(root_dir, 'run', 'output.zip') shutil.make_archive( os.path.splitext(output_file)[0], 'zip', output_dir) output_id = "%s/output.zip" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, output_id, output_file) # Check if the output folder contain an "html file" and copy the html file as detailed_results.html # traverse root directory, and list directories as dirs and files as files html_found = False for root, dirs, files in os.walk(output_dir): if not (html_found): path = root.split('/') for file in files: file_to_upload = os.path.join(root, file) file_ext = os.path.splitext(file_to_upload)[1] if file_ext.lower() == ".html": html_file_id = "%s/html/%s" % (os.path.splitext( run_id)[0], "detailed_results.html") _upload(blob_service, container, html_file_id, file_to_upload, "html") html_found = True # check if timed out AFTER output files are written! If we exit sooner, no output is written if timed_out: logger.exception("Run task timed out (task_id=%s).", task_id) _send_update(queue, task_id, 'failed') elif exit_code != 0: logger.exception("Run task exit code non-zero (task_id=%s).", task_id) _send_update(queue, task_id, 'failed', extra={'traceback': open(stderr_file).read()}) else: _send_update(queue, task_id, 'finished') except Exception: logger.exception("Run task failed (task_id=%s).", task_id) _send_update(queue, task_id, 'failed', extra={'traceback': traceback.format_exc()}) # comment out for dev and viewing of raw folder outputs. if root_dir is not None: # Try cleaning-up temporary directory try: os.chdir(current_dir) shutil.rmtree(root_dir) except: logger.exception( "Unable to clean-up local folder %s (task_id=%s)", root_dir, task_id) # Cleanup any stuck processes or old files temp_dir = config.getLocalRoot() # Cleanup dir for the_file in os.listdir(temp_dir): file_path = os.path.join(temp_dir, the_file) if os.path.isfile(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) # Kill running processes in the temp dir call(["fuser", "-f", temp_dir])
def run(task_id, task_args): """ Performs a Run. task_id: The tracking ID for this task. task_args: The input arguments for this task: """ run_id = task_args['bundle_id'] container = task_args['container_name'] reply_to_queue_name = task_args['reply_to'] queue = AzureServiceBusQueue(config.getAzureServiceBusNamespace(), config.getAzureServiceBusKey(), config.getAzureServiceBusIssuer(), reply_to_queue_name) root_dir = None current_dir = os.getcwd() try: _send_update(queue, task_id, 'running') # Create temporary directory for the run root_dir = tempfile.mkdtemp(dir=config.getLocalRoot()) # Fetch and stage the bundles blob_service = BlobService(config.getAzureStorageAccountName(), config.getAzureStorageAccountKey()) bundles = getBundle(root_dir, blob_service, container, run_id, 'run') # Verify we have an input folder: create one if it's not in the bundle. input_rel_path = join('run', 'input') if input_rel_path not in bundles: input_dir = join(root_dir, 'run', 'input') if os.path.exists(input_dir) == False: os.mkdir(input_dir) # Verify we have a program prog_rel_path = join('run', 'program') if prog_rel_path not in bundles: raise Exception("Program bundle is not available.") prog_info = bundles[prog_rel_path] if prog_info is None: raise Exception("Program metadata is not available.") prog_cmd = "" if 'command' in prog_info: prog_cmd = prog_info['command'].strip() if len(prog_cmd) <= 0: raise Exception("Program command is not specified.") # Create output folder output_dir = join(root_dir, 'run', 'output') if os.path.exists(output_dir) == False: os.mkdir(output_dir) # Create temp folder temp_dir = join(root_dir, 'run', 'temp') if os.path.exists(temp_dir) == False: os.mkdir(temp_dir) # Report the list of folders and files staged # # Invoke custom evaluation program run_dir = join(root_dir, 'run') os.chdir(run_dir) logger.debug("Execution directory: %s" % run_dir) # Update command-line with the real paths logger.debug("CMD: %s" % prog_cmd) prog_cmd = prog_cmd.replace("$program", join('.', 'program')) \ .replace("$input", join('.', 'input')) \ .replace("$output", join('.', 'output')) \ .replace("$tmp", join('.', 'temp')) \ .replace("/", os.path.sep) \ .replace("\\", os.path.sep) logger.debug("Invoking program: %s", prog_cmd) stdout_file = join(run_dir, 'stdout.txt') stderr_file = join(run_dir, 'stderr.txt') startTime = time.time() exitCode = os.system(prog_cmd + ' >' + stdout_file + ' 2>' + stderr_file) # Run it! logger.debug("Exit Code: %d" % exitCode) endTime = time.time() elapsedTime = endTime - startTime prog_status = { 'exitCode': exitCode, 'elapsedTime': elapsedTime } with open(join(output_dir, 'metadata'), 'w') as f: f.write(yaml.dump(prog_status, default_flow_style=False)) # Upload stdout and stderr files stdout_id = "%s/stdout.txt" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, stdout_id, stdout_file) stderr_id = "%s/stderr.txt" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, stderr_id, stderr_file) # Pack results and send them to Blob storage logger.debug("Packing results...") output_file = join(root_dir, 'run', 'output.zip') shutil.make_archive(os.path.splitext(output_file)[0], 'zip', output_dir) output_id = "%s/output.zip" % (os.path.splitext(run_id)[0]) _upload(blob_service, container, output_id, output_file) _send_update(queue, task_id, 'finished') except Exception: logger.exception("Run task failed (task_id=%s).", task_id) _send_update(queue, task_id, 'failed') if root_dir is not None: # Try cleaning-up temporary directory try: os.chdir(current_dir) shutil.rmtree(root_dir) except: logger.exception("Unable to clean-up local folder %s (task_id=%s)", root_dir, task_id)