class AbstractRunSubcommand(object): def __init__(self, args=None, silent=False): self._validate_args(args) self.args = args self.silent = silent verify_has_connection_settings() server_url = get_server_url() verify_server_is_running(url=server_url) token = get_token() self.connection = Connection(server_url, token=token) try: self.storage_settings = self.connection.get_storage_settings() self.import_manager = ImportManager( self.connection, storage_settings=self.storage_settings) self.export_manager = ExportManager( self.connection, storage_settings=self.storage_settings) except LoomengineUtilsError as e: raise SystemExit("ERROR! Failed to initialize client: '%s'" % e) @classmethod def _validate_args(cls, args): pass def _print(self, text): if not self.silent: print text
class TaskMonitor(object): DOCKER_SOCKET = 'unix://var/run/docker.sock' LOOM_RUN_SCRIPT_NAME = '.loom_run_script' def __init__(self, args=None, mock_connection=None, mock_import_manager=None, mock_export_manager=None): if args is None: args = self._get_args() self.settings = { 'TASK_ATTEMPT_ID': args.task_attempt_id, 'SERVER_URL': args.server_url, 'LOG_LEVEL': args.log_level, } self.is_failed = False self.logger = get_stdout_logger( __name__, self.settings['LOG_LEVEL']) if mock_connection is not None: self.connection = mock_connection else: try: self.connection = Connection(self.settings['SERVER_URL'], token=args.token) except Exception as e: error = self._get_error_text(e) self.logger.error( 'TaskMonitor for attempt %s failed to initialize server ' 'connection. %s' % (self.settings.get('TASK_ATTEMPT_ID'), error)) raise self._event('Initializing TaskMonitor') self._init_task_attempt() # From here on errors can be reported to Loom if mock_import_manager is not None: self.import_manager = mock_import_manager else: try: self.storage_settings = self.connection.get_storage_settings() self.import_manager = ImportManager( self.connection, storage_settings=self.storage_settings) self.export_manager = ExportManager( self.connection, storage_settings=self.storage_settings) self.settings.update(self._get_settings()) self._init_docker_client() self._init_working_dir() except Exception as e: error = self._get_error_text(e) self._report_system_error( detail='Initializing TaskMonitor failed. %s' % error) raise def _init_task_attempt(self): self.task_attempt = self.connection.get_task_attempt( self.settings['TASK_ATTEMPT_ID']) if self.task_attempt is None: raise Exception( 'TaskAttempt ID "%s" not found' % self.settings['TASK_ATTEMPT_ID']) def _get_settings(self): settings = self.connection.get_task_attempt_settings( self.settings['TASK_ATTEMPT_ID']) if settings is None: raise Exception('Worker settings not found') return settings def _init_docker_client(self): self.docker_client = docker.Client(base_url=self.DOCKER_SOCKET) self._verify_docker() def _verify_docker(self): try: self.docker_client.info() except requests.exceptions.ConnectionError: raise Exception('Failed to connect to Docker daemon') def _init_working_dir(self): init_directory(self.settings['WORKING_DIR_ROOT'], new=True) self.working_dir = os.path.join( self.settings['WORKING_DIR_ROOT'], 'work') self.log_dir = os.path.join(self.settings['WORKING_DIR_ROOT'], 'logs') init_directory(self.working_dir, new=True) init_directory(self.log_dir, new=True) def _delete_working_dir(self): # Skip delete if blank or root! if self.settings['WORKING_DIR_ROOT'].strip('/'): shutil.rmtree(self.settings['WORKING_DIR_ROOT']) def run_with_heartbeats(self, function): heartbeat_interval = int(self.settings['HEARTBEAT_INTERVAL_SECONDS']) polling_interval = 1 t = threading.Thread(target=function) t.start() last_heartbeat = self._send_heartbeat() while t.is_alive(): time.sleep(polling_interval) if (datetime.utcnow().replace(tzinfo=pytz.utc) - last_heartbeat)\ .total_seconds() > \ (heartbeat_interval - polling_interval): last_heartbeat = self._send_heartbeat() def run(self): try: self._copy_inputs() self._create_run_script() self._pull_image() self._create_container() self._run_container() self._stream_docker_logs() self._get_returncode() self._save_process_logs() if not self.is_failed: self._save_outputs() self._finish() finally: self._delete_working_dir() self._delete_container() def _copy_inputs(self): self._event('Copying inputs') if self.task_attempt.get('inputs') is None: return try: for input in self.task_attempt['inputs']: TaskAttemptInput(input, self).copy() except FileAlreadyExistsError as e: error = self._get_error_text(e) self._report_system_error( detail='Copying inputs failed because file already exists. ' 'Are there multiple inputs with the same name? %s' % error) raise except Exception as e: error = self._get_error_text(e) self._report_system_error( detail='Copying inputs failed. %s' % error) raise def _create_run_script(self): try: user_command = self.task_attempt['command'] with open(os.path.join( self.working_dir, self.LOOM_RUN_SCRIPT_NAME), 'w') as f: f.write(user_command.encode('utf-8') + '\n') except Exception as e: error = self._get_error_text(e) self._report_system_error( detail='Creating run script failed. %s' % error) raise def _pull_image(self): self._event('Pulling image') try: docker_image = self._get_docker_image() raw_pull_data = execute_with_retries( lambda: self.docker_client.pull(docker_image), (docker.errors.NotFound,), self.logger, "Pull docker image") pull_data = self._parse_docker_output(raw_pull_data) if pull_data[-1].get('errorDetail'): raise Exception('Failed to pull docker image: "%s"' % pull_data[-1].get('errorDetail')) container_info = self.docker_client.inspect_image( self._get_docker_image()) self._save_environment_info(container_info) except Exception as e: error = self._get_error_text(e) self._report_system_error( detail='Pulling Docker image failed with "%s" error: ' '"%s"' % (e.__class__, str(e))) raise def _get_docker_image(self): docker_image = self.task_attempt['environment']['docker_image'] # If no registry is specified, set to default. # If the first term contains "." or ends in ":", it is a registry. part1 = docker_image.split('/')[0] if '.' not in part1 and not part1.endswith(':'): default_registry = self.settings.get( 'DEFAULT_DOCKER_REGISTRY', None) # Don't add default_registry without the owner. # Default ower is library if len(docker_image.split('/')) == 1: docker_image = 'library/' + docker_image if default_registry: docker_image = '%s/%s' % (default_registry, docker_image) # Tag is required. Otherwise docker-py pull will download all tags. if '@' not in docker_image and ':' not in docker_image: docker_image += ':latest' return docker_image def _parse_docker_output(self, data): return [json.loads(line) for line in data.strip().split('\r\n')] def _create_container(self): self._event('Creating container') try: docker_image = self._get_docker_image() interpreter = self.task_attempt['interpreter'] host_dir = self.working_dir container_dir = '/loom_workspace' command = interpreter.split(' ') command.append(self.LOOM_RUN_SCRIPT_NAME) self.container = self.docker_client.create_container( image=docker_image, command=command, volumes=[container_dir], host_config=self.docker_client.create_host_config( binds={host_dir: { 'bind': container_dir, 'mode': 'rw', }}), working_dir=container_dir, name=self.settings['PROCESS_CONTAINER_NAME'], ) self._set_container_id(self.container['Id']) except Exception as e: error = self._get_error_text(e) self._report_system_error( detail='Creating container failed. %s' % error) raise def _run_container(self): self._event('Starting analysis') try: self.docker_client.start(self.container) self._verify_container_started_running() except Exception as e: error = self._get_error_text(e) self._report_system_error( detail='Starting analysis failed. %s' % error) raise def _verify_container_started_running(self): status = self.docker_client.inspect_container( self.container)['State'].get('Status') if status == 'running' or status == 'exited': return else: raise Exception('Unexpected container status "%s"' % status) def _stream_docker_logs(self): """Stream stdout and stderr from the task container to this process's stdout and stderr, respectively. """ thread = threading.Thread(target=self._stderr_stream_worker) thread.start() for line in self.docker_client.logs(self.container, stdout=True, stderr=False, stream=True): sys.stdout.write(line) thread.join() def _stderr_stream_worker(self): for line in self.docker_client.logs(self.container, stdout=False, stderr=True, stream=True): sys.stderr.write(line) def _get_returncode(self): self._event('Running analysis') try: returncode = self._poll_for_returncode() if returncode == 0: return else: # bad returncode self._report_analysis_error( 'Analysis finished with returncode %s. ' 'Check stderr/stdout logs for errors.' % returncode) # Do not raise error. Attempt to save log files. except Exception as e: error = self._get_error_text(e) self._report_system_error('Failed to run analysis. %s' % error) # Do not raise error. Attempt to save log files. def _poll_for_returncode(self, poll_interval_seconds=1): while True: try: container_data = self.docker_client.inspect_container( self.container) except Exception as e: raise Exception( 'Unable to inspect Docker container: "%s"' % str(e)) if not container_data.get('State'): raise Exception( 'Could not parse container info from Docker: "%s"' % container_data) if container_data['State'].get('Status') == 'exited': # Success return container_data['State'].get('ExitCode') elif container_data['State'].get('Status') == 'running': time.sleep(poll_interval_seconds) else: # Error -- process did not complete message = 'Docker container has unexpected status "%s"' % \ container_data['State'].get('Status') raise Exception(message) def _save_process_logs(self): self._event('Saving logfiles') try: stdout_file = os.path.join(self.log_dir, 'stdout.log') stderr_file = os.path.join(self.log_dir, 'stderr.log') init_directory( os.path.dirname(self.log_dir)) with open(stdout_file, 'w') as stdoutlog: stdoutlog.write(self._get_stdout()) with open(stderr_file, 'w') as stderrlog: stderrlog.write(self._get_stderr()) self._import_log_file(stderr_file, retry=True) self._import_log_file(stdout_file, retry=True) except Exception as e: error = self._get_error_text(e) self._report_system_error( detail='Saving log files failed. %s' % error) raise def _get_stdout(self): return self.docker_client.logs( self.container, stderr=False, stdout=True) def _get_stderr(self): return self.docker_client.logs( self.container, stderr=True, stdout=False) def _import_log_file(self, filepath, retry=True): try: self.import_manager.import_log_file( self.task_attempt, filepath, retry=retry, ) except IOError: message = 'Failed to upload log file %s' % filepath raise Exception(message) def _save_outputs(self): self._event('Saving outputs') try: for output in self.task_attempt['outputs']: TaskAttemptOutput(output, self).save() except Exception as e: error = self._get_error_text(e) self._report_system_error( detail='Saving outputs failed. %s' % error) raise def _finish(self): try: self._finish() except Exception as e: error = self._get_error_text(e) self._report_system_error( detail='Setting finished status failed. %s' % error) raise # Updates to TaskAttempt def _send_heartbeat(self): task_attempt = self.connection.update_task_attempt( self.settings['TASK_ATTEMPT_ID'], {} ) return parse(task_attempt.get('last_heartbeat')) def _set_container_id(self, container_id): self.connection.update_task_attempt( self.settings['TASK_ATTEMPT_ID'], {'container_id': container_id} ) def _save_environment_info(self, container_info): self.connection.update_task_attempt( self.settings['TASK_ATTEMPT_ID'], {'environment_info': container_info} ) def _event(self, event, detail='', is_error=False): if is_error: self.logger.error("%s. %s" % (event, detail)) else: self.logger.info("%s. %s" % (event, detail)) self.connection.post_task_attempt_event( self.settings['TASK_ATTEMPT_ID'], { 'event': event, 'detail': detail, 'is_error': is_error }) def _report_system_error(self, detail=''): self.is_failed = True try: self._event( "TaskAttempt execution failed.", detail=detail, is_error=True) self.connection.post_task_attempt_system_error( self.settings['TASK_ATTEMPT_ID']) except Exception: # If there is an error reporting failure, don't raise it # because it will mask the root cause of failure pass def _report_analysis_error(self, detail=''): self.is_failed = True try: self._event( "TaskAttempt execution failed.", detail=detail, is_error=True) self.connection.post_task_attempt_analysis_error( self.settings['TASK_ATTEMPT_ID']) except Exception: # If there is an error reporting failure, don't raise it # because it will mask the root cause of failure pass def _finish(self): self.connection.finish_task_attempt(self.settings['TASK_ATTEMPT_ID']) def _delete_container(self): try: if not self.container: return except AttributeError: return if self.settings.get('PRESERVE_ALL'): return if self.is_failed and self.settings.get('PRESERVE_ON_FAILURE'): return self.docker_client.stop(self.container) self.docker_client.remove_container(self.container) def _get_error_text(self, e): if hasattr(self, 'settings') and self.settings.get('DEBUG'): return traceback.format_exc() else: return "%s.%s: %s" % ( e.__class__.__module__, e.__class__.__name__, str(e)) # Parser def _get_args(self): parser = self.get_parser() return parser.parse_args() @classmethod def get_parser(self): parser = argparse.ArgumentParser(__file__) parser.add_argument('-i', '--task_attempt_id', required=True, help='ID of TaskAttempt to be processed') parser.add_argument('-u', '--server_url', required=True, help='URL of the Loom server') parser.add_argument('-l', '--log_level', required=False, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default='WARNING', help='Log level') parser.add_argument('-t', '--token', required=False, default=None, help='Authentication token') return parser