def process(job): util.descriptor_correct(job) job_dir, in_dir, out_dir = logic.create_workdir(job) mounted_ids = [] container_id = None try: logic.get_input_files(job, in_dir) with LockFile(config.LOCK_FILE): mounted_ids, container_id = logic.create_containers( job, in_dir, out_dir) while harbor.is_running(container_id): logger.debug("Container is running. Sleeping for {} sec.".format( config.CONTAINER_CHECK_INTERVAL)) time.sleep(config.CONTAINER_CHECK_INTERVAL) logic.write_std_output(container_id, out_dir) logic.handle_output(job, out_dir) logger.debug("Setting job.status='completed'") job.status = Job.COMPLETED except Exception as e: capture_exception() traceback.print_exc() raise e finally: logic.cleanup_dir(job_dir) cnt_to_remove = mounted_ids if container_id: cnt_to_remove += [container_id] logic.cleanup_containers(cnt_to_remove)
def create_containers(job, in_dir, out_dir): # Add needed containers logger.debug("Creating containers") descriptor = json.loads(job.input) mounted_ids = [] mounted_names = [] needed = descriptor['container'].get('needed_containers', []) for i, container in enumerate(needed): image, volumes = container['name'], container['volumes'] assert isinstance(volumes, list) if not config.ONLY_LOCAL_IMAGES: harbor.pull_image(image) tag = "JOB-{}-CNT-{}".format(job.id, i) mounted_names.append(tag) c_id = harbor.create_container( image, volumes=volumes, detach=True, name=tag, mem_limit="{}m".format(descriptor['max_memoryMB']), ) mounted_ids.append(c_id) # Execute environment container if not config.ONLY_LOCAL_IMAGES: harbor.pull_image(descriptor['container']['name']) command = util.build_command(job) logger.debug('Command to execute: {}'.format(command)) entrypoint = descriptor['container'].get('entrypoint', '') extra_flags = descriptor['container'].get('extra_flags', []) needed_volumes = descriptor['container'].get('volumes', []) volumes_list = util.obtain_volumes(in_dir, out_dir, needed_volumes) main_id = harbor.create_container( descriptor['container']['name'], working_dir=descriptor['container']['workdir'], command=command, entrypoint=entrypoint, volumes=volumes_list, detach=True, ) harbor.start_container( main_id, volumes_from=mounted_names, binds=volumes_list ) return mounted_ids, main_id
def REMOVE_ALL_CONTAINERS(): "Use with caution" logger.debug("Killing and removing all containers!") all_ids = [c['Id'] for c in client.containers(all=True)] for container_id in all_ids: for retries in xrange(20): try: client.remove_container(container_id, force=True) break except: capture_exception() continue
def do_docker_job(job, stub): logger.debug("Got descriptor: {}".format(job.input)) try: job.status = Job.RUNNING stub.ModifyJob(job) process(job) job.status = Job.COMPLETED stub.ModifyJob(job) logger.debug("Finished") except BaseException as e: capture_exception() if job.status != Job.COMPLETED: job.status = Job.FAILED if config.DEBUG: logger.debug({ "hostname": socket.gethostname(), "exception": str(e), "traceback": traceback.format_exc() }) logger.error(str(e)) logger.error(traceback.format_exc()) raise e
def do_docker_job(job, completion_event): logger.debug("Got descriptor: {}".format(job.input)) try: job.status = Job.RUNNING stub = new_client() stub.ModifyJob(job, timeout=5) del stub process(job) stub = new_client() job.status = Job.COMPLETED stub.ModifyJob(job, timeout=5) completion_event.set() logger.debug("Finished") except BaseException as e: capture_exception() if job.status != Job.COMPLETED: job.status = Job.FAILED debug_info = { "hostname": socket.gethostname(), "exception": str(e), "traceback": traceback.format_exc() } job.metadata = json.dumps(debug_info) stub = new_client() stub.ModifyJob(job, timeout=5) logger.error(str(e)) logger.error(traceback.format_exc()) completion_event.set() raise e
def start_container(container_id, **kwargs): attempts = 0 while attempts < config.DOCKER_START_ATTEMPTS: logger.debug("Trying to start container id={}".format(container_id)) try: client.start(container_id, **kwargs) break except Exception as e: capture_exception() logger.debug("Failed to start container id={}, error: {}".format( container_id, e)) attempts += 1 if attempts < config.DOCKER_START_ATTEMPTS: logger.debug("Started container id={}".format(container_id)) return True else: raise Exception('Failed to start container id={}'.format(container_id))
def get_input_files(job, in_dir): descriptor = json.loads(job.input) for input_file in descriptor['input']: logger.debug("Download input {}".format(input_file)) config.backend.copy_from_backend(input_file, in_dir)
def cleanup_dir(job_dir): logger.debug("Cleaning up directories") pre_remove_hook() shutil.rmtree(job_dir)
def cleanup_containers(cnt_ids): logger.debug("Cleaning up containers") for container_id in cnt_ids: harbor.remove(container_id, v=True, force=True)
def pre_remove_hook(): logger.debug("Executing pre-remove hook: `{}`".format( config.PRE_REMOVE_HOOK)) os.system(config.PRE_REMOVE_HOOK)
def upload_output_files(out_dir, upload_uri): logger.debug("Upload output directory `{}` to `{}`".format( out_dir, upload_uri)) config.backend.copy_to_backend(out_dir, upload_uri) return config.backend.list_uploaded(upload_uri)
def create_container(image, **kwargs): logger.debug("Creating container for image {} with arguments: {}".format( image, kwargs)) c = client.create_container(image, **kwargs) return c['Id']
def pull_image(image, *args, **kwargs): logger.debug("Pulling image {}".format(image)) client.pull(image, *args, **kwargs)