def task_put(self, task): limits = KolejkaLimits() limits.cpus = self.config.cpus limits.memory = self.config.memory limits.pids = self.config.pids limits.storage = self.config.storage limits.time = self.config.time limits.network = self.config.network task.limits.update(limits) if not self.instance_session: self.login() for k, f in task.files.items(): if not f.reference or not self.blob_check( blob_reference=f.reference): f.reference = None if f.path: f.reference = self.blob_put(os.path.join( task.path, f.path))['key'] else: raise info = self.post('/task/task/', data=json.dumps(task.dump())) if info.status_code == 200: task = KolejkaTask(None) task.load(info.json()['task']) return task else: print(info) print(info.text)
def task_put(self, task): limits = KolejkaLimits() limits.cpus = self.config.cpus limits.memory = self.config.memory limits.swap = self.config.swap limits.pids = self.config.pids limits.storage = self.config.storage limits.image = self.config.image limits.workspace = self.config.workspace limits.time = self.config.time limits.network = self.config.network limits.gpus = self.config.gpus task.limits.update(limits) if not self.instance_session: self.login() for f in task.files.values(): if not f.reference or not self.blob_check( blob_reference=f.reference): assert f.path f.reference = self.blob_put(os.path.join(task.path, f.path))['key'] response = self.post('/task/task/', data=json.dumps(task.dump())) task = KolejkaTask(None) task.load(response.json()['task']) return task
def foreman(): config = foreman_config() limits = KolejkaLimits() limits.cpus = config.cpus limits.memory = config.memory limits.pids = config.pids limits.storage = config.storage limits.time = config.time limits.network = config.network client = KolejkaClient() while True: try: tasks = client.dequeue(config.concurency, limits, config.tags) if len(tasks) == 0: time.sleep(config.interval) else: while len(tasks) > 0: resources = KolejkaLimits() resources.update(limits) processes = list() cpus_offset = 0 for task in tasks: if len(processes) >= config.concurency: break if task.exclusive and len(processes) > 0: break task.limits.update(limits) task.limits.cpus_offset = cpus_offset ok = True if resources.cpus is not None and task.limits.cpus > resources.cpus: ok = False if resources.memory is not None and task.limits.memory > resources.memory: ok = False if resources.pids is not None and task.limits.pids > resources.pids: ok = False if resources.storage is not None and task.limits.storage > resources.storage: ok = False if ok: proc = Thread(target=foreman_single, args=(config.temp_path, client, task)) proc.start() processes.append(proc) cpus_offset += task.limits.cpus if resources.cpus is not None: resources.cpus -= task.limits.cpus if resources.memory is not None: resources.memory -= task.limits.memory if resources.pids is not None: resources.pids -= task.limits.pids if resources.storage is not None: resources.storage -= task.limits.storage tasks = tasks[1:] if task.exclusive: break else: break for proc in processes: proc.join() except: time.sleep(config.interval)
def stage0(task_path, result_path, temp_path=None, consume_task_folder=False): config = worker_config() cgs = ControlGroupSystem() task = KolejkaTask(task_path) if not task.id: task.id = uuid.uuid4().hex logging.warning('Assigned id {} to the task'.format(task.id)) if not task.image: logging.error('Task does not define system image') sys.exit(1) if not task.args: logging.error('Task does not define args') sys.exit(1) if not task.files.is_local: logging.error('Task contains non-local files') sys.exit(1) limits = KolejkaLimits() limits.cpus = config.cpus limits.memory = config.memory limits.swap = config.swap limits.pids = config.pids limits.storage = config.storage limits.image = config.image limits.workspace = config.workspace limits.time = config.time limits.network = config.network limits.gpus = config.gpus task.limits.update(limits) docker_task = 'kolejka_worker_{}'.format(task.id) docker_cleanup = [ ['docker', 'kill', docker_task], ['docker', 'rm', docker_task], ] with tempfile.TemporaryDirectory(dir=temp_path) as jailed_path: #TODO jailed_path size remains unlimited? logging.debug('Using {} as temporary directory'.format(jailed_path)) jailed_task_path = os.path.join(jailed_path, 'task') os.makedirs(jailed_task_path, exist_ok=True) jailed_result_path = os.path.join(jailed_path, 'result') os.makedirs(jailed_result_path, exist_ok=True) jailed = KolejkaTask(os.path.join(jailed_path, 'task')) jailed.load(task.dump()) jailed.files.clear() volumes = list() check_python_volume() if os.path.exists(OBSERVER_SOCKET): volumes.append((OBSERVER_SOCKET, OBSERVER_SOCKET, 'rw')) else: logging.warning('Observer is not running.') volumes.append( (jailed_result_path, os.path.join(WORKER_DIRECTORY, 'result'), 'rw')) for key, val in task.files.items(): if key != TASK_SPEC: src_path = os.path.join(task.path, val.path) dst_path = os.path.join(jailed_path, 'task', key) os.makedirs(os.path.dirname(dst_path), exist_ok=True) if consume_task_folder: shutil.move(src_path, dst_path) else: shutil.copy(src_path, dst_path) jailed.files.add(key) jailed.files.add(TASK_SPEC) #jailed.limits = KolejkaLimits() #TODO: Task is limited by docker, no need to limit it again? jailed.commit() volumes.append((jailed.path, os.path.join(WORKER_DIRECTORY, 'task'), 'rw')) if consume_task_folder: try: shutil.rmtree(task_path) except: logging.warning('Failed to remove {}'.format(task_path)) pass for spath in [os.path.dirname(__file__)]: stage1 = os.path.join(spath, 'stage1.sh') if os.path.isfile(stage1): volumes.append( (stage1, os.path.join(WORKER_DIRECTORY, 'stage1.sh'), 'ro')) break for spath in [os.path.dirname(__file__)]: stage2 = os.path.join(spath, 'stage2.py') if os.path.isfile(stage2): volumes.append( (stage2, os.path.join(WORKER_DIRECTORY, 'stage2.py'), 'ro')) break docker_call = ['docker', 'run'] docker_call += ['--detach'] docker_call += ['--name', docker_task] docker_call += [ '--entrypoint', os.path.join(WORKER_DIRECTORY, 'stage1.sh') ] for key, val in task.environment.items(): docker_call += ['--env', '{}={}'.format(key, val)] docker_call += ['--hostname', WORKER_HOSTNAME] docker_call += ['--init'] if task.limits.cpus is not None: docker_call += [ '--cpuset-cpus', ','.join([ str(c) for c in cgs.limited_cpuset(cgs.full_cpuset( ), task.limits.cpus, task.limits.cpus_offset) ]) ] if task.limits.gpus is not None and task.limits.gpus > 0: check_gpu_runtime_availability() gpus = ','.join( map( str, limited_gpuset(full_gpuset(), task.limits.gpus, task.limits.gpus_offset))) docker_call += [ '--runtime=nvidia', '--shm-size=1g', '--gpus', f'"device={gpus}"' ] if task.limits.memory is not None: docker_call += ['--memory', str(task.limits.memory)] if task.limits.swap is not None: docker_call += [ '--memory-swap', str(task.limits.memory + task.limits.swap) ] if task.limits.storage is not None: docker_info_run = subprocess.run( ['docker', 'system', 'info', '--format', '{{json .Driver}}'], stdout=subprocess.PIPE, check=True) storage_driver = str( json.loads(str(docker_info_run.stdout, 'utf-8'))) if storage_driver == 'overlay2': docker_info_run = subprocess.run([ 'docker', 'system', 'info', '--format', '{{json .DriverStatus}}' ], stdout=subprocess.PIPE, check=True) storage_fs = dict( json.loads(str(docker_info_run.stdout, 'utf-8')))['Backing Filesystem'] if storage_fs in ['xfs']: storage_limit = task.limits.storage docker_call += [ '--storage-opt', 'size=' + str(storage_limit) ] else: logging.warning( "Storage limit on {} ({}) is not supported".format( storage_driver, storage_fs)) else: logging.warning("Storage limit on {} is not supported".format( storage_driver)) if task.limits.network is not None: if not task.limits.network: docker_call += ['--network=none'] docker_call += ['--cap-add', 'SYS_NICE'] if task.limits.pids is not None: docker_call += ['--pids-limit', str(task.limits.pids)] if task.limits.time is not None: docker_call += [ '--stop-timeout', str(int(math.ceil(task.limits.time.total_seconds()))) ] docker_call += [ '--volume', '{}:{}:{}'.format(WORKER_PYTHON_VOLUME, os.path.join(WORKER_DIRECTORY, 'python3'), 'ro') ] for v in volumes: docker_call += [ '--volume', '{}:{}:{}'.format(os.path.realpath(v[0]), v[1], v[2]) ] docker_call += ['--workdir', WORKER_DIRECTORY] docker_image = task.image docker_call += [docker_image] docker_call += ['--consume'] if config.debug: docker_call += ['--debug'] if config.verbose: docker_call += ['--verbose'] docker_call += [os.path.join(WORKER_DIRECTORY, 'task')] docker_call += [os.path.join(WORKER_DIRECTORY, 'result')] logging.debug('Docker call : {}'.format(docker_call)) pull_image = config.pull if not pull_image: docker_inspect_run = subprocess.run( ['docker', 'image', 'inspect', docker_image], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) if docker_inspect_run.returncode != 0: pull_image = True if pull_image: subprocess.run(['docker', 'pull', docker_image], check=True) for docker_clean in docker_cleanup: silent_call(docker_clean) if os.path.exists(result_path): shutil.rmtree(result_path) os.makedirs(result_path, exist_ok=True) result = KolejkaResult(result_path) result.id = task.id result.limits = task.limits result.stdout = task.stdout result.stderr = task.stderr start_time = datetime.datetime.now() docker_run = subprocess.run(docker_call, stdout=subprocess.PIPE) cid = str(docker_run.stdout, 'utf-8').strip() logging.info('Started container {}'.format(cid)) try: if task.limits.gpus is not None and task.limits.gpus > 0: result.stats.update( gpu_stats(gpus=limited_gpuset(full_gpuset( ), task.limits.gpus, task.limits.gpus_offset))) except: pass time.sleep(0.1) while True: try: docker_state_run = subprocess.run( ['docker', 'inspect', '--format', '{{json .State}}', cid], stdout=subprocess.PIPE) state = json.loads(str(docker_state_run.stdout, 'utf-8')) except: break try: result.stats.update(cgs.name_stats(cid)) if task.limits.gpus is not None and task.limits.gpus > 0: result.stats.update( gpu_stats(gpus=limited_gpuset(full_gpuset( ), task.limits.gpus, task.limits.gpus_offset))) except: pass time.sleep(0.1) if not state['Running']: result.result = state['ExitCode'] try: result.stats.time = dateutil.parser.parse( state['FinishedAt']) - dateutil.parser.parse( state['StartedAt']) except: result.stats.time = None break if task.limits.time is not None and datetime.datetime.now( ) - start_time > task.limits.time + datetime.timedelta(seconds=2): docker_kill_run = subprocess.run( ['docker', 'kill', docker_task]) subprocess.run(['docker', 'logs', cid], stdout=subprocess.PIPE) try: summary = KolejkaResult(jailed_result_path) result.stats.update(summary.stats) except: pass stop_time = datetime.datetime.now() if result.stats.time is None: result.stats.time = stop_time - start_time result.stats.pids.usage = None result.stats.memory.usage = None result.stats.memory.swap = None for dirpath, dirnames, filenames in os.walk(jailed_result_path): for filename in filenames: abspath = os.path.join(dirpath, filename) realpath = os.path.realpath(abspath) if realpath.startswith( os.path.realpath(jailed_result_path) + '/'): relpath = abspath[len(jailed_result_path) + 1:] if relpath != RESULT_SPEC: destpath = os.path.join(result.path, relpath) os.makedirs(os.path.dirname(destpath), exist_ok=True) shutil.move(realpath, destpath) os.chmod(destpath, 0o640) result.files.add(relpath) result.commit() os.chmod(result.spec_path, 0o640) for docker_clean in docker_cleanup: silent_call(docker_clean)
def foreman(): config = foreman_config() limits = KolejkaLimits() limits.cpus = config.cpus limits.memory = config.memory limits.swap = config.swap limits.pids = config.pids limits.storage = config.storage limits.image = config.image limits.workspace = config.workspace limits.time = config.time limits.network = config.network limits.gpus = config.gpus client = KolejkaClient() while True: try: tasks = client.dequeue(config.concurency, limits, config.tags) if len(tasks) == 0: time.sleep(config.interval) else: check_python_volume() while len(tasks) > 0: resources = KolejkaLimits() resources.update(limits) image_usage = dict() processes = list() cpus_offset = 0 gpus_offset = 0 for task in tasks: if len(processes) >= config.concurency: break if task.exclusive and len(processes) > 0: break task.limits.update(limits) task.limits.cpus_offset = cpus_offset task.limits.gpus_offset = gpus_offset ok = True if resources.cpus is not None and task.limits.cpus > resources.cpus: ok = False if task.limits.gpus is not None and task.limits.gpus > 0: if resources.gpus is None or task.limits.gpus > resources.gpus: ok = False if resources.memory is not None and task.limits.memory > resources.memory: ok = False if resources.gpus is not None: if task.limits.gpus > resources.gpus: ok = False if resources.swap is not None and task.limits.swap > resources.swap: ok = False if resources.pids is not None and task.limits.pids > resources.pids: ok = False if resources.storage is not None and task.limits.storage > resources.storage: ok = False if resources.image is not None: image_usage_add = max( image_usage.get(task.image, 0), task.limits.image) - image_usage.get( task.image, 0) if image_usage_add > resources.image: ok = False if resources.workspace is not None and task.limits.workspace > resources.workspace: ok = False if ok: proc = Process(target=foreman_single, args=(config.temp_path, task)) processes.append(proc) cpus_offset += task.limits.cpus if resources.cpus is not None: resources.cpus -= task.limits.cpus gpus_offset += task.limits.gpus if resources.gpus is not None: resources.gpus -= task.limits.gpus if resources.memory is not None: resources.memory -= task.limits.memory if resources.swap is not None: resources.swap -= task.limits.swap if resources.pids is not None: resources.pids -= task.limits.pids if resources.storage is not None: resources.storage -= task.limits.storage if resources.image is not None: resources.image -= image_usage_add image_usage[task.image] = max( image_usage.get(task.image, 0), task.limits.image) if resources.workspace is not None: resources.workspace -= task.limits.workspace tasks = tasks[1:] if task.exclusive: break else: break if config.image is not None: manage_images(config.pull, config.image, image_usage, [task.image for task in tasks]) for proc in processes: proc.start() for proc in processes: proc.join() except KeyboardInterrupt: raise except: traceback.print_exc() time.sleep(config.interval)