def execute(config: str, debug: bool, params): check_statuses() _create_computer() _create_docker() # Fail all InProgress Tasks logger = create_logger(_session, __name__) provider = TaskProvider(_session) step_provider = StepProvider(_session) for t in provider.by_status(TaskStatus.InProgress, worker_index=WORKER_INDEX): step = step_provider.last_for_task(t.id) logger.error( f'Task Id = {t.id} was in InProgress state ' f'when another tasks arrived to the same worker', ComponentType.Worker, t.computer_assigned, t.id, step) provider.change_status(t, TaskStatus.Failed) # Create dags dags = _dag(config, debug, params=params) for dag in dags: for ids in dag.values(): for id in ids: task = provider.by_id(id) task.gpu_assigned = ','.join( [str(i) for i in range(torch.cuda.device_count())]) provider.commit() execute_by_id(id, exit=False)
def start(daemon: bool, debug: bool, workers: int, log_level: str): """ Start both server and worker on the same machine. It starts: redis-server, site, worker_supervisor, workers """ # creating supervisord config supervisor_command = 'mlcomp-worker worker-supervisor' worker_command = 'mlcomp-worker worker' server_command = 'mlcomp-server start-site' if debug: supervisor_command = 'python mlcomp/worker/__main__.py ' \ 'worker-supervisor' worker_command = 'python mlcomp/worker/__main__.py worker' server_command = 'python mlcomp/server/__main__.py start-site' folder = os.path.dirname(os.path.dirname(__file__)) redis_path = os.path.join(folder, 'bin/redis-server') daemon_text = 'false' if daemon else 'true' text = [ '[supervisord]', f'nodaemon={daemon_text}', '', '[program:redis]', f'command={redis_path} --port {REDIS_PORT}' f' --requirepass {REDIS_PASSWORD}', ] conf = os.path.join(CONFIG_FOLDER, 'supervisord-redis.conf') with open(conf, 'w') as f: f.writelines('\n'.join(text)) Popen( ['supervisord', f'--configuration={conf}', f'--loglevel={log_level}']) sleep(5) check_statuses() daemon_text = 'false' if daemon else 'true' text = [ '[supervisord]', f'nodaemon={daemon_text}', '', '[program:supervisor]', f'command={supervisor_command}', 'autostart=true', 'autorestart=true', '', '[program:server]', f'command={server_command}', 'autostart=true', 'autorestart=true', '' ] for p in range(workers): text.append(f'[program:worker{p}]') text.append(f'command={worker_command} {p}') text.append('autostart=true') text.append('autorestart=true') text.append('') conf = os.path.join(CONFIG_FOLDER, 'supervisord.conf') with open(conf, 'w') as f: f.writelines('\n'.join(text)) os.system(f'supervisord ' f'-c {conf} -e {log_level}')
def stop(): """ Stop supervisord started by start command """ check_statuses() lines = os.popen('ps -ef | grep supervisord').readlines() for line in lines: if 'mlcomp/configs/supervisord.conf' not in line: continue pid = int(line.split()[1]) kill_child_processes(pid)
def sync(project: str, computer: str, only_from: bool, only_to: bool, online: bool): """ Syncs specified project on this computer with other computers """ check_statuses() _create_computer() _create_docker() computer = computer or socket.gethostname() provider = ComputerProvider(_session) project_provider = ProjectProvider(_session) computer = provider.by_name(computer) computers = provider.all_with_last_activtiy() p = project_provider.by_name(project) assert p, f'Project={project} is not found' sync_folders = yaml_load(p.sync_folders) ignore_folders = yaml_load(p.ignore_folders) sync_folders = correct_folders(sync_folders, p.name) ignore_folders = correct_folders(ignore_folders, p.name) if not isinstance(sync_folders, list): sync_folders = [] if not isinstance(ignore_folders, list): ignore_folders = [] folders = [[s, ignore_folders] for s in sync_folders] for c in computers: if c.name != computer.name: if online and (now() - c.last_activity).total_seconds() > 100: continue if not only_from: sync_directed(_session, computer, c, folders) if not only_to: sync_directed(_session, c, computer, folders)
def start(daemon: bool, debug: bool, workers: int, log_level: str): """ Start worker_supervisor and workers """ check_statuses() # creating supervisord config supervisor_command = 'mlcomp-worker worker-supervisor' worker_command = 'mlcomp-worker worker' if debug: supervisor_command = 'python mlcomp/worker/__main__.py ' \ 'worker-supervisor' worker_command = 'python mlcomp/worker/__main__.py worker' daemon_text = 'false' if daemon else 'true' text = [ '[supervisord]', f'nodaemon={daemon_text}', '', '[program:supervisor]', f'command={supervisor_command} --workers {workers}', 'autostart=true', 'autorestart=true', '' ] for p in range(workers): text.append(f'[program:worker{p}]') text.append(f'command={worker_command} {p}') text.append('autostart=true') text.append('autorestart=true') text.append('') conf = os.path.join(CONFIG_FOLDER, 'supervisord.conf') with open(conf, 'w') as f: f.writelines('\n'.join(text)) os.system(f'supervisord ' f'-c {conf} -e {log_level}')
def worker_supervisor(workers: int): """ Start worker supervisor. This program controls workers ran on the same machine. Also, it writes metric of resources consumption. """ check_statuses() host = socket.gethostname() logger = create_logger(_session, 'worker_supervisor') logger.info('worker_supervisor start', ComponentType.WorkerSupervisor, host) _create_computer(workers) _create_docker() start_schedule([(stop_processes_not_exist, 10)]) if DOCKER_MAIN: syncer = FileSync() start_schedule([(worker_usage, 0)]) start_schedule([(syncer.sync, 0)]) name = f'{host}_{DOCKER_IMG}_supervisor' argv = [ 'worker', '--loglevel=INFO', '-P=solo', f'-n={name}', '-O fair', '-c=1', '--prefetch-multiplier=1', '-Q', f'{name}' ] logger.info('worker_supervisor run celery', ComponentType.WorkerSupervisor, host) app.worker_main(argv)
def dag(config: str, control_reqs: bool, params): check_statuses() _dag(config, control_reqs=control_reqs, params=params)
def stop_site(): """ Stop site """ check_statuses() _stop_server()
def start_site(): """ Start only site """ check_statuses() _start_server()