def sync(project: str, computer: str, only_from: bool, only_to: bool): _create_computer() computer = computer or socket.gethostname() provider = ComputerProvider(_session) project_provider = ProjectProvider(_session) computer = provider.by_name(computer) computers = provider.all() folders_excluded = [] p = project_provider.by_name(project) assert p, f'Project={project} is not found' ignore = yaml_load(p.ignore_folders) excluded = [] for f in ignore: excluded.append(str(f)) folders_excluded.append([join('data', p.name), excluded]) folders_excluded.append([join('models', p.name), []]) for c in computers: if c.name != computer.name: if not only_from: sync_directed(_session, computer, c, folders_excluded) if not only_to: sync_directed(_session, c, computer, folders_excluded)
def computers(): data = request_data() options = PaginatorOptions(**data['paginator']) options.sort_column = 'name' provider = ComputerProvider(_read_session) return provider.get(data, options)
def sync_manual(self, computer: Computer, provider: ComputerProvider): """ button sync was clicked manually """ if not computer.meta: return meta = yaml_load(computer.meta) if 'manual_sync' not in meta: return manual_sync = meta['manual_sync'] project_provider = ProjectProvider(self.session) docker_provider = DockerProvider(self.session) dockers = docker_provider.get_online() project = project_provider.by_id(manual_sync['project']) for docker in dockers: if docker.computer == computer.name: continue source = provider.by_name(docker.computer) ignore_folders = [ [join('models', project.name), []] ] sync_directed(self.session, target=computer, source=source, ignore_folders=ignore_folders) del meta['manual_sync'] computer.meta = yaml_dump(meta) provider.update()
def __init__( self, args: Args, report: ReportLayoutInfo, distr_info: dict, resume: dict, grid_config: dict, trace: str, params: dict, **kwargs ): super().__init__(**kwargs) self.order = 0 self.resume = resume self.distr_info = distr_info self.args = args self.report = report self.experiment = None self.runner = None self.series_provider = ReportSeriesProvider(self.session) self.computer_provider = ComputerProvider(self.session) self.grid_config = grid_config self.master = True self.checkpoint_resume = False self.checkpoint_stage_epoch = 0 self.trace = trace self.params = params self.last_batch_logged = None self.loader_started_time = None self.parent = None self.loader_step_start = 0
def copy_remote(session: Session, computer_from: str, path_from: str, path_to: str): provider = ComputerProvider(session) src = provider.by_name(computer_from) host = socket.gethostname() if host != computer_from: c = f'scp -P {src.port} {src.user}@{src.ip}:{path_from} {path_to}' else: f'cp {path_from} {path_to}' subprocess.check_output(c, shell=True) return os.path.exists(path_to)
def computer_sync_end(): data = request_data() provider = ComputerProvider(_write_session) for computer in provider.all(): if data.get('computer') and data['computer'] != computer.name: continue meta = yaml_load(computer.meta) meta['manual_sync'] = { 'project': data['id'], 'ignore_folders': yaml_load(data['ignore_folders']) } computer.meta = yaml_dump(meta) provider.update()
def sync_manual(self, computer: Computer, provider: ComputerProvider): """ button sync was clicked manually """ if not computer.meta: return meta = yaml_load(computer.meta) if 'manual_sync' not in meta: return manual_sync = meta['manual_sync'] project_provider = ProjectProvider(self.session) docker_provider = DockerProvider(self.session) dockers = docker_provider.get_online() project = project_provider.by_id(manual_sync['project']) sync_folders = manual_sync['sync_folders'] ignore_folders = manual_sync['ignore_folders'] sync_folders = correct_folders(sync_folders, project.name) ignore_folders = correct_folders(ignore_folders, project.name) if not isinstance(sync_folders, list): sync_folders = [] if not isinstance(ignore_folders, list): ignore_folders = [] for docker in dockers: if docker.computer == computer.name: continue source = provider.by_name(docker.computer) folders = [[s, ignore_folders] for s in sync_folders] computer.syncing_computer = source.name provider.update() try: sync_directed( self.session, target=computer, source=source, folders=folders ) except Exception as e: self.process_error(e) del meta['manual_sync'] computer.meta = yaml_dump(meta) provider.update()
def create_base(self): self.session.commit() self.provider = TaskProvider(self.session) self.computer_provider = ComputerProvider(self.session) self.docker_provider = DockerProvider(self.session) self.auxiliary_provider = AuxiliaryProvider(self.session) self.dag_provider = DagProvider(self.session) self.queues = [ f'{d.computer}_{d.name}' for d in self.docker_provider.all() if d.last_activity >= now() - datetime.timedelta(seconds=15) ] self.auxiliary['queues'] = self.queues
def sync(self): hostname = socket.gethostname() try: provider = ComputerProvider(self.session) task_synced_provider = TaskSyncedProvider(self.session) computer = provider.by_name(hostname) sync_start = now() if FILE_SYNC_INTERVAL == 0: time.sleep(1) else: computers = provider.all_with_last_activtiy() computers = [ c for c in computers if (now() - c.last_activity).total_seconds() < 10 ] computers_names = {c.name for c in computers} for c, project, tasks in task_synced_provider.for_computer( computer.name): if c.name not in computers_names: self.logger.info( f'Computer = {c.name} ' f'is offline. Can not sync', ComponentType.WorkerSupervisor, hostname) continue if c.syncing_computer: continue excluded = list(map(str, yaml_load(project.ignore_folders))) folders_excluded = [[join('data', project.name), excluded], [join('models', project.name), []]] computer.syncing_computer = c.name provider.update() sync_directed(self.session, c, computer, folders_excluded) for t in tasks: task_synced_provider.add( TaskSynced(computer=computer.name, task=t.id)) time.sleep(FILE_SYNC_INTERVAL) computer.last_synced = sync_start computer.syncing_computer = None provider.update() except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup('FileSync') self.session = Session.create_session(key='FileSync') self.logger = create_logger(self.session, 'FileSync') self.logger.error(traceback.format_exc(), ComponentType.WorkerSupervisor, hostname)
def describe_resources(computer: str, axis): provider = ComputerProvider() res = provider.get({})['data'] res = [r for r in res if r['name'] == computer][0] usage = res['usage_history'] x = [ datetime.datetime.strptime(t, provider.datetime_format) for t in usage['time'] ] for item in usage['mean']: if item['name'] == 'disk': continue axis.plot(x, item['value'], label=item['name']) axis.set_title('Resources') axis.set_ylabel('%') axis.legend(loc='lower left')
def sync(project: str, computer: str, only_from: bool, only_to: bool, online: bool): """ Syncs specified project on this computer with other computers """ check_statuses() _create_computer() _create_docker() computer = computer or socket.gethostname() provider = ComputerProvider(_session) project_provider = ProjectProvider(_session) computer = provider.by_name(computer) computers = provider.all_with_last_activtiy() p = project_provider.by_name(project) assert p, f'Project={project} is not found' sync_folders = yaml_load(p.sync_folders) ignore_folders = yaml_load(p.ignore_folders) sync_folders = correct_folders(sync_folders, p.name) ignore_folders = correct_folders(ignore_folders, p.name) if not isinstance(sync_folders, list): sync_folders = [] if not isinstance(ignore_folders, list): ignore_folders = [] folders = [[s, ignore_folders] for s in sync_folders] for c in computers: if c.name != computer.name: if online and (now() - c.last_activity).total_seconds() > 100: continue if not only_from: sync_directed(_session, computer, c, folders) if not only_to: sync_directed(_session, c, computer, folders)
def worker_usage(session: Session, logger): provider = ComputerProvider(session) docker_provider = DockerProvider(session) computer = socket.gethostname() docker = docker_provider.get(computer, DOCKER_IMG) usages = [] for _ in range(1 if MODE_ECONOMIC else 10): # noinspection PyProtectedMember memory = dict(psutil.virtual_memory()._asdict()) usage = { 'cpu': psutil.cpu_percent(), 'disk': disk(ROOT_FOLDER)[1], 'memory': memory['percent'], 'gpu': [{ 'memory': g.memoryUtil * 100, 'load': g.load * 100 } for g in GPUtil.getGPUs()] } provider.current_usage(computer, usage) usages.append(usage) docker.last_activity = now() docker_provider.update() time.sleep(10 if MODE_ECONOMIC else 1) usage = json.dumps({'mean': dict_func(usages, np.mean)}) provider.add(ComputerUsage(computer=computer, usage=usage, time=now()))
def _create_computer(): tot_m, used_m, free_m = memory() tot_d, used_d, free_d = disk(ROOT_FOLDER) computer = Computer(name=socket.gethostname(), gpu=len(GPUtil.getGPUs()), cpu=cpu_count(), memory=tot_m, ip=IP, port=PORT, user=get_username(), disk=tot_d, root_folder=ROOT_FOLDER) ComputerProvider(_session).create_or_update(computer, 'name')
def _create_computer(): tot_m, used_m, free_m = memory() tot_d, used_d, free_d = disk(ROOT_FOLDER) computer = Computer(name=socket.gethostname(), gpu=torch.cuda.device_count(), cpu=cpu_count(), memory=tot_m, ip=IP, port=PORT, user=get_username(), disk=tot_d, root_folder=ROOT_FOLDER, sync_with_this_computer=SYNC_WITH_THIS_COMPUTER, can_process_tasks=CAN_PROCESS_TASKS) ComputerProvider(_session).create_or_update(computer, 'name')
def worker_usage(session: Session, logger): provider = ComputerProvider(session) docker_provider = DockerProvider(session) computer = socket.gethostname() docker = docker_provider.get(computer, DOCKER_IMG) usages = [] count = int(10 / WORKER_USAGE_INTERVAL) count = max(1, count) for _ in range(count): # noinspection PyProtectedMember memory = dict(psutil.virtual_memory()._asdict()) try: gpus = GPUtil.getGPUs() except ValueError as err: logger.info(f"Active GPUs not found: {err}") gpus = [] usage = { 'cpu': psutil.cpu_percent(), 'disk': disk(ROOT_FOLDER)[1], 'memory': memory['percent'], 'gpu': [{ 'memory': g.memoryUtil * 100, 'load': g.load * 100 } for g in gpus] } provider.current_usage(computer, usage) usages.append(usage) docker.last_activity = now() docker_provider.update() time.sleep(WORKER_USAGE_INTERVAL) usage = json.dumps({'mean': dict_func(usages, np.mean)}) provider.add(ComputerUsage(computer=computer, usage=usage, time=now()))
class SupervisorBuilder: def __init__(self): self.session = Session.create_session(key='SupervisorBuilder') self.logger = create_logger(self.session, 'SupervisorBuilder') self.provider = None self.computer_provider = None self.docker_provider = None self.auxiliary_provider = None self.dag_provider = None self.queues = None self.not_ran_tasks = None self.dep_status = None self.computers = None self.auxiliary = {} self.tasks = [] self.tasks_stop = [] self.dags_start = [] self.sent_tasks = 0 def create_base(self): self.session.commit() self.provider = TaskProvider(self.session) self.computer_provider = ComputerProvider(self.session) self.docker_provider = DockerProvider(self.session) self.auxiliary_provider = AuxiliaryProvider(self.session) self.dag_provider = DagProvider(self.session) self.queues = [ f'{d.computer}_{d.name}' for d in self.docker_provider.all() if d.last_activity >= now() - datetime.timedelta(seconds=15) ] self.auxiliary['queues'] = self.queues def load_tasks(self): self.tasks = self.provider.by_status(TaskStatus.NotRan, TaskStatus.InProgress, TaskStatus.Queued) not_ran_tasks = [t for t in self.tasks if t.status == TaskStatus.NotRan.value] self.not_ran_tasks = [task for task in not_ran_tasks if not task.debug] self.not_ran_tasks = sorted( self.not_ran_tasks, key=lambda x: x.gpu or 0, reverse=True) self.logger.debug( f'Found {len(not_ran_tasks)} not ran tasks', ComponentType.Supervisor ) self.dep_status = self.provider.dependency_status(self.not_ran_tasks) self.auxiliary['not_ran_tasks'] = [ { 'id': t.id, 'name': t.name, 'dep_status': [ TaskStatus(s).name for s in self.dep_status.get(t.id, set()) ] } for t in not_ran_tasks[:5] ] def load_computers(self): computers = self.computer_provider.computers() for computer in computers.values(): computer['gpu'] = [0] * computer['gpu'] computer['ports'] = set() computer['cpu_total'] = computer['cpu'] computer['memory_total'] = computer['memory'] computer['gpu_total'] = len(computer['gpu']) computer['can_process_tasks'] = computer['can_process_tasks'] tasks = [ t for t in self.tasks if t.status in [TaskStatus.InProgress.value, TaskStatus.Queued.value] ] for task in tasks: if task.computer_assigned is None: continue assigned = task.computer_assigned comp_assigned = computers[assigned] comp_assigned['cpu'] -= task.cpu if task.gpu_assigned is not None: for g in task.gpu_assigned.split(','): comp_assigned['gpu'][int(g)] = task.id comp_assigned['memory'] -= task.memory * 1024 info = yaml_load(task.additional_info) if 'distr_info' in info: dist_info = info['distr_info'] if dist_info['rank'] == 0: comp_assigned['ports'].add(dist_info['master_port']) self.computers = [ { **value, 'name': name } for name, value in computers.items() ] self.auxiliary['computers'] = self.computers def process_to_celery(self, task: Task, queue: str, computer: dict): r = execute.apply_async((task.id,), queue=queue, retry=False) task.status = TaskStatus.Queued.value task.computer_assigned = computer['name'] task.celery_id = r.id if task.computer_assigned is not None: if task.gpu_assigned: for g in map(int, task.gpu_assigned.split(',')): computer['gpu'][g] = task.id computer['cpu'] -= task.cpu computer['memory'] -= task.memory * 1024 self.logger.info( f'Sent task={task.id} to celery. Queue = {queue} ' f'Task status = {task.status} Celery_id = {r.id}', ComponentType.Supervisor) self.provider.update() def create_service_task( self, task: Task, gpu_assigned=None, distr_info: dict = None, resume: dict = None ): new_task = Task( name=task.name, computer=task.computer, executor=task.executor, status=TaskStatus.NotRan.value, type=TaskType.Service.value, gpu_assigned=gpu_assigned, parent=task.id, report=task.report, dag=task.dag ) new_task.additional_info = task.additional_info if distr_info: additional_info = yaml_load(new_task.additional_info) additional_info['distr_info'] = distr_info new_task.additional_info = yaml_dump(additional_info) if resume: additional_info = yaml_load(new_task.additional_info) additional_info['resume'] = resume new_task.additional_info = yaml_dump(additional_info) return self.provider.add(new_task) def find_port(self, c: dict, docker_name: str): docker = self.docker_provider.get(c['name'], docker_name) ports = list(map(int, docker.ports.split('-'))) for p in range(ports[0], ports[1] + 1): if p not in c['ports']: return p raise Exception(f'All ports in {c["name"]} are taken') def _process_task_valid_computer(self, task: Task, c: dict, single_node: bool): if not c['can_process_tasks']: return 'this computer can not process tasks' if task.computer is not None and task.computer != c['name']: return 'name set in the config!= name of this computer' if task.cpu > c['cpu']: return f'task cpu = {task.cpu} > computer' \ f' free cpu = {c["cpu"]}' if task.memory > c['memory']: return f'task cpu = {task.cpu} > computer ' \ f'free memory = {c["memory"]}' queue = f'{c["name"]}_' \ f'{task.dag_rel.docker_img or "default"}' if queue not in self.queues: return f'required queue = {queue} not in queues' if task.gpu > 0 and not any(g == 0 for g in c['gpu']): return f'task requires gpu, but there is not any free' free_gpu = sum(g == 0 for g in c['gpu']) if single_node and task.gpu > free_gpu: return f'task requires {task.gpu} ' \ f'but there are only {free_gpu} free' def _process_task_get_computers( self, executor: dict, task: Task, auxiliary: dict ): single_node = executor.get('single_node', True) computers = [] for c in self.computers: error = self._process_task_valid_computer(task, c, single_node) auxiliary['computers'].append({'name': c['name'], 'error': error}) if not error: computers.append(c) if task.gpu > 0 and single_node and len(computers) > 0: computers = sorted( computers, key=lambda x: sum(g == 0 for g in c['gpu']), reverse=True )[:1] free_gpu = sum(sum(g == 0 for g in c['gpu']) for c in computers) if task.gpu > free_gpu: auxiliary['not_valid'] = f'gpu required by the ' \ f'task = {task.gpu},' \ f' but there are only {free_gpu} ' \ f'free gpus' return [] return computers def _process_task_to_send( self, executor: dict, task: Task, computers: List[dict] ): distr = executor.get('distr', True) to_send = [] for computer in computers: queue = f'{computer["name"]}_' \ f'{task.dag_rel.docker_img or "default"}' if task.gpu_max > 1 and distr: for index, task_taken_gpu in enumerate(computer['gpu']): if task_taken_gpu: continue to_send.append([computer, queue, index]) if len(to_send) >= task.gpu_max: break if len(to_send) >= task.gpu_max: break elif task.gpu_max > 0: cuda_devices = [] for index, task_taken_gpu in enumerate(computer['gpu']): if task_taken_gpu: continue cuda_devices.append(index) if len(cuda_devices) >= task.gpu_max: break task.gpu_assigned = ','.join(map(str, cuda_devices)) self.process_to_celery(task, queue, computer) else: self.process_to_celery(task, queue, computer) break return to_send def process_task(self, task: Task): auxiliary = self.auxiliary['process_tasks'][-1] auxiliary['computers'] = [] config = yaml_load(task.dag_rel.config) executor = config['executors'][task.executor] computers = self._process_task_get_computers(executor, task, auxiliary) if len(computers) == 0: return to_send = self._process_task_to_send(executor, task, computers) auxiliary['to_send'] = to_send[:5] additional_info = yaml_load(task.additional_info) rank = 0 master_port = None if len(to_send) > 0: master_port = self.find_port( to_send[0][0], to_send[0][1].split('_')[1] ) computer_names = {c['name'] for c, _, __ in to_send} if len(computer_names) == 1: task.computer_assigned = list(computer_names)[0] for computer, queue, gpu_assigned in to_send: main_cmp = to_send[0][0] # noinspection PyTypeChecker ip = 'localhost' if computer['name'] == main_cmp['name'] \ else main_cmp['ip'] distr_info = { 'master_addr': ip, 'rank': rank, 'local_rank': gpu_assigned, 'master_port': master_port, 'world_size': len(to_send), 'master_computer': main_cmp['name'] } service_task = self.create_service_task( task, distr_info=distr_info, gpu_assigned=gpu_assigned, resume=additional_info.get('resume') ) self.process_to_celery(service_task, queue, computer) rank += 1 main_cmp['ports'].add(master_port) if len(to_send) > 0: task.status = TaskStatus.Queued.value self.sent_tasks += len(to_send) def process_tasks(self): self.auxiliary['process_tasks'] = [] for task in self.not_ran_tasks: auxiliary = {'id': task.id, 'name': task.name} self.auxiliary['process_tasks'].append(auxiliary) if task.dag_rel is None: task.dag_rel = self.dag_provider.by_id(task.dag) if TaskStatus.Stopped.value in self.dep_status[task.id] \ or TaskStatus.Failed.value in self.dep_status[task.id] or \ TaskStatus.Skipped.value in self.dep_status[task.id]: auxiliary['not_valid'] = 'stopped or failed in dep_status' self.provider.change_status(task, TaskStatus.Skipped) continue if len(self.dep_status[task.id]) != 0 \ and self.dep_status[task.id] != {TaskStatus.Success.value}: auxiliary['not_valid'] = 'not all dep tasks are finished' continue self.process_task(task) self.auxiliary['process_tasks'] = self.auxiliary['process_tasks'][:5] def _stop_child_tasks(self, task: Task): self.provider.commit() children = self.provider.children(task.id, [Task.dag_rel]) dags = [c.dag_rel for c in children] for c, d in zip(children, dags): celery_tasks.stop(self.logger, self.session, c, d) def process_parent_tasks(self): tasks = self.provider.parent_tasks_stats() was_change = False for task, started, finished, statuses in tasks: status = task.status if statuses[TaskStatus.Failed] > 0: status = TaskStatus.Failed.value elif statuses[TaskStatus.Skipped] > 0: status = TaskStatus.Skipped.value elif statuses[TaskStatus.Queued] > 0: status = TaskStatus.Queued.value elif statuses[TaskStatus.InProgress] > 0: status = TaskStatus.InProgress.value elif statuses[TaskStatus.Success] > 0: status = TaskStatus.Success.value if status != task.status: if status == TaskStatus.InProgress.value: task.started = started elif status >= TaskStatus.Failed.value: task.started = started task.finished = finished self._stop_child_tasks(task) was_change = True task.status = status if was_change: self.provider.commit() self.auxiliary['parent_tasks_stats'] = [ { 'name': task.name, 'id': task.id, 'started': task.started, 'finished': finished, 'statuses': [ { 'name': k.name, 'count': v } for k, v in statuses.items() ], } for task, started, finished, statuses in tasks[:5] ] def write_auxiliary(self): self.auxiliary['duration'] = (now() - self.auxiliary['time']). \ total_seconds() auxiliary = Auxiliary( name='supervisor', data=yaml_dump(self.auxiliary) ) if len(auxiliary.data) > 16000: return self.auxiliary_provider.create_or_update(auxiliary, 'name') def stop_tasks(self, tasks: List[Task]): self.tasks_stop.extend([t.id for t in tasks]) def process_stop_tasks(self): # Stop not running tasks if len(self.tasks_stop) == 0: return tasks = self.provider.by_ids(self.tasks_stop) tasks_not_ran = [t.id for t in tasks if t.status in [TaskStatus.NotRan.value, TaskStatus.Queued.value]] tasks_started = [t for t in tasks if t.status in [TaskStatus.InProgress.value]] tasks_started_ids = [t.id for t in tasks_started] self.provider.change_status_all(tasks=tasks_not_ran, status=TaskStatus.Skipped) pids = [] for task in tasks_started: if task.pid: pids.append((task.computer_assigned, task.pid)) additional_info = yaml_load(task.additional_info) for p in additional_info.get('child_processes', []): pids.append((task.computer_assigned, p)) for computer, queue in self.docker_provider.queues_online(): pids_computer = [p for c, p in pids if c == computer] if len(pids_computer) > 0: celery_tasks.kill_all.apply_async((pids_computer,), queue=queue, retry=False) self.provider.change_status_all(tasks=tasks_started_ids, status=TaskStatus.Stopped) self.tasks_stop = [] def fast_check(self): if self.provider is None or self.computer_provider is None: return False if self.not_ran_tasks is None or self.queues is None: return False if len(self.tasks_stop) > 0: return False if len(self.dags_start) > 0: return False if len(self.auxiliary.get('to_send', [])) > 0: return False queues = set([ f'{d.computer}_{d.name}' for d in self.docker_provider.all() if d.last_activity >= now() - datetime.timedelta(seconds=15) ]) queues_set = set(queues) queues_set2 = set(self.queues) if queues_set != queues_set2: return False tasks = self.provider.by_status(TaskStatus.NotRan, TaskStatus.Queued, TaskStatus.InProgress) tasks_set = {t.id for t in tasks if t.status == TaskStatus.NotRan.value and not t.debug} tasks_set2 = {t.id for t in self.tasks if t.status == TaskStatus.NotRan.value} if tasks_set != tasks_set2: return False tasks_set = {t.id for t in tasks if t.status == TaskStatus.InProgress.value} tasks_set2 = {t.id for t in self.tasks if t.status == TaskStatus.InProgress.value} if tasks_set != tasks_set2: return False tasks_set = {t.id for t in tasks if t.status == TaskStatus.Queued.value} tasks_set2 = {t.id for t in self.tasks if t.status == TaskStatus.Queued.value} if tasks_set != tasks_set2: return False return True def start_dag(self, id: int): self.dags_start.append(id) def process_start_dags(self): if len(self.dags_start) == 0: return for id in self.dags_start: can_start_statuses = [ TaskStatus.Failed.value, TaskStatus.Skipped.value, TaskStatus.Stopped.value ] tasks = self.provider.by_dag(id) children_all = self.provider.children([t.id for t in tasks]) def find_resume(task): children = [c for c in children_all if c.parent == task.id] children = sorted(children, key=lambda x: x.id, reverse=True) if len(children) > 0: for c in children: if c.parent != task.id: continue info = yaml_load(c.additional_info) if 'distr_info' not in info: continue if info['distr_info']['rank'] == 0: return { 'master_computer': c.computer_assigned, 'master_task_id': c.id, 'load_last': True } raise Exception('Master task not found') else: return { 'master_computer': task.computer_assigned, 'master_task_id': task.id, 'load_last': True } for t in tasks: if t.status not in can_start_statuses: continue if t.parent: continue if t.type == TaskType.Train.value: info = yaml_load(t.additional_info) info['resume'] = find_resume(t) t.additional_info = yaml_dump(info) t.status = TaskStatus.NotRan.value t.pid = None t.started = None t.finished = None t.computer_assigned = None t.celery_id = None t.worker_index = None t.docker_assigned = None self.provider.commit() self.dags_start = [] def build(self): try: # if self.fast_check(): # return self.auxiliary = {'time': now()} self.create_base() self.process_stop_tasks() self.process_start_dags() self.process_parent_tasks() self.load_tasks() self.load_computers() self.process_tasks() self.write_auxiliary() except ObjectDeletedError: pass except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup(key='SupervisorBuilder') self.session = Session.create_session(key='SupervisorBuilder') self.logger = create_logger(self.session, 'SupervisorBuilder') self.logger.error(traceback.format_exc(), ComponentType.Supervisor)
def sync(self): hostname = socket.gethostname() try: provider = ComputerProvider(self.session) task_synced_provider = TaskSyncedProvider(self.session) computer = provider.by_name(hostname) sync_start = now() if FILE_SYNC_INTERVAL == 0: time.sleep(1) else: self.sync_manual(computer, provider) computers = provider.all_with_last_activtiy() computers = [ c for c in computers if (now() - c.last_activity).total_seconds() < 10 ] computers_names = {c.name for c in computers} for c, project, tasks in task_synced_provider.for_computer( computer.name): if c.sync_with_this_computer: if c.name not in computers_names: self.logger.info(f'Computer = {c.name} ' f'is offline. Can not sync', ComponentType.WorkerSupervisor, hostname) continue if c.syncing_computer: continue sync_folders = yaml_load(project.sync_folders) ignore_folders = yaml_load(project.ignore_folders) sync_folders = correct_folders(sync_folders, project.name) ignore_folders = correct_folders(ignore_folders, project.name) if not isinstance(sync_folders, list): sync_folders = [] if not isinstance(ignore_folders, list): ignore_folders = [] folders = [[s, ignore_folders] for s in sync_folders] computer.syncing_computer = c.name provider.update() sync_directed(self.session, c, computer, folders) for t in tasks: task_synced_provider.add( TaskSynced(computer=computer.name, task=t.id) ) time.sleep(FILE_SYNC_INTERVAL) computer.last_synced = sync_start computer.syncing_computer = None provider.update() except Exception as e: self.process_error(e)
class Catalyst(Executor, Callback): def __init__(self, args: Args, report: ReportLayoutInfo, distr_info: dict, resume: dict, grid_config: dict, trace: str, params: dict): super().__init__(order=0) self.resume = resume self.distr_info = distr_info self.args = args self.report = report self.experiment = None self.runner = None self.series_provider = ReportSeriesProvider(self.session) self.computer_provider = ComputerProvider(self.session) self.grid_config = grid_config self.master = True self.checkpoint_resume = False self.checkpoint_stage_epoch = 0 self.trace = trace self.params = params def callbacks(self): result = OrderedDict() if self.master: result['catalyst'] = self return result def on_epoch_start(self, state: RunnerState): if self.checkpoint_resume and state.stage_epoch == 0: state.epoch += 1 state.stage_epoch = state.stage_epoch + self.checkpoint_stage_epoch state.checkpoint_data = {'stage_epoch': state.stage_epoch} if self.master: if state.stage_epoch == 0: self.step.start(1, name=state.stage) self.step.start(2, name=f'epoch {state.stage_epoch}', index=state.stage_epoch) def on_epoch_end(self, state: RunnerState): self.step.end(2) for s in self.report.series: train = state.metrics.epoch_values['train'][s.key] val = state.metrics.epoch_values['valid'][s.key] task_id = self.task.parent or self.task.id train = ReportSeries(part='train', name=s.key, epoch=state.epoch, task=task_id, value=train, time=now(), stage=state.stage) val = ReportSeries(part='valid', name=s.key, epoch=state.epoch, task=task_id, value=val, time=now(), stage=state.stage) self.series_provider.add(train) self.series_provider.add(val) if s.key == self.report.metric.name: best = False task = self.task if task.parent: task = self.task_provider.by_id(task.parent) if self.report.metric.minimize: if task.score is None or val.value < task.score: best = True else: if task.score is None or val.value > task.score: best = True if best: task.score = val.value self.task_provider.update() def on_stage_start(self, state: RunnerState): state.loggers = { 'console': VerboseLogger(), 'raise': RaiseExceptionLogger() } def on_stage_end(self, state: RunnerState): self.checkpoint_resume = False self.checkpoint_stage_epoch = 0 self.step.end(1) @classmethod def _from_config(cls, executor: dict, config: Config, additional_info: dict): args = Args() for k, v in executor['args'].items(): v = str(v) if v in ['False', 'True']: v = v == 'True' elif v.isnumeric(): v = int(v) setattr(args, k, v) assert 'report_config' in additional_info, 'layout was not filled' report_config = additional_info['report_config'] grid_cell = additional_info.get('grid_cell') report = ReportLayoutInfo(report_config) if len(args.configs) == 0: args.configs = [args.config] grid_config = {} if grid_cell is not None: grid_config = grid_cells(executor['grid'])[grid_cell][0] distr_info = additional_info.get('distr_info', {}) resume = additional_info.get('resume') params = executor.get('params', {}) return cls(args=args, report=report, grid_config=grid_config, distr_info=distr_info, resume=resume, trace=executor.get('trace'), params=params) def set_dist_env(self, config): info = self.distr_info os.environ['MASTER_ADDR'] = info['master_addr'] os.environ['MASTER_PORT'] = str(info['master_port']) os.environ['WORLD_SIZE'] = str(info['world_size']) os.environ['RANK'] = str(info['rank']) distributed_params = config.get('distributed_params', {}) distributed_params['rank'] = 0 config['distributed_params'] = distributed_params if info['rank'] > 0: self.master = False def parse_args_uargs(self): args, config = parse_args_uargs(self.args, []) config = merge_dicts_smart(config, self.grid_config) config = merge_dicts_smart(config, self.params) if self.distr_info: self.set_dist_env(config) return args, config def _checkpoint_fix_config(self, experiment): resume = self.resume if not resume: return checkpoint_dir = join(experiment.logdir, 'checkpoints') os.makedirs(checkpoint_dir, exist_ok=True) file = 'last_full.pth' if resume.get('load_last') else 'best_full.pth' path = join(checkpoint_dir, file) computer = socket.gethostname() if computer != resume['master_computer']: master_computer = self.computer_provider.by_name( resume['master_computer']) path_from = join(master_computer.root_folder, str(resume['master_task_id']), 'log', 'checkpoints', file) self.info(f'copying checkpoint from: computer = ' f'{resume["master_computer"]} path_from={path_from} ' f'path_to={path}') success = copy_remote(session=self.session, computer_from=resume['master_computer'], path_from=path_from, path_to=path) if not success: self.error(f'copying from ' f'{resume["master_computer"]}/' f'{path_from} failed') else: self.info('checkpoint copied successfully') elif self.task.id != resume['master_task_id']: path = join(TASK_FOLDER, str(resume['master_task_id']), 'log', 'checkpoints', file) self.info(f'master_task_id!=task.id, using checkpoint' f' from task_id = {resume["master_task_id"]}') if not os.path.exists(path): self.info(f'no checkpoint at {path}') return ckpt = load_checkpoint(path) stages_config = experiment.stages_config for k, v in list(stages_config.items()): if k == ckpt['stage']: stage_epoch = ckpt['checkpoint_data']['stage_epoch'] + 1 # if it is the last epoch in the stage if stage_epoch == v['state_params']['num_epochs'] \ or resume.get('load_best'): del stages_config[k] break self.checkpoint_stage_epoch = stage_epoch v['state_params']['num_epochs'] -= stage_epoch break del stages_config[k] stage = experiment.stages_config[experiment.stages[0]] for k, v in stage['callbacks_params'].items(): if v.get('callback') == 'CheckpointCallback': v['resume'] = path self.info(f'found checkpoint at {path}') def _checkpoint_fix_callback(self, callbacks: dict): def mock(state): pass for k, c in callbacks.items(): if not isinstance(c, CheckpointCallback): continue if c.resume: self.checkpoint_resume = True if not self.master: c.on_epoch_end = mock c.on_stage_end = mock def work(self): args, config = self.parse_args_uargs() set_global_seed(args.seed) Experiment, R = import_experiment_and_runner(Path(args.expdir)) runner_params = config.pop('runner_params', {}) experiment = Experiment(config) runner: Runner = R(**runner_params) register() self.experiment = experiment self.runner = runner stages = experiment.stages[:] if self.master: task = self.task if not self.task.parent \ else self.task_provider.by_id(self.task.parent) task.steps = len(stages) self.task_provider.commit() self._checkpoint_fix_config(experiment) _get_callbacks = experiment.get_callbacks def get_callbacks(stage): res = self.callbacks() for k, v in _get_callbacks(stage).items(): res[k] = v self._checkpoint_fix_callback(res) return res experiment.get_callbacks = get_callbacks if experiment.logdir is not None: dump_environment(config, experiment.logdir, args.configs) if self.distr_info: info = yaml_load(self.task.additional_info) info['resume'] = { 'master_computer': self.distr_info['master_computer'], 'master_task_id': self.task.id - self.distr_info['rank'], 'load_best': True } self.task.additional_info = yaml_dump(info) self.task_provider.commit() experiment.stages_config = { k: v for k, v in experiment.stages_config.items() if k == experiment.stages[0] } runner.run_experiment(experiment, check=args.check) if self.master and self.trace: traced = trace_model_from_checkpoint(self.experiment.logdir, self) torch.jit.save(traced, self.trace) return {'stage': experiment.stages[-1], 'stages': stages}
class Catalyst(Executor, Callback): def __init__(self, args: Args, report: ReportLayoutInfo, distr_info: dict, resume: dict, grid_config: dict, trace: str, params: dict, **kwargs): super().__init__(**kwargs) self.series_provider = ReportSeriesProvider(self.session) self.computer_provider = ComputerProvider(self.session) self.memory_provider = MemoryProvider(self.session) self.order = 0 self.resume = resume self.distr_info = distr_info self.args = args self.report = report self.experiment = None self.runner = None self.grid_config = grid_config self.master = True self.trace = trace self.params = params self.last_batch_logged = None self.loader_started_time = None self.parent = None self.node = CallbackNode.All def get_parent_task(self): if self.parent: return self.parent return self.task def callbacks(self): result = OrderedDict() if self.master: result['catalyst'] = self return result def on_loader_start(self, state: State): self.loader_started_time = now() def on_epoch_start(self, state: State): stage_index = self.experiment.stages.index(state.stage_name) self.step.start(1, name=state.stage_name, index=stage_index) self.step.start(2, name=f'epoch {state.epoch}', index=state.epoch - 1) def on_batch_start(self, state: State): if self.last_batch_logged and state.loader_step != state.loader_len: if (now() - self.last_batch_logged).total_seconds() < 10: return task = self.get_parent_task() task.batch_index = state.loader_step task.batch_total = state.loader_len task.loader_name = state.loader_name duration = int((now() - self.loader_started_time).total_seconds()) task.epoch_duration = duration task.epoch_time_remaining = int( duration * (task.batch_total / task.batch_index)) - task.epoch_duration if state.epoch_metrics.get('train_loss') is not None: task.loss = float(state.epoch_metrics['train_loss']) if state.epoch_metrics.get('valid_loss') is not None: task.loss = float(state.epoch_metrics['valid_loss']) self.task_provider.update() self.last_batch_logged = now() def on_epoch_end(self, state: State): self.step.end(2) values = state.epoch_metrics for k, v in values.items(): part = '' name = k for loader in state.loaders: if k.startswith(loader): part = loader name = k.replace(loader, '') if name.startswith('_'): name = name[1:] task_id = self.task.parent or self.task.id series = ReportSeries(part=part, name=name, epoch=state.epoch - 1, task=task_id, value=v, time=now(), stage=state.stage_name) self.series_provider.add(series) if name == self.report.metric.name: best = False task = self.task if task.parent: task = self.task_provider.by_id(task.parent) if self.report.metric.minimize: if task.score is None or v < task.score: best = True else: if task.score is None or v > task.score: best = True if best: task.score = v self.task_provider.update() def on_stage_end(self, state: State): self.step.end(1) @classmethod def _from_config(cls, executor: dict, config: Config, additional_info: dict): args = Args() for k, v in executor['args'].items(): v = str(v) if v in ['False', 'True']: v = v == 'True' elif v.isnumeric(): v = int(v) setattr(args, k, v) assert 'report_config' in additional_info, 'layout was not filled' report_config = additional_info['report_config'] report = ReportLayoutInfo(report_config) if len(args.configs) == 0: args.configs = [args.config] distr_info = additional_info.get('distr_info', {}) resume = additional_info.get('resume') params = executor.get('params', {}) params.update(additional_info.get('params', {})) grid_config = executor.copy() grid_config.pop('args', '') return cls(args=args, report=report, grid_config=grid_config, distr_info=distr_info, resume=resume, trace=executor.get('trace'), params=params) def set_dist_env(self, config): info = self.distr_info os.environ['MASTER_ADDR'] = info['master_addr'] os.environ['MASTER_PORT'] = str(info['master_port']) os.environ['WORLD_SIZE'] = str(info['world_size']) os.environ['RANK'] = str(info['rank']) os.environ['LOCAL_RANK'] = "0" distributed_params = config.get('distributed_params', {}) distributed_params['rank'] = info['rank'] config['distributed_params'] = distributed_params torch.cuda.set_device(0) torch.distributed.init_process_group(backend="nccl", init_method="env://") if info['rank'] > 0: self.master = False self.node = CallbackNode.Worker else: self.node = CallbackNode.Master def parse_args_uargs(self): args, config = parse_args_uargs(self.args, []) config = merge_dicts_smart(config, self.grid_config) config = merge_dicts_smart(config, self.params) if self.distr_info: self.set_dist_env(config) return args, config def _fix_memory(self, experiment): if not torch.cuda.is_available(): return max_memory = torch.cuda.get_device_properties(0).total_memory / (2**30) stages_config = experiment.stages_config for k, v in list(stages_config.items()): query = {} # noinspection PyProtectedMember for kk, vv in experiment._config['model_params'].items(): query[kk] = vv for kk, vv in v['data_params'].items(): query[kk] = vv variants = self.memory_provider.find(query) variants = [v for v in variants if v.memory < max_memory] if len(variants) == 0: continue variant = max(variants, key=lambda x: x.memory) v['data_params']['batch_size'] = variant.batch_size def _checkpoint_fix_config(self, experiment): resume = self.resume if not resume: return if experiment.logdir is None: return checkpoint_dir = join(experiment.logdir, 'checkpoints') os.makedirs(checkpoint_dir, exist_ok=True) file = 'last_full.pth' if resume.get('load_last') else 'best_full.pth' path = join(checkpoint_dir, file) computer = socket.gethostname() if computer != resume['master_computer']: master_computer = self.computer_provider.by_name( resume['master_computer']) path_from = join(master_computer.root_folder, str(resume['master_task_id']), experiment.logdir, 'checkpoints', file) self.info(f'copying checkpoint from: computer = ' f'{resume["master_computer"]} path_from={path_from} ' f'path_to={path}') success = copy_remote(session=self.session, computer_from=resume['master_computer'], path_from=path_from, path_to=path) if not success: self.error(f'copying from ' f'{resume["master_computer"]}/' f'{path_from} failed') else: self.info('checkpoint copied successfully') elif self.task.id != resume['master_task_id']: path = join(TASK_FOLDER, str(resume['master_task_id']), experiment.logdir, 'checkpoints', file) self.info(f'master_task_id!=task.id, using checkpoint' f' from task_id = {resume["master_task_id"]}') if not os.path.exists(path): self.info(f'no checkpoint at {path}') return ckpt = load_checkpoint(path) stages_config = experiment.stages_config for k, v in list(stages_config.items()): if k == ckpt['stage']: stage_epoch = ckpt['checkpoint_data']['epoch'] + 1 # if it is the last epoch in the stage if stage_epoch >= v['state_params']['num_epochs'] \ or resume.get('load_best'): del stages_config[k] break self.checkpoint_stage_epoch = stage_epoch v['state_params']['num_epochs'] -= stage_epoch break del stages_config[k] stage = experiment.stages_config[experiment.stages[0]] for k, v in stage['callbacks_params'].items(): if v.get('callback') == 'CheckpointCallback': v['resume'] = path self.info(f'found checkpoint at {path}') def _checkpoint_fix_callback(self, callbacks: dict): def mock(state): pass for k, c in callbacks.items(): if not isinstance(c, CheckpointCallback): continue if c.resume: self.checkpoint_resume = True if not self.master: c.on_epoch_end = mock c.on_stage_end = mock c.on_batch_start = mock def work(self): args, config = self.parse_args_uargs() set_global_seed(args.seed) Experiment, R = import_experiment_and_runner(Path(args.expdir)) runner_params = config.pop('runner_params', {}) experiment = Experiment(config) runner: Runner = R(**runner_params) self.experiment = experiment self.runner = runner stages = experiment.stages[:] if self.task.parent: self.parent = self.task_provider.by_id(self.task.parent) if self.master: task = self.get_parent_task() task.steps = len(stages) self.task_provider.commit() self._checkpoint_fix_config(experiment) self._fix_memory(experiment) _get_callbacks = experiment.get_callbacks def get_callbacks(stage): res = self.callbacks() for k, v in _get_callbacks(stage).items(): res[k] = v self._checkpoint_fix_callback(res) return res experiment.get_callbacks = get_callbacks if experiment.logdir is not None: dump_environment(config, experiment.logdir, args.configs) if self.distr_info: info = yaml_load(self.task.additional_info) info['resume'] = { 'master_computer': self.distr_info['master_computer'], 'master_task_id': self.task.id - self.distr_info['rank'], 'load_best': True } self.task.additional_info = yaml_dump(info) self.task_provider.commit() experiment.stages_config = { k: v for k, v in experiment.stages_config.items() if k == experiment.stages[0] } runner.run_experiment(experiment) if runner.state.exception: raise runner.state.exception if self.master and self.trace: traced = trace_model_from_checkpoint(self.experiment.logdir, self) torch.jit.save(traced, self.trace) return {'stage': experiment.stages[-1], 'stages': stages}
def computer_sync_start(): provider = ComputerProvider(_read_session) return provider.sync_start()