def __init__(self, session: Session, task: Task, layout: str, part: str = 'valid', name: str = 'img_classify', max_img_size: Tuple[int, int] = None, main_metric: str = 'accuracy', plot_count: int = 0): self.session = session self.task = task self.layout = layout self.part = part self.name = name or 'img_classify' self.max_img_size = max_img_size self.main_metric = main_metric self.plot_count = plot_count self.dag_provider = DagProvider(session) self.report_provider = ReportProvider(session) self.layout_provider = ReportLayoutProvider(session) self.task_provider = TaskProvider(session) self.report_img_provider = ReportImgProvider(session) self.report_task_provider = ReportTasksProvider(session) self.report_series_provider = ReportSeriesProvider(session) self.project = self.task_provider.project(task.id).id self.layout = self.layout_provider.by_name(layout) self.layout_dict = yaml_load(self.layout.content)
def create_providers(self): self.log_info('create_providers') self.dag_provider = DagProvider(self.session) self.task_provider = TaskProvider(self.session) self.file_provider = FileProvider(self.session) self.dag_storage_provider = DagStorageProvider(self.session)
def dag_pipe(session: Session, config: dict, config_text: str = None): assert 'interfaces' in config, 'interfaces missed' assert 'pipes' in config, 'pipe missed' info = config['info'] storage = Storage(session) dag_provider = DagProvider(session) folder = os.getcwd() project = ProjectProvider(session).by_name(info['project']).id dag = dag_provider.add( Dag( config=config_text, project=project, name=info['name'], docker_img=info.get('docker_img'), type=DagType.Pipe.value ) ) storage.upload(folder, dag) # Change model dags which have the same name ModelProvider(session ).change_dag(project=project, name=info['name'], to=dag.id)
def __init__(self, session: Session, task: Task, layout: str, part: str = 'valid', name: str = 'img_segment', max_img_size: Tuple[int, int] = None, stack_type: str = 'vertical', main_metric: str = 'dice', plot_count: int = 0, colors: List[Tuple] = None): self.session = session self.task = task self.layout = layout self.part = part self.name = name or 'img_segment' self.max_img_size = max_img_size self.stack_type = stack_type self.main_metric = main_metric self.colors = colors self.plot_count = plot_count self.dag_provider = DagProvider(session) self.report_provider = ReportProvider(session) self.layout_provider = ReportLayoutProvider(session) self.task_provider = TaskProvider(session) self.report_img_provider = ReportImgProvider(session) self.report_task_provider = ReportTasksProvider(session) self.report_series_provider = ReportSeriesProvider(session) self.project = self.task_provider.project(task.id).id self.layout = self.layout_provider.by_name(layout) self.layout_dict = yaml_load(self.layout.content) self.create_base()
def dag_stop(): data = request_data() provider = DagProvider(_write_session) id = int(data['id']) dag = provider.by_id(id, joined_load=['tasks']) for t in dag.tasks: celery_tasks.stop(logger, _write_session, t, dag) return {'dag': provider.get({'id': id})['data'][0]}
def dag_model_start(session: Session, data: dict): provider = ModelProvider(session) model = provider.by_id(data['model_id']) dag_provider = DagProvider(session) dag = dag_provider.by_id(data['dag'], joined_load=[Dag.project_rel]) project = dag.project_rel src_config = Config.from_yaml(dag.config) pipe = src_config['pipes'][data['pipe']['name']] equations = yaml_load(model.equations) versions = data['pipe']['versions'] if len(versions) > 0: version = data['pipe']['version'] pipe_equations = yaml_load(version['equations']) found_version = versions[0] for v in versions: if v['name'] == version['name']: found_version = v break found_version['used'] = now() if len(pipe) == 1: pipe[list(pipe)[0]].update(pipe_equations) else: pipe.update(pipe_equations) equations[data['pipe']['name']] = versions model.equations = yaml_dump(equations) for v in pipe.values(): v['model_id'] = model.id v['model_name'] = model.name config = { 'info': { 'name': data['pipe']['name'], 'project': project.name }, 'executors': pipe } if model.dag: old_dag = dag_provider.by_id(model.dag) if old_dag.name != dag.name: model.dag = dag.id else: model.dag = dag.id provider.commit() dag_standard(session=session, config=config, debug=False, upload_files=False, copy_files_from=data['dag'])
def create_providers(self): self.provider = TaskProvider(self.session) self.report_provider = ReportProvider(self.session) self.report_tasks_provider = ReportTasksProvider(self.session) self.report_layout_provider = ReportLayoutProvider(self.session) self.project_provider = ProjectProvider(self.session) self.storage = Storage(self.session) self.dag_provider = DagProvider(self.session)
def dag_stop(): data = request_data() provider = TaskProvider(_write_session) id = int(data['id']) tasks = provider.by_dag(id) supervisor.stop_tasks(tasks) dag_provider = DagProvider(_write_session) return {'dag': dag_provider.get({'id': id})['data'][0]}
def describe_dag(dag, axis): provider = DagProvider() graph = provider.graph(dag) status_colors = { 'not_ran': '#808080', 'queued': '#add8e6', 'in_progress': '#bfff00', 'failed': '#e83217', 'stopped': '#cb88ea', 'skipped': '#ffa500', 'success': '#006400' } node_color = [] edge_color = [] G = nx.DiGraph() labels = dict() for n in graph['nodes']: G.add_node(n['id']) labels[n['id']] = n['id'] node_color.append(status_colors[n['status']]) edges = [] for e in graph['edges']: G.add_edge(e['from'], e['to']) edges.append((e['from'], e['to'])) edge_color.append(status_colors[e['status']]) pos = nx.spring_layout(G, seed=0) nx.draw_networkx_nodes(G, pos, node_color=node_color, ax=axis, node_size=2000) nx.draw_networkx_labels(G, pos, labels, ax=axis, with_labels=True, font_color='orange', font_weight='bold', font_size=18) nx.draw_networkx_edges(G, pos, edgelist=edges, edge_color=edge_color, arrows=True, arrowsize=80, ax=axis) axis.set_xticks([]) axis.axis('off') axis.set_title('Graph')
def create_providers(self): self.log_info('create_providers') self.provider = TaskProvider(self.session) self.report_provider = ReportProvider(self.session) self.report_tasks_provider = ReportTasksProvider(self.session) self.report_layout_provider = ReportLayoutProvider(self.session) self.project_provider = ProjectProvider(self.session) self.storage = Storage(self.session, logger=self.logger, component=self.component) self.dag_provider = DagProvider(self.session)
def __init__(self, session: Session, logger=None, component: ComponentType = None, max_file_size: int = 10 ** 5, max_count=10 ** 3): self.file_provider = FileProvider(session) self.provider = DagStorageProvider(session) self.task_provider = TaskProvider(session) self.library_provider = DagLibraryProvider(session) self.dag_provider = DagProvider(session) self.logger = logger self.component = component self.max_file_size = max_file_size self.max_count = max_count
def create_base(self): self.session.commit() self.provider = TaskProvider(self.session) self.computer_provider = ComputerProvider(self.session) self.docker_provider = DockerProvider(self.session) self.auxiliary_provider = AuxiliaryProvider(self.session) self.dag_provider = DagProvider(self.session) self.queues = [ f'{d.computer}_{d.name}' for d in self.docker_provider.all() if d.last_activity >= now() - datetime.timedelta(seconds=15) ] self.auxiliary['queues'] = self.queues
def work(self): project = ProjectProvider(self.session).by_id(self.project) self.info(f'Task = {self.train_task} child_task: {self.child_task}') model = Model( created=now(), name=self.name, project=self.project, equations='', fold=self.fold ) provider = ModelProvider(self.session) if self.train_task: task_provider = TaskProvider(self.session) dag_provider = DagProvider(self.session) task = task_provider.by_id(self.train_task) dag = dag_provider.by_id(task.dag) task_dir = join(TASK_FOLDER, str(self.child_task or task.id)) # get log directory config = yaml_load(dag.config) executor_config = config['executors'][task.executor] catalyst_config_file = executor_config['args']['config'] catalyst_config_file = join(task_dir, catalyst_config_file) catalyst_config = yaml_load(file=catalyst_config_file) catalyst_logdir = catalyst_config['args']['logdir'] model.score_local = task.score src_log = f'{task_dir}/{catalyst_logdir}' models_dir = join(MODEL_FOLDER, project.name) os.makedirs(models_dir, exist_ok=True) model_path_tmp = f'{src_log}/traced.pth' traced = trace_model_from_checkpoint(src_log, self, file=self.file) model_path = f'{models_dir}/{model.name}.pth' model_weight_path = f'{models_dir}/{model.name}_weight.pth' torch.jit.save(traced, model_path_tmp) shutil.copy(model_path_tmp, model_path) file = self.file = 'best_full' shutil.copy(f'{src_log}/checkpoints/{file}.pth', model_weight_path) provider.add(model)
def code_download(): id = int(request.args['id']) storage = Storage(_read_session) dag = DagProvider().by_id(id) folder = os.path.join(TMP_FOLDER, f'{dag.id}({dag.name})') try: storage.download_dag(id, folder) file_name = f'{dag.id}({dag.name}).zip' dst = os.path.join(TMP_FOLDER, file_name) zip_folder(folder, dst) res = send_from_directory(TMP_FOLDER, file_name) os.remove(dst) return res finally: shutil.rmtree(folder, ignore_errors=True)
def dag_model_start(session: Session, data: dict): provider = ModelProvider(session) model = provider.by_id(data['model_id']) dag = DagProvider(session ).by_id(data['dag'], joined_load=[Dag.project_rel]) project = dag.project_rel src_config = Config.from_yaml(dag.config) pipe = src_config['pipes'][data['pipe']] for k, v in pipe.items(): if v.get('slot') != data['slot']: continue params = yaml_load(data['interface_params']) slot = { 'interface': data['interface'], 'interface_params': params, 'slot': k, 'name': model.name, 'id': data['model_id'] } v['slot'] = slot config = { 'info': { 'name': data['pipe'], 'project': project.name }, 'executors': pipe } dag_standard( session=session, config=config, debug=False, upload_files=False, copy_files_from=data['dag'] ) model.dag = data['dag'] model.interface = data['interface'] model.interface_params = data['interface_params'] model.slot = data['slot'] provider.commit()
def work(self): task_provider = TaskProvider(self.session) task = task_provider.by_id(self.train_task) dag = DagProvider(self.session).by_id(self.dag_pipe, joined_load=[Dag.project_rel]) task_dir = join(TASK_FOLDER, str(self.child_task or task.id)) src_log = f'{task_dir}/log' models_dir = join(MODEL_FOLDER, dag.project_rel.name) os.makedirs(models_dir, exist_ok=True) self.info(f'Task = {self.task} child_task: {self.child_task}') model_path_tmp = f'{src_log}/traced.pth' traced = trace_model_from_checkpoint(src_log, self) model = Model(dag=self.dag_pipe, interface=self.interface, slot=self.slot, score_local=task.score, created=now(), name=self.name, project=dag.project, interface_params=yaml_dump(self.interface_params)) provider = ModelProvider(self.session) provider.add(model, commit=False) try: model_path = f'{models_dir}/{model.name}.pth' model_weight_path = f'{models_dir}/{model.name}_weight.pth' torch.jit.save(traced, model_path_tmp) shutil.copy(model_path_tmp, model_path) shutil.copy(f'{src_log}/checkpoints/best.pth', model_weight_path) interface_params = yaml_load(model.interface_params) interface_params['file'] = join('models', model.name + '.pth') model.interface_params = yaml_dump(interface_params) provider.update() except Exception as e: provider.rollback() raise e
class DagCopyBuilder: def __init__(self, session: Session, dag: int, file_changes: str = '', dag_suffix: str = '', logger=None, component: ComponentType = None): self.dag = dag self.file_changes = file_changes self.session = session self.logger = logger self.component = component self.dag_suffix = dag_suffix self.dag_db = None self.dag_provider = None self.task_provider = None self.file_provider = None self.dag_storage_provider = None def log_info(self, message: str): if self.logger: self.logger.info(message, self.component) def create_providers(self): self.log_info('create_providers') self.dag_provider = DagProvider(self.session) self.task_provider = TaskProvider(self.session) self.file_provider = FileProvider(self.session) self.dag_storage_provider = DagStorageProvider(self.session) def create_dag(self): dag = self.dag_provider.by_id(self.dag) name = dag.name if self.dag_suffix: name += ' ' + self.dag_suffix dag_new = Dag(name=name, created=now(), config=dag.config, project=dag.project, docker_img=dag.docker_img, img_size=0, file_size=0, type=dag.type) self.dag_provider.add(dag_new) self.dag_db = dag_new def find_replace(self, changes: dict, path: str): for k, v in changes.items(): if not re.match(k, path): continue return v def create_tasks(self): tasks = self.task_provider.by_dag(self.dag) tasks_new = [] tasks_old = [] for t in tasks: if t.parent: continue task = Task( name=t.name, status=TaskStatus.NotRan.value, computer=t.computer, gpu=t.gpu, gpu_max=t.gpu_max, cpu=t.cpu, executor=t.executor, memory=t.memory, steps=t.steps, dag=self.dag_db.id, debug=t.debug, type=t.type, ) task.additional_info = t.additional_info tasks_new.append(task) tasks_old.append(t) self.task_provider.bulk_save_objects(tasks_new, return_defaults=True) old2new = { t_old.id: t_new.id for t_new, t_old in zip(tasks_new, tasks_old) } dependencies = self.task_provider.get_dependencies(self.dag) dependencies_new = [] for d in dependencies: d_new = TaskDependence(task_id=old2new[d.task_id], depend_id=old2new[d.depend_id]) dependencies_new.append(d_new) self.task_provider.bulk_save_objects(dependencies_new, return_defaults=False) changes = yaml_load(self.file_changes) storages = self.dag_storage_provider.by_dag(self.dag) storages_new = [] for s, f in storages: if not isinstance(changes, dict): continue replace = self.find_replace(changes, s.path) if replace is not None and f: content = f.content.decode('utf-8') if s.path.endswith('.yml'): data = yaml_load(content) data = merge_dicts_smart(data, replace) content = yaml_dump(data) else: for k, v in replace: if k not in content: raise Exception(f'{k} is not in the content') content = content.replace(k, v) content = content.encode('utf-8') md5 = hashlib.md5(content).hexdigest() f = self.file_provider.by_md5(md5) if not f: f = File(content=content, created=now(), project=self.dag_db.project, md5=md5, dag=self.dag_db.id) self.file_provider.add(f) s_new = DagStorage(dag=self.dag_db.id, file=f.id, path=s.path, is_dir=s.is_dir) storages_new.append(s_new) self.dag_storage_provider.bulk_save_objects(storages_new, return_defaults=False) def build(self): self.create_providers() self.create_dag() self.create_tasks()
def dag_remove(): id = request_data()['id'] celery_tasks.remove_dag(_write_session, id) DagProvider(_write_session).remove(id)
def dag_start(): data = request_data() provider = DagProvider(_write_session) task_provider = TaskProvider(_write_session) id = int(data['id']) dag = provider.by_id(id, joined_load=['tasks']) can_start_statuses = [ TaskStatus.Failed.value, TaskStatus.Skipped.value, TaskStatus.Stopped.value ] tasks = list(dag.tasks) def find_resume(task): children = task_provider.children(task.id) children = sorted(children, key=lambda x: x.id, reverse=True) if len(children) > 0: for c in children: if c.parent != task.id: continue info = yaml_load(c.additional_info) if 'distr_info' not in info: continue if info['distr_info']['rank'] == 0: return { 'master_computer': c.computer_assigned, 'master_task_id': c.id, 'load_last': True } raise Exception('Master task not found') else: return { 'master_computer': task.computer_assigned, 'master_task_id': task.id, 'load_last': True } for t in tasks: if t.status not in can_start_statuses: continue if t.parent: continue info = yaml_load(t.additional_info) info['resume'] = find_resume(t) t.additional_info = yaml_dump(info) t.status = TaskStatus.NotRan.value t.pid = None t.started = None t.finished = None t.computer_assigned = None t.celery_id = None t.worker_index = None t.docker_assigned = None provider.commit()
def dag_tag_add(): data = request_data() provider = DagProvider(_write_session) tag = DagTag(dag=data['dag'], tag=data['tag']) provider.add(tag)
def graph(): id = request_data() res = DagProvider(_read_session).graph(id) return res
def config(): id = request_data() res = DagProvider(_read_session).config(id) return {'data': res}
class DagStandardBuilder: def __init__(self, session: Session, config: dict, debug: bool, config_text: str = None, upload_files: bool = True, copy_files_from: int = None, config_path: str = None, control_reqs: bool = True, logger=None, component: ComponentType = None): self.session = session self.config = config self.debug = debug self.config_text = config_text self.upload_files = upload_files self.copy_files_from = copy_files_from self.config_path = config_path self.control_reqs = control_reqs self.info = config['info'] self.layout_name = self.info.get('layout') self.provider = None self.report_provider = None self.report_tasks_provider = None self.report_layout_provider = None self.storage = None self.dag_provider = None self.logger = logger self.component = component self.project = None self.layouts = None self.dag = None self.dag_report_id = None self.created = None self.project_provider = None def log_info(self, message: str): if self.logger: self.logger.info(message, self.component) def create_providers(self): self.log_info('create_providers') self.provider = TaskProvider(self.session) self.report_provider = ReportProvider(self.session) self.report_tasks_provider = ReportTasksProvider(self.session) self.report_layout_provider = ReportLayoutProvider(self.session) self.project_provider = ProjectProvider(self.session) self.storage = Storage(self.session, logger=self.logger, component=self.component) self.dag_provider = DagProvider(self.session) def load_base(self): self.log_info('load_base') project = self.project_provider.by_name(self.info['project']) if project is None: project = self.project_provider.add_project(self.info['project']) self.project = project.id self.layouts = self.report_layout_provider.all() def create_report(self): self.log_info('create_report') self.dag_report_id = None layout_name = self.layout_name if layout_name: if layout_name not in self.layouts: raise Exception(f'Unknown layout = {layout_name}') report = Report(config=yaml_dump(self.layouts[layout_name]), name=self.info['name'], project=self.project, layout=layout_name) self.report_provider.add(report) self.dag_report_id = report.id def create_dag(self): self.log_info('create_dag') dag = Dag(config=self.config_text or yaml_dump(self.config), project=self.project, name=self.info['name'], docker_img=self.info.get('docker_img'), type=DagType.Standard.value, created=now(), report=self.dag_report_id) self.dag = self.dag_provider.add(dag) def upload(self): self.log_info('upload') if self.upload_files: folder = os.path.dirname(os.path.abspath(self.config_path)) if 'expdir' in self.config['info']: path = os.path.dirname(os.path.abspath(self.config_path)) folder = os.path.abspath( os.path.join(path, self.config['info']['expdir'])) self.storage.upload(folder, self.dag, control_reqs=self.control_reqs) elif self.copy_files_from: self.storage.copy_from(self.copy_files_from, self.dag) def create_task(self, k: str, v: dict, name: str, info: dict): task_type = TaskType.User.value if v.get('task_type') == 'train' or \ Executor.is_trainable(v['type']): task_type = TaskType.Train.value gpu = str(v.get('gpu', '0')) if '-' not in gpu: gpu = int(gpu) gpu_max = gpu else: gpu, gpu_max = map(int, gpu.split('-')) if gpu == 0 and gpu_max > 0: raise Exception(f'Executor = {k} Gpu_max can"t be>0 when gpu=0') task = Task(name=name, executor=k, computer=self.info.get('computer') or v.get('computer'), gpu=gpu, gpu_max=gpu_max, cpu=v.get('cpu', 1), memory=v.get('memory', 0.1), dag=self.dag.id, debug=self.debug, steps=int(v.get('steps', '1')), type=task_type) task.additional_info = yaml_dump(info) report = None if self.layout_name and task_type == TaskType.Train.value: if self.layout_name not in self.layouts: raise Exception(f'Unknown report = {v["report"]}') report_config = self.layouts[self.layout_name] info['report_config'] = report_config task.additional_info = yaml_dump(info) report = Report(config=yaml_dump(report_config), name=task.name, project=self.project, layout=self.layout_name) return task, report def create_tasks(self): self.log_info('create_tasks') created = OrderedDict() executors = self.config['executors'] tasks = [] dependencies = [] reports = [] while len(created) < len(executors): for k, v in executors.items(): valid = True if 'depends' in v: depends = v['depends'] if not isinstance(depends, list): depends = [depends] for d in depends: if d == k: raise Exception(f'Executor {k} depends on itself') if d not in executors: raise Exception(f'Executor {k} depend on {d} ' f'which does not exist') valid = valid and d in created if valid: names = [] infos = [] if 'grid' in v: grid = v['grid'] cells = grid_cells(grid) for i, (cell, cell_name) in enumerate(cells): names.append(cell_name) infos.append({'grid_cell': i}) else: names.append(v.get('name', k)) infos.append({}) k_tasks = [] for name, info in zip(names, infos): task, report = self.create_task(k, v, name=name, info=info) tasks.append(task) k_tasks.append(task) reports.append(report) if 'depends' in v: depends = v['depends'] if not isinstance(depends, list): depends = [depends] for d in depends: for dd in created[d]: dependencies.append((task, dd)) created[k] = k_tasks not_empty_reports = [r for r in reports if r is not None] if len(not_empty_reports) > 0: self.provider.bulk_save_objects(not_empty_reports, return_defaults=True) for report, task in zip(reports, tasks): if report is not None: task.report = report.id self.provider.bulk_save_objects(tasks, return_defaults=True) if len(not_empty_reports) > 0: report_tasks = [] for report, task in zip(reports, tasks): if report is not None: report_tasks.append( ReportTasks(report=report.id, task=task.id)) self.report_tasks_provider.bulk_save_objects(report_tasks) dependencies = [ TaskDependence(task_id=task.id, depend_id=dd.id) for task, dd in dependencies ] self.provider.bulk_save_objects(dependencies) for k, v in created.items(): created[k] = [vv.id for vv in v] self.created = created def build(self): self.create_providers() self.load_base() self.create_report() self.create_dag() self.upload() self.create_tasks() self.log_info('Done') return self.created
class SupervisorBuilder: def __init__(self): self.session = Session.create_session(key='SupervisorBuilder') self.logger = create_logger(self.session, 'SupervisorBuilder') self.provider = None self.computer_provider = None self.docker_provider = None self.auxiliary_provider = None self.dag_provider = None self.queues = None self.not_ran_tasks = None self.dep_status = None self.computers = None self.auxiliary = {} self.tasks = [] self.tasks_stop = [] self.dags_start = [] self.sent_tasks = 0 def create_base(self): self.session.commit() self.provider = TaskProvider(self.session) self.computer_provider = ComputerProvider(self.session) self.docker_provider = DockerProvider(self.session) self.auxiliary_provider = AuxiliaryProvider(self.session) self.dag_provider = DagProvider(self.session) self.queues = [ f'{d.computer}_{d.name}' for d in self.docker_provider.all() if d.last_activity >= now() - datetime.timedelta(seconds=15) ] self.auxiliary['queues'] = self.queues def load_tasks(self): self.tasks = self.provider.by_status(TaskStatus.NotRan, TaskStatus.InProgress, TaskStatus.Queued) not_ran_tasks = [t for t in self.tasks if t.status == TaskStatus.NotRan.value] self.not_ran_tasks = [task for task in not_ran_tasks if not task.debug] self.not_ran_tasks = sorted( self.not_ran_tasks, key=lambda x: x.gpu or 0, reverse=True) self.logger.debug( f'Found {len(not_ran_tasks)} not ran tasks', ComponentType.Supervisor ) self.dep_status = self.provider.dependency_status(self.not_ran_tasks) self.auxiliary['not_ran_tasks'] = [ { 'id': t.id, 'name': t.name, 'dep_status': [ TaskStatus(s).name for s in self.dep_status.get(t.id, set()) ] } for t in not_ran_tasks[:5] ] def load_computers(self): computers = self.computer_provider.computers() for computer in computers.values(): computer['gpu'] = [0] * computer['gpu'] computer['ports'] = set() computer['cpu_total'] = computer['cpu'] computer['memory_total'] = computer['memory'] computer['gpu_total'] = len(computer['gpu']) computer['can_process_tasks'] = computer['can_process_tasks'] tasks = [ t for t in self.tasks if t.status in [TaskStatus.InProgress.value, TaskStatus.Queued.value] ] for task in tasks: if task.computer_assigned is None: continue assigned = task.computer_assigned comp_assigned = computers[assigned] comp_assigned['cpu'] -= task.cpu if task.gpu_assigned is not None: for g in task.gpu_assigned.split(','): comp_assigned['gpu'][int(g)] = task.id comp_assigned['memory'] -= task.memory * 1024 info = yaml_load(task.additional_info) if 'distr_info' in info: dist_info = info['distr_info'] if dist_info['rank'] == 0: comp_assigned['ports'].add(dist_info['master_port']) self.computers = [ { **value, 'name': name } for name, value in computers.items() ] self.auxiliary['computers'] = self.computers def process_to_celery(self, task: Task, queue: str, computer: dict): r = execute.apply_async((task.id,), queue=queue, retry=False) task.status = TaskStatus.Queued.value task.computer_assigned = computer['name'] task.celery_id = r.id if task.computer_assigned is not None: if task.gpu_assigned: for g in map(int, task.gpu_assigned.split(',')): computer['gpu'][g] = task.id computer['cpu'] -= task.cpu computer['memory'] -= task.memory * 1024 self.logger.info( f'Sent task={task.id} to celery. Queue = {queue} ' f'Task status = {task.status} Celery_id = {r.id}', ComponentType.Supervisor) self.provider.update() def create_service_task( self, task: Task, gpu_assigned=None, distr_info: dict = None, resume: dict = None ): new_task = Task( name=task.name, computer=task.computer, executor=task.executor, status=TaskStatus.NotRan.value, type=TaskType.Service.value, gpu_assigned=gpu_assigned, parent=task.id, report=task.report, dag=task.dag ) new_task.additional_info = task.additional_info if distr_info: additional_info = yaml_load(new_task.additional_info) additional_info['distr_info'] = distr_info new_task.additional_info = yaml_dump(additional_info) if resume: additional_info = yaml_load(new_task.additional_info) additional_info['resume'] = resume new_task.additional_info = yaml_dump(additional_info) return self.provider.add(new_task) def find_port(self, c: dict, docker_name: str): docker = self.docker_provider.get(c['name'], docker_name) ports = list(map(int, docker.ports.split('-'))) for p in range(ports[0], ports[1] + 1): if p not in c['ports']: return p raise Exception(f'All ports in {c["name"]} are taken') def _process_task_valid_computer(self, task: Task, c: dict, single_node: bool): if not c['can_process_tasks']: return 'this computer can not process tasks' if task.computer is not None and task.computer != c['name']: return 'name set in the config!= name of this computer' if task.cpu > c['cpu']: return f'task cpu = {task.cpu} > computer' \ f' free cpu = {c["cpu"]}' if task.memory > c['memory']: return f'task cpu = {task.cpu} > computer ' \ f'free memory = {c["memory"]}' queue = f'{c["name"]}_' \ f'{task.dag_rel.docker_img or "default"}' if queue not in self.queues: return f'required queue = {queue} not in queues' if task.gpu > 0 and not any(g == 0 for g in c['gpu']): return f'task requires gpu, but there is not any free' free_gpu = sum(g == 0 for g in c['gpu']) if single_node and task.gpu > free_gpu: return f'task requires {task.gpu} ' \ f'but there are only {free_gpu} free' def _process_task_get_computers( self, executor: dict, task: Task, auxiliary: dict ): single_node = executor.get('single_node', True) computers = [] for c in self.computers: error = self._process_task_valid_computer(task, c, single_node) auxiliary['computers'].append({'name': c['name'], 'error': error}) if not error: computers.append(c) if task.gpu > 0 and single_node and len(computers) > 0: computers = sorted( computers, key=lambda x: sum(g == 0 for g in c['gpu']), reverse=True )[:1] free_gpu = sum(sum(g == 0 for g in c['gpu']) for c in computers) if task.gpu > free_gpu: auxiliary['not_valid'] = f'gpu required by the ' \ f'task = {task.gpu},' \ f' but there are only {free_gpu} ' \ f'free gpus' return [] return computers def _process_task_to_send( self, executor: dict, task: Task, computers: List[dict] ): distr = executor.get('distr', True) to_send = [] for computer in computers: queue = f'{computer["name"]}_' \ f'{task.dag_rel.docker_img or "default"}' if task.gpu_max > 1 and distr: for index, task_taken_gpu in enumerate(computer['gpu']): if task_taken_gpu: continue to_send.append([computer, queue, index]) if len(to_send) >= task.gpu_max: break if len(to_send) >= task.gpu_max: break elif task.gpu_max > 0: cuda_devices = [] for index, task_taken_gpu in enumerate(computer['gpu']): if task_taken_gpu: continue cuda_devices.append(index) if len(cuda_devices) >= task.gpu_max: break task.gpu_assigned = ','.join(map(str, cuda_devices)) self.process_to_celery(task, queue, computer) else: self.process_to_celery(task, queue, computer) break return to_send def process_task(self, task: Task): auxiliary = self.auxiliary['process_tasks'][-1] auxiliary['computers'] = [] config = yaml_load(task.dag_rel.config) executor = config['executors'][task.executor] computers = self._process_task_get_computers(executor, task, auxiliary) if len(computers) == 0: return to_send = self._process_task_to_send(executor, task, computers) auxiliary['to_send'] = to_send[:5] additional_info = yaml_load(task.additional_info) rank = 0 master_port = None if len(to_send) > 0: master_port = self.find_port( to_send[0][0], to_send[0][1].split('_')[1] ) computer_names = {c['name'] for c, _, __ in to_send} if len(computer_names) == 1: task.computer_assigned = list(computer_names)[0] for computer, queue, gpu_assigned in to_send: main_cmp = to_send[0][0] # noinspection PyTypeChecker ip = 'localhost' if computer['name'] == main_cmp['name'] \ else main_cmp['ip'] distr_info = { 'master_addr': ip, 'rank': rank, 'local_rank': gpu_assigned, 'master_port': master_port, 'world_size': len(to_send), 'master_computer': main_cmp['name'] } service_task = self.create_service_task( task, distr_info=distr_info, gpu_assigned=gpu_assigned, resume=additional_info.get('resume') ) self.process_to_celery(service_task, queue, computer) rank += 1 main_cmp['ports'].add(master_port) if len(to_send) > 0: task.status = TaskStatus.Queued.value self.sent_tasks += len(to_send) def process_tasks(self): self.auxiliary['process_tasks'] = [] for task in self.not_ran_tasks: auxiliary = {'id': task.id, 'name': task.name} self.auxiliary['process_tasks'].append(auxiliary) if task.dag_rel is None: task.dag_rel = self.dag_provider.by_id(task.dag) if TaskStatus.Stopped.value in self.dep_status[task.id] \ or TaskStatus.Failed.value in self.dep_status[task.id] or \ TaskStatus.Skipped.value in self.dep_status[task.id]: auxiliary['not_valid'] = 'stopped or failed in dep_status' self.provider.change_status(task, TaskStatus.Skipped) continue if len(self.dep_status[task.id]) != 0 \ and self.dep_status[task.id] != {TaskStatus.Success.value}: auxiliary['not_valid'] = 'not all dep tasks are finished' continue self.process_task(task) self.auxiliary['process_tasks'] = self.auxiliary['process_tasks'][:5] def _stop_child_tasks(self, task: Task): self.provider.commit() children = self.provider.children(task.id, [Task.dag_rel]) dags = [c.dag_rel for c in children] for c, d in zip(children, dags): celery_tasks.stop(self.logger, self.session, c, d) def process_parent_tasks(self): tasks = self.provider.parent_tasks_stats() was_change = False for task, started, finished, statuses in tasks: status = task.status if statuses[TaskStatus.Failed] > 0: status = TaskStatus.Failed.value elif statuses[TaskStatus.Skipped] > 0: status = TaskStatus.Skipped.value elif statuses[TaskStatus.Queued] > 0: status = TaskStatus.Queued.value elif statuses[TaskStatus.InProgress] > 0: status = TaskStatus.InProgress.value elif statuses[TaskStatus.Success] > 0: status = TaskStatus.Success.value if status != task.status: if status == TaskStatus.InProgress.value: task.started = started elif status >= TaskStatus.Failed.value: task.started = started task.finished = finished self._stop_child_tasks(task) was_change = True task.status = status if was_change: self.provider.commit() self.auxiliary['parent_tasks_stats'] = [ { 'name': task.name, 'id': task.id, 'started': task.started, 'finished': finished, 'statuses': [ { 'name': k.name, 'count': v } for k, v in statuses.items() ], } for task, started, finished, statuses in tasks[:5] ] def write_auxiliary(self): self.auxiliary['duration'] = (now() - self.auxiliary['time']). \ total_seconds() auxiliary = Auxiliary( name='supervisor', data=yaml_dump(self.auxiliary) ) if len(auxiliary.data) > 16000: return self.auxiliary_provider.create_or_update(auxiliary, 'name') def stop_tasks(self, tasks: List[Task]): self.tasks_stop.extend([t.id for t in tasks]) def process_stop_tasks(self): # Stop not running tasks if len(self.tasks_stop) == 0: return tasks = self.provider.by_ids(self.tasks_stop) tasks_not_ran = [t.id for t in tasks if t.status in [TaskStatus.NotRan.value, TaskStatus.Queued.value]] tasks_started = [t for t in tasks if t.status in [TaskStatus.InProgress.value]] tasks_started_ids = [t.id for t in tasks_started] self.provider.change_status_all(tasks=tasks_not_ran, status=TaskStatus.Skipped) pids = [] for task in tasks_started: if task.pid: pids.append((task.computer_assigned, task.pid)) additional_info = yaml_load(task.additional_info) for p in additional_info.get('child_processes', []): pids.append((task.computer_assigned, p)) for computer, queue in self.docker_provider.queues_online(): pids_computer = [p for c, p in pids if c == computer] if len(pids_computer) > 0: celery_tasks.kill_all.apply_async((pids_computer,), queue=queue, retry=False) self.provider.change_status_all(tasks=tasks_started_ids, status=TaskStatus.Stopped) self.tasks_stop = [] def fast_check(self): if self.provider is None or self.computer_provider is None: return False if self.not_ran_tasks is None or self.queues is None: return False if len(self.tasks_stop) > 0: return False if len(self.dags_start) > 0: return False if len(self.auxiliary.get('to_send', [])) > 0: return False queues = set([ f'{d.computer}_{d.name}' for d in self.docker_provider.all() if d.last_activity >= now() - datetime.timedelta(seconds=15) ]) queues_set = set(queues) queues_set2 = set(self.queues) if queues_set != queues_set2: return False tasks = self.provider.by_status(TaskStatus.NotRan, TaskStatus.Queued, TaskStatus.InProgress) tasks_set = {t.id for t in tasks if t.status == TaskStatus.NotRan.value and not t.debug} tasks_set2 = {t.id for t in self.tasks if t.status == TaskStatus.NotRan.value} if tasks_set != tasks_set2: return False tasks_set = {t.id for t in tasks if t.status == TaskStatus.InProgress.value} tasks_set2 = {t.id for t in self.tasks if t.status == TaskStatus.InProgress.value} if tasks_set != tasks_set2: return False tasks_set = {t.id for t in tasks if t.status == TaskStatus.Queued.value} tasks_set2 = {t.id for t in self.tasks if t.status == TaskStatus.Queued.value} if tasks_set != tasks_set2: return False return True def start_dag(self, id: int): self.dags_start.append(id) def process_start_dags(self): if len(self.dags_start) == 0: return for id in self.dags_start: can_start_statuses = [ TaskStatus.Failed.value, TaskStatus.Skipped.value, TaskStatus.Stopped.value ] tasks = self.provider.by_dag(id) children_all = self.provider.children([t.id for t in tasks]) def find_resume(task): children = [c for c in children_all if c.parent == task.id] children = sorted(children, key=lambda x: x.id, reverse=True) if len(children) > 0: for c in children: if c.parent != task.id: continue info = yaml_load(c.additional_info) if 'distr_info' not in info: continue if info['distr_info']['rank'] == 0: return { 'master_computer': c.computer_assigned, 'master_task_id': c.id, 'load_last': True } raise Exception('Master task not found') else: return { 'master_computer': task.computer_assigned, 'master_task_id': task.id, 'load_last': True } for t in tasks: if t.status not in can_start_statuses: continue if t.parent: continue if t.type == TaskType.Train.value: info = yaml_load(t.additional_info) info['resume'] = find_resume(t) t.additional_info = yaml_dump(info) t.status = TaskStatus.NotRan.value t.pid = None t.started = None t.finished = None t.computer_assigned = None t.celery_id = None t.worker_index = None t.docker_assigned = None self.provider.commit() self.dags_start = [] def build(self): try: # if self.fast_check(): # return self.auxiliary = {'time': now()} self.create_base() self.process_stop_tasks() self.process_start_dags() self.process_parent_tasks() self.load_tasks() self.load_computers() self.process_tasks() self.write_auxiliary() except ObjectDeletedError: pass except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup(key='SupervisorBuilder') self.session = Session.create_session(key='SupervisorBuilder') self.logger = create_logger(self.session, 'SupervisorBuilder') self.logger.error(traceback.format_exc(), ComponentType.Supervisor)
class Storage: def __init__(self, session: Session, logger=None, component: ComponentType = None, max_file_size: int = 10**5, max_count=10**3): self.file_provider = FileProvider(session) self.provider = DagStorageProvider(session) self.task_provider = TaskProvider(session) self.library_provider = DagLibraryProvider(session) self.dag_provider = DagProvider(session) self.logger = logger self.component = component self.max_file_size = max_file_size self.max_count = max_count def log_info(self, message: str): if self.logger: self.logger.info(message, self.component) def copy_from(self, src: int, dag: Dag): storages = self.provider.query(DagStorage). \ filter(DagStorage.dag == src). \ all() libraries = self.library_provider.query(DagLibrary). \ filter(DagLibrary.dag == src). \ all() s_news = [] for s in storages: s_new = DagStorage(dag=dag.id, file=s.file, path=s.path, is_dir=s.is_dir) s_news.append(s_new) l_news = [] for l in libraries: l_new = DagLibrary(dag=dag.id, library=l.library, version=l.version) l_news.append(l_new) self.provider.add_all(s_news) self.library_provider.add_all(l_news) def _build_spec(self, folder: str): ignore_file = os.path.join(folder, 'file.ignore.txt') if not os.path.exists(ignore_file): ignore_patterns = [] else: ignore_patterns = read_lines(ignore_file) ignore_patterns.extend( ['log', '/data', '/models', '__pycache__', '*.ipynb']) return pathspec.PathSpec.from_lines( pathspec.patterns.GitWildMatchPattern, ignore_patterns) def upload(self, folder: str, dag: Dag, control_reqs: bool = True): self.log_info('upload started') hashs = self.file_provider.hashs(dag.project) self.log_info('hashes are retrieved') all_files = [] spec = self._build_spec(folder) files = glob(os.path.join(folder, '**')) for file in files[:]: path = os.path.relpath(file, folder) if spec.match_file(path) or path == '.': continue if os.path.isdir(file): child_files = glob(os.path.join(folder, file, '**'), recursive=True) files.extend(child_files) if self.max_count and len(files) > self.max_count: raise Exception(f'files count = {len(files)} ' f'But max count = {self.max_count}') self.log_info('list of files formed') folders_to_add = [] files_to_add = [] files_storage_to_add = [] total_size_added = 0 for o in files: path = os.path.relpath(o, folder) if spec.match_file(path) or path == '.': continue if isdir(o): folder_to_add = DagStorage(dag=dag.id, path=path, is_dir=True) folders_to_add.append(folder_to_add) continue content = open(o, 'rb').read() size = sys.getsizeof(content) if self.max_file_size and size > self.max_file_size: raise Exception( f'file = {o} has size {size}.' f' But max size is set to {self.max_file_size}') md5 = hashlib.md5(content).hexdigest() all_files.append(o) if md5 not in hashs: file = File(md5=md5, content=content, project=dag.project, dag=dag.id, created=now()) hashs[md5] = file files_to_add.append(file) total_size_added += size file_storage = DagStorage(dag=dag.id, path=path, file=hashs[md5], is_dir=False) files_storage_to_add.append(file_storage) self.log_info('inserting DagStorage folders') if len(folders_to_add) > 0: self.provider.bulk_save_objects(folders_to_add) self.log_info('inserting Files') if len(files_to_add) > 0: self.file_provider.bulk_save_objects(files_to_add, return_defaults=True) self.log_info('inserting DagStorage Files') if len(files_storage_to_add) > 0: for file_storage in files_storage_to_add: if isinstance(file_storage.file, File): # noinspection PyUnresolvedReferences file_storage.file = file_storage.file.id self.provider.bulk_save_objects(files_storage_to_add) dag.file_size += total_size_added self.dag_provider.update() if INSTALL_DEPENDENCIES and control_reqs: reqs = control_requirements(folder, files=all_files) for name, rel, version in reqs: self.library_provider.add( DagLibrary(dag=dag.id, library=name, version=version)) def download_dag(self, dag: int, folder: str): os.makedirs(folder, exist_ok=True) items = self.provider.by_dag(dag) items = sorted(items, key=lambda x: x[1] is not None) for item, file in items: path = os.path.join(folder, item.path) if item.is_dir: os.makedirs(path, exist_ok=True) else: with open(path, 'wb') as f: f.write(file.content) def download(self, task: int): task = self.task_provider.by_id( task, joinedload(Task.dag_rel, innerjoin=True)) folder = join(TASK_FOLDER, str(task.id)) self.download_dag(task.dag, folder) config = Config.from_yaml(task.dag_rel.config) info = config['info'] try: data_folder = os.path.join(DATA_FOLDER, info['project']) os.makedirs(data_folder, exist_ok=True) os.symlink(data_folder, os.path.join(folder, 'data'), target_is_directory=True) except FileExistsError: pass try: model_folder = os.path.join(MODEL_FOLDER, info['project']) os.makedirs(model_folder, exist_ok=True) os.symlink(model_folder, os.path.join(folder, 'models'), target_is_directory=True) except FileExistsError: pass sys.path.insert(0, folder) return folder def import_executor(self, folder: str, base_folder: str, executor: str, libraries: List[Tuple] = None): sys.path.insert(0, base_folder) spec = self._build_spec(folder) was_installation = False folders = [ p for p in glob(f'{folder}/*', recursive=True) if os.path.isdir(p) and not spec.match_file(p) ] folders += [folder] library_names = set(n for n, v in (libraries or [])) library_versions = {n: v for n, v in (libraries or [])} for n in library_names: try: version = pkg_resources.get_distribution(n).version need_install = library_versions[n] != version except Exception: need_install = True if INSTALL_DEPENDENCIES and need_install: os.system(f'pip install {n}=={library_versions[n]}') was_installation = True def is_valid_class(cls: pyclbr.Class): return cls.name == executor or \ cls.name.lower() == executor or \ to_snake(cls.name) == executor def relative_name(path: str): rel = os.path.relpath(path, base_folder) parts = [str(p).split('.')[0] for p in rel.split(os.sep)] return '.'.join(parts) for (module_loader, module_name, ispkg) in pkgutil.iter_modules(folders): module = module_loader.find_module(module_name) rel_path = os.path.relpath( os.path.splitext(module.path)[0], base_folder).replace('/', '.') try: classes = pyclbr.readmodule(rel_path, path=[base_folder]) except Exception: continue for k, v in classes.items(): if is_valid_class(v): importlib.import_module(relative_name(module.path)) return True, was_installation return False, was_installation
def dag_tag_remove(): data = request_data() provider = DagProvider(_write_session) provider.remove_tag(dag=data['dag'], tag=data['tag'])
class SegmentationReportBuilder: def __init__(self, session: Session, task: Task, layout: str, part: str = 'valid', name: str = 'img_segment', max_img_size: Tuple[int, int] = None, stack_type: str = 'vertical', main_metric: str = 'dice', plot_count: int = 0, colors: List[Tuple] = None): self.session = session self.task = task self.layout = layout self.part = part self.name = name or 'img_segment' self.max_img_size = max_img_size self.stack_type = stack_type self.main_metric = main_metric self.colors = colors self.plot_count = plot_count self.dag_provider = DagProvider(session) self.report_provider = ReportProvider(session) self.layout_provider = ReportLayoutProvider(session) self.task_provider = TaskProvider(session) self.report_img_provider = ReportImgProvider(session) self.report_task_provider = ReportTasksProvider(session) self.report_series_provider = ReportSeriesProvider(session) self.project = self.task_provider.project(task.id).id self.layout = self.layout_provider.by_name(layout) self.layout_dict = yaml_load(self.layout.content) self.create_base() def create_base(self): report = Report(config=yaml_dump(self.layout_dict), time=now(), layout=self.layout.name, project=self.project, name=self.name) self.report_provider.add(report) self.report_task_provider.add( ReportTasks(report=report.id, task=self.task.id)) self.task.report = report.id self.task_provider.update() def encode_pred(self, mask: np.array): res = np.zeros((*mask.shape[1:], 3), dtype=np.uint8) for i, c in enumerate(mask): c = np.repeat(c[:, :, None], 3, axis=2) color = self.colors[i] if self.colors is not None else (255, 255, 255) res += (c * color).astype(np.uint8) return res def plot_mask(self, img: np.array, mask: np.array): if len(img.shape) == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) img = img.astype(np.uint8) mask = mask.astype(np.uint8) for i, c in enumerate(mask): contours, _ = cv2.findContours(c, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) color = self.colors[i] if self.colors else (0, 255, 0) for i in range(0, len(contours)): cv2.polylines(img, contours[i], True, color, 2) return img def process_scores(self, scores): for key, item in self.layout_dict['items'].items(): item['name'] = key if item['type'] == 'series' and item['key'] in scores: series = ReportSeries(name=item['name'], value=scores[item['key']], epoch=0, time=now(), task=self.task.id, part='valid', stage='stage1') self.report_series_provider.add(series) def process_pred(self, imgs: np.array, preds: dict, targets: np.array = None, attrs=None, scores=None): for key, item in self.layout_dict['items'].items(): item['name'] = key if item['type'] != 'img_segment': continue report_imgs = [] dag = self.dag_provider.by_id(self.task.dag) for i in range(len(imgs)): if self.plot_count <= 0: break if targets is not None: img = self.plot_mask(imgs[i], targets[i]) else: img = imgs[i] imgs_add = [img] for key, value in preds.items(): imgs_add.append(self.encode_pred(value[i])) for j in range(len(imgs_add)): imgs_add[j] = resize_saving_ratio(imgs_add[j], self.max_img_size) if self.stack_type == 'horizontal': img = np.hstack(imgs_add) else: img = np.vstack(imgs_add) attr = attrs[i] if attrs else {} score = None if targets is not None: score = scores[self.main_metric][i] retval, buffer = cv2.imencode('.jpg', img) report_img = ReportImg(group=item['name'], epoch=0, task=self.task.id, img=buffer, dag=self.task.dag, part=self.part, project=self.project, score=score, **attr) self.plot_count -= 1 report_imgs.append(report_img) dag.img_size += report_img.size self.dag_provider.commit() self.report_img_provider.bulk_save_objects(report_imgs)
def dags(): data = request_data() options = PaginatorOptions(**data['paginator']) provider = DagProvider(_read_session) res = provider.get(data, options) return res
def dag_tags(): data = request_data() provider = DagProvider(_write_session) return provider.tags(data['name'])
def remove_all_dags(): data = request_data() provider = DagProvider(_write_session) dags = provider.by_project(data['project']) provider.remove_all([d.id for d in dags])