def create_dag(self): dag = Dag(config=self.config_text or yaml_dump(self.config), project=self.project, name=self.info['name'], docker_img=self.info.get('docker_img'), type=DagType.Standard.value, created=now(), report=self.dag_report_id) self.dag = self.dag_provider.add(dag)
def report_add_end(): data = request_data() provider = ReportProvider(_write_session) layouts = ReportLayoutProvider(_write_session).all() layout = layouts[data['layout']] report = Report( name=data['name'], project=data['project'], config=yaml_dump(layout) ) provider.add(report)
def create_task(self, k: str, v: dict, name: str, info: dict): task_type = TaskType.User.value if v.get('task_type') == 'train' or \ Executor.is_trainable(v['type']): task_type = TaskType.Train.value gpu = str(v.get('gpu', '0')) if '-' not in gpu: gpu = int(gpu) gpu_max = gpu else: gpu, gpu_max = map(int, gpu.split('-')) if gpu == 0 and gpu_max > 0: raise Exception(f'Executor = {k} Gpu_max can"t be>0 when gpu=0') task = Task(name=name, executor=k, computer=self.info.get('computer') or v.get('computer'), gpu=gpu, gpu_max=gpu_max, cpu=v.get('cpu', 1), memory=v.get('memory', 0.1), dag=self.dag.id, debug=self.debug, steps=int(v.get('steps', '1')), type=task_type) task.additional_info = yaml_dump(info) report = None if self.layout_name and task_type == TaskType.Train.value: if self.layout_name not in self.layouts: raise Exception(f'Unknown report = {v["report"]}') report_config = self.layouts[self.layout_name] info['report_config'] = report_config task.additional_info = yaml_dump(info) report = Report(config=yaml_dump(report_config), name=task.name, project=self.project, layout=self.layout_name) return task, report
def add_project(self, name: str, class_names: dict = None, ignore_folders: List[str] = None): class_names = class_names or {} ignore_folders = ignore_folders or [] assert type(class_names) == dict, 'class_names type must be dict' assert isinstance(ignore_folders, list), \ 'ignore_folders type must be list' project = Project(name=name, class_names=yaml_dump(class_names), ignore_folders=yaml_dump(ignore_folders)) project = self.session.add(project) os.makedirs(os.path.join(DATA_FOLDER, name), exist_ok=True) os.makedirs(os.path.join(MODEL_FOLDER, name), exist_ok=True) return project
def write_auxiliary(self): self.auxiliary['duration'] = (now() - self.auxiliary['time']). \ total_seconds() auxiliary = Auxiliary( name='supervisor', data=yaml_dump(self.auxiliary) ) if len(auxiliary.data) > 16000: return self.auxiliary_provider.create_or_update(auxiliary, 'name')
def create_base(self): report = Report(config=yaml_dump(self.layout_dict), time=now(), layout=self.layout.name, project=self.project, name=self.name) self.report_provider.add(report) self.report_task_provider.add( ReportTasks(report=report.id, task=self.task.id)) self.task.report = report.id self.task_provider.update()
def work(self): task_provider = TaskProvider(self.session) task = task_provider.by_id(self.train_task) dag = DagProvider(self.session).by_id(self.dag_pipe, joined_load=[Dag.project_rel]) task_dir = join(TASK_FOLDER, str(self.child_task or task.id)) src_log = f'{task_dir}/log' models_dir = join(MODEL_FOLDER, dag.project_rel.name) os.makedirs(models_dir, exist_ok=True) self.info(f'Task = {self.task} child_task: {self.child_task}') model_path_tmp = f'{src_log}/traced.pth' traced = trace_model_from_checkpoint(src_log, self) model = Model(dag=self.dag_pipe, interface=self.interface, slot=self.slot, score_local=task.score, created=now(), name=self.name, project=dag.project, interface_params=yaml_dump(self.interface_params)) provider = ModelProvider(self.session) provider.add(model, commit=False) try: model_path = f'{models_dir}/{model.name}.pth' model_weight_path = f'{models_dir}/{model.name}_weight.pth' torch.jit.save(traced, model_path_tmp) shutil.copy(model_path_tmp, model_path) shutil.copy(f'{src_log}/checkpoints/best.pth', model_weight_path) interface_params = yaml_load(model.interface_params) interface_params['file'] = join('models', model.name + '.pth') model.interface_params = yaml_dump(interface_params) provider.update() except Exception as e: provider.rollback() raise e
def dag_model_start(session: Session, data: dict): provider = ModelProvider(session) model = provider.by_id(data['model_id']) dag_provider = DagProvider(session) dag = dag_provider.by_id(data['dag'], joined_load=[Dag.project_rel]) project = dag.project_rel src_config = Config.from_yaml(dag.config) pipe = src_config['pipes'][data['pipe']['name']] equations = yaml_load(model.equations) versions = data['pipe']['versions'] if len(versions) > 0: pipe_equations = yaml_load(versions[0]['equations']) versions[0]['used'] = now() if len(pipe) == 1: pipe[list(pipe)[0]].update(pipe_equations) else: pipe.update(pipe_equations) equations[data['pipe']['name']] = versions model.equations = yaml_dump(equations) for v in pipe.values(): v['model_id'] = model.id config = { 'info': { 'name': data['pipe']['name'], 'project': project.name }, 'executors': pipe } if model.dag: old_dag = dag_provider.by_id(model.dag) if old_dag.name != dag.name: model.dag = dag.id else: model.dag = dag.id provider.commit() dag_standard( session=session, config=config, debug=False, upload_files=False, copy_files_from=data['dag'] )
def computer_sync_end(): data = request_data() provider = ComputerProvider(_write_session) for computer in provider.all(): if data.get('computer') and data['computer'] != computer.name: continue meta = yaml_load(computer.meta) meta['manual_sync'] = { 'project': data['id'], 'ignore_folders': yaml_load(data['ignore_folders']) } computer.meta = yaml_dump(meta) provider.update()
def create_report(self): self.dag_report_id = None layout_name = self.layout_name if layout_name: if layout_name not in self.layouts: raise Exception(f'Unknown layout = {layout_name}') report = Report(config=yaml_dump(self.layouts[layout_name]), name=self.info['name'], project=self.project, layout=layout_name) self.report_provider.add(report) self.dag_report_id = report.id
def sync_manual(self, computer: Computer, provider: ComputerProvider): """ button sync was clicked manually """ if not computer.meta: return meta = yaml_load(computer.meta) if 'manual_sync' not in meta: return manual_sync = meta['manual_sync'] project_provider = ProjectProvider(self.session) docker_provider = DockerProvider(self.session) dockers = docker_provider.get_online() project = project_provider.by_id(manual_sync['project']) sync_folders = manual_sync['sync_folders'] ignore_folders = manual_sync['ignore_folders'] sync_folders = correct_folders(sync_folders, project.name) ignore_folders = correct_folders(ignore_folders, project.name) if not isinstance(sync_folders, list): sync_folders = [] if not isinstance(ignore_folders, list): ignore_folders = [] for docker in dockers: if docker.computer == computer.name: continue source = provider.by_name(docker.computer) folders = [[s, ignore_folders] for s in sync_folders] computer.syncing_computer = source.name provider.update() try: sync_directed( self.session, target=computer, source=source, folders=folders ) except Exception as e: self.process_error(e) del meta['manual_sync'] computer.meta = yaml_dump(meta) provider.update()
def stop_all_dags(): data = request_data() provider = TaskProvider(_write_session) tasks = provider.by_status(TaskStatus.InProgress, TaskStatus.Queued, TaskStatus.NotRan, project=data['project']) for t in tasks: info = yaml_load(t.additional_info) info['stopped'] = True t.additional_info = yaml_dump(info) provider.update() supervisor.stop_tasks(tasks)
def create_dag(self): self.log_info('create_dag') name = self.info['name'] if self.grid_cell: name = f'{name} {self.grid_cell[1]}' dag = Dag(config=self.config_text or yaml_dump(self.config), project=self.project, name=name, docker_img=self.info.get('docker_img'), type=DagType.Standard.value, created=now(), report=self.dag_report_id) self.dag = self.dag_provider.add(dag)
def _dag(config: str, debug: bool = False, control_reqs=True, params: Tuple[str] = ()): logger = create_logger(_session, name='_dag') logger.info('started', ComponentType.Client) config_text = open(config, 'r').read() config_parsed = yaml_load(config_text) params = dict_from_list_str(params) config_parsed = merge_dicts_smart(config_parsed, params) config_text = yaml_dump(config_parsed) logger.info('config parsed', ComponentType.Client) try: commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).strip() config_parsed['info']['name'] += f'_{commit.decode("utf-8")[:6]}' except Exception: logger.info('commit not parsed') type_name = config_parsed['info'].get('type', 'standard') if type_name == DagType.Standard.name.lower(): cells = grid_cells( config_parsed['grid']) if 'grid' in config_parsed else [None] dags = [] for cell in cells: dag = dag_standard(session=_session, config=config_parsed, debug=debug, config_text=config_text, config_path=config, control_reqs=control_reqs, logger=logger, component=ComponentType.Client, grid_cell=cell) dags.append(dag) return dags return [ dag_pipe(session=_session, config=config_parsed, config_text=config_text) ]
def _dag(config: str, debug: bool = False, control_reqs=True, params: Tuple[str] = ()): logger = create_logger(_session, name='_dag') logger.info('started', ComponentType.Client) config_text = open(config, 'r').read() config_parsed = yaml_load(config_text) params = dict_from_list_str(params) config_parsed = merge_dicts_smart(config_parsed, params) config_text = yaml_dump(config_parsed) logger.info('config parsed', ComponentType.Client) type_name = config_parsed['info'].get('type', 'standard') if type_name == DagType.Standard.name.lower(): cells = grid_cells( config_parsed['grid']) if 'grid' in config_parsed else [None] dags = [] for cell in cells: dag = dag_standard(session=_session, config=config_parsed, debug=debug, config_text=config_text, config_path=config, control_reqs=control_reqs, logger=logger, component=ComponentType.Client, grid_cell=cell) dags.append(dag) return dags return [ dag_pipe(session=_session, config=config_parsed, config_text=config_text) ]
def execute(self): self.info('execute start') res = self.executor(task=self.task, task_provider=self.provider, dag=self.dag) self.info('execute executor finished') res = res or {} self.task.result = yaml_dump(res) self.provider.commit() if 'stage' in res and 'stages' in res: index = res['stages'].index(res['stage']) if index < len(res['stages']) - 1: self.executor.info(f'stage = {res["stage"]} done. ' f'Go to the stage = ' f'{res["stages"][index + 1]}') time.sleep(3) self.executor.info(f'sending {(self.id, self.repeat_count)} ' f'to {self.queue_personal}') self.task.status = TaskStatus.Queued.value self.provider.commit() execute.apply_async((self.id, self.repeat_count), queue=self.queue_personal, retry=False) return self.executor.step.finish() self.provider.change_status(self.task, TaskStatus.Success) self.info('execute end')
def _dag(config: str, debug: bool = False, control_reqs=True, params: Tuple[str] = ()): migrate() config_text = open(config, 'r').read() config_parsed = yaml_load(config_text) params = dict_from_list_str(params) config_parsed = merge_dicts_smart(config_parsed, params) config_text = yaml_dump(config_parsed) type_name = config_parsed['info'].get('type', 'standard') if type_name == DagType.Standard.name.lower(): return dag_standard(session=_session, config=config_parsed, debug=debug, config_text=config_text, config_path=config, control_reqs=control_reqs) return dag_pipe(session=_session, config=config_parsed, config_text=config_text)
def update_layout_end(self, id: int, layout: str, layouts: dict): layout_content = yaml_dump(layouts[layout]) report = self.by_id(id) report.config = layout_content report.layout = layout self.commit()
def work(self): args, config = self.parse_args_uargs() set_global_seed(args.seed) Experiment, R = import_experiment_and_runner(Path(args.expdir)) runner_params = config.pop('runner_params', {}) experiment = Experiment(config) runner: Runner = R(**runner_params) register() self.experiment = experiment self.runner = runner stages = experiment.stages[:] if self.master: task = self.task if not self.task.parent \ else self.task_provider.by_id(self.task.parent) task.steps = len(stages) self.task_provider.commit() self._checkpoint_fix_config(experiment) _get_callbacks = experiment.get_callbacks def get_callbacks(stage): res = self.callbacks() for k, v in _get_callbacks(stage).items(): res[k] = v self._checkpoint_fix_callback(res) return res experiment.get_callbacks = get_callbacks if experiment.logdir is not None: dump_environment(config, experiment.logdir, args.configs) if self.distr_info: info = yaml_load(self.task.additional_info) info['resume'] = { 'master_computer': self.distr_info['master_computer'], 'master_task_id': self.task.id - self.distr_info['rank'], 'load_best': True } self.task.additional_info = yaml_dump(info) self.task_provider.commit() experiment.stages_config = { k: v for k, v in experiment.stages_config.items() if k == experiment.stages[0] } runner.run_experiment(experiment, check=args.check) if self.master and self.trace: traced = trace_model_from_checkpoint(self.experiment.logdir, self) torch.jit.save(traced, self.trace) return {'stage': experiment.stages[-1], 'stages': stages}
def add_item(self, k: str, v: dict): self.add( ReportLayout(content=yaml_dump(v), name=k, last_modified=now()) )
def add_child_process(self, pid: int): additional_info = yaml_load(self.task.additional_info) additional_info['child_processes'] = additional_info.get( 'child_processes', []) + [pid] self.task.additional_info = yaml_dump(additional_info) self.task_provider.update()
def dag_start(): data = request_data() provider = DagProvider(_write_session) task_provider = TaskProvider(_write_session) id = int(data['id']) dag = provider.by_id(id, joined_load=['tasks']) can_start_statuses = [ TaskStatus.Failed.value, TaskStatus.Skipped.value, TaskStatus.Stopped.value ] tasks = list(dag.tasks) def find_resume(task): children = task_provider.children(task.id) children = sorted(children, key=lambda x: x.id, reverse=True) if len(children) > 0: for c in children: if c.parent != task.id: continue info = yaml_load(c.additional_info) if 'distr_info' not in info: continue if info['distr_info']['rank'] == 0: return { 'master_computer': c.computer_assigned, 'master_task_id': c.id, 'load_last': True } raise Exception('Master task not found') else: return { 'master_computer': task.computer_assigned, 'master_task_id': task.id, 'load_last': True } for t in tasks: if t.status not in can_start_statuses: continue if t.parent: continue info = yaml_load(t.additional_info) info['resume'] = find_resume(t) t.additional_info = yaml_dump(info) t.status = TaskStatus.NotRan.value t.pid = None t.started = None t.finished = None t.computer_assigned = None t.celery_id = None t.worker_index = None t.docker_assigned = None provider.commit()
def process_start_dags(self): if len(self.dags_start) == 0: return for id in self.dags_start: can_start_statuses = [ TaskStatus.Failed.value, TaskStatus.Skipped.value, TaskStatus.Stopped.value ] tasks = self.provider.by_dag(id) children_all = self.provider.children([t.id for t in tasks]) def find_resume(task): children = [c for c in children_all if c.parent == task.id] children = sorted(children, key=lambda x: x.id, reverse=True) if len(children) > 0: for c in children: if c.parent != task.id: continue info = yaml_load(c.additional_info) if 'distr_info' not in info: continue if info['distr_info']['rank'] == 0: return { 'master_computer': c.computer_assigned, 'master_task_id': c.id, 'load_last': True } raise Exception('Master task not found') else: return { 'master_computer': task.computer_assigned, 'master_task_id': task.id, 'load_last': True } for t in tasks: if t.status not in can_start_statuses: continue if t.parent: continue if t.type == TaskType.Train.value: info = yaml_load(t.additional_info) info['resume'] = find_resume(t) t.additional_info = yaml_dump(info) t.status = TaskStatus.NotRan.value t.pid = None t.started = None t.finished = None t.computer_assigned = None t.celery_id = None t.worker_index = None t.docker_assigned = None self.provider.commit() self.dags_start = []
def get(self, filter: dict, options: PaginatorOptions): query = self.query(Task, Project.name).\ join(Dag, Dag.id == Task.dag).\ join(Project, Project.id == Dag.project).\ options(joinedload(Task.dag_rel, innerjoin=True)) query = self._get_filter(query, filter) total = query.count() paginator = self.paginator(query, options) res = [] for p, project_name in paginator.all(): if p.dag_rel is None: continue item = {**self.to_dict(p, rules=('-additional_info', ))} item['status'] = to_snake(TaskStatus(item['status']).name) item['type'] = to_snake(TaskType(item['type']).name) item['dag_rel']['project'] = { 'id': item['dag_rel']['project'], 'name': project_name } if p.started is None: delta = 0 elif p.status == TaskStatus.InProgress.value: delta = (now() - p.started).total_seconds() else: finish = (p.finished or p.last_activity) delta = (finish - p.started).total_seconds() item['duration'] = duration_format(delta) if p.dag_rel is not None: res.append(item) if filter.get('report'): tasks_within_report = self.query( ReportTasks.task ).filter(ReportTasks.report == int(filter['report'])) tasks_within_report = {t[0] for t in tasks_within_report} for r in res: r['report_full'] = r['id'] in tasks_within_report projects = self.query(Project.name, Project.id). \ order_by(Project.id.desc()). \ limit(20). \ all() dags = self.query(Dag.name, Dag.id). \ order_by(Dag.id.desc()). \ limit(20). \ all() projects = [{'name': name, 'id': id} for name, id in projects] dags = [{'name': name, 'id': id} for name, id in dags] dags_model = self.query(Dag.name, Dag.id, Dag.config). \ filter(Dag.type == DagType.Pipe.value). \ order_by(Dag.id.desc()). \ all() dags_model_dict = [] used_dag_names = set() for name, id, config in dags_model: if name in used_dag_names: continue config = Config.from_yaml(config) slots = [] for pipe in config['pipes'].values(): for k, v in pipe.items(): if 'slot' in v: slots.append(v['slot']) elif 'slots' in v: slots.extend(v['slots']) dag = { 'name': name, 'id': id, 'slots': slots, 'interfaces': [ { 'name': k, 'params': yaml_dump(v) } for k, v in config['interfaces'].items() ] } dags_model_dict.append(dag) used_dag_names.add(name) return { 'total': total, 'data': res, 'projects': projects, 'dags': dags, 'dags_model': dags_model_dict }
def create_tasks(self): tasks = self.task_provider.by_dag(self.dag) tasks_new = [] tasks_old = [] for t in tasks: if t.parent: continue task = Task( name=t.name, status=TaskStatus.NotRan.value, computer=t.computer, gpu=t.gpu, gpu_max=t.gpu_max, cpu=t.cpu, executor=t.executor, memory=t.memory, steps=t.steps, dag=self.dag_db.id, debug=t.debug, type=t.type, ) task.additional_info = t.additional_info tasks_new.append(task) tasks_old.append(t) self.task_provider.bulk_save_objects(tasks_new, return_defaults=True) old2new = { t_old.id: t_new.id for t_new, t_old in zip(tasks_new, tasks_old) } dependencies = self.task_provider.get_dependencies(self.dag) dependencies_new = [] for d in dependencies: d_new = TaskDependence(task_id=old2new[d.task_id], depend_id=old2new[d.depend_id]) dependencies_new.append(d_new) self.task_provider.bulk_save_objects(dependencies_new, return_defaults=False) changes = yaml_load(self.file_changes) storages = self.dag_storage_provider.by_dag(self.dag) storages_new = [] for s, f in storages: if not isinstance(changes, dict): continue replace = self.find_replace(changes, s.path) if replace is not None and f: content = f.content.decode('utf-8') if s.path.endswith('.yml'): data = yaml_load(content) data = merge_dicts_smart(data, replace) content = yaml_dump(data) else: for k, v in replace: if k not in content: raise Exception(f'{k} is not in the content') content = content.replace(k, v) content = content.encode('utf-8') md5 = hashlib.md5(content).hexdigest() f = self.file_provider.by_md5(md5) if not f: f = File(content=content, created=now(), project=self.dag_db.project, md5=md5, dag=self.dag_db.id) self.file_provider.add(f) s_new = DagStorage(dag=self.dag_db.id, file=f.id, path=s.path, is_dir=s.is_dir) storages_new.append(s_new) self.dag_storage_provider.bulk_save_objects(storages_new, return_defaults=False)