def work(self): project = ProjectProvider(self.session).by_id(self.project) self.info(f'Task = {self.train_task} child_task: {self.child_task}') model = Model(created=now(), name=self.name, project=self.project, equations='', fold=self.fold) provider = ModelProvider(self.session) if self.train_task: task_provider = TaskProvider(self.session) task = task_provider.by_id(self.train_task) model.score_local = task.score task_dir = join(TASK_FOLDER, str(self.child_task or task.id)) src_log = f'{task_dir}/log' models_dir = join(MODEL_FOLDER, project.name) os.makedirs(models_dir, exist_ok=True) model_path_tmp = f'{src_log}/traced.pth' traced = trace_model_from_checkpoint(src_log, self, file=self.file) model_path = f'{models_dir}/{model.name}.pth' model_weight_path = f'{models_dir}/{model.name}_weight.pth' torch.jit.save(traced, model_path_tmp) shutil.copy(model_path_tmp, model_path) file = self.file = 'best_full' shutil.copy(f'{src_log}/checkpoints/{file}.pth', model_weight_path) provider.add(model)
def task_stop(): data = request_data() provider = TaskProvider(_write_session) task = provider.by_id(data['id'], joinedload(Task.dag_rel, innerjoin=True)) tasks = [task] + provider.children(task.id) supervisor.stop_tasks(tasks)
def stop(logger, session: Session, task: Task, dag: Dag): provider = TaskProvider(session) if task.status > TaskStatus.InProgress.value: return task.status status = TaskStatus.Stopped try: if task.status != TaskStatus.NotRan.value: app.control.revoke(task.celery_id, terminate=True) else: status = TaskStatus.Skipped except Exception as e: if Session.sqlalchemy_error(e): try: logger.error(traceback.format_exc(), ComponentType.API) except Exception: pass raise logger.error(traceback.format_exc(), ComponentType.API) finally: if task.pid: queue = f'{task.computer_assigned}_' \ f'{dag.docker_img or "default"}_supervisor' kill.apply_async((task.pid, ), queue=queue, retry=False) additional_info = yaml_load(task.additional_info) for p in additional_info.get('child_processes', []): kill.apply_async((p, ), queue=queue, retry=False) provider.change_status(task, status) return task.status
def create_providers(self): self.log_info('create_providers') self.dag_provider = DagProvider(self.session) self.task_provider = TaskProvider(self.session) self.file_provider = FileProvider(self.session) self.dag_storage_provider = DagStorageProvider(self.session)
def __init__(self, session: Session, task: Task, layout: str, part: str = 'valid', name: str = 'img_classify', max_img_size: Tuple[int, int] = None, main_metric: str = 'accuracy', plot_count: int = 0): self.session = session self.task = task self.layout = layout self.part = part self.name = name or 'img_classify' self.max_img_size = max_img_size self.main_metric = main_metric self.plot_count = plot_count self.dag_provider = DagProvider(session) self.report_provider = ReportProvider(session) self.layout_provider = ReportLayoutProvider(session) self.task_provider = TaskProvider(session) self.report_img_provider = ReportImgProvider(session) self.report_task_provider = ReportTasksProvider(session) self.report_series_provider = ReportSeriesProvider(session) self.project = self.task_provider.project(task.id).id self.layout = self.layout_provider.by_name(layout) self.layout_dict = yaml_load(self.layout.content)
def stop_processes_not_exist(session: Session, logger): provider = TaskProvider(session) hostname = socket.gethostname() tasks = provider.by_status(TaskStatus.InProgress, task_docker_assigned=DOCKER_IMG, computer_assigned=hostname) hostname = socket.gethostname() for t in tasks: if not psutil.pid_exists(t.pid): # tasks can retry the execution if (now() - t.last_activity).total_seconds() < 30: continue os.system(f'kill -9 {t.pid}') t.status = TaskStatus.Failed.value logger.error( f'process with pid = {t.pid} does not exist. ' f'Set task to failed state', ComponentType.WorkerSupervisor, hostname, t.id) provider.commit() additional_info = yaml_load(t.additional_info) for p in additional_info.get('child_processes', []): logger.info(f'killing child process = {p}') os.system(f'kill -9 {p}')
def create_base(self): self.info('create_base') self.provider = TaskProvider(self.session) self.library_provider = DagLibraryProvider(self.session) self.storage = Storage(self.session) self.task = self.provider.by_id( self.id, joinedload(Task.dag_rel, innerjoin=True)) if not self.task: raise Exception(f'task with id = {self.id} is not found') self.dag = self.task.dag_rel self.executor = None self.hostname = socket.gethostname() self.docker_img = DOCKER_IMG self.worker_index = os.getenv('WORKER_INDEX', -1) self.queue_personal = f'{self.hostname}_{self.docker_img}_' \ f'{self.worker_index}' self.config = Config.from_yaml(self.dag.config) self.executor_type = self.config['executors'][ self.task.executor]['type']
def __init__(self, session: Session, task: Task, layout: str, part: str = 'valid', name: str = 'img_segment', max_img_size: Tuple[int, int] = None, stack_type: str = 'vertical', main_metric: str = 'dice', plot_count: int = 0, colors: List[Tuple] = None): self.session = session self.task = task self.layout = layout self.part = part self.name = name or 'img_segment' self.max_img_size = max_img_size self.stack_type = stack_type self.main_metric = main_metric self.colors = colors self.plot_count = plot_count self.dag_provider = DagProvider(session) self.report_provider = ReportProvider(session) self.layout_provider = ReportLayoutProvider(session) self.task_provider = TaskProvider(session) self.report_img_provider = ReportImgProvider(session) self.report_task_provider = ReportTasksProvider(session) self.report_series_provider = ReportSeriesProvider(session) self.project = self.task_provider.project(task.id).id self.layout = self.layout_provider.by_name(layout) self.layout_dict = yaml_load(self.layout.content) self.create_base()
def describe_task_names(dag: int): pd.set_option('display.max_columns', None) pd.set_option('display.expand_frame_repr', False) pd.set_option('max_colwidth', -1) provider = TaskProvider() tasks = provider.by_dag(dag) return pd.DataFrame([{'id': t.id, 'name': t.name} for t in tasks])
def create_base(self): self.info('create_base') if app.current_task: app.current_task.update_state(state=states.SUCCESS) app.control.revoke(app.current_task.request.id, terminate=True) self.provider = TaskProvider(self.session) self.library_provider = DagLibraryProvider(self.session) self.storage = Storage(self.session) self.task = self.provider.by_id( self.id, joinedload(Task.dag_rel, innerjoin=True)) if not self.task: raise Exception(f'task with id = {self.id} is not found') self.dag = self.task.dag_rel self.executor = None self.hostname = socket.gethostname() self.docker_img = DOCKER_IMG self.worker_index = os.getenv('WORKER_INDEX', -1) self.queue_personal = f'{self.hostname}_{self.docker_img}_' \ f'{self.worker_index}' self.config = Config.from_yaml(self.dag.config) set_global_seed(self.config['info'].get('seed', 0)) self.executor_type = self.config['executors'][ self.task.executor]['type'] executor = self.config['executors'][self.task.executor] if os.getenv('CUDA_VISIBLE_DEVICES', '').strip() != '': cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', '').split(',') self.task.gpu_assigned = ','.join([ cuda_visible_devices[int(g)] for g in (self.task.gpu_assigned or '').split(',') ]) cuda_visible_devices = self.task.gpu_assigned else: cuda_visible_devices = self.task.gpu_assigned cuda_visible_devices = cuda_visible_devices or '' env = { 'MKL_NUM_THREADS': 1, 'OMP_NUM_THREADS': 1, 'CUDA_VISIBLE_DEVICES': cuda_visible_devices } env.update(executor.get('env', {})) for k, v in env.items(): os.environ[k] = str(v) self.info(f'Set env. {k} = {v}')
def create_providers(self): self.provider = TaskProvider(self.session) self.report_provider = ReportProvider(self.session) self.report_tasks_provider = ReportTasksProvider(self.session) self.report_layout_provider = ReportLayoutProvider(self.session) self.project_provider = ProjectProvider(self.session) self.storage = Storage(self.session) self.dag_provider = DagProvider(self.session)
def dag_stop(): data = request_data() provider = TaskProvider(_write_session) id = int(data['id']) tasks = provider.by_dag(id) supervisor.stop_tasks(tasks) dag_provider = DagProvider(_write_session) return {'dag': dag_provider.get({'id': id})['data'][0]}
def create_providers(self): self.log_info('create_providers') self.provider = TaskProvider(self.session) self.report_provider = ReportProvider(self.session) self.report_tasks_provider = ReportTasksProvider(self.session) self.report_layout_provider = ReportLayoutProvider(self.session) self.project_provider = ProjectProvider(self.session) self.storage = Storage(self.session, logger=self.logger, component=self.component) self.dag_provider = DagProvider(self.session)
def __init__(self, session: Session, logger=None, component: ComponentType = None, max_file_size: int = 10 ** 5, max_count=10 ** 3): self.file_provider = FileProvider(session) self.provider = DagStorageProvider(session) self.task_provider = TaskProvider(session) self.library_provider = DagLibraryProvider(session) self.dag_provider = DagProvider(session) self.logger = logger self.component = component self.max_file_size = max_file_size self.max_count = max_count
def task_stop(): data = request_data() provider = TaskProvider(_write_session) task = provider.by_id(data['id'], joinedload(Task.dag_rel, innerjoin=True)) dag = task.dag_rel status = celery_tasks.stop(logger, _write_session, task, dag) child_tasks = provider.children(task.id) for t in child_tasks: celery_tasks.stop(logger, _write_session, t, dag) return {'status': to_snake(TaskStatus(status).name)}
def execute(config: str, debug: bool, params): check_statuses() _create_computer() _create_docker() # Fail all InProgress Tasks logger = create_logger(_session, __name__) provider = TaskProvider(_session) step_provider = StepProvider(_session) for t in provider.by_status(TaskStatus.InProgress, worker_index=WORKER_INDEX): step = step_provider.last_for_task(t.id) logger.error( f'Task Id = {t.id} was in InProgress state ' f'when another tasks arrived to the same worker', ComponentType.Worker, t.computer_assigned, t.id, step) provider.change_status(t, TaskStatus.Failed) # Create dags dags = _dag(config, debug, params=params) for dag in dags: for ids in dag.values(): for id in ids: task = provider.by_id(id) task.gpu_assigned = ','.join( [str(i) for i in range(torch.cuda.device_count())]) provider.commit() execute_by_id(id, exit=False)
def execute(config: str, debug: bool): _create_computer() # Fail all InProgress Tasks logger = create_logger(_session, __name__) provider = TaskProvider(_session) step_provider = StepProvider(_session) for t in provider.by_status(TaskStatus.InProgress, worker_index=WORKER_INDEX): step = step_provider.last_for_task(t.id) logger.error( f'Task Id = {t.id} was in InProgress state ' f'when another tasks arrived to the same worker', ComponentType.Worker, t.computer_assigned, t.id, step) provider.change_status(t, TaskStatus.Failed) # Create dag created_dag = _dag(config, debug) for ids in created_dag.values(): for id in ids: task = provider.by_id(id) task.gpu_assigned = ','.join( [str(i) for i, _ in enumerate(GPUtil.getGPUs())]) provider.commit() execute_by_id(id, exit=False)
def task_before_update(mapper, connection, target): target.last_activity = now() if target.parent: provider = TaskProvider(_session) parent = provider.by_id(target.parent) if parent is None: return parent.last_activity = target.last_activity try: provider.commit() except StaleDataError: pass
def stop_all_dags(): data = request_data() provider = TaskProvider(_write_session) tasks = provider.by_status(TaskStatus.InProgress, TaskStatus.Queued, TaskStatus.NotRan, project=data['project']) for t in tasks: info = yaml_load(t.additional_info) info['stopped'] = True t.additional_info = yaml_dump(info) provider.update() supervisor.stop_tasks(tasks)
def create_base(self): self.session.commit() self.provider = TaskProvider(self.session) self.computer_provider = ComputerProvider(self.session) self.docker_provider = DockerProvider(self.session) self.auxiliary_provider = AuxiliaryProvider(self.session) self.dag_provider = DagProvider(self.session) self.queues = [ f'{d.computer}_{d.name}' for d in self.docker_provider.all() if d.last_activity >= now() - datetime.timedelta(seconds=15) ] self.auxiliary['queues'] = self.queues
def work(self): project = ProjectProvider(self.session).by_id(self.project) self.info(f'Task = {self.train_task} child_task: {self.child_task}') model = Model( created=now(), name=self.name, project=self.project, equations='', fold=self.fold ) provider = ModelProvider(self.session) if self.train_task: task_provider = TaskProvider(self.session) dag_provider = DagProvider(self.session) task = task_provider.by_id(self.train_task) dag = dag_provider.by_id(task.dag) task_dir = join(TASK_FOLDER, str(self.child_task or task.id)) # get log directory config = yaml_load(dag.config) executor_config = config['executors'][task.executor] catalyst_config_file = executor_config['args']['config'] catalyst_config_file = join(task_dir, catalyst_config_file) catalyst_config = yaml_load(file=catalyst_config_file) catalyst_logdir = catalyst_config['args']['logdir'] model.score_local = task.score src_log = f'{task_dir}/{catalyst_logdir}' models_dir = join(MODEL_FOLDER, project.name) os.makedirs(models_dir, exist_ok=True) model_path_tmp = f'{src_log}/traced.pth' traced = trace_model_from_checkpoint(src_log, self, file=self.file) model_path = f'{models_dir}/{model.name}.pth' model_weight_path = f'{models_dir}/{model.name}_weight.pth' torch.jit.save(traced, model_path_tmp) shutil.copy(model_path_tmp, model_path) file = self.file = 'best_full' shutil.copy(f'{src_log}/checkpoints/{file}.pth', model_weight_path) provider.add(model)
def dag_model_add(session: Session, data: dict): if not data.get('task'): model = Model(name=data['name'], project=data['project'], equations=data['equations'], created=now()) ModelProvider(session).add(model) return task_provider = TaskProvider(session) task = task_provider.by_id(data['task'], options=joinedload(Task.dag_rel, innerjoin=True)) child_tasks = task_provider.children(task.id) computer = task.computer_assigned child_task = None if len(child_tasks) > 0: child_task = child_tasks[0].id computer = child_tasks[0].computer_assigned project = ProjectProvider(session).by_id(task.dag_rel.project) config = { 'info': { 'name': 'model_add', 'project': project.name, 'computer': computer }, 'executors': { 'model_add': { 'type': 'model_add', 'project': data['project'], 'task': data.get('task'), 'name': data['name'], 'file': data['file'], 'child_task': child_task, 'fold': data['fold'] } } } dag_standard(session=session, config=config, debug=False, upload_files=False)
def work(self): task_provider = TaskProvider(self.session) task = task_provider.by_id(self.train_task) dag = DagProvider(self.session).by_id(self.dag_pipe, joined_load=[Dag.project_rel]) task_dir = join(TASK_FOLDER, str(self.child_task or task.id)) src_log = f'{task_dir}/log' models_dir = join(MODEL_FOLDER, dag.project_rel.name) os.makedirs(models_dir, exist_ok=True) self.info(f'Task = {self.task} child_task: {self.child_task}') model_path_tmp = f'{src_log}/traced.pth' traced = trace_model_from_checkpoint(src_log, self) model = Model(dag=self.dag_pipe, interface=self.interface, slot=self.slot, score_local=task.score, created=now(), name=self.name, project=dag.project, interface_params=yaml_dump(self.interface_params)) provider = ModelProvider(self.session) provider.add(model, commit=False) try: model_path = f'{models_dir}/{model.name}.pth' model_weight_path = f'{models_dir}/{model.name}_weight.pth' torch.jit.save(traced, model_path_tmp) shutil.copy(model_path_tmp, model_path) shutil.copy(f'{src_log}/checkpoints/best.pth', model_weight_path) interface_params = yaml_load(model.interface_params) interface_params['file'] = join('models', model.name + '.pth') model.interface_params = yaml_dump(interface_params) provider.update() except Exception as e: provider.rollback() raise e
def dag_model_add(session: Session, data: dict): task_provider = TaskProvider(session) task = task_provider.by_id(data['task'], options=joinedload(Task.dag_rel, innerjoin=True)) child_tasks = task_provider.children(task.id) computer = task.computer_assigned child_task = None if len(child_tasks) > 0: child_task = child_tasks[0].id computer = child_tasks[0].computer_assigned project = ProjectProvider(session).by_id(task.dag_rel.project) interface_params = data.get('interface_params', '') interface_params = yaml_load(interface_params) config = { 'info': { 'name': 'model_add', 'project': project.name, 'computer': computer }, 'executors': { 'model_add': { 'type': 'model_add', 'dag': data['dag'], 'slot': data['slot'], 'interface': data['interface'], 'task': data.get('task'), 'name': data['name'], 'interface_params': interface_params, 'child_task': child_task } } } dag_standard(session=session, config=config, debug=False, upload_files=False)
def task_info(): data = request_data() task = TaskProvider(_read_session).by_id( data['id'], joinedload(Task.dag_rel, innerjoin=True)) return { 'pid': task.pid, 'worker_index': task.worker_index, 'gpu_assigned': task.gpu_assigned, 'celery_id': task.celery_id, 'additional_info': task.additional_info or '', 'result': task.result or '', 'id': task.id }
def create_base(self): self.info('create_base') if app.current_task: app.current_task.update_state(state=states.SUCCESS) app.control.revoke(app.current_task.request.id, terminate=True) self.provider = TaskProvider(self.session) self.library_provider = DagLibraryProvider(self.session) self.storage = Storage(self.session) self.task = self.provider.by_id( self.id, joinedload(Task.dag_rel, innerjoin=True)) if not self.task: raise Exception(f'task with id = {self.id} is not found') self.dag = self.task.dag_rel self.executor = None self.hostname = socket.gethostname() self.docker_img = DOCKER_IMG self.worker_index = os.getenv('WORKER_INDEX', -1) self.queue_personal = f'{self.hostname}_{self.docker_img}_' \ f'{self.worker_index}' self.config = Config.from_yaml(self.dag.config) self.executor_type = self.config['executors'][ self.task.executor]['type'] executor = self.config['executors'][self.task.executor] env = {'MKL_NUM_THREADS': 1, 'OMP_NUM_THREADS': 1} env.update(executor.get('env', {})) for k, v in env.items(): os.environ[k] = str(v) self.info(f'Set env. {k} = {v}')
def stop_processes_not_exist(session: Session, logger): provider = TaskProvider(session) hostname = socket.gethostname() tasks = provider.by_status( TaskStatus.InProgress, task_docker_assigned=DOCKER_IMG, computer_assigned=hostname ) # Kill processes which does not exist hostname = socket.gethostname() for t in tasks: if not psutil.pid_exists(t.pid): # tasks can retry the execution if (now() - t.last_activity).total_seconds() < 30: continue os.system(f'kill -9 {t.pid}') t.status = TaskStatus.Failed.value logger.error( f'process with pid = {t.pid} does not exist. ' f'Set task to failed state', ComponentType.WorkerSupervisor, hostname, t.id ) provider.commit() additional_info = yaml_load(t.additional_info) for p in additional_info.get('child_processes', []): logger.info(f'killing child process = {p}') os.system(f'kill -9 {p}') # Kill processes which exist but should not processes = get_pid('worker ') ids = [p['PID'] for p in processes] tasks = provider.by_ids(ids) tasks = {t.pid: t for t in tasks} for p in processes: pid = p['PID'] if pid in tasks: task = tasks[pid] if task.status in [TaskStatus.Stopped.value, TaskStatus.Failed.value, TaskStatus.Skipped.value]: logger.info(f'Kill processes which exist but should not. ' f'Pid = {pid}') os.system(f'kill -9 {pid}')
class SegmentationReportBuilder: def __init__(self, session: Session, task: Task, layout: str, part: str = 'valid', name: str = 'img_segment', max_img_size: Tuple[int, int] = None, stack_type: str = 'vertical', main_metric: str = 'dice', plot_count: int = 0, colors: List[Tuple] = None): self.session = session self.task = task self.layout = layout self.part = part self.name = name or 'img_segment' self.max_img_size = max_img_size self.stack_type = stack_type self.main_metric = main_metric self.colors = colors self.plot_count = plot_count self.dag_provider = DagProvider(session) self.report_provider = ReportProvider(session) self.layout_provider = ReportLayoutProvider(session) self.task_provider = TaskProvider(session) self.report_img_provider = ReportImgProvider(session) self.report_task_provider = ReportTasksProvider(session) self.report_series_provider = ReportSeriesProvider(session) self.project = self.task_provider.project(task.id).id self.layout = self.layout_provider.by_name(layout) self.layout_dict = yaml_load(self.layout.content) self.create_base() def create_base(self): report = Report(config=yaml_dump(self.layout_dict), time=now(), layout=self.layout.name, project=self.project, name=self.name) self.report_provider.add(report) self.report_task_provider.add( ReportTasks(report=report.id, task=self.task.id)) self.task.report = report.id self.task_provider.update() def encode_pred(self, mask: np.array): res = np.zeros((*mask.shape[1:], 3), dtype=np.uint8) for i, c in enumerate(mask): c = np.repeat(c[:, :, None], 3, axis=2) color = self.colors[i] if self.colors is not None else (255, 255, 255) res += (c * color).astype(np.uint8) return res def plot_mask(self, img: np.array, mask: np.array): if len(img.shape) == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) img = img.astype(np.uint8) mask = mask.astype(np.uint8) for i, c in enumerate(mask): contours, _ = cv2.findContours(c, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) color = self.colors[i] if self.colors else (0, 255, 0) for i in range(0, len(contours)): cv2.polylines(img, contours[i], True, color, 2) return img def process_scores(self, scores): for key, item in self.layout_dict['items'].items(): item['name'] = key if item['type'] == 'series' and item['key'] in scores: series = ReportSeries(name=item['name'], value=scores[item['key']], epoch=0, time=now(), task=self.task.id, part='valid', stage='stage1') self.report_series_provider.add(series) def process_pred(self, imgs: np.array, preds: dict, targets: np.array = None, attrs=None, scores=None): for key, item in self.layout_dict['items'].items(): item['name'] = key if item['type'] != 'img_segment': continue report_imgs = [] dag = self.dag_provider.by_id(self.task.dag) for i in range(len(imgs)): if self.plot_count <= 0: break if targets is not None: img = self.plot_mask(imgs[i], targets[i]) else: img = imgs[i] imgs_add = [img] for key, value in preds.items(): imgs_add.append(self.encode_pred(value[i])) for j in range(len(imgs_add)): imgs_add[j] = resize_saving_ratio(imgs_add[j], self.max_img_size) if self.stack_type == 'horizontal': img = np.hstack(imgs_add) else: img = np.vstack(imgs_add) attr = attrs[i] if attrs else {} score = None if targets is not None: score = scores[self.main_metric][i] retval, buffer = cv2.imencode('.jpg', img) report_img = ReportImg(group=item['name'], epoch=0, task=self.task.id, img=buffer, dag=self.task.dag, part=self.part, project=self.project, score=score, **attr) self.plot_count -= 1 report_imgs.append(report_img) dag.img_size += report_img.size self.dag_provider.commit() self.report_img_provider.bulk_save_objects(report_imgs)
def remove_dag(session: Session, id: int): tasks = TaskProvider(session).by_dag(id) for task in tasks: remove_task(session, task.id)
class ExecuteBuilder: def __init__(self, id: int, repeat_count: int = 1, exit=True): self.session = Session.create_session(key='ExecuteBuilder') self.id = id self.repeat_count = repeat_count self.logger = create_logger(self.session, 'ExecuteBuilder') self.logger_db = create_logger(self.session, 'ExecuteBuilder.db', console=False) self.exit = exit self.provider = None self.library_provider = None self.storage = None self.task = None self.dag = None self.executor = None self.hostname = None self.docker_img = None self.worker_index = None self.queue_personal = None self.config = None self.executor_type = None def info(self, msg: str, step=None): self.logger.info(msg, ComponentType.Worker, self.hostname, self.id, step) def error(self, msg: str, step=None): self.logger.error(msg, ComponentType.Worker, self.hostname, self.id, step) def warning(self, msg: str, step=None): self.logger.warning(msg, ComponentType.Worker, self.hostname, self.id, step) def debug(self, msg: str, step=None): self.logger.debug(msg, ComponentType.Worker, self.hostname, self.id, step) def create_base(self): self.info('create_base') if app.current_task: app.current_task.update_state(state=states.SUCCESS) app.control.revoke(app.current_task.request.id, terminate=True) self.provider = TaskProvider(self.session) self.library_provider = DagLibraryProvider(self.session) self.storage = Storage(self.session) self.task = self.provider.by_id( self.id, joinedload(Task.dag_rel, innerjoin=True)) if not self.task: raise Exception(f'task with id = {self.id} is not found') self.dag = self.task.dag_rel self.executor = None self.hostname = socket.gethostname() self.docker_img = DOCKER_IMG self.worker_index = os.getenv('WORKER_INDEX', -1) self.queue_personal = f'{self.hostname}_{self.docker_img}_' \ f'{self.worker_index}' self.config = Config.from_yaml(self.dag.config) set_global_seed(self.config['info'].get('seed', 0)) self.executor_type = self.config['executors'][ self.task.executor]['type'] executor = self.config['executors'][self.task.executor] cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', '') self.info(f'Env.before execution ' f'CUDA_VISIBLE_DEVICES={cuda_visible_devices}') if cuda_visible_devices.strip() != '': gpu_assigned = self.task.gpu_assigned or '' cuda_visible_devices = cuda_visible_devices.split(',') cuda_visible_devices = ','.join([ cuda_visible_devices[int(g)] for g in gpu_assigned.split(',') if g.strip() != '' ]) else: cuda_visible_devices = self.task.gpu_assigned cuda_visible_devices = cuda_visible_devices or '' env = { 'MKL_NUM_THREADS': 1, 'OMP_NUM_THREADS': 1, 'CUDA_VISIBLE_DEVICES': cuda_visible_devices } env.update(executor.get('env', {})) for k, v in env.items(): os.environ[k] = str(v) self.info(f'Set env. {k} = {v}') def check_status(self): self.info('check_status') assert self.dag is not None, 'You must fetch task with dag_rel' if self.task.status >= TaskStatus.InProgress.value: msg = f'Task = {self.task.id}. Status = {self.task.status}, ' \ f'before the execute_by_id invocation.' if app.current_task: msg += f' Request Id = {app.current_task.request.id}' self.error(msg) return True def change_status(self): self.info('change_status') self.task.computer_assigned = self.hostname self.task.pid = os.getpid() self.task.worker_index = self.worker_index self.task.docker_assigned = self.docker_img self.provider.change_status(self.task, TaskStatus.InProgress) def download(self): self.info('download') if not self.task.debug: folder = self.storage.download(task=self.id) else: folder = os.getcwd() os.chdir(folder) libraries = self.library_provider.dag(self.task.dag) executor_type = self.executor_type self.info('download. folder changed') mlcomp_executors_folder = join(dirname(abspath(__file__)), 'executors') mlcomp_base_folder = os.path.abspath( join(mlcomp_executors_folder, '../../../')) imported, was_installation = self.storage.import_executor( mlcomp_executors_folder, mlcomp_base_folder, executor_type) if not imported: imported, was_installation = self.storage.import_executor( folder, folder, executor_type, libraries) if not imported: raise Exception(f'Executor = {executor_type} not found') self.info('download. executor imported') if was_installation and not self.task.debug: if self.repeat_count > 0: self.info('was installation. ' 'set task status to Queued. ' 'And resending the task to a queue') self.task.status = TaskStatus.Queued.value self.provider.commit() try: execute.apply_async((self.id, self.repeat_count - 1), queue=self.queue_personal, retry=False) except Exception: pass finally: sys.exit() assert Executor.is_registered(executor_type), \ f'Executor {executor_type} was not found' def create_executor(self): self.info('create_executor') additional_info = yaml_load(self.task.additional_info) \ if self.task.additional_info else dict() self.executor = Executor.from_config(executor=self.task.executor, config=self.config, additional_info=additional_info, session=self.session, logger=self.logger, logger_db=self.logger_db) def execute(self): self.info('execute start') res = self.executor(task=self.task, task_provider=self.provider, dag=self.dag) self.info('execute executor finished') res = res or {} self.task.result = yaml_dump(res) self.provider.commit() if 'stage' in res and 'stages' in res: index = res['stages'].index(res['stage']) if index < len(res['stages']) - 1: self.executor.info(f'stage = {res["stage"]} done. ' f'Go to the stage = ' f'{res["stages"][index + 1]}') time.sleep(3) self.executor.info(f'sending {(self.id, self.repeat_count)} ' f'to {self.queue_personal}') self.task.status = TaskStatus.Queued.value self.provider.commit() execute.apply_async((self.id, self.repeat_count), queue=self.queue_personal, retry=False) return self.executor.step.finish() self.provider.change_status(self.task, TaskStatus.Success) self.info('execute end') def build(self): try: self.create_base() bad_status = self.check_status() if bad_status: return self.change_status() self.download() self.create_executor() self.execute() except Exception as e: step = self.executor.step.id if \ (self.executor and self.executor.step) else None if Session.sqlalchemy_error(e): Session.cleanup(key='ExecuteBuilder') self.session = Session.create_session(key='ExecuteBuilder') self.logger.session = create_logger(self.session, 'ExecuteBuilder') self.error(traceback.format_exc(), step) if self.task.status <= TaskStatus.InProgress.value: self.provider.change_status(self.task, TaskStatus.Failed) raise e finally: if app.current_task: app.close() if self.exit: # noinspection PyProtectedMember os._exit(0)