def dag_pipe(session: Session, config: dict, config_text: str = None): assert 'interfaces' in config, 'interfaces missed' assert 'pipes' in config, 'pipe missed' info = config['info'] storage = Storage(session) dag_provider = DagProvider(session) folder = os.getcwd() project = ProjectProvider(session).by_name(info['project']).id dag = dag_provider.add( Dag( config=config_text, project=project, name=info['name'], docker_img=info.get('docker_img'), type=DagType.Pipe.value ) ) storage.upload(folder, dag) # Change model dags which have the same name ModelProvider(session ).change_dag(project=project, name=info['name'], to=dag.id)
def create_base(self): self.info('create_base') self.provider = TaskProvider(self.session) self.library_provider = DagLibraryProvider(self.session) self.storage = Storage(self.session) self.task = self.provider.by_id( self.id, joinedload(Task.dag_rel, innerjoin=True)) if not self.task: raise Exception(f'task with id = {self.id} is not found') self.dag = self.task.dag_rel self.executor = None self.hostname = socket.gethostname() self.docker_img = DOCKER_IMG self.worker_index = os.getenv('WORKER_INDEX', -1) self.queue_personal = f'{self.hostname}_{self.docker_img}_' \ f'{self.worker_index}' self.config = Config.from_yaml(self.dag.config) self.executor_type = self.config['executors'][ self.task.executor]['type']
def create_base(self): self.info('create_base') if app.current_task: app.current_task.update_state(state=states.SUCCESS) app.control.revoke(app.current_task.request.id, terminate=True) self.provider = TaskProvider(self.session) self.library_provider = DagLibraryProvider(self.session) self.storage = Storage(self.session) self.task = self.provider.by_id( self.id, joinedload(Task.dag_rel, innerjoin=True)) if not self.task: raise Exception(f'task with id = {self.id} is not found') self.dag = self.task.dag_rel self.executor = None self.hostname = socket.gethostname() self.docker_img = DOCKER_IMG self.worker_index = os.getenv('WORKER_INDEX', -1) self.queue_personal = f'{self.hostname}_{self.docker_img}_' \ f'{self.worker_index}' self.config = Config.from_yaml(self.dag.config) set_global_seed(self.config['info'].get('seed', 0)) self.executor_type = self.config['executors'][ self.task.executor]['type'] executor = self.config['executors'][self.task.executor] if os.getenv('CUDA_VISIBLE_DEVICES', '').strip() != '': cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', '').split(',') self.task.gpu_assigned = ','.join([ cuda_visible_devices[int(g)] for g in (self.task.gpu_assigned or '').split(',') ]) cuda_visible_devices = self.task.gpu_assigned else: cuda_visible_devices = self.task.gpu_assigned cuda_visible_devices = cuda_visible_devices or '' env = { 'MKL_NUM_THREADS': 1, 'OMP_NUM_THREADS': 1, 'CUDA_VISIBLE_DEVICES': cuda_visible_devices } env.update(executor.get('env', {})) for k, v in env.items(): os.environ[k] = str(v) self.info(f'Set env. {k} = {v}')
def create_providers(self): self.provider = TaskProvider(self.session) self.report_provider = ReportProvider(self.session) self.report_tasks_provider = ReportTasksProvider(self.session) self.report_layout_provider = ReportLayoutProvider(self.session) self.project_provider = ProjectProvider(self.session) self.storage = Storage(self.session) self.dag_provider = DagProvider(self.session)
def create_providers(self): self.log_info('create_providers') self.provider = TaskProvider(self.session) self.report_provider = ReportProvider(self.session) self.report_tasks_provider = ReportTasksProvider(self.session) self.report_layout_provider = ReportLayoutProvider(self.session) self.project_provider = ProjectProvider(self.session) self.storage = Storage(self.session, logger=self.logger, component=self.component) self.dag_provider = DagProvider(self.session)
def code_download(): id = int(request.args['id']) storage = Storage(_read_session) dag = DagProvider().by_id(id) folder = os.path.join(TMP_FOLDER, f'{dag.id}({dag.name})') try: storage.download_dag(id, folder) file_name = f'{dag.id}({dag.name}).zip' dst = os.path.join(TMP_FOLDER, file_name) zip_folder(folder, dst) res = send_from_directory(TMP_FOLDER, file_name) os.remove(dst) return res finally: shutil.rmtree(folder, ignore_errors=True)
def create_base(self): self.info('create_base') if app.current_task: app.current_task.update_state(state=states.SUCCESS) app.control.revoke(app.current_task.request.id, terminate=True) self.provider = TaskProvider(self.session) self.library_provider = DagLibraryProvider(self.session) self.storage = Storage(self.session) self.task = self.provider.by_id( self.id, joinedload(Task.dag_rel, innerjoin=True)) if not self.task: raise Exception(f'task with id = {self.id} is not found') self.dag = self.task.dag_rel self.executor = None self.hostname = socket.gethostname() self.docker_img = DOCKER_IMG self.worker_index = os.getenv('WORKER_INDEX', -1) self.queue_personal = f'{self.hostname}_{self.docker_img}_' \ f'{self.worker_index}' self.config = Config.from_yaml(self.dag.config) self.executor_type = self.config['executors'][ self.task.executor]['type'] executor = self.config['executors'][self.task.executor] env = {'MKL_NUM_THREADS': 1, 'OMP_NUM_THREADS': 1} env.update(executor.get('env', {})) for k, v in env.items(): os.environ[k] = str(v) self.info(f'Set env. {k} = {v}')
class ExecuteBuilder: def __init__(self, id: int, repeat_count: int = 1, exit=True): self.session = Session.create_session(key='ExecuteBuilder') self.id = id self.repeat_count = repeat_count self.logger = create_logger(self.session, 'ExecuteBuilder') self.logger_db = create_logger(self.session, 'ExecuteBuilder.db', console=False) self.exit = exit self.provider = None self.library_provider = None self.storage = None self.task = None self.dag = None self.executor = None self.hostname = None self.docker_img = None self.worker_index = None self.queue_personal = None self.config = None self.executor_type = None def info(self, msg: str, step=None): self.logger.info(msg, ComponentType.Worker, self.hostname, self.id, step) def error(self, msg: str, step=None): self.logger.error(msg, ComponentType.Worker, self.hostname, self.id, step) def warning(self, msg: str, step=None): self.logger.warning(msg, ComponentType.Worker, self.hostname, self.id, step) def debug(self, msg: str, step=None): self.logger.debug(msg, ComponentType.Worker, self.hostname, self.id, step) def create_base(self): self.info('create_base') if app.current_task: app.current_task.update_state(state=states.SUCCESS) app.control.revoke(app.current_task.request.id, terminate=True) self.provider = TaskProvider(self.session) self.library_provider = DagLibraryProvider(self.session) self.storage = Storage(self.session) self.task = self.provider.by_id( self.id, joinedload(Task.dag_rel, innerjoin=True)) if not self.task: raise Exception(f'task with id = {self.id} is not found') self.dag = self.task.dag_rel self.executor = None self.hostname = socket.gethostname() self.docker_img = DOCKER_IMG self.worker_index = os.getenv('WORKER_INDEX', -1) self.queue_personal = f'{self.hostname}_{self.docker_img}_' \ f'{self.worker_index}' self.config = Config.from_yaml(self.dag.config) set_global_seed(self.config['info'].get('seed', 0)) self.executor_type = self.config['executors'][ self.task.executor]['type'] executor = self.config['executors'][self.task.executor] cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', '') self.info(f'Env.before execution ' f'CUDA_VISIBLE_DEVICES={cuda_visible_devices}') if cuda_visible_devices.strip() != '': gpu_assigned = self.task.gpu_assigned or '' cuda_visible_devices = cuda_visible_devices.split(',') cuda_visible_devices = ','.join([ cuda_visible_devices[int(g)] for g in gpu_assigned.split(',') if g.strip() != '' ]) else: cuda_visible_devices = self.task.gpu_assigned cuda_visible_devices = cuda_visible_devices or '' env = { 'MKL_NUM_THREADS': 1, 'OMP_NUM_THREADS': 1, 'CUDA_VISIBLE_DEVICES': cuda_visible_devices } env.update(executor.get('env', {})) for k, v in env.items(): os.environ[k] = str(v) self.info(f'Set env. {k} = {v}') def check_status(self): self.info('check_status') assert self.dag is not None, 'You must fetch task with dag_rel' if self.task.status >= TaskStatus.InProgress.value: msg = f'Task = {self.task.id}. Status = {self.task.status}, ' \ f'before the execute_by_id invocation.' if app.current_task: msg += f' Request Id = {app.current_task.request.id}' self.error(msg) return True def change_status(self): self.info('change_status') self.task.computer_assigned = self.hostname self.task.pid = os.getpid() self.task.worker_index = self.worker_index self.task.docker_assigned = self.docker_img self.provider.change_status(self.task, TaskStatus.InProgress) def download(self): self.info('download') if not self.task.debug: folder = self.storage.download(task=self.id) else: folder = os.getcwd() os.chdir(folder) libraries = self.library_provider.dag(self.task.dag) executor_type = self.executor_type self.info('download. folder changed') mlcomp_executors_folder = join(dirname(abspath(__file__)), 'executors') mlcomp_base_folder = os.path.abspath( join(mlcomp_executors_folder, '../../../')) imported, was_installation = self.storage.import_executor( mlcomp_executors_folder, mlcomp_base_folder, executor_type) if not imported: imported, was_installation = self.storage.import_executor( folder, folder, executor_type, libraries) if not imported: raise Exception(f'Executor = {executor_type} not found') self.info('download. executor imported') if was_installation and not self.task.debug: if self.repeat_count > 0: self.info('was installation. ' 'set task status to Queued. ' 'And resending the task to a queue') self.task.status = TaskStatus.Queued.value self.provider.commit() try: execute.apply_async((self.id, self.repeat_count - 1), queue=self.queue_personal, retry=False) except Exception: pass finally: sys.exit() assert Executor.is_registered(executor_type), \ f'Executor {executor_type} was not found' def create_executor(self): self.info('create_executor') additional_info = yaml_load(self.task.additional_info) \ if self.task.additional_info else dict() self.executor = Executor.from_config(executor=self.task.executor, config=self.config, additional_info=additional_info, session=self.session, logger=self.logger, logger_db=self.logger_db) def execute(self): self.info('execute start') res = self.executor(task=self.task, task_provider=self.provider, dag=self.dag) self.info('execute executor finished') res = res or {} self.task.result = yaml_dump(res) self.provider.commit() if 'stage' in res and 'stages' in res: index = res['stages'].index(res['stage']) if index < len(res['stages']) - 1: self.executor.info(f'stage = {res["stage"]} done. ' f'Go to the stage = ' f'{res["stages"][index + 1]}') time.sleep(3) self.executor.info(f'sending {(self.id, self.repeat_count)} ' f'to {self.queue_personal}') self.task.status = TaskStatus.Queued.value self.provider.commit() execute.apply_async((self.id, self.repeat_count), queue=self.queue_personal, retry=False) return self.executor.step.finish() self.provider.change_status(self.task, TaskStatus.Success) self.info('execute end') def build(self): try: self.create_base() bad_status = self.check_status() if bad_status: return self.change_status() self.download() self.create_executor() self.execute() except Exception as e: step = self.executor.step.id if \ (self.executor and self.executor.step) else None if Session.sqlalchemy_error(e): Session.cleanup(key='ExecuteBuilder') self.session = Session.create_session(key='ExecuteBuilder') self.logger.session = create_logger(self.session, 'ExecuteBuilder') self.error(traceback.format_exc(), step) if self.task.status <= TaskStatus.InProgress.value: self.provider.change_status(self.task, TaskStatus.Failed) raise e finally: if app.current_task: app.close() if self.exit: # noinspection PyProtectedMember os._exit(0)
class DagStandardBuilder: def __init__(self, session: Session, config: dict, debug: bool, config_text: str = None, upload_files: bool = True, copy_files_from: int = None, config_path: str = None, control_reqs: bool = True, logger=None, component: ComponentType = None): self.session = session self.config = config self.debug = debug self.config_text = config_text self.upload_files = upload_files self.copy_files_from = copy_files_from self.config_path = config_path self.control_reqs = control_reqs self.info = config['info'] self.layout_name = self.info.get('layout') self.provider = None self.report_provider = None self.report_tasks_provider = None self.report_layout_provider = None self.storage = None self.dag_provider = None self.logger = logger self.component = component self.project = None self.layouts = None self.dag = None self.dag_report_id = None self.created = None self.project_provider = None def log_info(self, message: str): if self.logger: self.logger.info(message, self.component) def create_providers(self): self.log_info('create_providers') self.provider = TaskProvider(self.session) self.report_provider = ReportProvider(self.session) self.report_tasks_provider = ReportTasksProvider(self.session) self.report_layout_provider = ReportLayoutProvider(self.session) self.project_provider = ProjectProvider(self.session) self.storage = Storage(self.session, logger=self.logger, component=self.component) self.dag_provider = DagProvider(self.session) def load_base(self): self.log_info('load_base') project = self.project_provider.by_name(self.info['project']) if project is None: project = self.project_provider.add_project(self.info['project']) self.project = project.id self.layouts = self.report_layout_provider.all() def create_report(self): self.log_info('create_report') self.dag_report_id = None layout_name = self.layout_name if layout_name: if layout_name not in self.layouts: raise Exception(f'Unknown layout = {layout_name}') report = Report(config=yaml_dump(self.layouts[layout_name]), name=self.info['name'], project=self.project, layout=layout_name) self.report_provider.add(report) self.dag_report_id = report.id def create_dag(self): self.log_info('create_dag') dag = Dag(config=self.config_text or yaml_dump(self.config), project=self.project, name=self.info['name'], docker_img=self.info.get('docker_img'), type=DagType.Standard.value, created=now(), report=self.dag_report_id) self.dag = self.dag_provider.add(dag) def upload(self): self.log_info('upload') if self.upload_files: folder = os.path.dirname(os.path.abspath(self.config_path)) if 'expdir' in self.config['info']: path = os.path.dirname(os.path.abspath(self.config_path)) folder = os.path.abspath( os.path.join(path, self.config['info']['expdir'])) self.storage.upload(folder, self.dag, control_reqs=self.control_reqs) elif self.copy_files_from: self.storage.copy_from(self.copy_files_from, self.dag) def create_task(self, k: str, v: dict, name: str, info: dict): task_type = TaskType.User.value if v.get('task_type') == 'train' or \ Executor.is_trainable(v['type']): task_type = TaskType.Train.value gpu = str(v.get('gpu', '0')) if '-' not in gpu: gpu = int(gpu) gpu_max = gpu else: gpu, gpu_max = map(int, gpu.split('-')) if gpu == 0 and gpu_max > 0: raise Exception(f'Executor = {k} Gpu_max can"t be>0 when gpu=0') task = Task(name=name, executor=k, computer=self.info.get('computer') or v.get('computer'), gpu=gpu, gpu_max=gpu_max, cpu=v.get('cpu', 1), memory=v.get('memory', 0.1), dag=self.dag.id, debug=self.debug, steps=int(v.get('steps', '1')), type=task_type) task.additional_info = yaml_dump(info) report = None if self.layout_name and task_type == TaskType.Train.value: if self.layout_name not in self.layouts: raise Exception(f'Unknown report = {v["report"]}') report_config = self.layouts[self.layout_name] info['report_config'] = report_config task.additional_info = yaml_dump(info) report = Report(config=yaml_dump(report_config), name=task.name, project=self.project, layout=self.layout_name) return task, report def create_tasks(self): self.log_info('create_tasks') created = OrderedDict() executors = self.config['executors'] tasks = [] dependencies = [] reports = [] while len(created) < len(executors): for k, v in executors.items(): valid = True if 'depends' in v: depends = v['depends'] if not isinstance(depends, list): depends = [depends] for d in depends: if d == k: raise Exception(f'Executor {k} depends on itself') if d not in executors: raise Exception(f'Executor {k} depend on {d} ' f'which does not exist') valid = valid and d in created if valid: names = [] infos = [] if 'grid' in v: grid = v['grid'] cells = grid_cells(grid) for i, (cell, cell_name) in enumerate(cells): names.append(cell_name) infos.append({'grid_cell': i}) else: names.append(v.get('name', k)) infos.append({}) k_tasks = [] for name, info in zip(names, infos): task, report = self.create_task(k, v, name=name, info=info) tasks.append(task) k_tasks.append(task) reports.append(report) if 'depends' in v: depends = v['depends'] if not isinstance(depends, list): depends = [depends] for d in depends: for dd in created[d]: dependencies.append((task, dd)) created[k] = k_tasks not_empty_reports = [r for r in reports if r is not None] if len(not_empty_reports) > 0: self.provider.bulk_save_objects(not_empty_reports, return_defaults=True) for report, task in zip(reports, tasks): if report is not None: task.report = report.id self.provider.bulk_save_objects(tasks, return_defaults=True) if len(not_empty_reports) > 0: report_tasks = [] for report, task in zip(reports, tasks): if report is not None: report_tasks.append( ReportTasks(report=report.id, task=task.id)) self.report_tasks_provider.bulk_save_objects(report_tasks) dependencies = [ TaskDependence(task_id=task.id, depend_id=dd.id) for task, dd in dependencies ] self.provider.bulk_save_objects(dependencies) for k, v in created.items(): created[k] = [vv.id for vv in v] self.created = created def build(self): self.create_providers() self.load_base() self.create_report() self.create_dag() self.upload() self.create_tasks() self.log_info('Done') return self.created
class ExecuteBuilder: def __init__(self, id: int, repeat_count: int = 1, exit=True): self.session = Session.create_session(key='ExecuteBuilder') self.id = id self.repeat_count = repeat_count self.logger = create_logger(self.session, 'ExecuteBuilder') self.exit = exit self.provider = None self.library_provider = None self.storage = None self.task = None self.dag = None self.executor = None self.hostname = None self.docker_img = None self.worker_index = None self.queue_personal = None self.config = None self.executor_type = None def info(self, msg: str, step=None): self.logger.info(msg, ComponentType.Worker, self.hostname, self.id, step) def error(self, msg: str, step=None): self.logger.error(msg, ComponentType.Worker, self.hostname, self.id, step) def warning(self, msg: str, step=None): self.logger.warning(msg, ComponentType.Worker, self.hostname, self.id, step) def debug(self, msg: str, step=None): self.logger.debug(msg, ComponentType.Worker, self.hostname, self.id, step) def create_base(self): self.info('create_base') self.provider = TaskProvider(self.session) self.library_provider = DagLibraryProvider(self.session) self.storage = Storage(self.session) self.task = self.provider.by_id( self.id, joinedload(Task.dag_rel, innerjoin=True)) if not self.task: raise Exception(f'task with id = {self.id} is not found') self.dag = self.task.dag_rel self.executor = None self.hostname = socket.gethostname() self.docker_img = DOCKER_IMG self.worker_index = os.getenv('WORKER_INDEX', -1) self.queue_personal = f'{self.hostname}_{self.docker_img}_' \ f'{self.worker_index}' self.config = Config.from_yaml(self.dag.config) self.executor_type = self.config['executors'][ self.task.executor]['type'] def check_status(self): self.info('check_status') assert self.dag is not None, 'You must fetch task with dag_rel' if self.task.status > TaskStatus.InProgress.value: msg = f'Task = {self.task.id}. Status = {self.task.status}, ' \ f'before the execute_by_id invocation' self.error(msg) raise Exception(msg) def change_status(self): self.info('change_status') self.task.computer_assigned = self.hostname self.task.pid = os.getpid() self.task.worker_index = self.worker_index self.task.docker_assigned = self.docker_img self.provider.change_status(self.task, TaskStatus.InProgress) def download(self): self.info('download') if not self.task.debug: folder = self.storage.download(task=self.id) else: folder = os.getcwd() os.chdir(folder) libraries = self.library_provider.dag(self.task.dag) executor_type = self.executor_type mlcomp_executors_folder = join(dirname(abspath(__file__)), 'executors') mlcomp_base_folder = os.path.abspath( join(mlcomp_executors_folder, '../../../')) imported, was_installation = self.storage.import_executor( mlcomp_executors_folder, mlcomp_base_folder, executor_type) if not imported: imported, was_installation = self.storage.import_executor( folder, folder, executor_type, libraries) if not imported: raise Exception(f'Executor = {executor_type} not found') if was_installation and not self.task.debug: if self.repeat_count > 0: try: self.warning(traceback.format_exc()) execute.apply_async((self.id, self.repeat_count - 1), queue=self.queue_personal) except Exception: pass finally: sys.exit() assert Executor.is_registered(executor_type), \ f'Executor {executor_type} was not found' def create_executor(self): self.info('create_executor') additional_info = yaml_load(self.task.additional_info) \ if self.task.additional_info else dict() self.executor = Executor.from_config(executor=self.task.executor, config=self.config, additional_info=additional_info, session=self.session, logger=self.logger) def execute(self): self.info('execute start') res = self.executor(task=self.task, task_provider=self.provider, dag=self.dag) self.info('execute executor finished') res = res or {} self.task.result = yaml_dump(res) self.provider.commit() if 'stage' in res and 'stages' in res: index = res['stages'].index(res['stage']) if index < len(res['stages']) - 1: self.executor.info(f'stage = {res["stage"]} done. ' f'Go to the stage = ' f'{res["stages"][index + 1]}') time.sleep(3) self.executor.info(f'sending {(self.id, self.repeat_count)} ' f'to {self.queue_personal}') execute.apply_async((self.id, self.repeat_count), queue=self.queue_personal) return self.executor.step.finish() self.provider.change_status(self.task, TaskStatus.Success) self.info('execute end') def build(self): try: self.create_base() self.check_status() self.change_status() self.download() self.create_executor() self.execute() except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup(key='ExecuteBuilder') self.session = Session.create_session(key='ExecuteBuilder') self.logger.session = create_logger(self.session, 'ExecuteBuilder') step = self.executor.step.id if \ (self.executor and self.executor.step) else None self.error(traceback.format_exc(), step) self.provider.change_status(self.task, TaskStatus.Failed) raise e finally: if app.current_task: app.current_task.update_state(state=states.SUCCESS) app.close() if self.exit: # noinspection PyProtectedMember os._exit(0)