Example #1
0
class ExecuteBuilder:
    def __init__(self, id: int, repeat_count: int = 1, exit=True):
        self.session = Session.create_session(key='ExecuteBuilder')
        self.id = id
        self.repeat_count = repeat_count
        self.logger = create_logger(self.session, 'ExecuteBuilder')
        self.logger_db = create_logger(self.session,
                                       'ExecuteBuilder.db',
                                       console=False)
        self.exit = exit

        self.provider = None
        self.library_provider = None
        self.storage = None
        self.task = None
        self.dag = None
        self.executor = None
        self.hostname = None
        self.docker_img = None
        self.worker_index = None
        self.queue_personal = None
        self.config = None
        self.executor_type = None

    def info(self, msg: str, step=None):
        self.logger.info(msg, ComponentType.Worker, self.hostname, self.id,
                         step)

    def error(self, msg: str, step=None):
        self.logger.error(msg, ComponentType.Worker, self.hostname, self.id,
                          step)

    def warning(self, msg: str, step=None):
        self.logger.warning(msg, ComponentType.Worker, self.hostname, self.id,
                            step)

    def debug(self, msg: str, step=None):
        self.logger.debug(msg, ComponentType.Worker, self.hostname, self.id,
                          step)

    def create_base(self):
        self.info('create_base')

        if app.current_task:
            app.current_task.update_state(state=states.SUCCESS)
            app.control.revoke(app.current_task.request.id, terminate=True)

        self.provider = TaskProvider(self.session)
        self.library_provider = DagLibraryProvider(self.session)
        self.storage = Storage(self.session)

        self.task = self.provider.by_id(
            self.id, joinedload(Task.dag_rel, innerjoin=True))
        if not self.task:
            raise Exception(f'task with id = {self.id} is not found')

        self.dag = self.task.dag_rel
        self.executor = None
        self.hostname = socket.gethostname()

        self.docker_img = DOCKER_IMG
        self.worker_index = os.getenv('WORKER_INDEX', -1)

        self.queue_personal = f'{self.hostname}_{self.docker_img}_' \
                              f'{self.worker_index}'

        self.config = Config.from_yaml(self.dag.config)

        set_global_seed(self.config['info'].get('seed', 0))

        self.executor_type = self.config['executors'][
            self.task.executor]['type']

        executor = self.config['executors'][self.task.executor]

        cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', '')
        self.info(f'Env.before execution '
                  f'CUDA_VISIBLE_DEVICES={cuda_visible_devices}')

        if cuda_visible_devices.strip() != '':
            gpu_assigned = self.task.gpu_assigned or ''

            cuda_visible_devices = cuda_visible_devices.split(',')
            cuda_visible_devices = ','.join([
                cuda_visible_devices[int(g)] for g in gpu_assigned.split(',')
                if g.strip() != ''
            ])
        else:
            cuda_visible_devices = self.task.gpu_assigned

        cuda_visible_devices = cuda_visible_devices or ''

        env = {
            'MKL_NUM_THREADS': 1,
            'OMP_NUM_THREADS': 1,
            'CUDA_VISIBLE_DEVICES': cuda_visible_devices
        }
        env.update(executor.get('env', {}))

        for k, v in env.items():
            os.environ[k] = str(v)
            self.info(f'Set env. {k} = {v}')

    def check_status(self):
        self.info('check_status')

        assert self.dag is not None, 'You must fetch task with dag_rel'

        if self.task.status >= TaskStatus.InProgress.value:
            msg = f'Task = {self.task.id}. Status = {self.task.status}, ' \
                  f'before the execute_by_id invocation.'
            if app.current_task:
                msg += f' Request Id = {app.current_task.request.id}'
            self.error(msg)
            return True

    def change_status(self):
        self.info('change_status')

        self.task.computer_assigned = self.hostname
        self.task.pid = os.getpid()
        self.task.worker_index = self.worker_index
        self.task.docker_assigned = self.docker_img
        self.provider.change_status(self.task, TaskStatus.InProgress)

    def download(self):
        self.info('download')

        if not self.task.debug:
            folder = self.storage.download(task=self.id)
        else:
            folder = os.getcwd()

        os.chdir(folder)

        libraries = self.library_provider.dag(self.task.dag)
        executor_type = self.executor_type

        self.info('download. folder changed')

        mlcomp_executors_folder = join(dirname(abspath(__file__)), 'executors')
        mlcomp_base_folder = os.path.abspath(
            join(mlcomp_executors_folder, '../../../'))

        imported, was_installation = self.storage.import_executor(
            mlcomp_executors_folder, mlcomp_base_folder, executor_type)

        if not imported:
            imported, was_installation = self.storage.import_executor(
                folder, folder, executor_type, libraries)

            if not imported:
                raise Exception(f'Executor = {executor_type} not found')

        self.info('download. executor imported')

        if was_installation and not self.task.debug:
            if self.repeat_count > 0:
                self.info('was installation. '
                          'set task status to Queued. '
                          'And resending the task to a queue')
                self.task.status = TaskStatus.Queued.value
                self.provider.commit()

                try:
                    execute.apply_async((self.id, self.repeat_count - 1),
                                        queue=self.queue_personal,
                                        retry=False)
                except Exception:
                    pass
                finally:
                    sys.exit()

        assert Executor.is_registered(executor_type), \
            f'Executor {executor_type} was not found'

    def create_executor(self):
        self.info('create_executor')

        additional_info = yaml_load(self.task.additional_info) \
            if self.task.additional_info else dict()
        self.executor = Executor.from_config(executor=self.task.executor,
                                             config=self.config,
                                             additional_info=additional_info,
                                             session=self.session,
                                             logger=self.logger,
                                             logger_db=self.logger_db)

    def execute(self):
        self.info('execute start')

        res = self.executor(task=self.task,
                            task_provider=self.provider,
                            dag=self.dag)
        self.info('execute executor finished')

        res = res or {}
        self.task.result = yaml_dump(res)
        self.provider.commit()

        if 'stage' in res and 'stages' in res:
            index = res['stages'].index(res['stage'])
            if index < len(res['stages']) - 1:
                self.executor.info(f'stage = {res["stage"]} done. '
                                   f'Go to the stage = '
                                   f'{res["stages"][index + 1]}')

                time.sleep(3)

                self.executor.info(f'sending {(self.id, self.repeat_count)} '
                                   f'to {self.queue_personal}')

                self.task.status = TaskStatus.Queued.value
                self.provider.commit()

                execute.apply_async((self.id, self.repeat_count),
                                    queue=self.queue_personal,
                                    retry=False)
                return

        self.executor.step.finish()
        self.provider.change_status(self.task, TaskStatus.Success)

        self.info('execute end')

    def build(self):
        try:
            self.create_base()

            bad_status = self.check_status()
            if bad_status:
                return

            self.change_status()

            self.download()

            self.create_executor()

            self.execute()

        except Exception as e:
            step = self.executor.step.id if \
                (self.executor and self.executor.step) else None

            if Session.sqlalchemy_error(e):
                Session.cleanup(key='ExecuteBuilder')
                self.session = Session.create_session(key='ExecuteBuilder')
                self.logger.session = create_logger(self.session,
                                                    'ExecuteBuilder')

            self.error(traceback.format_exc(), step)
            if self.task.status <= TaskStatus.InProgress.value:
                self.provider.change_status(self.task, TaskStatus.Failed)
            raise e
        finally:
            if app.current_task:
                app.close()

            if self.exit:
                # noinspection PyProtectedMember
                os._exit(0)
Example #2
0
class ExecuteBuilder:
    def __init__(self, id: int, repeat_count: int = 1, exit=True):
        self.session = Session.create_session(key='ExecuteBuilder')
        self.id = id
        self.repeat_count = repeat_count
        self.logger = create_logger(self.session, 'ExecuteBuilder')
        self.exit = exit

        self.provider = None
        self.library_provider = None
        self.storage = None
        self.task = None
        self.dag = None
        self.executor = None
        self.hostname = None
        self.docker_img = None
        self.worker_index = None
        self.queue_personal = None
        self.config = None
        self.executor_type = None

    def info(self, msg: str, step=None):
        self.logger.info(msg, ComponentType.Worker, self.hostname, self.id,
                         step)

    def error(self, msg: str, step=None):
        self.logger.error(msg, ComponentType.Worker, self.hostname, self.id,
                          step)

    def warning(self, msg: str, step=None):
        self.logger.warning(msg, ComponentType.Worker, self.hostname, self.id,
                            step)

    def debug(self, msg: str, step=None):
        self.logger.debug(msg, ComponentType.Worker, self.hostname, self.id,
                          step)

    def create_base(self):
        self.info('create_base')

        self.provider = TaskProvider(self.session)
        self.library_provider = DagLibraryProvider(self.session)
        self.storage = Storage(self.session)

        self.task = self.provider.by_id(
            self.id, joinedload(Task.dag_rel, innerjoin=True))
        if not self.task:
            raise Exception(f'task with id = {self.id} is not found')

        self.dag = self.task.dag_rel
        self.executor = None
        self.hostname = socket.gethostname()

        self.docker_img = DOCKER_IMG
        self.worker_index = os.getenv('WORKER_INDEX', -1)

        self.queue_personal = f'{self.hostname}_{self.docker_img}_' \
                              f'{self.worker_index}'

        self.config = Config.from_yaml(self.dag.config)
        self.executor_type = self.config['executors'][
            self.task.executor]['type']

    def check_status(self):
        self.info('check_status')

        assert self.dag is not None, 'You must fetch task with dag_rel'

        if self.task.status > TaskStatus.InProgress.value:
            msg = f'Task = {self.task.id}. Status = {self.task.status}, ' \
                  f'before the execute_by_id invocation'
            self.error(msg)
            raise Exception(msg)

    def change_status(self):
        self.info('change_status')

        self.task.computer_assigned = self.hostname
        self.task.pid = os.getpid()
        self.task.worker_index = self.worker_index
        self.task.docker_assigned = self.docker_img
        self.provider.change_status(self.task, TaskStatus.InProgress)

    def download(self):
        self.info('download')

        if not self.task.debug:
            folder = self.storage.download(task=self.id)
        else:
            folder = os.getcwd()

        os.chdir(folder)

        libraries = self.library_provider.dag(self.task.dag)
        executor_type = self.executor_type

        mlcomp_executors_folder = join(dirname(abspath(__file__)), 'executors')
        mlcomp_base_folder = os.path.abspath(
            join(mlcomp_executors_folder, '../../../'))

        imported, was_installation = self.storage.import_executor(
            mlcomp_executors_folder, mlcomp_base_folder, executor_type)

        if not imported:
            imported, was_installation = self.storage.import_executor(
                folder, folder, executor_type, libraries)

            if not imported:
                raise Exception(f'Executor = {executor_type} not found')

        if was_installation and not self.task.debug:
            if self.repeat_count > 0:
                try:
                    self.warning(traceback.format_exc())
                    execute.apply_async((self.id, self.repeat_count - 1),
                                        queue=self.queue_personal)
                except Exception:
                    pass
                finally:
                    sys.exit()

        assert Executor.is_registered(executor_type), \
            f'Executor {executor_type} was not found'

    def create_executor(self):
        self.info('create_executor')

        additional_info = yaml_load(self.task.additional_info) \
            if self.task.additional_info else dict()
        self.executor = Executor.from_config(executor=self.task.executor,
                                             config=self.config,
                                             additional_info=additional_info,
                                             session=self.session,
                                             logger=self.logger)

    def execute(self):
        self.info('execute start')

        res = self.executor(task=self.task,
                            task_provider=self.provider,
                            dag=self.dag)
        self.info('execute executor finished')

        res = res or {}
        self.task.result = yaml_dump(res)
        self.provider.commit()

        if 'stage' in res and 'stages' in res:
            index = res['stages'].index(res['stage'])
            if index < len(res['stages']) - 1:
                self.executor.info(f'stage = {res["stage"]} done. '
                                   f'Go to the stage = '
                                   f'{res["stages"][index + 1]}')

                time.sleep(3)

                self.executor.info(f'sending {(self.id, self.repeat_count)} '
                                   f'to {self.queue_personal}')

                execute.apply_async((self.id, self.repeat_count),
                                    queue=self.queue_personal)
                return

        self.executor.step.finish()
        self.provider.change_status(self.task, TaskStatus.Success)

        self.info('execute end')

    def build(self):
        try:
            self.create_base()

            self.check_status()

            self.change_status()

            self.download()

            self.create_executor()

            self.execute()

        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup(key='ExecuteBuilder')
                self.session = Session.create_session(key='ExecuteBuilder')
                self.logger.session = create_logger(self.session,
                                                    'ExecuteBuilder')

            step = self.executor.step.id if \
                (self.executor and self.executor.step) else None

            self.error(traceback.format_exc(), step)
            self.provider.change_status(self.task, TaskStatus.Failed)
            raise e
        finally:
            if app.current_task:
                app.current_task.update_state(state=states.SUCCESS)
                app.close()

            if self.exit:
                # noinspection PyProtectedMember
                os._exit(0)