Example #1
0
    def work(self):
        project = ProjectProvider(self.session).by_id(self.project)

        self.info(f'Task = {self.train_task} child_task: {self.child_task}')

        model = Model(created=now(),
                      name=self.name,
                      project=self.project,
                      equations='',
                      fold=self.fold)

        provider = ModelProvider(self.session)
        if self.train_task:
            task_provider = TaskProvider(self.session)
            task = task_provider.by_id(self.train_task)
            model.score_local = task.score

            task_dir = join(TASK_FOLDER, str(self.child_task or task.id))
            src_log = f'{task_dir}/log'
            models_dir = join(MODEL_FOLDER, project.name)
            os.makedirs(models_dir, exist_ok=True)

            model_path_tmp = f'{src_log}/traced.pth'
            traced = trace_model_from_checkpoint(src_log, self, file=self.file)

            model_path = f'{models_dir}/{model.name}.pth'
            model_weight_path = f'{models_dir}/{model.name}_weight.pth'
            torch.jit.save(traced, model_path_tmp)
            shutil.copy(model_path_tmp, model_path)
            file = self.file = 'best_full'
            shutil.copy(f'{src_log}/checkpoints/{file}.pth', model_weight_path)

        provider.add(model)
Example #2
0
def task_stop():
    data = request_data()
    provider = TaskProvider(_write_session)
    task = provider.by_id(data['id'], joinedload(Task.dag_rel, innerjoin=True))

    tasks = [task] + provider.children(task.id)
    supervisor.stop_tasks(tasks)
Example #3
0
def stop(logger, session: Session, task: Task, dag: Dag):
    provider = TaskProvider(session)
    if task.status > TaskStatus.InProgress.value:
        return task.status

    status = TaskStatus.Stopped
    try:
        if task.status != TaskStatus.NotRan.value:
            app.control.revoke(task.celery_id, terminate=True)
        else:
            status = TaskStatus.Skipped
    except Exception as e:
        if Session.sqlalchemy_error(e):
            try:
                logger.error(traceback.format_exc(), ComponentType.API)
            except Exception:
                pass
            raise
        logger.error(traceback.format_exc(), ComponentType.API)
    finally:
        if task.pid:
            queue = f'{task.computer_assigned}_' \
                    f'{dag.docker_img or "default"}_supervisor'
            kill.apply_async((task.pid, ), queue=queue, retry=False)

            additional_info = yaml_load(task.additional_info)
            for p in additional_info.get('child_processes', []):
                kill.apply_async((p, ), queue=queue, retry=False)
        provider.change_status(task, status)

    return task.status
Example #4
0
    def create_providers(self):
        self.log_info('create_providers')

        self.dag_provider = DagProvider(self.session)
        self.task_provider = TaskProvider(self.session)
        self.file_provider = FileProvider(self.session)
        self.dag_storage_provider = DagStorageProvider(self.session)
Example #5
0
    def __init__(self,
                 session: Session,
                 task: Task,
                 layout: str,
                 part: str = 'valid',
                 name: str = 'img_classify',
                 max_img_size: Tuple[int, int] = None,
                 main_metric: str = 'accuracy',
                 plot_count: int = 0):
        self.session = session
        self.task = task
        self.layout = layout
        self.part = part
        self.name = name or 'img_classify'
        self.max_img_size = max_img_size
        self.main_metric = main_metric
        self.plot_count = plot_count

        self.dag_provider = DagProvider(session)
        self.report_provider = ReportProvider(session)
        self.layout_provider = ReportLayoutProvider(session)
        self.task_provider = TaskProvider(session)
        self.report_img_provider = ReportImgProvider(session)
        self.report_task_provider = ReportTasksProvider(session)
        self.report_series_provider = ReportSeriesProvider(session)

        self.project = self.task_provider.project(task.id).id
        self.layout = self.layout_provider.by_name(layout)
        self.layout_dict = yaml_load(self.layout.content)
Example #6
0
def stop_processes_not_exist(session: Session, logger):
    provider = TaskProvider(session)
    hostname = socket.gethostname()
    tasks = provider.by_status(TaskStatus.InProgress,
                               task_docker_assigned=DOCKER_IMG,
                               computer_assigned=hostname)
    hostname = socket.gethostname()
    for t in tasks:
        if not psutil.pid_exists(t.pid):
            # tasks can retry the execution
            if (now() - t.last_activity).total_seconds() < 30:
                continue

            os.system(f'kill -9 {t.pid}')
            t.status = TaskStatus.Failed.value
            logger.error(
                f'process with pid = {t.pid} does not exist. '
                f'Set task to failed state', ComponentType.WorkerSupervisor,
                hostname, t.id)

            provider.commit()

            additional_info = yaml_load(t.additional_info)
            for p in additional_info.get('child_processes', []):
                logger.info(f'killing child process = {p}')
                os.system(f'kill -9 {p}')
Example #7
0
    def create_base(self):
        self.info('create_base')

        self.provider = TaskProvider(self.session)
        self.library_provider = DagLibraryProvider(self.session)
        self.storage = Storage(self.session)

        self.task = self.provider.by_id(
            self.id, joinedload(Task.dag_rel, innerjoin=True))
        if not self.task:
            raise Exception(f'task with id = {self.id} is not found')

        self.dag = self.task.dag_rel
        self.executor = None
        self.hostname = socket.gethostname()

        self.docker_img = DOCKER_IMG
        self.worker_index = os.getenv('WORKER_INDEX', -1)

        self.queue_personal = f'{self.hostname}_{self.docker_img}_' \
                              f'{self.worker_index}'

        self.config = Config.from_yaml(self.dag.config)
        self.executor_type = self.config['executors'][
            self.task.executor]['type']
Example #8
0
    def __init__(self,
                 session: Session,
                 task: Task,
                 layout: str,
                 part: str = 'valid',
                 name: str = 'img_segment',
                 max_img_size: Tuple[int, int] = None,
                 stack_type: str = 'vertical',
                 main_metric: str = 'dice',
                 plot_count: int = 0,
                 colors: List[Tuple] = None):
        self.session = session
        self.task = task
        self.layout = layout
        self.part = part
        self.name = name or 'img_segment'
        self.max_img_size = max_img_size
        self.stack_type = stack_type
        self.main_metric = main_metric
        self.colors = colors
        self.plot_count = plot_count

        self.dag_provider = DagProvider(session)
        self.report_provider = ReportProvider(session)
        self.layout_provider = ReportLayoutProvider(session)
        self.task_provider = TaskProvider(session)
        self.report_img_provider = ReportImgProvider(session)
        self.report_task_provider = ReportTasksProvider(session)
        self.report_series_provider = ReportSeriesProvider(session)

        self.project = self.task_provider.project(task.id).id
        self.layout = self.layout_provider.by_name(layout)
        self.layout_dict = yaml_load(self.layout.content)

        self.create_base()
Example #9
0
def describe_task_names(dag: int):
    pd.set_option('display.max_columns', None)
    pd.set_option('display.expand_frame_repr', False)
    pd.set_option('max_colwidth', -1)

    provider = TaskProvider()
    tasks = provider.by_dag(dag)
    return pd.DataFrame([{'id': t.id, 'name': t.name} for t in tasks])
Example #10
0
    def create_base(self):
        self.info('create_base')

        if app.current_task:
            app.current_task.update_state(state=states.SUCCESS)
            app.control.revoke(app.current_task.request.id, terminate=True)

        self.provider = TaskProvider(self.session)
        self.library_provider = DagLibraryProvider(self.session)
        self.storage = Storage(self.session)

        self.task = self.provider.by_id(
            self.id, joinedload(Task.dag_rel, innerjoin=True))
        if not self.task:
            raise Exception(f'task with id = {self.id} is not found')

        self.dag = self.task.dag_rel
        self.executor = None
        self.hostname = socket.gethostname()

        self.docker_img = DOCKER_IMG
        self.worker_index = os.getenv('WORKER_INDEX', -1)

        self.queue_personal = f'{self.hostname}_{self.docker_img}_' \
                              f'{self.worker_index}'

        self.config = Config.from_yaml(self.dag.config)

        set_global_seed(self.config['info'].get('seed', 0))

        self.executor_type = self.config['executors'][
            self.task.executor]['type']

        executor = self.config['executors'][self.task.executor]

        if os.getenv('CUDA_VISIBLE_DEVICES', '').strip() != '':
            cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES',
                                             '').split(',')
            self.task.gpu_assigned = ','.join([
                cuda_visible_devices[int(g)]
                for g in (self.task.gpu_assigned or '').split(',')
            ])
            cuda_visible_devices = self.task.gpu_assigned
        else:
            cuda_visible_devices = self.task.gpu_assigned

        cuda_visible_devices = cuda_visible_devices or ''

        env = {
            'MKL_NUM_THREADS': 1,
            'OMP_NUM_THREADS': 1,
            'CUDA_VISIBLE_DEVICES': cuda_visible_devices
        }
        env.update(executor.get('env', {}))

        for k, v in env.items():
            os.environ[k] = str(v)
            self.info(f'Set env. {k} = {v}')
Example #11
0
    def create_providers(self):
        self.provider = TaskProvider(self.session)
        self.report_provider = ReportProvider(self.session)
        self.report_tasks_provider = ReportTasksProvider(self.session)
        self.report_layout_provider = ReportLayoutProvider(self.session)
        self.project_provider = ProjectProvider(self.session)

        self.storage = Storage(self.session)
        self.dag_provider = DagProvider(self.session)
Example #12
0
def dag_stop():
    data = request_data()
    provider = TaskProvider(_write_session)
    id = int(data['id'])
    tasks = provider.by_dag(id)

    supervisor.stop_tasks(tasks)

    dag_provider = DagProvider(_write_session)
    return {'dag': dag_provider.get({'id': id})['data'][0]}
Example #13
0
    def create_providers(self):
        self.log_info('create_providers')

        self.provider = TaskProvider(self.session)
        self.report_provider = ReportProvider(self.session)
        self.report_tasks_provider = ReportTasksProvider(self.session)
        self.report_layout_provider = ReportLayoutProvider(self.session)
        self.project_provider = ProjectProvider(self.session)

        self.storage = Storage(self.session,
                               logger=self.logger,
                               component=self.component)
        self.dag_provider = DagProvider(self.session)
Example #14
0
    def __init__(self, session: Session, logger=None,
                 component: ComponentType = None,
                 max_file_size: int = 10 ** 5, max_count=10 ** 3):
        self.file_provider = FileProvider(session)
        self.provider = DagStorageProvider(session)
        self.task_provider = TaskProvider(session)
        self.library_provider = DagLibraryProvider(session)
        self.dag_provider = DagProvider(session)

        self.logger = logger
        self.component = component
        self.max_file_size = max_file_size
        self.max_count = max_count
Example #15
0
def task_stop():
    data = request_data()
    provider = TaskProvider(_write_session)
    task = provider.by_id(data['id'], joinedload(Task.dag_rel, innerjoin=True))

    dag = task.dag_rel
    status = celery_tasks.stop(logger, _write_session, task, dag)

    child_tasks = provider.children(task.id)
    for t in child_tasks:
        celery_tasks.stop(logger, _write_session, t, dag)

    return {'status': to_snake(TaskStatus(status).name)}
Example #16
0
def execute(config: str, debug: bool, params):
    check_statuses()

    _create_computer()
    _create_docker()

    # Fail all InProgress Tasks
    logger = create_logger(_session, __name__)

    provider = TaskProvider(_session)
    step_provider = StepProvider(_session)

    for t in provider.by_status(TaskStatus.InProgress,
                                worker_index=WORKER_INDEX):
        step = step_provider.last_for_task(t.id)
        logger.error(
            f'Task Id = {t.id} was in InProgress state '
            f'when another tasks arrived to the same worker',
            ComponentType.Worker, t.computer_assigned, t.id, step)
        provider.change_status(t, TaskStatus.Failed)

    # Create dags
    dags = _dag(config, debug, params=params)
    for dag in dags:
        for ids in dag.values():
            for id in ids:
                task = provider.by_id(id)
                task.gpu_assigned = ','.join(
                    [str(i) for i in range(torch.cuda.device_count())])

                provider.commit()
                execute_by_id(id, exit=False)
Example #17
0
def execute(config: str, debug: bool):
    _create_computer()

    # Fail all InProgress Tasks
    logger = create_logger(_session, __name__)

    provider = TaskProvider(_session)
    step_provider = StepProvider(_session)

    for t in provider.by_status(TaskStatus.InProgress,
                                worker_index=WORKER_INDEX):
        step = step_provider.last_for_task(t.id)
        logger.error(
            f'Task Id = {t.id} was in InProgress state '
            f'when another tasks arrived to the same worker',
            ComponentType.Worker, t.computer_assigned, t.id, step)
        provider.change_status(t, TaskStatus.Failed)

    # Create dag
    created_dag = _dag(config, debug)
    for ids in created_dag.values():
        for id in ids:
            task = provider.by_id(id)
            task.gpu_assigned = ','.join(
                [str(i) for i, _ in enumerate(GPUtil.getGPUs())])

            provider.commit()
            execute_by_id(id, exit=False)
Example #18
0
def task_before_update(mapper, connection, target):
    target.last_activity = now()
    if target.parent:
        provider = TaskProvider(_session)
        parent = provider.by_id(target.parent)
        if parent is None:
            return

        parent.last_activity = target.last_activity

        try:
            provider.commit()
        except StaleDataError:
            pass
Example #19
0
def stop_all_dags():
    data = request_data()
    provider = TaskProvider(_write_session)
    tasks = provider.by_status(TaskStatus.InProgress,
                               TaskStatus.Queued,
                               TaskStatus.NotRan,
                               project=data['project'])

    for t in tasks:
        info = yaml_load(t.additional_info)
        info['stopped'] = True
        t.additional_info = yaml_dump(info)

    provider.update()
    supervisor.stop_tasks(tasks)
Example #20
0
    def create_base(self):
        self.session.commit()

        self.provider = TaskProvider(self.session)
        self.computer_provider = ComputerProvider(self.session)
        self.docker_provider = DockerProvider(self.session)
        self.auxiliary_provider = AuxiliaryProvider(self.session)
        self.dag_provider = DagProvider(self.session)

        self.queues = [
            f'{d.computer}_{d.name}' for d in self.docker_provider.all()
            if d.last_activity >= now() - datetime.timedelta(seconds=15)
        ]

        self.auxiliary['queues'] = self.queues
Example #21
0
    def work(self):
        project = ProjectProvider(self.session).by_id(self.project)

        self.info(f'Task = {self.train_task} child_task: {self.child_task}')

        model = Model(
            created=now(),
            name=self.name,
            project=self.project,
            equations='',
            fold=self.fold
        )

        provider = ModelProvider(self.session)
        if self.train_task:
            task_provider = TaskProvider(self.session)
            dag_provider = DagProvider(self.session)
            task = task_provider.by_id(self.train_task)
            dag = dag_provider.by_id(task.dag)

            task_dir = join(TASK_FOLDER, str(self.child_task or task.id))

            # get log directory
            config = yaml_load(dag.config)
            executor_config = config['executors'][task.executor]
            catalyst_config_file = executor_config['args']['config']
            catalyst_config_file = join(task_dir, catalyst_config_file)
            catalyst_config = yaml_load(file=catalyst_config_file)
            catalyst_logdir = catalyst_config['args']['logdir']

            model.score_local = task.score

            src_log = f'{task_dir}/{catalyst_logdir}'
            models_dir = join(MODEL_FOLDER, project.name)
            os.makedirs(models_dir, exist_ok=True)

            model_path_tmp = f'{src_log}/traced.pth'
            traced = trace_model_from_checkpoint(src_log, self, file=self.file)

            model_path = f'{models_dir}/{model.name}.pth'
            model_weight_path = f'{models_dir}/{model.name}_weight.pth'
            torch.jit.save(traced, model_path_tmp)
            shutil.copy(model_path_tmp, model_path)
            file = self.file = 'best_full'
            shutil.copy(f'{src_log}/checkpoints/{file}.pth',
                        model_weight_path)

        provider.add(model)
Example #22
0
def dag_model_add(session: Session, data: dict):
    if not data.get('task'):
        model = Model(name=data['name'],
                      project=data['project'],
                      equations=data['equations'],
                      created=now())
        ModelProvider(session).add(model)
        return

    task_provider = TaskProvider(session)
    task = task_provider.by_id(data['task'],
                               options=joinedload(Task.dag_rel,
                                                  innerjoin=True))
    child_tasks = task_provider.children(task.id)
    computer = task.computer_assigned
    child_task = None
    if len(child_tasks) > 0:
        child_task = child_tasks[0].id
        computer = child_tasks[0].computer_assigned

    project = ProjectProvider(session).by_id(task.dag_rel.project)
    config = {
        'info': {
            'name': 'model_add',
            'project': project.name,
            'computer': computer
        },
        'executors': {
            'model_add': {
                'type': 'model_add',
                'project': data['project'],
                'task': data.get('task'),
                'name': data['name'],
                'file': data['file'],
                'child_task': child_task,
                'fold': data['fold']
            }
        }
    }

    dag_standard(session=session,
                 config=config,
                 debug=False,
                 upload_files=False)
Example #23
0
    def work(self):
        task_provider = TaskProvider(self.session)
        task = task_provider.by_id(self.train_task)
        dag = DagProvider(self.session).by_id(self.dag_pipe,
                                              joined_load=[Dag.project_rel])

        task_dir = join(TASK_FOLDER, str(self.child_task or task.id))
        src_log = f'{task_dir}/log'
        models_dir = join(MODEL_FOLDER, dag.project_rel.name)
        os.makedirs(models_dir, exist_ok=True)

        self.info(f'Task = {self.task} child_task: {self.child_task}')

        model_path_tmp = f'{src_log}/traced.pth'
        traced = trace_model_from_checkpoint(src_log, self)

        model = Model(dag=self.dag_pipe,
                      interface=self.interface,
                      slot=self.slot,
                      score_local=task.score,
                      created=now(),
                      name=self.name,
                      project=dag.project,
                      interface_params=yaml_dump(self.interface_params))
        provider = ModelProvider(self.session)
        provider.add(model, commit=False)
        try:
            model_path = f'{models_dir}/{model.name}.pth'
            model_weight_path = f'{models_dir}/{model.name}_weight.pth'
            torch.jit.save(traced, model_path_tmp)
            shutil.copy(model_path_tmp, model_path)
            shutil.copy(f'{src_log}/checkpoints/best.pth', model_weight_path)

            interface_params = yaml_load(model.interface_params)
            interface_params['file'] = join('models', model.name + '.pth')
            model.interface_params = yaml_dump(interface_params)
            provider.update()
        except Exception as e:
            provider.rollback()
            raise e
Example #24
0
def dag_model_add(session: Session, data: dict):
    task_provider = TaskProvider(session)
    task = task_provider.by_id(data['task'],
                               options=joinedload(Task.dag_rel,
                                                  innerjoin=True))
    child_tasks = task_provider.children(task.id)
    computer = task.computer_assigned
    child_task = None
    if len(child_tasks) > 0:
        child_task = child_tasks[0].id
        computer = child_tasks[0].computer_assigned

    project = ProjectProvider(session).by_id(task.dag_rel.project)
    interface_params = data.get('interface_params', '')
    interface_params = yaml_load(interface_params)
    config = {
        'info': {
            'name': 'model_add',
            'project': project.name,
            'computer': computer
        },
        'executors': {
            'model_add': {
                'type': 'model_add',
                'dag': data['dag'],
                'slot': data['slot'],
                'interface': data['interface'],
                'task': data.get('task'),
                'name': data['name'],
                'interface_params': interface_params,
                'child_task': child_task
            }
        }
    }

    dag_standard(session=session,
                 config=config,
                 debug=False,
                 upload_files=False)
Example #25
0
def task_info():
    data = request_data()
    task = TaskProvider(_read_session).by_id(
        data['id'], joinedload(Task.dag_rel, innerjoin=True))
    return {
        'pid': task.pid,
        'worker_index': task.worker_index,
        'gpu_assigned': task.gpu_assigned,
        'celery_id': task.celery_id,
        'additional_info': task.additional_info or '',
        'result': task.result or '',
        'id': task.id
    }
Example #26
0
    def create_base(self):
        self.info('create_base')

        if app.current_task:
            app.current_task.update_state(state=states.SUCCESS)
            app.control.revoke(app.current_task.request.id, terminate=True)

        self.provider = TaskProvider(self.session)
        self.library_provider = DagLibraryProvider(self.session)
        self.storage = Storage(self.session)

        self.task = self.provider.by_id(
            self.id, joinedload(Task.dag_rel, innerjoin=True))
        if not self.task:
            raise Exception(f'task with id = {self.id} is not found')

        self.dag = self.task.dag_rel
        self.executor = None
        self.hostname = socket.gethostname()

        self.docker_img = DOCKER_IMG
        self.worker_index = os.getenv('WORKER_INDEX', -1)

        self.queue_personal = f'{self.hostname}_{self.docker_img}_' \
                              f'{self.worker_index}'

        self.config = Config.from_yaml(self.dag.config)
        self.executor_type = self.config['executors'][
            self.task.executor]['type']

        executor = self.config['executors'][self.task.executor]
        env = {'MKL_NUM_THREADS': 1, 'OMP_NUM_THREADS': 1}
        env.update(executor.get('env', {}))

        for k, v in env.items():
            os.environ[k] = str(v)
            self.info(f'Set env. {k} = {v}')
Example #27
0
def stop_processes_not_exist(session: Session, logger):
    provider = TaskProvider(session)
    hostname = socket.gethostname()
    tasks = provider.by_status(
        TaskStatus.InProgress,
        task_docker_assigned=DOCKER_IMG,
        computer_assigned=hostname
    )
    # Kill processes which does not exist
    hostname = socket.gethostname()
    for t in tasks:
        if not psutil.pid_exists(t.pid):
            # tasks can retry the execution
            if (now() - t.last_activity).total_seconds() < 30:
                continue

            os.system(f'kill -9 {t.pid}')
            t.status = TaskStatus.Failed.value
            logger.error(
                f'process with pid = {t.pid} does not exist. '
                f'Set task to failed state',
                ComponentType.WorkerSupervisor, hostname, t.id
            )

            provider.commit()

            additional_info = yaml_load(t.additional_info)
            for p in additional_info.get('child_processes', []):
                logger.info(f'killing child process = {p}')
                os.system(f'kill -9 {p}')

    # Kill processes which exist but should not
    processes = get_pid('worker ')
    ids = [p['PID'] for p in processes]
    tasks = provider.by_ids(ids)
    tasks = {t.pid: t for t in tasks}

    for p in processes:
        pid = p['PID']
        if pid in tasks:
            task = tasks[pid]
            if task.status in [TaskStatus.Stopped.value,
                               TaskStatus.Failed.value,
                               TaskStatus.Skipped.value]:

                logger.info(f'Kill processes which exist but should not. '
                            f'Pid = {pid}')
                os.system(f'kill -9 {pid}')
Example #28
0
class SegmentationReportBuilder:
    def __init__(self,
                 session: Session,
                 task: Task,
                 layout: str,
                 part: str = 'valid',
                 name: str = 'img_segment',
                 max_img_size: Tuple[int, int] = None,
                 stack_type: str = 'vertical',
                 main_metric: str = 'dice',
                 plot_count: int = 0,
                 colors: List[Tuple] = None):
        self.session = session
        self.task = task
        self.layout = layout
        self.part = part
        self.name = name or 'img_segment'
        self.max_img_size = max_img_size
        self.stack_type = stack_type
        self.main_metric = main_metric
        self.colors = colors
        self.plot_count = plot_count

        self.dag_provider = DagProvider(session)
        self.report_provider = ReportProvider(session)
        self.layout_provider = ReportLayoutProvider(session)
        self.task_provider = TaskProvider(session)
        self.report_img_provider = ReportImgProvider(session)
        self.report_task_provider = ReportTasksProvider(session)
        self.report_series_provider = ReportSeriesProvider(session)

        self.project = self.task_provider.project(task.id).id
        self.layout = self.layout_provider.by_name(layout)
        self.layout_dict = yaml_load(self.layout.content)

        self.create_base()

    def create_base(self):
        report = Report(config=yaml_dump(self.layout_dict),
                        time=now(),
                        layout=self.layout.name,
                        project=self.project,
                        name=self.name)
        self.report_provider.add(report)
        self.report_task_provider.add(
            ReportTasks(report=report.id, task=self.task.id))

        self.task.report = report.id
        self.task_provider.update()

    def encode_pred(self, mask: np.array):
        res = np.zeros((*mask.shape[1:], 3), dtype=np.uint8)
        for i, c in enumerate(mask):
            c = np.repeat(c[:, :, None], 3, axis=2)
            color = self.colors[i] if self.colors is not None else (255, 255,
                                                                    255)
            res += (c * color).astype(np.uint8)

        return res

    def plot_mask(self, img: np.array, mask: np.array):
        if len(img.shape) == 2:
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
        img = img.astype(np.uint8)
        mask = mask.astype(np.uint8)

        for i, c in enumerate(mask):
            contours, _ = cv2.findContours(c, cv2.RETR_LIST,
                                           cv2.CHAIN_APPROX_NONE)
            color = self.colors[i] if self.colors else (0, 255, 0)
            for i in range(0, len(contours)):
                cv2.polylines(img, contours[i], True, color, 2)

        return img

    def process_scores(self, scores):
        for key, item in self.layout_dict['items'].items():
            item['name'] = key
            if item['type'] == 'series' and item['key'] in scores:
                series = ReportSeries(name=item['name'],
                                      value=scores[item['key']],
                                      epoch=0,
                                      time=now(),
                                      task=self.task.id,
                                      part='valid',
                                      stage='stage1')

                self.report_series_provider.add(series)

    def process_pred(self,
                     imgs: np.array,
                     preds: dict,
                     targets: np.array = None,
                     attrs=None,
                     scores=None):
        for key, item in self.layout_dict['items'].items():
            item['name'] = key
            if item['type'] != 'img_segment':
                continue

            report_imgs = []
            dag = self.dag_provider.by_id(self.task.dag)

            for i in range(len(imgs)):
                if self.plot_count <= 0:
                    break

                if targets is not None:
                    img = self.plot_mask(imgs[i], targets[i])
                else:
                    img = imgs[i]

                imgs_add = [img]
                for key, value in preds.items():
                    imgs_add.append(self.encode_pred(value[i]))

                for j in range(len(imgs_add)):
                    imgs_add[j] = resize_saving_ratio(imgs_add[j],
                                                      self.max_img_size)

                if self.stack_type == 'horizontal':
                    img = np.hstack(imgs_add)
                else:
                    img = np.vstack(imgs_add)

                attr = attrs[i] if attrs else {}

                score = None
                if targets is not None:
                    score = scores[self.main_metric][i]

                retval, buffer = cv2.imencode('.jpg', img)
                report_img = ReportImg(group=item['name'],
                                       epoch=0,
                                       task=self.task.id,
                                       img=buffer,
                                       dag=self.task.dag,
                                       part=self.part,
                                       project=self.project,
                                       score=score,
                                       **attr)

                self.plot_count -= 1
                report_imgs.append(report_img)
                dag.img_size += report_img.size

            self.dag_provider.commit()
            self.report_img_provider.bulk_save_objects(report_imgs)
Example #29
0
def remove_dag(session: Session, id: int):
    tasks = TaskProvider(session).by_dag(id)
    for task in tasks:
        remove_task(session, task.id)
Example #30
0
class ExecuteBuilder:
    def __init__(self, id: int, repeat_count: int = 1, exit=True):
        self.session = Session.create_session(key='ExecuteBuilder')
        self.id = id
        self.repeat_count = repeat_count
        self.logger = create_logger(self.session, 'ExecuteBuilder')
        self.logger_db = create_logger(self.session,
                                       'ExecuteBuilder.db',
                                       console=False)
        self.exit = exit

        self.provider = None
        self.library_provider = None
        self.storage = None
        self.task = None
        self.dag = None
        self.executor = None
        self.hostname = None
        self.docker_img = None
        self.worker_index = None
        self.queue_personal = None
        self.config = None
        self.executor_type = None

    def info(self, msg: str, step=None):
        self.logger.info(msg, ComponentType.Worker, self.hostname, self.id,
                         step)

    def error(self, msg: str, step=None):
        self.logger.error(msg, ComponentType.Worker, self.hostname, self.id,
                          step)

    def warning(self, msg: str, step=None):
        self.logger.warning(msg, ComponentType.Worker, self.hostname, self.id,
                            step)

    def debug(self, msg: str, step=None):
        self.logger.debug(msg, ComponentType.Worker, self.hostname, self.id,
                          step)

    def create_base(self):
        self.info('create_base')

        if app.current_task:
            app.current_task.update_state(state=states.SUCCESS)
            app.control.revoke(app.current_task.request.id, terminate=True)

        self.provider = TaskProvider(self.session)
        self.library_provider = DagLibraryProvider(self.session)
        self.storage = Storage(self.session)

        self.task = self.provider.by_id(
            self.id, joinedload(Task.dag_rel, innerjoin=True))
        if not self.task:
            raise Exception(f'task with id = {self.id} is not found')

        self.dag = self.task.dag_rel
        self.executor = None
        self.hostname = socket.gethostname()

        self.docker_img = DOCKER_IMG
        self.worker_index = os.getenv('WORKER_INDEX', -1)

        self.queue_personal = f'{self.hostname}_{self.docker_img}_' \
                              f'{self.worker_index}'

        self.config = Config.from_yaml(self.dag.config)

        set_global_seed(self.config['info'].get('seed', 0))

        self.executor_type = self.config['executors'][
            self.task.executor]['type']

        executor = self.config['executors'][self.task.executor]

        cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', '')
        self.info(f'Env.before execution '
                  f'CUDA_VISIBLE_DEVICES={cuda_visible_devices}')

        if cuda_visible_devices.strip() != '':
            gpu_assigned = self.task.gpu_assigned or ''

            cuda_visible_devices = cuda_visible_devices.split(',')
            cuda_visible_devices = ','.join([
                cuda_visible_devices[int(g)] for g in gpu_assigned.split(',')
                if g.strip() != ''
            ])
        else:
            cuda_visible_devices = self.task.gpu_assigned

        cuda_visible_devices = cuda_visible_devices or ''

        env = {
            'MKL_NUM_THREADS': 1,
            'OMP_NUM_THREADS': 1,
            'CUDA_VISIBLE_DEVICES': cuda_visible_devices
        }
        env.update(executor.get('env', {}))

        for k, v in env.items():
            os.environ[k] = str(v)
            self.info(f'Set env. {k} = {v}')

    def check_status(self):
        self.info('check_status')

        assert self.dag is not None, 'You must fetch task with dag_rel'

        if self.task.status >= TaskStatus.InProgress.value:
            msg = f'Task = {self.task.id}. Status = {self.task.status}, ' \
                  f'before the execute_by_id invocation.'
            if app.current_task:
                msg += f' Request Id = {app.current_task.request.id}'
            self.error(msg)
            return True

    def change_status(self):
        self.info('change_status')

        self.task.computer_assigned = self.hostname
        self.task.pid = os.getpid()
        self.task.worker_index = self.worker_index
        self.task.docker_assigned = self.docker_img
        self.provider.change_status(self.task, TaskStatus.InProgress)

    def download(self):
        self.info('download')

        if not self.task.debug:
            folder = self.storage.download(task=self.id)
        else:
            folder = os.getcwd()

        os.chdir(folder)

        libraries = self.library_provider.dag(self.task.dag)
        executor_type = self.executor_type

        self.info('download. folder changed')

        mlcomp_executors_folder = join(dirname(abspath(__file__)), 'executors')
        mlcomp_base_folder = os.path.abspath(
            join(mlcomp_executors_folder, '../../../'))

        imported, was_installation = self.storage.import_executor(
            mlcomp_executors_folder, mlcomp_base_folder, executor_type)

        if not imported:
            imported, was_installation = self.storage.import_executor(
                folder, folder, executor_type, libraries)

            if not imported:
                raise Exception(f'Executor = {executor_type} not found')

        self.info('download. executor imported')

        if was_installation and not self.task.debug:
            if self.repeat_count > 0:
                self.info('was installation. '
                          'set task status to Queued. '
                          'And resending the task to a queue')
                self.task.status = TaskStatus.Queued.value
                self.provider.commit()

                try:
                    execute.apply_async((self.id, self.repeat_count - 1),
                                        queue=self.queue_personal,
                                        retry=False)
                except Exception:
                    pass
                finally:
                    sys.exit()

        assert Executor.is_registered(executor_type), \
            f'Executor {executor_type} was not found'

    def create_executor(self):
        self.info('create_executor')

        additional_info = yaml_load(self.task.additional_info) \
            if self.task.additional_info else dict()
        self.executor = Executor.from_config(executor=self.task.executor,
                                             config=self.config,
                                             additional_info=additional_info,
                                             session=self.session,
                                             logger=self.logger,
                                             logger_db=self.logger_db)

    def execute(self):
        self.info('execute start')

        res = self.executor(task=self.task,
                            task_provider=self.provider,
                            dag=self.dag)
        self.info('execute executor finished')

        res = res or {}
        self.task.result = yaml_dump(res)
        self.provider.commit()

        if 'stage' in res and 'stages' in res:
            index = res['stages'].index(res['stage'])
            if index < len(res['stages']) - 1:
                self.executor.info(f'stage = {res["stage"]} done. '
                                   f'Go to the stage = '
                                   f'{res["stages"][index + 1]}')

                time.sleep(3)

                self.executor.info(f'sending {(self.id, self.repeat_count)} '
                                   f'to {self.queue_personal}')

                self.task.status = TaskStatus.Queued.value
                self.provider.commit()

                execute.apply_async((self.id, self.repeat_count),
                                    queue=self.queue_personal,
                                    retry=False)
                return

        self.executor.step.finish()
        self.provider.change_status(self.task, TaskStatus.Success)

        self.info('execute end')

    def build(self):
        try:
            self.create_base()

            bad_status = self.check_status()
            if bad_status:
                return

            self.change_status()

            self.download()

            self.create_executor()

            self.execute()

        except Exception as e:
            step = self.executor.step.id if \
                (self.executor and self.executor.step) else None

            if Session.sqlalchemy_error(e):
                Session.cleanup(key='ExecuteBuilder')
                self.session = Session.create_session(key='ExecuteBuilder')
                self.logger.session = create_logger(self.session,
                                                    'ExecuteBuilder')

            self.error(traceback.format_exc(), step)
            if self.task.status <= TaskStatus.InProgress.value:
                self.provider.change_status(self.task, TaskStatus.Failed)
            raise e
        finally:
            if app.current_task:
                app.close()

            if self.exit:
                # noinspection PyProtectedMember
                os._exit(0)