Example #1
0
    def __init__(self,
                 session: Session,
                 task: Task,
                 layout: str,
                 part: str = 'valid',
                 name: str = 'img_classify',
                 max_img_size: Tuple[int, int] = None,
                 main_metric: str = 'accuracy',
                 plot_count: int = 0):
        self.session = session
        self.task = task
        self.layout = layout
        self.part = part
        self.name = name or 'img_classify'
        self.max_img_size = max_img_size
        self.main_metric = main_metric
        self.plot_count = plot_count

        self.dag_provider = DagProvider(session)
        self.report_provider = ReportProvider(session)
        self.layout_provider = ReportLayoutProvider(session)
        self.task_provider = TaskProvider(session)
        self.report_img_provider = ReportImgProvider(session)
        self.report_task_provider = ReportTasksProvider(session)
        self.report_series_provider = ReportSeriesProvider(session)

        self.project = self.task_provider.project(task.id).id
        self.layout = self.layout_provider.by_name(layout)
        self.layout_dict = yaml_load(self.layout.content)
Example #2
0
    def create_providers(self):
        self.log_info('create_providers')

        self.dag_provider = DagProvider(self.session)
        self.task_provider = TaskProvider(self.session)
        self.file_provider = FileProvider(self.session)
        self.dag_storage_provider = DagStorageProvider(self.session)
Example #3
0
def dag_pipe(session: Session, config: dict, config_text: str = None):
    assert 'interfaces' in config, 'interfaces missed'
    assert 'pipes' in config, 'pipe missed'

    info = config['info']

    storage = Storage(session)
    dag_provider = DagProvider(session)

    folder = os.getcwd()
    project = ProjectProvider(session).by_name(info['project']).id
    dag = dag_provider.add(
        Dag(
            config=config_text,
            project=project,
            name=info['name'],
            docker_img=info.get('docker_img'),
            type=DagType.Pipe.value
        )
    )
    storage.upload(folder, dag)

    # Change model dags which have the same name
    ModelProvider(session
                  ).change_dag(project=project, name=info['name'], to=dag.id)
Example #4
0
    def __init__(self,
                 session: Session,
                 task: Task,
                 layout: str,
                 part: str = 'valid',
                 name: str = 'img_segment',
                 max_img_size: Tuple[int, int] = None,
                 stack_type: str = 'vertical',
                 main_metric: str = 'dice',
                 plot_count: int = 0,
                 colors: List[Tuple] = None):
        self.session = session
        self.task = task
        self.layout = layout
        self.part = part
        self.name = name or 'img_segment'
        self.max_img_size = max_img_size
        self.stack_type = stack_type
        self.main_metric = main_metric
        self.colors = colors
        self.plot_count = plot_count

        self.dag_provider = DagProvider(session)
        self.report_provider = ReportProvider(session)
        self.layout_provider = ReportLayoutProvider(session)
        self.task_provider = TaskProvider(session)
        self.report_img_provider = ReportImgProvider(session)
        self.report_task_provider = ReportTasksProvider(session)
        self.report_series_provider = ReportSeriesProvider(session)

        self.project = self.task_provider.project(task.id).id
        self.layout = self.layout_provider.by_name(layout)
        self.layout_dict = yaml_load(self.layout.content)

        self.create_base()
Example #5
0
def dag_stop():
    data = request_data()
    provider = DagProvider(_write_session)
    id = int(data['id'])
    dag = provider.by_id(id, joined_load=['tasks'])
    for t in dag.tasks:
        celery_tasks.stop(logger, _write_session, t, dag)
    return {'dag': provider.get({'id': id})['data'][0]}
Example #6
0
def dag_model_start(session: Session, data: dict):
    provider = ModelProvider(session)
    model = provider.by_id(data['model_id'])
    dag_provider = DagProvider(session)
    dag = dag_provider.by_id(data['dag'], joined_load=[Dag.project_rel])

    project = dag.project_rel
    src_config = Config.from_yaml(dag.config)
    pipe = src_config['pipes'][data['pipe']['name']]

    equations = yaml_load(model.equations)
    versions = data['pipe']['versions']

    if len(versions) > 0:
        version = data['pipe']['version']
        pipe_equations = yaml_load(version['equations'])
        found_version = versions[0]
        for v in versions:
            if v['name'] == version['name']:
                found_version = v
                break

        found_version['used'] = now()

        if len(pipe) == 1:
            pipe[list(pipe)[0]].update(pipe_equations)
        else:
            pipe.update(pipe_equations)

    equations[data['pipe']['name']] = versions
    model.equations = yaml_dump(equations)

    for v in pipe.values():
        v['model_id'] = model.id
        v['model_name'] = model.name

    config = {
        'info': {
            'name': data['pipe']['name'],
            'project': project.name
        },
        'executors': pipe
    }

    if model.dag:
        old_dag = dag_provider.by_id(model.dag)
        if old_dag.name != dag.name:
            model.dag = dag.id
    else:
        model.dag = dag.id

    provider.commit()

    dag_standard(session=session,
                 config=config,
                 debug=False,
                 upload_files=False,
                 copy_files_from=data['dag'])
Example #7
0
    def create_providers(self):
        self.provider = TaskProvider(self.session)
        self.report_provider = ReportProvider(self.session)
        self.report_tasks_provider = ReportTasksProvider(self.session)
        self.report_layout_provider = ReportLayoutProvider(self.session)
        self.project_provider = ProjectProvider(self.session)

        self.storage = Storage(self.session)
        self.dag_provider = DagProvider(self.session)
Example #8
0
def dag_stop():
    data = request_data()
    provider = TaskProvider(_write_session)
    id = int(data['id'])
    tasks = provider.by_dag(id)

    supervisor.stop_tasks(tasks)

    dag_provider = DagProvider(_write_session)
    return {'dag': dag_provider.get({'id': id})['data'][0]}
Example #9
0
def describe_dag(dag, axis):
    provider = DagProvider()
    graph = provider.graph(dag)

    status_colors = {
        'not_ran': '#808080',
        'queued': '#add8e6',
        'in_progress': '#bfff00',
        'failed': '#e83217',
        'stopped': '#cb88ea',
        'skipped': '#ffa500',
        'success': '#006400'
    }
    node_color = []
    edge_color = []

    G = nx.DiGraph()
    labels = dict()
    for n in graph['nodes']:
        G.add_node(n['id'])
        labels[n['id']] = n['id']
        node_color.append(status_colors[n['status']])

    edges = []
    for e in graph['edges']:
        G.add_edge(e['from'], e['to'])
        edges.append((e['from'], e['to']))
        edge_color.append(status_colors[e['status']])

    pos = nx.spring_layout(G, seed=0)
    nx.draw_networkx_nodes(G,
                           pos,
                           node_color=node_color,
                           ax=axis,
                           node_size=2000)
    nx.draw_networkx_labels(G,
                            pos,
                            labels,
                            ax=axis,
                            with_labels=True,
                            font_color='orange',
                            font_weight='bold',
                            font_size=18)
    nx.draw_networkx_edges(G,
                           pos,
                           edgelist=edges,
                           edge_color=edge_color,
                           arrows=True,
                           arrowsize=80,
                           ax=axis)

    axis.set_xticks([])
    axis.axis('off')
    axis.set_title('Graph')
Example #10
0
    def create_providers(self):
        self.log_info('create_providers')

        self.provider = TaskProvider(self.session)
        self.report_provider = ReportProvider(self.session)
        self.report_tasks_provider = ReportTasksProvider(self.session)
        self.report_layout_provider = ReportLayoutProvider(self.session)
        self.project_provider = ProjectProvider(self.session)

        self.storage = Storage(self.session,
                               logger=self.logger,
                               component=self.component)
        self.dag_provider = DagProvider(self.session)
Example #11
0
    def __init__(self, session: Session, logger=None,
                 component: ComponentType = None,
                 max_file_size: int = 10 ** 5, max_count=10 ** 3):
        self.file_provider = FileProvider(session)
        self.provider = DagStorageProvider(session)
        self.task_provider = TaskProvider(session)
        self.library_provider = DagLibraryProvider(session)
        self.dag_provider = DagProvider(session)

        self.logger = logger
        self.component = component
        self.max_file_size = max_file_size
        self.max_count = max_count
Example #12
0
    def create_base(self):
        self.session.commit()

        self.provider = TaskProvider(self.session)
        self.computer_provider = ComputerProvider(self.session)
        self.docker_provider = DockerProvider(self.session)
        self.auxiliary_provider = AuxiliaryProvider(self.session)
        self.dag_provider = DagProvider(self.session)

        self.queues = [
            f'{d.computer}_{d.name}' for d in self.docker_provider.all()
            if d.last_activity >= now() - datetime.timedelta(seconds=15)
        ]

        self.auxiliary['queues'] = self.queues
Example #13
0
    def work(self):
        project = ProjectProvider(self.session).by_id(self.project)

        self.info(f'Task = {self.train_task} child_task: {self.child_task}')

        model = Model(
            created=now(),
            name=self.name,
            project=self.project,
            equations='',
            fold=self.fold
        )

        provider = ModelProvider(self.session)
        if self.train_task:
            task_provider = TaskProvider(self.session)
            dag_provider = DagProvider(self.session)
            task = task_provider.by_id(self.train_task)
            dag = dag_provider.by_id(task.dag)

            task_dir = join(TASK_FOLDER, str(self.child_task or task.id))

            # get log directory
            config = yaml_load(dag.config)
            executor_config = config['executors'][task.executor]
            catalyst_config_file = executor_config['args']['config']
            catalyst_config_file = join(task_dir, catalyst_config_file)
            catalyst_config = yaml_load(file=catalyst_config_file)
            catalyst_logdir = catalyst_config['args']['logdir']

            model.score_local = task.score

            src_log = f'{task_dir}/{catalyst_logdir}'
            models_dir = join(MODEL_FOLDER, project.name)
            os.makedirs(models_dir, exist_ok=True)

            model_path_tmp = f'{src_log}/traced.pth'
            traced = trace_model_from_checkpoint(src_log, self, file=self.file)

            model_path = f'{models_dir}/{model.name}.pth'
            model_weight_path = f'{models_dir}/{model.name}_weight.pth'
            torch.jit.save(traced, model_path_tmp)
            shutil.copy(model_path_tmp, model_path)
            file = self.file = 'best_full'
            shutil.copy(f'{src_log}/checkpoints/{file}.pth',
                        model_weight_path)

        provider.add(model)
Example #14
0
def code_download():
    id = int(request.args['id'])
    storage = Storage(_read_session)
    dag = DagProvider().by_id(id)
    folder = os.path.join(TMP_FOLDER, f'{dag.id}({dag.name})')

    try:
        storage.download_dag(id, folder)

        file_name = f'{dag.id}({dag.name}).zip'
        dst = os.path.join(TMP_FOLDER, file_name)
        zip_folder(folder, dst)
        res = send_from_directory(TMP_FOLDER, file_name)
        os.remove(dst)
        return res
    finally:
        shutil.rmtree(folder, ignore_errors=True)
Example #15
0
def dag_model_start(session: Session, data: dict):
    provider = ModelProvider(session)
    model = provider.by_id(data['model_id'])
    dag = DagProvider(session
                      ).by_id(data['dag'], joined_load=[Dag.project_rel])

    project = dag.project_rel
    src_config = Config.from_yaml(dag.config)
    pipe = src_config['pipes'][data['pipe']]
    for k, v in pipe.items():
        if v.get('slot') != data['slot']:
            continue
        params = yaml_load(data['interface_params'])
        slot = {
            'interface': data['interface'],
            'interface_params': params,
            'slot': k,
            'name': model.name,
            'id': data['model_id']
        }
        v['slot'] = slot

    config = {
        'info': {
            'name': data['pipe'],
            'project': project.name
        },
        'executors': pipe
    }

    dag_standard(
        session=session,
        config=config,
        debug=False,
        upload_files=False,
        copy_files_from=data['dag']
    )

    model.dag = data['dag']
    model.interface = data['interface']
    model.interface_params = data['interface_params']
    model.slot = data['slot']

    provider.commit()
Example #16
0
    def work(self):
        task_provider = TaskProvider(self.session)
        task = task_provider.by_id(self.train_task)
        dag = DagProvider(self.session).by_id(self.dag_pipe,
                                              joined_load=[Dag.project_rel])

        task_dir = join(TASK_FOLDER, str(self.child_task or task.id))
        src_log = f'{task_dir}/log'
        models_dir = join(MODEL_FOLDER, dag.project_rel.name)
        os.makedirs(models_dir, exist_ok=True)

        self.info(f'Task = {self.task} child_task: {self.child_task}')

        model_path_tmp = f'{src_log}/traced.pth'
        traced = trace_model_from_checkpoint(src_log, self)

        model = Model(dag=self.dag_pipe,
                      interface=self.interface,
                      slot=self.slot,
                      score_local=task.score,
                      created=now(),
                      name=self.name,
                      project=dag.project,
                      interface_params=yaml_dump(self.interface_params))
        provider = ModelProvider(self.session)
        provider.add(model, commit=False)
        try:
            model_path = f'{models_dir}/{model.name}.pth'
            model_weight_path = f'{models_dir}/{model.name}_weight.pth'
            torch.jit.save(traced, model_path_tmp)
            shutil.copy(model_path_tmp, model_path)
            shutil.copy(f'{src_log}/checkpoints/best.pth', model_weight_path)

            interface_params = yaml_load(model.interface_params)
            interface_params['file'] = join('models', model.name + '.pth')
            model.interface_params = yaml_dump(interface_params)
            provider.update()
        except Exception as e:
            provider.rollback()
            raise e
Example #17
0
class DagCopyBuilder:
    def __init__(self,
                 session: Session,
                 dag: int,
                 file_changes: str = '',
                 dag_suffix: str = '',
                 logger=None,
                 component: ComponentType = None):
        self.dag = dag
        self.file_changes = file_changes
        self.session = session
        self.logger = logger
        self.component = component
        self.dag_suffix = dag_suffix

        self.dag_db = None

        self.dag_provider = None
        self.task_provider = None
        self.file_provider = None
        self.dag_storage_provider = None

    def log_info(self, message: str):
        if self.logger:
            self.logger.info(message, self.component)

    def create_providers(self):
        self.log_info('create_providers')

        self.dag_provider = DagProvider(self.session)
        self.task_provider = TaskProvider(self.session)
        self.file_provider = FileProvider(self.session)
        self.dag_storage_provider = DagStorageProvider(self.session)

    def create_dag(self):
        dag = self.dag_provider.by_id(self.dag)
        name = dag.name
        if self.dag_suffix:
            name += ' ' + self.dag_suffix
        dag_new = Dag(name=name,
                      created=now(),
                      config=dag.config,
                      project=dag.project,
                      docker_img=dag.docker_img,
                      img_size=0,
                      file_size=0,
                      type=dag.type)
        self.dag_provider.add(dag_new)
        self.dag_db = dag_new

    def find_replace(self, changes: dict, path: str):
        for k, v in changes.items():
            if not re.match(k, path):
                continue
            return v

    def create_tasks(self):
        tasks = self.task_provider.by_dag(self.dag)
        tasks_new = []
        tasks_old = []

        for t in tasks:
            if t.parent:
                continue

            task = Task(
                name=t.name,
                status=TaskStatus.NotRan.value,
                computer=t.computer,
                gpu=t.gpu,
                gpu_max=t.gpu_max,
                cpu=t.cpu,
                executor=t.executor,
                memory=t.memory,
                steps=t.steps,
                dag=self.dag_db.id,
                debug=t.debug,
                type=t.type,
            )
            task.additional_info = t.additional_info
            tasks_new.append(task)
            tasks_old.append(t)

        self.task_provider.bulk_save_objects(tasks_new, return_defaults=True)
        old2new = {
            t_old.id: t_new.id
            for t_new, t_old in zip(tasks_new, tasks_old)
        }
        dependencies = self.task_provider.get_dependencies(self.dag)
        dependencies_new = []
        for d in dependencies:
            d_new = TaskDependence(task_id=old2new[d.task_id],
                                   depend_id=old2new[d.depend_id])
            dependencies_new.append(d_new)

        self.task_provider.bulk_save_objects(dependencies_new,
                                             return_defaults=False)

        changes = yaml_load(self.file_changes)
        storages = self.dag_storage_provider.by_dag(self.dag)
        storages_new = []

        for s, f in storages:
            if not isinstance(changes, dict):
                continue

            replace = self.find_replace(changes, s.path)
            if replace is not None and f:
                content = f.content.decode('utf-8')
                if s.path.endswith('.yml'):
                    data = yaml_load(content)
                    data = merge_dicts_smart(data, replace)
                    content = yaml_dump(data)
                else:
                    for k, v in replace:
                        if k not in content:
                            raise Exception(f'{k} is not in the content')
                        content = content.replace(k, v)
                content = content.encode('utf-8')
                md5 = hashlib.md5(content).hexdigest()
                f = self.file_provider.by_md5(md5)
                if not f:
                    f = File(content=content,
                             created=now(),
                             project=self.dag_db.project,
                             md5=md5,
                             dag=self.dag_db.id)
                self.file_provider.add(f)

            s_new = DagStorage(dag=self.dag_db.id,
                               file=f.id,
                               path=s.path,
                               is_dir=s.is_dir)
            storages_new.append(s_new)

        self.dag_storage_provider.bulk_save_objects(storages_new,
                                                    return_defaults=False)

    def build(self):
        self.create_providers()
        self.create_dag()
        self.create_tasks()
Example #18
0
def dag_remove():
    id = request_data()['id']
    celery_tasks.remove_dag(_write_session, id)
    DagProvider(_write_session).remove(id)
Example #19
0
def dag_start():
    data = request_data()
    provider = DagProvider(_write_session)
    task_provider = TaskProvider(_write_session)

    id = int(data['id'])
    dag = provider.by_id(id, joined_load=['tasks'])
    can_start_statuses = [
        TaskStatus.Failed.value, TaskStatus.Skipped.value,
        TaskStatus.Stopped.value
    ]

    tasks = list(dag.tasks)

    def find_resume(task):
        children = task_provider.children(task.id)
        children = sorted(children, key=lambda x: x.id, reverse=True)

        if len(children) > 0:
            for c in children:
                if c.parent != task.id:
                    continue

                info = yaml_load(c.additional_info)
                if 'distr_info' not in info:
                    continue

                if info['distr_info']['rank'] == 0:
                    return {
                        'master_computer': c.computer_assigned,
                        'master_task_id': c.id,
                        'load_last': True
                    }
            raise Exception('Master task not found')
        else:
            return {
                'master_computer': task.computer_assigned,
                'master_task_id': task.id,
                'load_last': True
            }

    for t in tasks:
        if t.status not in can_start_statuses:
            continue

        if t.parent:
            continue

        info = yaml_load(t.additional_info)
        info['resume'] = find_resume(t)
        t.additional_info = yaml_dump(info)

        t.status = TaskStatus.NotRan.value
        t.pid = None
        t.started = None
        t.finished = None
        t.computer_assigned = None
        t.celery_id = None
        t.worker_index = None
        t.docker_assigned = None

    provider.commit()
Example #20
0
def dag_tag_add():
    data = request_data()
    provider = DagProvider(_write_session)
    tag = DagTag(dag=data['dag'], tag=data['tag'])
    provider.add(tag)
Example #21
0
def graph():
    id = request_data()
    res = DagProvider(_read_session).graph(id)
    return res
Example #22
0
def config():
    id = request_data()
    res = DagProvider(_read_session).config(id)
    return {'data': res}
Example #23
0
class DagStandardBuilder:
    def __init__(self,
                 session: Session,
                 config: dict,
                 debug: bool,
                 config_text: str = None,
                 upload_files: bool = True,
                 copy_files_from: int = None,
                 config_path: str = None,
                 control_reqs: bool = True,
                 logger=None,
                 component: ComponentType = None):
        self.session = session
        self.config = config
        self.debug = debug
        self.config_text = config_text
        self.upload_files = upload_files
        self.copy_files_from = copy_files_from
        self.config_path = config_path
        self.control_reqs = control_reqs

        self.info = config['info']
        self.layout_name = self.info.get('layout')

        self.provider = None
        self.report_provider = None
        self.report_tasks_provider = None
        self.report_layout_provider = None
        self.storage = None
        self.dag_provider = None
        self.logger = logger
        self.component = component

        self.project = None
        self.layouts = None
        self.dag = None
        self.dag_report_id = None
        self.created = None
        self.project_provider = None

    def log_info(self, message: str):
        if self.logger:
            self.logger.info(message, self.component)

    def create_providers(self):
        self.log_info('create_providers')

        self.provider = TaskProvider(self.session)
        self.report_provider = ReportProvider(self.session)
        self.report_tasks_provider = ReportTasksProvider(self.session)
        self.report_layout_provider = ReportLayoutProvider(self.session)
        self.project_provider = ProjectProvider(self.session)

        self.storage = Storage(self.session,
                               logger=self.logger,
                               component=self.component)
        self.dag_provider = DagProvider(self.session)

    def load_base(self):
        self.log_info('load_base')

        project = self.project_provider.by_name(self.info['project'])
        if project is None:
            project = self.project_provider.add_project(self.info['project'])

        self.project = project.id
        self.layouts = self.report_layout_provider.all()

    def create_report(self):
        self.log_info('create_report')

        self.dag_report_id = None
        layout_name = self.layout_name
        if layout_name:
            if layout_name not in self.layouts:
                raise Exception(f'Unknown layout = {layout_name}')

            report = Report(config=yaml_dump(self.layouts[layout_name]),
                            name=self.info['name'],
                            project=self.project,
                            layout=layout_name)
            self.report_provider.add(report)
            self.dag_report_id = report.id

    def create_dag(self):
        self.log_info('create_dag')

        dag = Dag(config=self.config_text or yaml_dump(self.config),
                  project=self.project,
                  name=self.info['name'],
                  docker_img=self.info.get('docker_img'),
                  type=DagType.Standard.value,
                  created=now(),
                  report=self.dag_report_id)

        self.dag = self.dag_provider.add(dag)

    def upload(self):
        self.log_info('upload')

        if self.upload_files:
            folder = os.path.dirname(os.path.abspath(self.config_path))
            if 'expdir' in self.config['info']:
                path = os.path.dirname(os.path.abspath(self.config_path))
                folder = os.path.abspath(
                    os.path.join(path, self.config['info']['expdir']))
            self.storage.upload(folder,
                                self.dag,
                                control_reqs=self.control_reqs)
        elif self.copy_files_from:
            self.storage.copy_from(self.copy_files_from, self.dag)

    def create_task(self, k: str, v: dict, name: str, info: dict):
        task_type = TaskType.User.value
        if v.get('task_type') == 'train' or \
                Executor.is_trainable(v['type']):
            task_type = TaskType.Train.value

        gpu = str(v.get('gpu', '0'))
        if '-' not in gpu:
            gpu = int(gpu)
            gpu_max = gpu
        else:
            gpu, gpu_max = map(int, gpu.split('-'))

        if gpu == 0 and gpu_max > 0:
            raise Exception(f'Executor = {k} Gpu_max can"t be>0 when gpu=0')

        task = Task(name=name,
                    executor=k,
                    computer=self.info.get('computer') or v.get('computer'),
                    gpu=gpu,
                    gpu_max=gpu_max,
                    cpu=v.get('cpu', 1),
                    memory=v.get('memory', 0.1),
                    dag=self.dag.id,
                    debug=self.debug,
                    steps=int(v.get('steps', '1')),
                    type=task_type)
        task.additional_info = yaml_dump(info)
        report = None
        if self.layout_name and task_type == TaskType.Train.value:
            if self.layout_name not in self.layouts:
                raise Exception(f'Unknown report = {v["report"]}')

            report_config = self.layouts[self.layout_name]
            info['report_config'] = report_config

            task.additional_info = yaml_dump(info)
            report = Report(config=yaml_dump(report_config),
                            name=task.name,
                            project=self.project,
                            layout=self.layout_name)

        return task, report

    def create_tasks(self):
        self.log_info('create_tasks')

        created = OrderedDict()
        executors = self.config['executors']

        tasks = []
        dependencies = []
        reports = []

        while len(created) < len(executors):
            for k, v in executors.items():
                valid = True
                if 'depends' in v:
                    depends = v['depends']
                    if not isinstance(depends, list):
                        depends = [depends]

                    for d in depends:
                        if d == k:
                            raise Exception(f'Executor {k} depends on itself')

                        if d not in executors:
                            raise Exception(f'Executor {k} depend on {d} '
                                            f'which does not exist')

                        valid = valid and d in created
                if valid:
                    names = []
                    infos = []
                    if 'grid' in v:
                        grid = v['grid']
                        cells = grid_cells(grid)
                        for i, (cell, cell_name) in enumerate(cells):
                            names.append(cell_name)
                            infos.append({'grid_cell': i})
                    else:
                        names.append(v.get('name', k))
                        infos.append({})

                    k_tasks = []
                    for name, info in zip(names, infos):
                        task, report = self.create_task(k,
                                                        v,
                                                        name=name,
                                                        info=info)
                        tasks.append(task)
                        k_tasks.append(task)
                        reports.append(report)

                        if 'depends' in v:
                            depends = v['depends']
                            if not isinstance(depends, list):
                                depends = [depends]

                            for d in depends:
                                for dd in created[d]:
                                    dependencies.append((task, dd))
                    created[k] = k_tasks

        not_empty_reports = [r for r in reports if r is not None]
        if len(not_empty_reports) > 0:
            self.provider.bulk_save_objects(not_empty_reports,
                                            return_defaults=True)
            for report, task in zip(reports, tasks):
                if report is not None:
                    task.report = report.id

        self.provider.bulk_save_objects(tasks, return_defaults=True)

        if len(not_empty_reports) > 0:
            report_tasks = []
            for report, task in zip(reports, tasks):
                if report is not None:
                    report_tasks.append(
                        ReportTasks(report=report.id, task=task.id))
            self.report_tasks_provider.bulk_save_objects(report_tasks)

        dependencies = [
            TaskDependence(task_id=task.id, depend_id=dd.id)
            for task, dd in dependencies
        ]
        self.provider.bulk_save_objects(dependencies)

        for k, v in created.items():
            created[k] = [vv.id for vv in v]
        self.created = created

    def build(self):
        self.create_providers()

        self.load_base()

        self.create_report()

        self.create_dag()

        self.upload()

        self.create_tasks()

        self.log_info('Done')

        return self.created
Example #24
0
class SupervisorBuilder:
    def __init__(self):
        self.session = Session.create_session(key='SupervisorBuilder')
        self.logger = create_logger(self.session, 'SupervisorBuilder')
        self.provider = None
        self.computer_provider = None
        self.docker_provider = None
        self.auxiliary_provider = None
        self.dag_provider = None
        self.queues = None
        self.not_ran_tasks = None
        self.dep_status = None
        self.computers = None
        self.auxiliary = {}

        self.tasks = []
        self.tasks_stop = []
        self.dags_start = []
        self.sent_tasks = 0

    def create_base(self):
        self.session.commit()

        self.provider = TaskProvider(self.session)
        self.computer_provider = ComputerProvider(self.session)
        self.docker_provider = DockerProvider(self.session)
        self.auxiliary_provider = AuxiliaryProvider(self.session)
        self.dag_provider = DagProvider(self.session)

        self.queues = [
            f'{d.computer}_{d.name}' for d in self.docker_provider.all()
            if d.last_activity >= now() - datetime.timedelta(seconds=15)
        ]

        self.auxiliary['queues'] = self.queues

    def load_tasks(self):
        self.tasks = self.provider.by_status(TaskStatus.NotRan,
                                             TaskStatus.InProgress,
                                             TaskStatus.Queued)

        not_ran_tasks = [t for t in self.tasks if
                         t.status == TaskStatus.NotRan.value]

        self.not_ran_tasks = [task for task in not_ran_tasks if not task.debug]
        self.not_ran_tasks = sorted(
            self.not_ran_tasks, key=lambda x: x.gpu or 0,
            reverse=True)

        self.logger.debug(
            f'Found {len(not_ran_tasks)} not ran tasks',
            ComponentType.Supervisor
        )

        self.dep_status = self.provider.dependency_status(self.not_ran_tasks)

        self.auxiliary['not_ran_tasks'] = [
            {
                'id': t.id,
                'name': t.name,
                'dep_status': [
                    TaskStatus(s).name
                    for s in self.dep_status.get(t.id, set())
                ]
            } for t in not_ran_tasks[:5]
        ]

    def load_computers(self):
        computers = self.computer_provider.computers()
        for computer in computers.values():
            computer['gpu'] = [0] * computer['gpu']
            computer['ports'] = set()
            computer['cpu_total'] = computer['cpu']
            computer['memory_total'] = computer['memory']
            computer['gpu_total'] = len(computer['gpu'])
            computer['can_process_tasks'] = computer['can_process_tasks']

        tasks = [
            t for t in self.tasks if
            t.status in [TaskStatus.InProgress.value,
                         TaskStatus.Queued.value]
        ]

        for task in tasks:
            if task.computer_assigned is None:
                continue
            assigned = task.computer_assigned
            comp_assigned = computers[assigned]
            comp_assigned['cpu'] -= task.cpu

            if task.gpu_assigned is not None:
                for g in task.gpu_assigned.split(','):
                    comp_assigned['gpu'][int(g)] = task.id
            comp_assigned['memory'] -= task.memory * 1024

            info = yaml_load(task.additional_info)
            if 'distr_info' in info:
                dist_info = info['distr_info']
                if dist_info['rank'] == 0:
                    comp_assigned['ports'].add(dist_info['master_port'])

        self.computers = [
            {
                **value, 'name': name
            } for name, value in computers.items()
        ]

        self.auxiliary['computers'] = self.computers

    def process_to_celery(self, task: Task, queue: str, computer: dict):
        r = execute.apply_async((task.id,), queue=queue, retry=False)
        task.status = TaskStatus.Queued.value
        task.computer_assigned = computer['name']
        task.celery_id = r.id

        if task.computer_assigned is not None:
            if task.gpu_assigned:
                for g in map(int, task.gpu_assigned.split(',')):
                    computer['gpu'][g] = task.id
            computer['cpu'] -= task.cpu
            computer['memory'] -= task.memory * 1024

        self.logger.info(
            f'Sent task={task.id} to celery. Queue = {queue} '
            f'Task status = {task.status} Celery_id = {r.id}',
            ComponentType.Supervisor)
        self.provider.update()

    def create_service_task(
            self,
            task: Task,
            gpu_assigned=None,
            distr_info: dict = None,
            resume: dict = None
    ):
        new_task = Task(
            name=task.name,
            computer=task.computer,
            executor=task.executor,
            status=TaskStatus.NotRan.value,
            type=TaskType.Service.value,
            gpu_assigned=gpu_assigned,
            parent=task.id,
            report=task.report,
            dag=task.dag
        )
        new_task.additional_info = task.additional_info

        if distr_info:
            additional_info = yaml_load(new_task.additional_info)
            additional_info['distr_info'] = distr_info
            new_task.additional_info = yaml_dump(additional_info)

        if resume:
            additional_info = yaml_load(new_task.additional_info)
            additional_info['resume'] = resume
            new_task.additional_info = yaml_dump(additional_info)

        return self.provider.add(new_task)

    def find_port(self, c: dict, docker_name: str):
        docker = self.docker_provider.get(c['name'], docker_name)
        ports = list(map(int, docker.ports.split('-')))
        for p in range(ports[0], ports[1] + 1):
            if p not in c['ports']:
                return p
        raise Exception(f'All ports in {c["name"]} are taken')

    def _process_task_valid_computer(self, task: Task, c: dict,
                                     single_node: bool):
        if not c['can_process_tasks']:
            return 'this computer can not process tasks'

        if task.computer is not None and task.computer != c['name']:
            return 'name set in the config!= name of this computer'

        if task.cpu > c['cpu']:
            return f'task cpu = {task.cpu} > computer' \
                   f' free cpu = {c["cpu"]}'

        if task.memory > c['memory']:
            return f'task cpu = {task.cpu} > computer ' \
                   f'free memory = {c["memory"]}'

        queue = f'{c["name"]}_' \
                f'{task.dag_rel.docker_img or "default"}'
        if queue not in self.queues:
            return f'required queue = {queue} not in queues'

        if task.gpu > 0 and not any(g == 0 for g in c['gpu']):
            return f'task requires gpu, but there is not any free'

        free_gpu = sum(g == 0 for g in c['gpu'])
        if single_node and task.gpu > free_gpu:
            return f'task requires {task.gpu} ' \
                   f'but there are only {free_gpu} free'

    def _process_task_get_computers(
            self, executor: dict, task: Task, auxiliary: dict
    ):
        single_node = executor.get('single_node', True)

        computers = []
        for c in self.computers:
            error = self._process_task_valid_computer(task, c, single_node)
            auxiliary['computers'].append({'name': c['name'], 'error': error})
            if not error:
                computers.append(c)

        if task.gpu > 0 and single_node and len(computers) > 0:
            computers = sorted(
                computers,
                key=lambda x: sum(g == 0 for g in c['gpu']),
                reverse=True
            )[:1]

        free_gpu = sum(sum(g == 0 for g in c['gpu']) for c in computers)
        if task.gpu > free_gpu:
            auxiliary['not_valid'] = f'gpu required by the ' \
                                     f'task = {task.gpu},' \
                                     f' but there are only {free_gpu} ' \
                                     f'free gpus'
            return []
        return computers

    def _process_task_to_send(
            self, executor: dict, task: Task, computers: List[dict]
    ):
        distr = executor.get('distr', True)
        to_send = []
        for computer in computers:
            queue = f'{computer["name"]}_' \
                    f'{task.dag_rel.docker_img or "default"}'

            if task.gpu_max > 1 and distr:
                for index, task_taken_gpu in enumerate(computer['gpu']):
                    if task_taken_gpu:
                        continue
                    to_send.append([computer, queue, index])

                    if len(to_send) >= task.gpu_max:
                        break

                if len(to_send) >= task.gpu_max:
                    break
            elif task.gpu_max > 0:
                cuda_devices = []
                for index, task_taken_gpu in enumerate(computer['gpu']):
                    if task_taken_gpu:
                        continue

                    cuda_devices.append(index)
                    if len(cuda_devices) >= task.gpu_max:
                        break

                task.gpu_assigned = ','.join(map(str, cuda_devices))
                self.process_to_celery(task, queue, computer)
            else:
                self.process_to_celery(task, queue, computer)
                break
        return to_send

    def process_task(self, task: Task):
        auxiliary = self.auxiliary['process_tasks'][-1]
        auxiliary['computers'] = []

        config = yaml_load(task.dag_rel.config)
        executor = config['executors'][task.executor]

        computers = self._process_task_get_computers(executor, task, auxiliary)
        if len(computers) == 0:
            return

        to_send = self._process_task_to_send(executor, task, computers)
        auxiliary['to_send'] = to_send[:5]
        additional_info = yaml_load(task.additional_info)

        rank = 0
        master_port = None
        if len(to_send) > 0:

            master_port = self.find_port(
                to_send[0][0], to_send[0][1].split('_')[1]
            )
            computer_names = {c['name'] for c, _, __ in to_send}
            if len(computer_names) == 1:
                task.computer_assigned = list(computer_names)[0]

        for computer, queue, gpu_assigned in to_send:
            main_cmp = to_send[0][0]
            # noinspection PyTypeChecker
            ip = 'localhost' if computer['name'] == main_cmp['name'] \
                else main_cmp['ip']

            distr_info = {
                'master_addr': ip,
                'rank': rank,
                'local_rank': gpu_assigned,
                'master_port': master_port,
                'world_size': len(to_send),
                'master_computer': main_cmp['name']
            }
            service_task = self.create_service_task(
                task,
                distr_info=distr_info,
                gpu_assigned=gpu_assigned,
                resume=additional_info.get('resume')
            )
            self.process_to_celery(service_task, queue, computer)
            rank += 1
            main_cmp['ports'].add(master_port)

        if len(to_send) > 0:
            task.status = TaskStatus.Queued.value
            self.sent_tasks += len(to_send)

    def process_tasks(self):
        self.auxiliary['process_tasks'] = []

        for task in self.not_ran_tasks:
            auxiliary = {'id': task.id, 'name': task.name}
            self.auxiliary['process_tasks'].append(auxiliary)

            if task.dag_rel is None:
                task.dag_rel = self.dag_provider.by_id(task.dag)

            if TaskStatus.Stopped.value in self.dep_status[task.id] \
                    or TaskStatus.Failed.value in self.dep_status[task.id] or \
                    TaskStatus.Skipped.value in self.dep_status[task.id]:
                auxiliary['not_valid'] = 'stopped or failed in dep_status'
                self.provider.change_status(task, TaskStatus.Skipped)
                continue

            if len(self.dep_status[task.id]) != 0 \
                    and self.dep_status[task.id] != {TaskStatus.Success.value}:
                auxiliary['not_valid'] = 'not all dep tasks are finished'
                continue
            self.process_task(task)

        self.auxiliary['process_tasks'] = self.auxiliary['process_tasks'][:5]

    def _stop_child_tasks(self, task: Task):
        self.provider.commit()

        children = self.provider.children(task.id, [Task.dag_rel])
        dags = [c.dag_rel for c in children]
        for c, d in zip(children, dags):
            celery_tasks.stop(self.logger, self.session, c, d)

    def process_parent_tasks(self):
        tasks = self.provider.parent_tasks_stats()

        was_change = False
        for task, started, finished, statuses in tasks:
            status = task.status
            if statuses[TaskStatus.Failed] > 0:
                status = TaskStatus.Failed.value
            elif statuses[TaskStatus.Skipped] > 0:
                status = TaskStatus.Skipped.value
            elif statuses[TaskStatus.Queued] > 0:
                status = TaskStatus.Queued.value
            elif statuses[TaskStatus.InProgress] > 0:
                status = TaskStatus.InProgress.value
            elif statuses[TaskStatus.Success] > 0:
                status = TaskStatus.Success.value

            if status != task.status:
                if status == TaskStatus.InProgress.value:
                    task.started = started
                elif status >= TaskStatus.Failed.value:
                    task.started = started
                    task.finished = finished
                    self._stop_child_tasks(task)

                was_change = True
                task.status = status

        if was_change:
            self.provider.commit()

        self.auxiliary['parent_tasks_stats'] = [
            {
                'name': task.name,
                'id': task.id,
                'started': task.started,
                'finished': finished,
                'statuses': [
                    {
                        'name': k.name,
                        'count': v
                    } for k, v in statuses.items()
                ],
            } for task, started, finished, statuses in tasks[:5]
        ]

    def write_auxiliary(self):
        self.auxiliary['duration'] = (now() - self.auxiliary['time']). \
            total_seconds()

        auxiliary = Auxiliary(
            name='supervisor', data=yaml_dump(self.auxiliary)
        )
        if len(auxiliary.data) > 16000:
            return

        self.auxiliary_provider.create_or_update(auxiliary, 'name')

    def stop_tasks(self, tasks: List[Task]):
        self.tasks_stop.extend([t.id for t in tasks])

    def process_stop_tasks(self):
        # Stop not running tasks
        if len(self.tasks_stop) == 0:
            return

        tasks = self.provider.by_ids(self.tasks_stop)
        tasks_not_ran = [t.id for t in tasks if
                         t.status in [TaskStatus.NotRan.value,
                                      TaskStatus.Queued.value]]
        tasks_started = [t for t in tasks if
                         t.status in [TaskStatus.InProgress.value]]
        tasks_started_ids = [t.id for t in tasks_started]

        self.provider.change_status_all(tasks=tasks_not_ran,
                                        status=TaskStatus.Skipped)

        pids = []
        for task in tasks_started:
            if task.pid:
                pids.append((task.computer_assigned, task.pid))

            additional_info = yaml_load(task.additional_info)
            for p in additional_info.get('child_processes', []):
                pids.append((task.computer_assigned, p))

        for computer, queue in self.docker_provider.queues_online():
            pids_computer = [p for c, p in pids if c == computer]
            if len(pids_computer) > 0:
                celery_tasks.kill_all.apply_async((pids_computer,),
                                                  queue=queue,
                                                  retry=False)

        self.provider.change_status_all(tasks=tasks_started_ids,
                                        status=TaskStatus.Stopped)

        self.tasks_stop = []

    def fast_check(self):
        if self.provider is None or self.computer_provider is None:
            return False

        if self.not_ran_tasks is None or self.queues is None:
            return False

        if len(self.tasks_stop) > 0:
            return False

        if len(self.dags_start) > 0:
            return False

        if len(self.auxiliary.get('to_send', [])) > 0:
            return False

        queues = set([
            f'{d.computer}_{d.name}' for d in self.docker_provider.all()
            if d.last_activity >= now() - datetime.timedelta(seconds=15)
        ])

        queues_set = set(queues)
        queues_set2 = set(self.queues)

        if queues_set != queues_set2:
            return False

        tasks = self.provider.by_status(TaskStatus.NotRan,
                                        TaskStatus.Queued,
                                        TaskStatus.InProgress)
        tasks_set = {t.id for t in tasks if
                     t.status == TaskStatus.NotRan.value and not t.debug}
        tasks_set2 = {t.id for t in self.tasks if
                      t.status == TaskStatus.NotRan.value}

        if tasks_set != tasks_set2:
            return False

        tasks_set = {t.id for t in tasks if
                     t.status == TaskStatus.InProgress.value}
        tasks_set2 = {t.id for t in self.tasks if
                      t.status == TaskStatus.InProgress.value}

        if tasks_set != tasks_set2:
            return False

        tasks_set = {t.id for t in tasks if
                     t.status == TaskStatus.Queued.value}
        tasks_set2 = {t.id for t in self.tasks if
                      t.status == TaskStatus.Queued.value}

        if tasks_set != tasks_set2:
            return False

        return True

    def start_dag(self, id: int):
        self.dags_start.append(id)

    def process_start_dags(self):
        if len(self.dags_start) == 0:
            return

        for id in self.dags_start:
            can_start_statuses = [
                TaskStatus.Failed.value, TaskStatus.Skipped.value,
                TaskStatus.Stopped.value
            ]

            tasks = self.provider.by_dag(id)
            children_all = self.provider.children([t.id for t in tasks])

            def find_resume(task):
                children = [c for c in children_all if c.parent == task.id]
                children = sorted(children, key=lambda x: x.id, reverse=True)

                if len(children) > 0:
                    for c in children:
                        if c.parent != task.id:
                            continue

                        info = yaml_load(c.additional_info)
                        if 'distr_info' not in info:
                            continue

                        if info['distr_info']['rank'] == 0:
                            return {
                                'master_computer': c.computer_assigned,
                                'master_task_id': c.id,
                                'load_last': True
                            }
                    raise Exception('Master task not found')
                else:
                    return {
                        'master_computer': task.computer_assigned,
                        'master_task_id': task.id,
                        'load_last': True
                    }

            for t in tasks:
                if t.status not in can_start_statuses:
                    continue

                if t.parent:
                    continue

                if t.type == TaskType.Train.value:
                    info = yaml_load(t.additional_info)
                    info['resume'] = find_resume(t)
                    t.additional_info = yaml_dump(info)

                t.status = TaskStatus.NotRan.value
                t.pid = None
                t.started = None
                t.finished = None
                t.computer_assigned = None
                t.celery_id = None
                t.worker_index = None
                t.docker_assigned = None

        self.provider.commit()
        self.dags_start = []

    def build(self):
        try:
            # if self.fast_check():
            #     return

            self.auxiliary = {'time': now()}

            self.create_base()

            self.process_stop_tasks()

            self.process_start_dags()

            self.process_parent_tasks()

            self.load_tasks()

            self.load_computers()

            self.process_tasks()

            self.write_auxiliary()

        except ObjectDeletedError:
            pass
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup(key='SupervisorBuilder')
                self.session = Session.create_session(key='SupervisorBuilder')
                self.logger = create_logger(self.session, 'SupervisorBuilder')

            self.logger.error(traceback.format_exc(), ComponentType.Supervisor)
Example #25
0
class Storage:
    def __init__(self,
                 session: Session,
                 logger=None,
                 component: ComponentType = None,
                 max_file_size: int = 10**5,
                 max_count=10**3):
        self.file_provider = FileProvider(session)
        self.provider = DagStorageProvider(session)
        self.task_provider = TaskProvider(session)
        self.library_provider = DagLibraryProvider(session)
        self.dag_provider = DagProvider(session)

        self.logger = logger
        self.component = component
        self.max_file_size = max_file_size
        self.max_count = max_count

    def log_info(self, message: str):
        if self.logger:
            self.logger.info(message, self.component)

    def copy_from(self, src: int, dag: Dag):
        storages = self.provider.query(DagStorage). \
            filter(DagStorage.dag == src). \
            all()
        libraries = self.library_provider.query(DagLibrary). \
            filter(DagLibrary.dag == src). \
            all()

        s_news = []
        for s in storages:
            s_new = DagStorage(dag=dag.id,
                               file=s.file,
                               path=s.path,
                               is_dir=s.is_dir)
            s_news.append(s_new)
        l_news = []
        for l in libraries:
            l_new = DagLibrary(dag=dag.id,
                               library=l.library,
                               version=l.version)
            l_news.append(l_new)

        self.provider.add_all(s_news)
        self.library_provider.add_all(l_news)

    def _build_spec(self, folder: str):
        ignore_file = os.path.join(folder, 'file.ignore.txt')
        if not os.path.exists(ignore_file):
            ignore_patterns = []
        else:
            ignore_patterns = read_lines(ignore_file)
        ignore_patterns.extend(
            ['log', '/data', '/models', '__pycache__', '*.ipynb'])

        return pathspec.PathSpec.from_lines(
            pathspec.patterns.GitWildMatchPattern, ignore_patterns)

    def upload(self, folder: str, dag: Dag, control_reqs: bool = True):
        self.log_info('upload started')
        hashs = self.file_provider.hashs(dag.project)
        self.log_info('hashes are retrieved')

        all_files = []
        spec = self._build_spec(folder)

        files = glob(os.path.join(folder, '**'))
        for file in files[:]:
            path = os.path.relpath(file, folder)
            if spec.match_file(path) or path == '.':
                continue
            if os.path.isdir(file):
                child_files = glob(os.path.join(folder, file, '**'),
                                   recursive=True)
                files.extend(child_files)

        if self.max_count and len(files) > self.max_count:
            raise Exception(f'files count = {len(files)} '
                            f'But max count = {self.max_count}')

        self.log_info('list of files formed')

        folders_to_add = []
        files_to_add = []
        files_storage_to_add = []

        total_size_added = 0

        for o in files:
            path = os.path.relpath(o, folder)
            if spec.match_file(path) or path == '.':
                continue

            if isdir(o):
                folder_to_add = DagStorage(dag=dag.id, path=path, is_dir=True)
                folders_to_add.append(folder_to_add)
                continue
            content = open(o, 'rb').read()
            size = sys.getsizeof(content)
            if self.max_file_size and size > self.max_file_size:
                raise Exception(
                    f'file = {o} has size {size}.'
                    f' But max size is set to {self.max_file_size}')
            md5 = hashlib.md5(content).hexdigest()

            all_files.append(o)

            if md5 not in hashs:
                file = File(md5=md5,
                            content=content,
                            project=dag.project,
                            dag=dag.id,
                            created=now())
                hashs[md5] = file
                files_to_add.append(file)
                total_size_added += size

            file_storage = DagStorage(dag=dag.id,
                                      path=path,
                                      file=hashs[md5],
                                      is_dir=False)
            files_storage_to_add.append(file_storage)

        self.log_info('inserting DagStorage folders')

        if len(folders_to_add) > 0:
            self.provider.bulk_save_objects(folders_to_add)

        self.log_info('inserting Files')

        if len(files_to_add) > 0:
            self.file_provider.bulk_save_objects(files_to_add,
                                                 return_defaults=True)

        self.log_info('inserting DagStorage Files')

        if len(files_storage_to_add) > 0:
            for file_storage in files_storage_to_add:
                if isinstance(file_storage.file, File):
                    # noinspection PyUnresolvedReferences
                    file_storage.file = file_storage.file.id

            self.provider.bulk_save_objects(files_storage_to_add)

        dag.file_size += total_size_added

        self.dag_provider.update()

        if INSTALL_DEPENDENCIES and control_reqs:
            reqs = control_requirements(folder, files=all_files)
            for name, rel, version in reqs:
                self.library_provider.add(
                    DagLibrary(dag=dag.id, library=name, version=version))

    def download_dag(self, dag: int, folder: str):
        os.makedirs(folder, exist_ok=True)

        items = self.provider.by_dag(dag)
        items = sorted(items, key=lambda x: x[1] is not None)
        for item, file in items:
            path = os.path.join(folder, item.path)
            if item.is_dir:
                os.makedirs(path, exist_ok=True)
            else:
                with open(path, 'wb') as f:
                    f.write(file.content)

    def download(self, task: int):
        task = self.task_provider.by_id(
            task, joinedload(Task.dag_rel, innerjoin=True))
        folder = join(TASK_FOLDER, str(task.id))
        self.download_dag(task.dag, folder)

        config = Config.from_yaml(task.dag_rel.config)
        info = config['info']
        try:
            data_folder = os.path.join(DATA_FOLDER, info['project'])
            os.makedirs(data_folder, exist_ok=True)

            os.symlink(data_folder,
                       os.path.join(folder, 'data'),
                       target_is_directory=True)
        except FileExistsError:
            pass

        try:
            model_folder = os.path.join(MODEL_FOLDER, info['project'])
            os.makedirs(model_folder, exist_ok=True)

            os.symlink(model_folder,
                       os.path.join(folder, 'models'),
                       target_is_directory=True)
        except FileExistsError:
            pass

        sys.path.insert(0, folder)
        return folder

    def import_executor(self,
                        folder: str,
                        base_folder: str,
                        executor: str,
                        libraries: List[Tuple] = None):

        sys.path.insert(0, base_folder)

        spec = self._build_spec(folder)
        was_installation = False

        folders = [
            p for p in glob(f'{folder}/*', recursive=True)
            if os.path.isdir(p) and not spec.match_file(p)
        ]
        folders += [folder]
        library_names = set(n for n, v in (libraries or []))
        library_versions = {n: v for n, v in (libraries or [])}

        for n in library_names:
            try:
                version = pkg_resources.get_distribution(n).version
                need_install = library_versions[n] != version
            except Exception:
                need_install = True

            if INSTALL_DEPENDENCIES and need_install:
                os.system(f'pip install {n}=={library_versions[n]}')
                was_installation = True

        def is_valid_class(cls: pyclbr.Class):
            return cls.name == executor or \
                   cls.name.lower() == executor or \
                   to_snake(cls.name) == executor

        def relative_name(path: str):
            rel = os.path.relpath(path, base_folder)
            parts = [str(p).split('.')[0] for p in rel.split(os.sep)]
            return '.'.join(parts)

        for (module_loader, module_name,
             ispkg) in pkgutil.iter_modules(folders):
            module = module_loader.find_module(module_name)
            rel_path = os.path.relpath(
                os.path.splitext(module.path)[0],
                base_folder).replace('/', '.')
            try:
                classes = pyclbr.readmodule(rel_path, path=[base_folder])
            except Exception:
                continue
            for k, v in classes.items():
                if is_valid_class(v):
                    importlib.import_module(relative_name(module.path))
                    return True, was_installation

        return False, was_installation
Example #26
0
def dag_tag_remove():
    data = request_data()
    provider = DagProvider(_write_session)
    provider.remove_tag(dag=data['dag'], tag=data['tag'])
Example #27
0
class SegmentationReportBuilder:
    def __init__(self,
                 session: Session,
                 task: Task,
                 layout: str,
                 part: str = 'valid',
                 name: str = 'img_segment',
                 max_img_size: Tuple[int, int] = None,
                 stack_type: str = 'vertical',
                 main_metric: str = 'dice',
                 plot_count: int = 0,
                 colors: List[Tuple] = None):
        self.session = session
        self.task = task
        self.layout = layout
        self.part = part
        self.name = name or 'img_segment'
        self.max_img_size = max_img_size
        self.stack_type = stack_type
        self.main_metric = main_metric
        self.colors = colors
        self.plot_count = plot_count

        self.dag_provider = DagProvider(session)
        self.report_provider = ReportProvider(session)
        self.layout_provider = ReportLayoutProvider(session)
        self.task_provider = TaskProvider(session)
        self.report_img_provider = ReportImgProvider(session)
        self.report_task_provider = ReportTasksProvider(session)
        self.report_series_provider = ReportSeriesProvider(session)

        self.project = self.task_provider.project(task.id).id
        self.layout = self.layout_provider.by_name(layout)
        self.layout_dict = yaml_load(self.layout.content)

        self.create_base()

    def create_base(self):
        report = Report(config=yaml_dump(self.layout_dict),
                        time=now(),
                        layout=self.layout.name,
                        project=self.project,
                        name=self.name)
        self.report_provider.add(report)
        self.report_task_provider.add(
            ReportTasks(report=report.id, task=self.task.id))

        self.task.report = report.id
        self.task_provider.update()

    def encode_pred(self, mask: np.array):
        res = np.zeros((*mask.shape[1:], 3), dtype=np.uint8)
        for i, c in enumerate(mask):
            c = np.repeat(c[:, :, None], 3, axis=2)
            color = self.colors[i] if self.colors is not None else (255, 255,
                                                                    255)
            res += (c * color).astype(np.uint8)

        return res

    def plot_mask(self, img: np.array, mask: np.array):
        if len(img.shape) == 2:
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
        img = img.astype(np.uint8)
        mask = mask.astype(np.uint8)

        for i, c in enumerate(mask):
            contours, _ = cv2.findContours(c, cv2.RETR_LIST,
                                           cv2.CHAIN_APPROX_NONE)
            color = self.colors[i] if self.colors else (0, 255, 0)
            for i in range(0, len(contours)):
                cv2.polylines(img, contours[i], True, color, 2)

        return img

    def process_scores(self, scores):
        for key, item in self.layout_dict['items'].items():
            item['name'] = key
            if item['type'] == 'series' and item['key'] in scores:
                series = ReportSeries(name=item['name'],
                                      value=scores[item['key']],
                                      epoch=0,
                                      time=now(),
                                      task=self.task.id,
                                      part='valid',
                                      stage='stage1')

                self.report_series_provider.add(series)

    def process_pred(self,
                     imgs: np.array,
                     preds: dict,
                     targets: np.array = None,
                     attrs=None,
                     scores=None):
        for key, item in self.layout_dict['items'].items():
            item['name'] = key
            if item['type'] != 'img_segment':
                continue

            report_imgs = []
            dag = self.dag_provider.by_id(self.task.dag)

            for i in range(len(imgs)):
                if self.plot_count <= 0:
                    break

                if targets is not None:
                    img = self.plot_mask(imgs[i], targets[i])
                else:
                    img = imgs[i]

                imgs_add = [img]
                for key, value in preds.items():
                    imgs_add.append(self.encode_pred(value[i]))

                for j in range(len(imgs_add)):
                    imgs_add[j] = resize_saving_ratio(imgs_add[j],
                                                      self.max_img_size)

                if self.stack_type == 'horizontal':
                    img = np.hstack(imgs_add)
                else:
                    img = np.vstack(imgs_add)

                attr = attrs[i] if attrs else {}

                score = None
                if targets is not None:
                    score = scores[self.main_metric][i]

                retval, buffer = cv2.imencode('.jpg', img)
                report_img = ReportImg(group=item['name'],
                                       epoch=0,
                                       task=self.task.id,
                                       img=buffer,
                                       dag=self.task.dag,
                                       part=self.part,
                                       project=self.project,
                                       score=score,
                                       **attr)

                self.plot_count -= 1
                report_imgs.append(report_img)
                dag.img_size += report_img.size

            self.dag_provider.commit()
            self.report_img_provider.bulk_save_objects(report_imgs)
Example #28
0
def dags():
    data = request_data()
    options = PaginatorOptions(**data['paginator'])
    provider = DagProvider(_read_session)
    res = provider.get(data, options)
    return res
Example #29
0
def dag_tags():
    data = request_data()
    provider = DagProvider(_write_session)
    return provider.tags(data['name'])
Example #30
0
def remove_all_dags():
    data = request_data()
    provider = DagProvider(_write_session)
    dags = provider.by_project(data['project'])
    provider.remove_all([d.id for d in dags])