Example #1
0
    def create_dag(self):
        dag = Dag(config=self.config_text or yaml_dump(self.config),
                  project=self.project,
                  name=self.info['name'],
                  docker_img=self.info.get('docker_img'),
                  type=DagType.Standard.value,
                  created=now(),
                  report=self.dag_report_id)

        self.dag = self.dag_provider.add(dag)
Example #2
0
def report_add_end():
    data = request_data()

    provider = ReportProvider(_write_session)
    layouts = ReportLayoutProvider(_write_session).all()
    layout = layouts[data['layout']]
    report = Report(
        name=data['name'], project=data['project'], config=yaml_dump(layout)
    )
    provider.add(report)
Example #3
0
    def create_task(self, k: str, v: dict, name: str, info: dict):
        task_type = TaskType.User.value
        if v.get('task_type') == 'train' or \
                Executor.is_trainable(v['type']):
            task_type = TaskType.Train.value

        gpu = str(v.get('gpu', '0'))
        if '-' not in gpu:
            gpu = int(gpu)
            gpu_max = gpu
        else:
            gpu, gpu_max = map(int, gpu.split('-'))

        if gpu == 0 and gpu_max > 0:
            raise Exception(f'Executor = {k} Gpu_max can"t be>0 when gpu=0')

        task = Task(name=name,
                    executor=k,
                    computer=self.info.get('computer') or v.get('computer'),
                    gpu=gpu,
                    gpu_max=gpu_max,
                    cpu=v.get('cpu', 1),
                    memory=v.get('memory', 0.1),
                    dag=self.dag.id,
                    debug=self.debug,
                    steps=int(v.get('steps', '1')),
                    type=task_type)
        task.additional_info = yaml_dump(info)
        report = None
        if self.layout_name and task_type == TaskType.Train.value:
            if self.layout_name not in self.layouts:
                raise Exception(f'Unknown report = {v["report"]}')

            report_config = self.layouts[self.layout_name]
            info['report_config'] = report_config

            task.additional_info = yaml_dump(info)
            report = Report(config=yaml_dump(report_config),
                            name=task.name,
                            project=self.project,
                            layout=self.layout_name)

        return task, report
Example #4
0
    def add_project(self,
                    name: str,
                    class_names: dict = None,
                    ignore_folders: List[str] = None):
        class_names = class_names or {}
        ignore_folders = ignore_folders or []

        assert type(class_names) == dict, 'class_names type must be dict'
        assert isinstance(ignore_folders, list), \
            'ignore_folders type must be list'

        project = Project(name=name,
                          class_names=yaml_dump(class_names),
                          ignore_folders=yaml_dump(ignore_folders))
        project = self.session.add(project)

        os.makedirs(os.path.join(DATA_FOLDER, name), exist_ok=True)
        os.makedirs(os.path.join(MODEL_FOLDER, name), exist_ok=True)

        return project
Example #5
0
    def write_auxiliary(self):
        self.auxiliary['duration'] = (now() - self.auxiliary['time']). \
            total_seconds()

        auxiliary = Auxiliary(
            name='supervisor', data=yaml_dump(self.auxiliary)
        )
        if len(auxiliary.data) > 16000:
            return

        self.auxiliary_provider.create_or_update(auxiliary, 'name')
Example #6
0
    def create_base(self):
        report = Report(config=yaml_dump(self.layout_dict),
                        time=now(),
                        layout=self.layout.name,
                        project=self.project,
                        name=self.name)
        self.report_provider.add(report)
        self.report_task_provider.add(
            ReportTasks(report=report.id, task=self.task.id))

        self.task.report = report.id
        self.task_provider.update()
Example #7
0
    def work(self):
        task_provider = TaskProvider(self.session)
        task = task_provider.by_id(self.train_task)
        dag = DagProvider(self.session).by_id(self.dag_pipe,
                                              joined_load=[Dag.project_rel])

        task_dir = join(TASK_FOLDER, str(self.child_task or task.id))
        src_log = f'{task_dir}/log'
        models_dir = join(MODEL_FOLDER, dag.project_rel.name)
        os.makedirs(models_dir, exist_ok=True)

        self.info(f'Task = {self.task} child_task: {self.child_task}')

        model_path_tmp = f'{src_log}/traced.pth'
        traced = trace_model_from_checkpoint(src_log, self)

        model = Model(dag=self.dag_pipe,
                      interface=self.interface,
                      slot=self.slot,
                      score_local=task.score,
                      created=now(),
                      name=self.name,
                      project=dag.project,
                      interface_params=yaml_dump(self.interface_params))
        provider = ModelProvider(self.session)
        provider.add(model, commit=False)
        try:
            model_path = f'{models_dir}/{model.name}.pth'
            model_weight_path = f'{models_dir}/{model.name}_weight.pth'
            torch.jit.save(traced, model_path_tmp)
            shutil.copy(model_path_tmp, model_path)
            shutil.copy(f'{src_log}/checkpoints/best.pth', model_weight_path)

            interface_params = yaml_load(model.interface_params)
            interface_params['file'] = join('models', model.name + '.pth')
            model.interface_params = yaml_dump(interface_params)
            provider.update()
        except Exception as e:
            provider.rollback()
            raise e
Example #8
0
def dag_model_start(session: Session, data: dict):
    provider = ModelProvider(session)
    model = provider.by_id(data['model_id'])
    dag_provider = DagProvider(session)
    dag = dag_provider.by_id(data['dag'], joined_load=[Dag.project_rel])

    project = dag.project_rel
    src_config = Config.from_yaml(dag.config)
    pipe = src_config['pipes'][data['pipe']['name']]

    equations = yaml_load(model.equations)
    versions = data['pipe']['versions']

    if len(versions) > 0:
        pipe_equations = yaml_load(versions[0]['equations'])
        versions[0]['used'] = now()

        if len(pipe) == 1:
            pipe[list(pipe)[0]].update(pipe_equations)
        else:
            pipe.update(pipe_equations)

    equations[data['pipe']['name']] = versions
    model.equations = yaml_dump(equations)

    for v in pipe.values():
        v['model_id'] = model.id

    config = {
        'info': {
            'name': data['pipe']['name'],
            'project': project.name
        },
        'executors': pipe
    }

    if model.dag:
        old_dag = dag_provider.by_id(model.dag)
        if old_dag.name != dag.name:
            model.dag = dag.id
    else:
        model.dag = dag.id

    provider.commit()

    dag_standard(
        session=session,
        config=config,
        debug=False,
        upload_files=False,
        copy_files_from=data['dag']
    )
Example #9
0
def computer_sync_end():
    data = request_data()
    provider = ComputerProvider(_write_session)
    for computer in provider.all():
        if data.get('computer') and data['computer'] != computer.name:
            continue
        meta = yaml_load(computer.meta)
        meta['manual_sync'] = {
            'project': data['id'],
            'ignore_folders': yaml_load(data['ignore_folders'])
        }
        computer.meta = yaml_dump(meta)
    provider.update()
Example #10
0
    def create_report(self):
        self.dag_report_id = None
        layout_name = self.layout_name
        if layout_name:
            if layout_name not in self.layouts:
                raise Exception(f'Unknown layout = {layout_name}')

            report = Report(config=yaml_dump(self.layouts[layout_name]),
                            name=self.info['name'],
                            project=self.project,
                            layout=layout_name)
            self.report_provider.add(report)
            self.dag_report_id = report.id
Example #11
0
    def sync_manual(self, computer: Computer, provider: ComputerProvider):
        """
        button sync was clicked manually
        """
        if not computer.meta:
            return

        meta = yaml_load(computer.meta)
        if 'manual_sync' not in meta:
            return

        manual_sync = meta['manual_sync']

        project_provider = ProjectProvider(self.session)
        docker_provider = DockerProvider(self.session)

        dockers = docker_provider.get_online()
        project = project_provider.by_id(manual_sync['project'])
        sync_folders = manual_sync['sync_folders']
        ignore_folders = manual_sync['ignore_folders']

        sync_folders = correct_folders(sync_folders, project.name)
        ignore_folders = correct_folders(ignore_folders, project.name)

        if not isinstance(sync_folders, list):
            sync_folders = []
        if not isinstance(ignore_folders, list):
            ignore_folders = []

        for docker in dockers:
            if docker.computer == computer.name:
                continue

            source = provider.by_name(docker.computer)
            folders = [[s, ignore_folders] for s in sync_folders]

            computer.syncing_computer = source.name
            provider.update()

            try:
                sync_directed(
                    self.session,
                    target=computer,
                    source=source,
                    folders=folders
                )
            except Exception as e:
                self.process_error(e)
        del meta['manual_sync']
        computer.meta = yaml_dump(meta)
        provider.update()
Example #12
0
def stop_all_dags():
    data = request_data()
    provider = TaskProvider(_write_session)
    tasks = provider.by_status(TaskStatus.InProgress,
                               TaskStatus.Queued,
                               TaskStatus.NotRan,
                               project=data['project'])

    for t in tasks:
        info = yaml_load(t.additional_info)
        info['stopped'] = True
        t.additional_info = yaml_dump(info)

    provider.update()
    supervisor.stop_tasks(tasks)
Example #13
0
    def create_dag(self):
        self.log_info('create_dag')

        name = self.info['name']
        if self.grid_cell:
            name = f'{name} {self.grid_cell[1]}'

        dag = Dag(config=self.config_text or yaml_dump(self.config),
                  project=self.project,
                  name=name,
                  docker_img=self.info.get('docker_img'),
                  type=DagType.Standard.value,
                  created=now(),
                  report=self.dag_report_id)

        self.dag = self.dag_provider.add(dag)
Example #14
0
def _dag(config: str,
         debug: bool = False,
         control_reqs=True,
         params: Tuple[str] = ()):
    logger = create_logger(_session, name='_dag')
    logger.info('started', ComponentType.Client)

    config_text = open(config, 'r').read()
    config_parsed = yaml_load(config_text)
    params = dict_from_list_str(params)
    config_parsed = merge_dicts_smart(config_parsed, params)
    config_text = yaml_dump(config_parsed)

    logger.info('config parsed', ComponentType.Client)

    try:
        commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).strip()
        config_parsed['info']['name'] += f'_{commit.decode("utf-8")[:6]}'
    except Exception:
        logger.info('commit not parsed')

    type_name = config_parsed['info'].get('type', 'standard')
    if type_name == DagType.Standard.name.lower():
        cells = grid_cells(
            config_parsed['grid']) if 'grid' in config_parsed else [None]
        dags = []
        for cell in cells:
            dag = dag_standard(session=_session,
                               config=config_parsed,
                               debug=debug,
                               config_text=config_text,
                               config_path=config,
                               control_reqs=control_reqs,
                               logger=logger,
                               component=ComponentType.Client,
                               grid_cell=cell)
            dags.append(dag)

        return dags

    return [
        dag_pipe(session=_session,
                 config=config_parsed,
                 config_text=config_text)
    ]
Example #15
0
def _dag(config: str,
         debug: bool = False,
         control_reqs=True,
         params: Tuple[str] = ()):
    logger = create_logger(_session, name='_dag')
    logger.info('started', ComponentType.Client)

    config_text = open(config, 'r').read()
    config_parsed = yaml_load(config_text)
    params = dict_from_list_str(params)
    config_parsed = merge_dicts_smart(config_parsed, params)
    config_text = yaml_dump(config_parsed)

    logger.info('config parsed', ComponentType.Client)

    type_name = config_parsed['info'].get('type', 'standard')
    if type_name == DagType.Standard.name.lower():
        cells = grid_cells(
            config_parsed['grid']) if 'grid' in config_parsed else [None]
        dags = []
        for cell in cells:
            dag = dag_standard(session=_session,
                               config=config_parsed,
                               debug=debug,
                               config_text=config_text,
                               config_path=config,
                               control_reqs=control_reqs,
                               logger=logger,
                               component=ComponentType.Client,
                               grid_cell=cell)
            dags.append(dag)

        return dags

    return [
        dag_pipe(session=_session,
                 config=config_parsed,
                 config_text=config_text)
    ]
Example #16
0
    def execute(self):
        self.info('execute start')

        res = self.executor(task=self.task,
                            task_provider=self.provider,
                            dag=self.dag)
        self.info('execute executor finished')

        res = res or {}
        self.task.result = yaml_dump(res)
        self.provider.commit()

        if 'stage' in res and 'stages' in res:
            index = res['stages'].index(res['stage'])
            if index < len(res['stages']) - 1:
                self.executor.info(f'stage = {res["stage"]} done. '
                                   f'Go to the stage = '
                                   f'{res["stages"][index + 1]}')

                time.sleep(3)

                self.executor.info(f'sending {(self.id, self.repeat_count)} '
                                   f'to {self.queue_personal}')

                self.task.status = TaskStatus.Queued.value
                self.provider.commit()

                execute.apply_async((self.id, self.repeat_count),
                                    queue=self.queue_personal,
                                    retry=False)
                return

        self.executor.step.finish()
        self.provider.change_status(self.task, TaskStatus.Success)

        self.info('execute end')
Example #17
0
def _dag(config: str,
         debug: bool = False,
         control_reqs=True,
         params: Tuple[str] = ()):
    migrate()

    config_text = open(config, 'r').read()
    config_parsed = yaml_load(config_text)
    params = dict_from_list_str(params)
    config_parsed = merge_dicts_smart(config_parsed, params)
    config_text = yaml_dump(config_parsed)

    type_name = config_parsed['info'].get('type', 'standard')
    if type_name == DagType.Standard.name.lower():
        return dag_standard(session=_session,
                            config=config_parsed,
                            debug=debug,
                            config_text=config_text,
                            config_path=config,
                            control_reqs=control_reqs)

    return dag_pipe(session=_session,
                    config=config_parsed,
                    config_text=config_text)
Example #18
0
 def update_layout_end(self, id: int, layout: str, layouts: dict):
     layout_content = yaml_dump(layouts[layout])
     report = self.by_id(id)
     report.config = layout_content
     report.layout = layout
     self.commit()
Example #19
0
    def work(self):
        args, config = self.parse_args_uargs()
        set_global_seed(args.seed)

        Experiment, R = import_experiment_and_runner(Path(args.expdir))

        runner_params = config.pop('runner_params', {})

        experiment = Experiment(config)
        runner: Runner = R(**runner_params)

        register()

        self.experiment = experiment
        self.runner = runner

        stages = experiment.stages[:]

        if self.master:
            task = self.task if not self.task.parent \
                else self.task_provider.by_id(self.task.parent)
            task.steps = len(stages)
            self.task_provider.commit()

        self._checkpoint_fix_config(experiment)

        _get_callbacks = experiment.get_callbacks

        def get_callbacks(stage):
            res = self.callbacks()
            for k, v in _get_callbacks(stage).items():
                res[k] = v

            self._checkpoint_fix_callback(res)
            return res

        experiment.get_callbacks = get_callbacks

        if experiment.logdir is not None:
            dump_environment(config, experiment.logdir, args.configs)

        if self.distr_info:
            info = yaml_load(self.task.additional_info)
            info['resume'] = {
                'master_computer': self.distr_info['master_computer'],
                'master_task_id': self.task.id - self.distr_info['rank'],
                'load_best': True
            }
            self.task.additional_info = yaml_dump(info)
            self.task_provider.commit()

            experiment.stages_config = {
                k: v
                for k, v in experiment.stages_config.items()
                if k == experiment.stages[0]
            }

        runner.run_experiment(experiment, check=args.check)

        if self.master and self.trace:
            traced = trace_model_from_checkpoint(self.experiment.logdir, self)
            torch.jit.save(traced, self.trace)

        return {'stage': experiment.stages[-1], 'stages': stages}
Example #20
0
 def add_item(self, k: str, v: dict):
     self.add(
         ReportLayout(content=yaml_dump(v), name=k, last_modified=now())
     )
Example #21
0
 def add_child_process(self, pid: int):
     additional_info = yaml_load(self.task.additional_info)
     additional_info['child_processes'] = additional_info.get(
         'child_processes', []) + [pid]
     self.task.additional_info = yaml_dump(additional_info)
     self.task_provider.update()
Example #22
0
def dag_start():
    data = request_data()
    provider = DagProvider(_write_session)
    task_provider = TaskProvider(_write_session)

    id = int(data['id'])
    dag = provider.by_id(id, joined_load=['tasks'])
    can_start_statuses = [
        TaskStatus.Failed.value, TaskStatus.Skipped.value,
        TaskStatus.Stopped.value
    ]

    tasks = list(dag.tasks)

    def find_resume(task):
        children = task_provider.children(task.id)
        children = sorted(children, key=lambda x: x.id, reverse=True)

        if len(children) > 0:
            for c in children:
                if c.parent != task.id:
                    continue

                info = yaml_load(c.additional_info)
                if 'distr_info' not in info:
                    continue

                if info['distr_info']['rank'] == 0:
                    return {
                        'master_computer': c.computer_assigned,
                        'master_task_id': c.id,
                        'load_last': True
                    }
            raise Exception('Master task not found')
        else:
            return {
                'master_computer': task.computer_assigned,
                'master_task_id': task.id,
                'load_last': True
            }

    for t in tasks:
        if t.status not in can_start_statuses:
            continue

        if t.parent:
            continue

        info = yaml_load(t.additional_info)
        info['resume'] = find_resume(t)
        t.additional_info = yaml_dump(info)

        t.status = TaskStatus.NotRan.value
        t.pid = None
        t.started = None
        t.finished = None
        t.computer_assigned = None
        t.celery_id = None
        t.worker_index = None
        t.docker_assigned = None

    provider.commit()
Example #23
0
    def process_start_dags(self):
        if len(self.dags_start) == 0:
            return

        for id in self.dags_start:
            can_start_statuses = [
                TaskStatus.Failed.value, TaskStatus.Skipped.value,
                TaskStatus.Stopped.value
            ]

            tasks = self.provider.by_dag(id)
            children_all = self.provider.children([t.id for t in tasks])

            def find_resume(task):
                children = [c for c in children_all if c.parent == task.id]
                children = sorted(children, key=lambda x: x.id, reverse=True)

                if len(children) > 0:
                    for c in children:
                        if c.parent != task.id:
                            continue

                        info = yaml_load(c.additional_info)
                        if 'distr_info' not in info:
                            continue

                        if info['distr_info']['rank'] == 0:
                            return {
                                'master_computer': c.computer_assigned,
                                'master_task_id': c.id,
                                'load_last': True
                            }
                    raise Exception('Master task not found')
                else:
                    return {
                        'master_computer': task.computer_assigned,
                        'master_task_id': task.id,
                        'load_last': True
                    }

            for t in tasks:
                if t.status not in can_start_statuses:
                    continue

                if t.parent:
                    continue

                if t.type == TaskType.Train.value:
                    info = yaml_load(t.additional_info)
                    info['resume'] = find_resume(t)
                    t.additional_info = yaml_dump(info)

                t.status = TaskStatus.NotRan.value
                t.pid = None
                t.started = None
                t.finished = None
                t.computer_assigned = None
                t.celery_id = None
                t.worker_index = None
                t.docker_assigned = None

        self.provider.commit()
        self.dags_start = []
Example #24
0
    def get(self, filter: dict, options: PaginatorOptions):
        query = self.query(Task, Project.name).\
            join(Dag, Dag.id == Task.dag).\
            join(Project, Project.id == Dag.project).\
            options(joinedload(Task.dag_rel, innerjoin=True))

        query = self._get_filter(query, filter)

        total = query.count()
        paginator = self.paginator(query, options)
        res = []

        for p, project_name in paginator.all():
            if p.dag_rel is None:
                continue

            item = {**self.to_dict(p, rules=('-additional_info', ))}
            item['status'] = to_snake(TaskStatus(item['status']).name)
            item['type'] = to_snake(TaskType(item['type']).name)
            item['dag_rel']['project'] = {
                'id': item['dag_rel']['project'],
                'name': project_name
            }
            if p.started is None:
                delta = 0
            elif p.status == TaskStatus.InProgress.value:
                delta = (now() - p.started).total_seconds()
            else:
                finish = (p.finished or p.last_activity)
                delta = (finish - p.started).total_seconds()
            item['duration'] = duration_format(delta)
            if p.dag_rel is not None:
                res.append(item)

        if filter.get('report'):
            tasks_within_report = self.query(
                ReportTasks.task
            ).filter(ReportTasks.report == int(filter['report']))
            tasks_within_report = {t[0] for t in tasks_within_report}
            for r in res:
                r['report_full'] = r['id'] in tasks_within_report

        projects = self.query(Project.name, Project.id). \
            order_by(Project.id.desc()). \
            limit(20). \
            all()
        dags = self.query(Dag.name, Dag.id). \
            order_by(Dag.id.desc()). \
            limit(20). \
            all()
        projects = [{'name': name, 'id': id} for name, id in projects]
        dags = [{'name': name, 'id': id} for name, id in dags]

        dags_model = self.query(Dag.name, Dag.id, Dag.config). \
            filter(Dag.type == DagType.Pipe.value). \
            order_by(Dag.id.desc()). \
            all()

        dags_model_dict = []
        used_dag_names = set()

        for name, id, config in dags_model:
            if name in used_dag_names:
                continue

            config = Config.from_yaml(config)
            slots = []
            for pipe in config['pipes'].values():
                for k, v in pipe.items():
                    if 'slot' in v:
                        slots.append(v['slot'])
                    elif 'slots' in v:
                        slots.extend(v['slots'])

            dag = {
                'name': name,
                'id': id,
                'slots': slots,
                'interfaces': [
                    {
                        'name': k,
                        'params': yaml_dump(v)
                    } for k, v in config['interfaces'].items()
                ]
            }
            dags_model_dict.append(dag)
            used_dag_names.add(name)

        return {
            'total': total,
            'data': res,
            'projects': projects,
            'dags': dags,
            'dags_model': dags_model_dict
        }
Example #25
0
    def create_tasks(self):
        tasks = self.task_provider.by_dag(self.dag)
        tasks_new = []
        tasks_old = []

        for t in tasks:
            if t.parent:
                continue

            task = Task(
                name=t.name,
                status=TaskStatus.NotRan.value,
                computer=t.computer,
                gpu=t.gpu,
                gpu_max=t.gpu_max,
                cpu=t.cpu,
                executor=t.executor,
                memory=t.memory,
                steps=t.steps,
                dag=self.dag_db.id,
                debug=t.debug,
                type=t.type,
            )
            task.additional_info = t.additional_info
            tasks_new.append(task)
            tasks_old.append(t)

        self.task_provider.bulk_save_objects(tasks_new, return_defaults=True)
        old2new = {
            t_old.id: t_new.id
            for t_new, t_old in zip(tasks_new, tasks_old)
        }
        dependencies = self.task_provider.get_dependencies(self.dag)
        dependencies_new = []
        for d in dependencies:
            d_new = TaskDependence(task_id=old2new[d.task_id],
                                   depend_id=old2new[d.depend_id])
            dependencies_new.append(d_new)

        self.task_provider.bulk_save_objects(dependencies_new,
                                             return_defaults=False)

        changes = yaml_load(self.file_changes)
        storages = self.dag_storage_provider.by_dag(self.dag)
        storages_new = []

        for s, f in storages:
            if not isinstance(changes, dict):
                continue

            replace = self.find_replace(changes, s.path)
            if replace is not None and f:
                content = f.content.decode('utf-8')
                if s.path.endswith('.yml'):
                    data = yaml_load(content)
                    data = merge_dicts_smart(data, replace)
                    content = yaml_dump(data)
                else:
                    for k, v in replace:
                        if k not in content:
                            raise Exception(f'{k} is not in the content')
                        content = content.replace(k, v)
                content = content.encode('utf-8')
                md5 = hashlib.md5(content).hexdigest()
                f = self.file_provider.by_md5(md5)
                if not f:
                    f = File(content=content,
                             created=now(),
                             project=self.dag_db.project,
                             md5=md5,
                             dag=self.dag_db.id)
                self.file_provider.add(f)

            s_new = DagStorage(dag=self.dag_db.id,
                               file=f.id,
                               path=s.path,
                               is_dir=s.is_dir)
            storages_new.append(s_new)

        self.dag_storage_provider.bulk_save_objects(storages_new,
                                                    return_defaults=False)