Ejemplo n.º 1
0
    def create_service_task(
            self,
            task: Task,
            gpu_assigned=None,
            distr_info: dict = None,
            resume: dict = None
    ):
        new_task = Task(
            name=task.name,
            computer=task.computer,
            executor=task.executor,
            status=TaskStatus.NotRan.value,
            type=TaskType.Service.value,
            gpu_assigned=gpu_assigned,
            parent=task.id,
            report=task.report,
            dag=task.dag
        )
        new_task.additional_info = task.additional_info

        if distr_info:
            additional_info = yaml_load(new_task.additional_info)
            additional_info['distr_info'] = distr_info
            new_task.additional_info = yaml_dump(additional_info)

        if resume:
            additional_info = yaml_load(new_task.additional_info)
            additional_info['resume'] = resume
            new_task.additional_info = yaml_dump(additional_info)

        return self.provider.add(new_task)
Ejemplo n.º 2
0
    def create_task(self, k: str, v: dict, name: str, info: dict):
        task_type = TaskType.User.value
        if v.get('task_type') == 'train' or \
                Executor.is_trainable(v['type']):
            task_type = TaskType.Train.value

        gpu = str(v.get('gpu', '0'))
        if '-' not in gpu:
            gpu = int(gpu)
            gpu_max = gpu
        else:
            gpu, gpu_max = map(int, gpu.split('-'))

        if gpu == 0 and gpu_max > 0:
            raise Exception(f'Executor = {k} Gpu_max can"t be>0 when gpu=0')

        task = Task(name=name,
                    executor=k,
                    computer=self.info.get('computer'),
                    gpu=gpu,
                    gpu_max=gpu_max,
                    cpu=v.get('cpu', 1),
                    memory=v.get('memory', 0.1),
                    dag=self.dag.id,
                    debug=self.debug,
                    steps=int(v.get('steps', '1')),
                    type=task_type)

        if self.layout_name and task_type == TaskType.Train.value:
            if self.layout_name not in self.layouts:
                raise Exception(f'Unknown report = {v["report"]}')

            report_config = self.layouts[self.layout_name]
            info['report_config'] = report_config

            task.additional_info = yaml_dump(info)
            self.provider.add(task, commit=False)
            report = Report(config=yaml_dump(report_config),
                            name=task.name,
                            project=self.project,
                            layout=self.layout_name)
            self.report_provider.add(report)
            task.report = report.id

            self.report_tasks_provider.add(
                ReportTasks(report=report.id, task=task.id))

            self.report_tasks_provider.add(
                ReportTasks(report=self.dag_report_id, task=task.id))

            self.provider.commit()
        else:
            task.additional_info = yaml_dump(self.additional_info)
            self.provider.add(task)

        return task.id
Ejemplo n.º 3
0
    def create_task(self,
                    k: str,
                    v: dict,
                    name: str,
                    info: dict,
                    cell: dict = None):
        task_type = TaskType.User.value
        v = deepcopy(v)
        if v.get('task_type') == 'train' or \
                Executor.is_trainable(v['type']):
            task_type = TaskType.Train.value

        gpu = str(v.get('gpu', '0'))
        if '-' not in gpu:
            gpu = int(gpu)
            gpu_max = gpu
        else:
            gpu, gpu_max = map(int, gpu.split('-'))

        if gpu == 0 and gpu_max > 0:
            raise Exception(f'Executor = {k} Gpu_max can"t be>0 when gpu=0')

        task = Task(name=name,
                    executor=k,
                    computer=self.info.get('computer') or v.get('computer'),
                    gpu=gpu,
                    gpu_max=gpu_max,
                    cpu=v.get('cpu', 1),
                    memory=v.get('memory', 0.1),
                    dag=self.dag.id,
                    debug=self.debug,
                    steps=int(v.get('steps', '1')),
                    type=task_type)

        if cell is not None:
            v.update(cell)

        info['executor'] = v
        task.additional_info = yaml_dump(info)
        report = None

        if self.layout_name and task_type == TaskType.Train.value:
            if self.layout_name not in self.layouts:
                raise Exception(f'Unknown report = {v["report"]}')

            report_config = self.layouts[self.layout_name]
            info['report_config'] = report_config

            task.additional_info = yaml_dump(info)
            report = Report(config=yaml_dump(report_config),
                            name=task.name,
                            project=self.project,
                            layout=self.layout_name)

        return task, report
Ejemplo n.º 4
0
    def process_task(self, task: Task):
        auxiliary = self.auxiliary['process_tasks'][-1]
        auxiliary['computers'] = []

        config = yaml_load(task.dag_rel.config)
        executor = config['executors'][task.executor]

        computers = self._process_task_get_computers(executor, task, auxiliary)
        if len(computers) == 0:
            return

        to_send = self._process_task_to_send(executor, task, computers)
        auxiliary['to_send'] = to_send[:5]
        additional_info = yaml_load(task.additional_info)

        rank = 0
        master_port = None
        if len(to_send) > 0:

            master_port = self.find_port(
                to_send[0][0], to_send[0][1].split('_')[1]
            )
            computer_names = {c['name'] for c, _, __ in to_send}
            if len(computer_names) == 1:
                task.computer_assigned = list(computer_names)[0]

        for computer, queue, gpu_assigned in to_send:
            main_cmp = to_send[0][0]
            # noinspection PyTypeChecker
            ip = 'localhost' if computer['name'] == main_cmp['name'] \
                else main_cmp['ip']

            distr_info = {
                'master_addr': ip,
                'rank': rank,
                'local_rank': gpu_assigned,
                'master_port': master_port,
                'world_size': len(to_send),
                'master_computer': main_cmp['name']
            }
            service_task = self.create_service_task(
                task,
                distr_info=distr_info,
                gpu_assigned=gpu_assigned,
                resume=additional_info.get('resume')
            )
            self.process_to_celery(service_task, queue, computer)
            rank += 1
            main_cmp['ports'].add(master_port)

        if len(to_send) > 0:
            task.status = TaskStatus.Queued.value
            self.sent_tasks += len(to_send)
Ejemplo n.º 5
0
    def process_to_celery(self, task: Task, queue: str, computer: dict):
        r = execute.apply_async((task.id, ), queue=queue)
        task.status = TaskStatus.Queued.value
        task.computer_assigned = computer['name']
        task.celery_id = r.id

        if task.gpu_assigned is not None:
            for g in map(int, task.gpu_assigned.split(',')):
                computer['gpu'][g] = task.id
            computer['cpu'] -= task.cpu
            computer['memory'] -= task.memory * 1024

        self.provider.update()
Ejemplo n.º 6
0
    def _process_task_to_send(self, executor: dict, task: Task,
                              computers: List[dict]):
        distr = executor.get('distr', True)
        to_send = []
        for computer in computers:
            queue = f'{computer["name"]}_' \
                    f'{task.dag_rel.docker_img or "default"}'

            if task.gpu_max > 1 and distr:
                for index, task_taken_gpu in enumerate(computer['gpu']):
                    if task_taken_gpu:
                        continue
                    to_send.append([computer, queue, index])

                    if len(to_send) >= task.gpu_max:
                        break

                if len(to_send) >= task.gpu_max:
                    break
            elif task.gpu_max > 0:
                cuda_devices = []
                for index, task_taken_gpu in enumerate(computer['gpu']):
                    if task_taken_gpu:
                        continue

                    cuda_devices.append(index)
                    if len(cuda_devices) >= task.gpu_max:
                        break

                task.gpu_assigned = ','.join(map(str, cuda_devices))
                self.process_to_celery(task, queue, computer)
            else:
                self.process_to_celery(task, queue, computer)
                break
        return to_send
Ejemplo n.º 7
0
    def process_to_celery(self, task: Task, queue: str, computer: dict):
        r = execute.apply_async((task.id, ), queue=queue, retry=False)
        task.status = TaskStatus.Queued.value
        task.computer_assigned = computer['name']
        task.celery_id = r.id

        if task.gpu_assigned is not None:
            for g in map(int, task.gpu_assigned.split(',')):
                computer['gpu'][g] = task.id
            computer['cpu'] -= task.cpu
            computer['memory'] -= task.memory * 1024

        self.logger.info(
            f'Sent task={task.id} to celery. Queue = {queue} '
            f'Task status = {task.status} Celery_id = {r.id}',
            ComponentType.Supervisor)
        self.provider.update()
Ejemplo n.º 8
0
    def create_tasks(self):
        tasks = self.task_provider.by_dag(self.dag)
        tasks_new = []
        tasks_old = []

        for t in tasks:
            if t.parent:
                continue

            task = Task(
                name=t.name,
                status=TaskStatus.NotRan.value,
                computer=t.computer,
                gpu=t.gpu,
                gpu_max=t.gpu_max,
                cpu=t.cpu,
                executor=t.executor,
                memory=t.memory,
                steps=t.steps,
                dag=self.dag_db.id,
                debug=t.debug,
                type=t.type,
            )
            task.additional_info = t.additional_info
            tasks_new.append(task)
            tasks_old.append(t)

        self.task_provider.bulk_save_objects(tasks_new, return_defaults=True)
        old2new = {
            t_old.id: t_new.id
            for t_new, t_old in zip(tasks_new, tasks_old)
        }
        dependencies = self.task_provider.get_dependencies(self.dag)
        dependencies_new = []
        for d in dependencies:
            d_new = TaskDependence(task_id=old2new[d.task_id],
                                   depend_id=old2new[d.depend_id])
            dependencies_new.append(d_new)

        self.task_provider.bulk_save_objects(dependencies_new,
                                             return_defaults=False)

        changes = yaml_load(self.file_changes)
        storages = self.dag_storage_provider.by_dag(self.dag)
        storages_new = []

        for s, f in storages:
            if not isinstance(changes, dict):
                continue

            replace = self.find_replace(changes, s.path)
            if replace is not None and f:
                content = f.content.decode('utf-8')
                if s.path.endswith('.yml'):
                    data = yaml_load(content)
                    data = merge_dicts_smart(data, replace)
                    content = yaml_dump(data)
                else:
                    for k, v in replace:
                        if k not in content:
                            raise Exception(f'{k} is not in the content')
                        content = content.replace(k, v)
                content = content.encode('utf-8')
                md5 = hashlib.md5(content).hexdigest()
                f = self.file_provider.by_md5(md5)
                if not f:
                    f = File(content=content,
                             created=now(),
                             project=self.dag_db.project,
                             md5=md5,
                             dag=self.dag_db.id)
                self.file_provider.add(f)

            s_new = DagStorage(dag=self.dag_db.id,
                               file=f.id,
                               path=s.path,
                               is_dir=s.is_dir)
            storages_new.append(s_new)

        self.dag_storage_provider.bulk_save_objects(storages_new,
                                                    return_defaults=False)