Exemple #1
0
def sync(project: str, computer: str, only_from: bool, only_to: bool):
    _create_computer()

    computer = computer or socket.gethostname()
    provider = ComputerProvider(_session)
    project_provider = ProjectProvider(_session)
    computer = provider.by_name(computer)
    computers = provider.all()
    folders_excluded = []
    p = project_provider.by_name(project)
    assert p, f'Project={project} is not found'

    ignore = yaml_load(p.ignore_folders)
    excluded = []
    for f in ignore:
        excluded.append(str(f))

    folders_excluded.append([join('data', p.name), excluded])
    folders_excluded.append([join('models', p.name), []])

    for c in computers:
        if c.name != computer.name:
            if not only_from:
                sync_directed(_session, computer, c, folders_excluded)
            if not only_to:
                sync_directed(_session, c, computer, folders_excluded)
Exemple #2
0
def computers():
    data = request_data()
    options = PaginatorOptions(**data['paginator'])
    options.sort_column = 'name'

    provider = ComputerProvider(_read_session)
    return provider.get(data, options)
Exemple #3
0
    def sync_manual(self, computer: Computer, provider: ComputerProvider):
        """
        button sync was clicked manually
        """
        if not computer.meta:
            return

        meta = yaml_load(computer.meta)
        if 'manual_sync' not in meta:
            return

        manual_sync = meta['manual_sync']

        project_provider = ProjectProvider(self.session)
        docker_provider = DockerProvider(self.session)

        dockers = docker_provider.get_online()
        project = project_provider.by_id(manual_sync['project'])

        for docker in dockers:
            if docker.computer == computer.name:
                continue

            source = provider.by_name(docker.computer)
            ignore_folders = [
                [join('models', project.name), []]
            ]
            sync_directed(self.session, target=computer, source=source,
                          ignore_folders=ignore_folders)

        del meta['manual_sync']
        computer.meta = yaml_dump(meta)
        provider.update()
Exemple #4
0
    def __init__(
            self,
            args: Args,
            report: ReportLayoutInfo,
            distr_info: dict,
            resume: dict,
            grid_config: dict,
            trace: str,
            params: dict,
            **kwargs
    ):
        super().__init__(**kwargs)

        self.order = 0
        self.resume = resume
        self.distr_info = distr_info
        self.args = args
        self.report = report
        self.experiment = None
        self.runner = None
        self.series_provider = ReportSeriesProvider(self.session)
        self.computer_provider = ComputerProvider(self.session)
        self.grid_config = grid_config
        self.master = True
        self.checkpoint_resume = False
        self.checkpoint_stage_epoch = 0
        self.trace = trace
        self.params = params
        self.last_batch_logged = None
        self.loader_started_time = None
        self.parent = None
        self.loader_step_start = 0
Exemple #5
0
def copy_remote(session: Session, computer_from: str, path_from: str,
                path_to: str):
    provider = ComputerProvider(session)
    src = provider.by_name(computer_from)
    host = socket.gethostname()
    if host != computer_from:
        c = f'scp -P {src.port} {src.user}@{src.ip}:{path_from} {path_to}'
    else:
        f'cp {path_from} {path_to}'
    subprocess.check_output(c, shell=True)
    return os.path.exists(path_to)
Exemple #6
0
def computer_sync_end():
    data = request_data()
    provider = ComputerProvider(_write_session)
    for computer in provider.all():
        if data.get('computer') and data['computer'] != computer.name:
            continue
        meta = yaml_load(computer.meta)
        meta['manual_sync'] = {
            'project': data['id'],
            'ignore_folders': yaml_load(data['ignore_folders'])
        }
        computer.meta = yaml_dump(meta)
    provider.update()
Exemple #7
0
    def sync_manual(self, computer: Computer, provider: ComputerProvider):
        """
        button sync was clicked manually
        """
        if not computer.meta:
            return

        meta = yaml_load(computer.meta)
        if 'manual_sync' not in meta:
            return

        manual_sync = meta['manual_sync']

        project_provider = ProjectProvider(self.session)
        docker_provider = DockerProvider(self.session)

        dockers = docker_provider.get_online()
        project = project_provider.by_id(manual_sync['project'])
        sync_folders = manual_sync['sync_folders']
        ignore_folders = manual_sync['ignore_folders']

        sync_folders = correct_folders(sync_folders, project.name)
        ignore_folders = correct_folders(ignore_folders, project.name)

        if not isinstance(sync_folders, list):
            sync_folders = []
        if not isinstance(ignore_folders, list):
            ignore_folders = []

        for docker in dockers:
            if docker.computer == computer.name:
                continue

            source = provider.by_name(docker.computer)
            folders = [[s, ignore_folders] for s in sync_folders]

            computer.syncing_computer = source.name
            provider.update()

            try:
                sync_directed(
                    self.session,
                    target=computer,
                    source=source,
                    folders=folders
                )
            except Exception as e:
                self.process_error(e)
        del meta['manual_sync']
        computer.meta = yaml_dump(meta)
        provider.update()
Exemple #8
0
    def create_base(self):
        self.session.commit()

        self.provider = TaskProvider(self.session)
        self.computer_provider = ComputerProvider(self.session)
        self.docker_provider = DockerProvider(self.session)
        self.auxiliary_provider = AuxiliaryProvider(self.session)
        self.dag_provider = DagProvider(self.session)

        self.queues = [
            f'{d.computer}_{d.name}' for d in self.docker_provider.all()
            if d.last_activity >= now() - datetime.timedelta(seconds=15)
        ]

        self.auxiliary['queues'] = self.queues
Exemple #9
0
    def sync(self):
        hostname = socket.gethostname()
        try:
            provider = ComputerProvider(self.session)
            task_synced_provider = TaskSyncedProvider(self.session)

            computer = provider.by_name(hostname)
            sync_start = now()

            if FILE_SYNC_INTERVAL == 0:
                time.sleep(1)
            else:
                computers = provider.all_with_last_activtiy()
                computers = [
                    c for c in computers
                    if (now() - c.last_activity).total_seconds() < 10
                ]
                computers_names = {c.name for c in computers}

                for c, project, tasks in task_synced_provider.for_computer(
                        computer.name):
                    if c.name not in computers_names:
                        self.logger.info(
                            f'Computer = {c.name} '
                            f'is offline. Can not sync',
                            ComponentType.WorkerSupervisor, hostname)
                        continue

                    if c.syncing_computer:
                        continue

                    excluded = list(map(str,
                                        yaml_load(project.ignore_folders)))
                    folders_excluded = [[join('data', project.name), excluded],
                                        [join('models', project.name), []]]

                    computer.syncing_computer = c.name
                    provider.update()
                    sync_directed(self.session, c, computer, folders_excluded)

                    for t in tasks:
                        task_synced_provider.add(
                            TaskSynced(computer=computer.name, task=t.id))

                    time.sleep(FILE_SYNC_INTERVAL)

            computer.last_synced = sync_start
            computer.syncing_computer = None
            provider.update()
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup('FileSync')
                self.session = Session.create_session(key='FileSync')
                self.logger = create_logger(self.session, 'FileSync')

            self.logger.error(traceback.format_exc(),
                              ComponentType.WorkerSupervisor, hostname)
Exemple #10
0
def describe_resources(computer: str, axis):
    provider = ComputerProvider()
    res = provider.get({})['data']
    res = [r for r in res if r['name'] == computer][0]
    usage = res['usage_history']
    x = [
        datetime.datetime.strptime(t, provider.datetime_format)
        for t in usage['time']
    ]

    for item in usage['mean']:
        if item['name'] == 'disk':
            continue

        axis.plot(x, item['value'], label=item['name'])

    axis.set_title('Resources')
    axis.set_ylabel('%')
    axis.legend(loc='lower left')
Exemple #11
0
def sync(project: str, computer: str, only_from: bool, only_to: bool,
         online: bool):
    """
    Syncs specified project on this computer with other computers
    """
    check_statuses()

    _create_computer()
    _create_docker()

    computer = computer or socket.gethostname()
    provider = ComputerProvider(_session)
    project_provider = ProjectProvider(_session)
    computer = provider.by_name(computer)
    computers = provider.all_with_last_activtiy()
    p = project_provider.by_name(project)
    assert p, f'Project={project} is not found'

    sync_folders = yaml_load(p.sync_folders)
    ignore_folders = yaml_load(p.ignore_folders)

    sync_folders = correct_folders(sync_folders, p.name)
    ignore_folders = correct_folders(ignore_folders, p.name)

    if not isinstance(sync_folders, list):
        sync_folders = []
    if not isinstance(ignore_folders, list):
        ignore_folders = []

    folders = [[s, ignore_folders] for s in sync_folders]

    for c in computers:
        if c.name != computer.name:
            if online and (now() - c.last_activity).total_seconds() > 100:
                continue

            if not only_from:
                sync_directed(_session, computer, c, folders)
            if not only_to:
                sync_directed(_session, c, computer, folders)
Exemple #12
0
def worker_usage(session: Session, logger):
    provider = ComputerProvider(session)
    docker_provider = DockerProvider(session)

    computer = socket.gethostname()
    docker = docker_provider.get(computer, DOCKER_IMG)
    usages = []

    for _ in range(1 if MODE_ECONOMIC else 10):
        # noinspection PyProtectedMember
        memory = dict(psutil.virtual_memory()._asdict())

        usage = {
            'cpu':
            psutil.cpu_percent(),
            'disk':
            disk(ROOT_FOLDER)[1],
            'memory':
            memory['percent'],
            'gpu': [{
                'memory': g.memoryUtil * 100,
                'load': g.load * 100
            } for g in GPUtil.getGPUs()]
        }

        provider.current_usage(computer, usage)
        usages.append(usage)
        docker.last_activity = now()
        docker_provider.update()

        time.sleep(10 if MODE_ECONOMIC else 1)

    usage = json.dumps({'mean': dict_func(usages, np.mean)})
    provider.add(ComputerUsage(computer=computer, usage=usage, time=now()))
Exemple #13
0
def _create_computer():
    tot_m, used_m, free_m = memory()
    tot_d, used_d, free_d = disk(ROOT_FOLDER)
    computer = Computer(name=socket.gethostname(),
                        gpu=len(GPUtil.getGPUs()),
                        cpu=cpu_count(),
                        memory=tot_m,
                        ip=IP,
                        port=PORT,
                        user=get_username(),
                        disk=tot_d,
                        root_folder=ROOT_FOLDER)
    ComputerProvider(_session).create_or_update(computer, 'name')
Exemple #14
0
def _create_computer():
    tot_m, used_m, free_m = memory()
    tot_d, used_d, free_d = disk(ROOT_FOLDER)
    computer = Computer(name=socket.gethostname(),
                        gpu=torch.cuda.device_count(),
                        cpu=cpu_count(),
                        memory=tot_m,
                        ip=IP,
                        port=PORT,
                        user=get_username(),
                        disk=tot_d,
                        root_folder=ROOT_FOLDER,
                        sync_with_this_computer=SYNC_WITH_THIS_COMPUTER,
                        can_process_tasks=CAN_PROCESS_TASKS)
    ComputerProvider(_session).create_or_update(computer, 'name')
Exemple #15
0
def worker_usage(session: Session, logger):
    provider = ComputerProvider(session)
    docker_provider = DockerProvider(session)

    computer = socket.gethostname()
    docker = docker_provider.get(computer, DOCKER_IMG)
    usages = []

    count = int(10 / WORKER_USAGE_INTERVAL)
    count = max(1, count)

    for _ in range(count):
        # noinspection PyProtectedMember
        memory = dict(psutil.virtual_memory()._asdict())

        try:
            gpus = GPUtil.getGPUs()
        except ValueError as err:
            logger.info(f"Active GPUs not found: {err}")
            gpus = []

        usage = {
            'cpu':
            psutil.cpu_percent(),
            'disk':
            disk(ROOT_FOLDER)[1],
            'memory':
            memory['percent'],
            'gpu': [{
                'memory': g.memoryUtil * 100,
                'load': g.load * 100
            } for g in gpus]
        }

        provider.current_usage(computer, usage)
        usages.append(usage)
        docker.last_activity = now()
        docker_provider.update()

        time.sleep(WORKER_USAGE_INTERVAL)

    usage = json.dumps({'mean': dict_func(usages, np.mean)})
    provider.add(ComputerUsage(computer=computer, usage=usage, time=now()))
Exemple #16
0
class SupervisorBuilder:
    def __init__(self):
        self.session = Session.create_session(key='SupervisorBuilder')
        self.logger = create_logger(self.session, 'SupervisorBuilder')
        self.provider = None
        self.computer_provider = None
        self.docker_provider = None
        self.auxiliary_provider = None
        self.dag_provider = None
        self.queues = None
        self.not_ran_tasks = None
        self.dep_status = None
        self.computers = None
        self.auxiliary = {}

        self.tasks = []
        self.tasks_stop = []
        self.dags_start = []
        self.sent_tasks = 0

    def create_base(self):
        self.session.commit()

        self.provider = TaskProvider(self.session)
        self.computer_provider = ComputerProvider(self.session)
        self.docker_provider = DockerProvider(self.session)
        self.auxiliary_provider = AuxiliaryProvider(self.session)
        self.dag_provider = DagProvider(self.session)

        self.queues = [
            f'{d.computer}_{d.name}' for d in self.docker_provider.all()
            if d.last_activity >= now() - datetime.timedelta(seconds=15)
        ]

        self.auxiliary['queues'] = self.queues

    def load_tasks(self):
        self.tasks = self.provider.by_status(TaskStatus.NotRan,
                                             TaskStatus.InProgress,
                                             TaskStatus.Queued)

        not_ran_tasks = [t for t in self.tasks if
                         t.status == TaskStatus.NotRan.value]

        self.not_ran_tasks = [task for task in not_ran_tasks if not task.debug]
        self.not_ran_tasks = sorted(
            self.not_ran_tasks, key=lambda x: x.gpu or 0,
            reverse=True)

        self.logger.debug(
            f'Found {len(not_ran_tasks)} not ran tasks',
            ComponentType.Supervisor
        )

        self.dep_status = self.provider.dependency_status(self.not_ran_tasks)

        self.auxiliary['not_ran_tasks'] = [
            {
                'id': t.id,
                'name': t.name,
                'dep_status': [
                    TaskStatus(s).name
                    for s in self.dep_status.get(t.id, set())
                ]
            } for t in not_ran_tasks[:5]
        ]

    def load_computers(self):
        computers = self.computer_provider.computers()
        for computer in computers.values():
            computer['gpu'] = [0] * computer['gpu']
            computer['ports'] = set()
            computer['cpu_total'] = computer['cpu']
            computer['memory_total'] = computer['memory']
            computer['gpu_total'] = len(computer['gpu'])
            computer['can_process_tasks'] = computer['can_process_tasks']

        tasks = [
            t for t in self.tasks if
            t.status in [TaskStatus.InProgress.value,
                         TaskStatus.Queued.value]
        ]

        for task in tasks:
            if task.computer_assigned is None:
                continue
            assigned = task.computer_assigned
            comp_assigned = computers[assigned]
            comp_assigned['cpu'] -= task.cpu

            if task.gpu_assigned is not None:
                for g in task.gpu_assigned.split(','):
                    comp_assigned['gpu'][int(g)] = task.id
            comp_assigned['memory'] -= task.memory * 1024

            info = yaml_load(task.additional_info)
            if 'distr_info' in info:
                dist_info = info['distr_info']
                if dist_info['rank'] == 0:
                    comp_assigned['ports'].add(dist_info['master_port'])

        self.computers = [
            {
                **value, 'name': name
            } for name, value in computers.items()
        ]

        self.auxiliary['computers'] = self.computers

    def process_to_celery(self, task: Task, queue: str, computer: dict):
        r = execute.apply_async((task.id,), queue=queue, retry=False)
        task.status = TaskStatus.Queued.value
        task.computer_assigned = computer['name']
        task.celery_id = r.id

        if task.computer_assigned is not None:
            if task.gpu_assigned:
                for g in map(int, task.gpu_assigned.split(',')):
                    computer['gpu'][g] = task.id
            computer['cpu'] -= task.cpu
            computer['memory'] -= task.memory * 1024

        self.logger.info(
            f'Sent task={task.id} to celery. Queue = {queue} '
            f'Task status = {task.status} Celery_id = {r.id}',
            ComponentType.Supervisor)
        self.provider.update()

    def create_service_task(
            self,
            task: Task,
            gpu_assigned=None,
            distr_info: dict = None,
            resume: dict = None
    ):
        new_task = Task(
            name=task.name,
            computer=task.computer,
            executor=task.executor,
            status=TaskStatus.NotRan.value,
            type=TaskType.Service.value,
            gpu_assigned=gpu_assigned,
            parent=task.id,
            report=task.report,
            dag=task.dag
        )
        new_task.additional_info = task.additional_info

        if distr_info:
            additional_info = yaml_load(new_task.additional_info)
            additional_info['distr_info'] = distr_info
            new_task.additional_info = yaml_dump(additional_info)

        if resume:
            additional_info = yaml_load(new_task.additional_info)
            additional_info['resume'] = resume
            new_task.additional_info = yaml_dump(additional_info)

        return self.provider.add(new_task)

    def find_port(self, c: dict, docker_name: str):
        docker = self.docker_provider.get(c['name'], docker_name)
        ports = list(map(int, docker.ports.split('-')))
        for p in range(ports[0], ports[1] + 1):
            if p not in c['ports']:
                return p
        raise Exception(f'All ports in {c["name"]} are taken')

    def _process_task_valid_computer(self, task: Task, c: dict,
                                     single_node: bool):
        if not c['can_process_tasks']:
            return 'this computer can not process tasks'

        if task.computer is not None and task.computer != c['name']:
            return 'name set in the config!= name of this computer'

        if task.cpu > c['cpu']:
            return f'task cpu = {task.cpu} > computer' \
                   f' free cpu = {c["cpu"]}'

        if task.memory > c['memory']:
            return f'task cpu = {task.cpu} > computer ' \
                   f'free memory = {c["memory"]}'

        queue = f'{c["name"]}_' \
                f'{task.dag_rel.docker_img or "default"}'
        if queue not in self.queues:
            return f'required queue = {queue} not in queues'

        if task.gpu > 0 and not any(g == 0 for g in c['gpu']):
            return f'task requires gpu, but there is not any free'

        free_gpu = sum(g == 0 for g in c['gpu'])
        if single_node and task.gpu > free_gpu:
            return f'task requires {task.gpu} ' \
                   f'but there are only {free_gpu} free'

    def _process_task_get_computers(
            self, executor: dict, task: Task, auxiliary: dict
    ):
        single_node = executor.get('single_node', True)

        computers = []
        for c in self.computers:
            error = self._process_task_valid_computer(task, c, single_node)
            auxiliary['computers'].append({'name': c['name'], 'error': error})
            if not error:
                computers.append(c)

        if task.gpu > 0 and single_node and len(computers) > 0:
            computers = sorted(
                computers,
                key=lambda x: sum(g == 0 for g in c['gpu']),
                reverse=True
            )[:1]

        free_gpu = sum(sum(g == 0 for g in c['gpu']) for c in computers)
        if task.gpu > free_gpu:
            auxiliary['not_valid'] = f'gpu required by the ' \
                                     f'task = {task.gpu},' \
                                     f' but there are only {free_gpu} ' \
                                     f'free gpus'
            return []
        return computers

    def _process_task_to_send(
            self, executor: dict, task: Task, computers: List[dict]
    ):
        distr = executor.get('distr', True)
        to_send = []
        for computer in computers:
            queue = f'{computer["name"]}_' \
                    f'{task.dag_rel.docker_img or "default"}'

            if task.gpu_max > 1 and distr:
                for index, task_taken_gpu in enumerate(computer['gpu']):
                    if task_taken_gpu:
                        continue
                    to_send.append([computer, queue, index])

                    if len(to_send) >= task.gpu_max:
                        break

                if len(to_send) >= task.gpu_max:
                    break
            elif task.gpu_max > 0:
                cuda_devices = []
                for index, task_taken_gpu in enumerate(computer['gpu']):
                    if task_taken_gpu:
                        continue

                    cuda_devices.append(index)
                    if len(cuda_devices) >= task.gpu_max:
                        break

                task.gpu_assigned = ','.join(map(str, cuda_devices))
                self.process_to_celery(task, queue, computer)
            else:
                self.process_to_celery(task, queue, computer)
                break
        return to_send

    def process_task(self, task: Task):
        auxiliary = self.auxiliary['process_tasks'][-1]
        auxiliary['computers'] = []

        config = yaml_load(task.dag_rel.config)
        executor = config['executors'][task.executor]

        computers = self._process_task_get_computers(executor, task, auxiliary)
        if len(computers) == 0:
            return

        to_send = self._process_task_to_send(executor, task, computers)
        auxiliary['to_send'] = to_send[:5]
        additional_info = yaml_load(task.additional_info)

        rank = 0
        master_port = None
        if len(to_send) > 0:

            master_port = self.find_port(
                to_send[0][0], to_send[0][1].split('_')[1]
            )
            computer_names = {c['name'] for c, _, __ in to_send}
            if len(computer_names) == 1:
                task.computer_assigned = list(computer_names)[0]

        for computer, queue, gpu_assigned in to_send:
            main_cmp = to_send[0][0]
            # noinspection PyTypeChecker
            ip = 'localhost' if computer['name'] == main_cmp['name'] \
                else main_cmp['ip']

            distr_info = {
                'master_addr': ip,
                'rank': rank,
                'local_rank': gpu_assigned,
                'master_port': master_port,
                'world_size': len(to_send),
                'master_computer': main_cmp['name']
            }
            service_task = self.create_service_task(
                task,
                distr_info=distr_info,
                gpu_assigned=gpu_assigned,
                resume=additional_info.get('resume')
            )
            self.process_to_celery(service_task, queue, computer)
            rank += 1
            main_cmp['ports'].add(master_port)

        if len(to_send) > 0:
            task.status = TaskStatus.Queued.value
            self.sent_tasks += len(to_send)

    def process_tasks(self):
        self.auxiliary['process_tasks'] = []

        for task in self.not_ran_tasks:
            auxiliary = {'id': task.id, 'name': task.name}
            self.auxiliary['process_tasks'].append(auxiliary)

            if task.dag_rel is None:
                task.dag_rel = self.dag_provider.by_id(task.dag)

            if TaskStatus.Stopped.value in self.dep_status[task.id] \
                    or TaskStatus.Failed.value in self.dep_status[task.id] or \
                    TaskStatus.Skipped.value in self.dep_status[task.id]:
                auxiliary['not_valid'] = 'stopped or failed in dep_status'
                self.provider.change_status(task, TaskStatus.Skipped)
                continue

            if len(self.dep_status[task.id]) != 0 \
                    and self.dep_status[task.id] != {TaskStatus.Success.value}:
                auxiliary['not_valid'] = 'not all dep tasks are finished'
                continue
            self.process_task(task)

        self.auxiliary['process_tasks'] = self.auxiliary['process_tasks'][:5]

    def _stop_child_tasks(self, task: Task):
        self.provider.commit()

        children = self.provider.children(task.id, [Task.dag_rel])
        dags = [c.dag_rel for c in children]
        for c, d in zip(children, dags):
            celery_tasks.stop(self.logger, self.session, c, d)

    def process_parent_tasks(self):
        tasks = self.provider.parent_tasks_stats()

        was_change = False
        for task, started, finished, statuses in tasks:
            status = task.status
            if statuses[TaskStatus.Failed] > 0:
                status = TaskStatus.Failed.value
            elif statuses[TaskStatus.Skipped] > 0:
                status = TaskStatus.Skipped.value
            elif statuses[TaskStatus.Queued] > 0:
                status = TaskStatus.Queued.value
            elif statuses[TaskStatus.InProgress] > 0:
                status = TaskStatus.InProgress.value
            elif statuses[TaskStatus.Success] > 0:
                status = TaskStatus.Success.value

            if status != task.status:
                if status == TaskStatus.InProgress.value:
                    task.started = started
                elif status >= TaskStatus.Failed.value:
                    task.started = started
                    task.finished = finished
                    self._stop_child_tasks(task)

                was_change = True
                task.status = status

        if was_change:
            self.provider.commit()

        self.auxiliary['parent_tasks_stats'] = [
            {
                'name': task.name,
                'id': task.id,
                'started': task.started,
                'finished': finished,
                'statuses': [
                    {
                        'name': k.name,
                        'count': v
                    } for k, v in statuses.items()
                ],
            } for task, started, finished, statuses in tasks[:5]
        ]

    def write_auxiliary(self):
        self.auxiliary['duration'] = (now() - self.auxiliary['time']). \
            total_seconds()

        auxiliary = Auxiliary(
            name='supervisor', data=yaml_dump(self.auxiliary)
        )
        if len(auxiliary.data) > 16000:
            return

        self.auxiliary_provider.create_or_update(auxiliary, 'name')

    def stop_tasks(self, tasks: List[Task]):
        self.tasks_stop.extend([t.id for t in tasks])

    def process_stop_tasks(self):
        # Stop not running tasks
        if len(self.tasks_stop) == 0:
            return

        tasks = self.provider.by_ids(self.tasks_stop)
        tasks_not_ran = [t.id for t in tasks if
                         t.status in [TaskStatus.NotRan.value,
                                      TaskStatus.Queued.value]]
        tasks_started = [t for t in tasks if
                         t.status in [TaskStatus.InProgress.value]]
        tasks_started_ids = [t.id for t in tasks_started]

        self.provider.change_status_all(tasks=tasks_not_ran,
                                        status=TaskStatus.Skipped)

        pids = []
        for task in tasks_started:
            if task.pid:
                pids.append((task.computer_assigned, task.pid))

            additional_info = yaml_load(task.additional_info)
            for p in additional_info.get('child_processes', []):
                pids.append((task.computer_assigned, p))

        for computer, queue in self.docker_provider.queues_online():
            pids_computer = [p for c, p in pids if c == computer]
            if len(pids_computer) > 0:
                celery_tasks.kill_all.apply_async((pids_computer,),
                                                  queue=queue,
                                                  retry=False)

        self.provider.change_status_all(tasks=tasks_started_ids,
                                        status=TaskStatus.Stopped)

        self.tasks_stop = []

    def fast_check(self):
        if self.provider is None or self.computer_provider is None:
            return False

        if self.not_ran_tasks is None or self.queues is None:
            return False

        if len(self.tasks_stop) > 0:
            return False

        if len(self.dags_start) > 0:
            return False

        if len(self.auxiliary.get('to_send', [])) > 0:
            return False

        queues = set([
            f'{d.computer}_{d.name}' for d in self.docker_provider.all()
            if d.last_activity >= now() - datetime.timedelta(seconds=15)
        ])

        queues_set = set(queues)
        queues_set2 = set(self.queues)

        if queues_set != queues_set2:
            return False

        tasks = self.provider.by_status(TaskStatus.NotRan,
                                        TaskStatus.Queued,
                                        TaskStatus.InProgress)
        tasks_set = {t.id for t in tasks if
                     t.status == TaskStatus.NotRan.value and not t.debug}
        tasks_set2 = {t.id for t in self.tasks if
                      t.status == TaskStatus.NotRan.value}

        if tasks_set != tasks_set2:
            return False

        tasks_set = {t.id for t in tasks if
                     t.status == TaskStatus.InProgress.value}
        tasks_set2 = {t.id for t in self.tasks if
                      t.status == TaskStatus.InProgress.value}

        if tasks_set != tasks_set2:
            return False

        tasks_set = {t.id for t in tasks if
                     t.status == TaskStatus.Queued.value}
        tasks_set2 = {t.id for t in self.tasks if
                      t.status == TaskStatus.Queued.value}

        if tasks_set != tasks_set2:
            return False

        return True

    def start_dag(self, id: int):
        self.dags_start.append(id)

    def process_start_dags(self):
        if len(self.dags_start) == 0:
            return

        for id in self.dags_start:
            can_start_statuses = [
                TaskStatus.Failed.value, TaskStatus.Skipped.value,
                TaskStatus.Stopped.value
            ]

            tasks = self.provider.by_dag(id)
            children_all = self.provider.children([t.id for t in tasks])

            def find_resume(task):
                children = [c for c in children_all if c.parent == task.id]
                children = sorted(children, key=lambda x: x.id, reverse=True)

                if len(children) > 0:
                    for c in children:
                        if c.parent != task.id:
                            continue

                        info = yaml_load(c.additional_info)
                        if 'distr_info' not in info:
                            continue

                        if info['distr_info']['rank'] == 0:
                            return {
                                'master_computer': c.computer_assigned,
                                'master_task_id': c.id,
                                'load_last': True
                            }
                    raise Exception('Master task not found')
                else:
                    return {
                        'master_computer': task.computer_assigned,
                        'master_task_id': task.id,
                        'load_last': True
                    }

            for t in tasks:
                if t.status not in can_start_statuses:
                    continue

                if t.parent:
                    continue

                if t.type == TaskType.Train.value:
                    info = yaml_load(t.additional_info)
                    info['resume'] = find_resume(t)
                    t.additional_info = yaml_dump(info)

                t.status = TaskStatus.NotRan.value
                t.pid = None
                t.started = None
                t.finished = None
                t.computer_assigned = None
                t.celery_id = None
                t.worker_index = None
                t.docker_assigned = None

        self.provider.commit()
        self.dags_start = []

    def build(self):
        try:
            # if self.fast_check():
            #     return

            self.auxiliary = {'time': now()}

            self.create_base()

            self.process_stop_tasks()

            self.process_start_dags()

            self.process_parent_tasks()

            self.load_tasks()

            self.load_computers()

            self.process_tasks()

            self.write_auxiliary()

        except ObjectDeletedError:
            pass
        except Exception as e:
            if Session.sqlalchemy_error(e):
                Session.cleanup(key='SupervisorBuilder')
                self.session = Session.create_session(key='SupervisorBuilder')
                self.logger = create_logger(self.session, 'SupervisorBuilder')

            self.logger.error(traceback.format_exc(), ComponentType.Supervisor)
Exemple #17
0
    def sync(self):
        hostname = socket.gethostname()
        try:
            provider = ComputerProvider(self.session)
            task_synced_provider = TaskSyncedProvider(self.session)

            computer = provider.by_name(hostname)
            sync_start = now()

            if FILE_SYNC_INTERVAL == 0:
                time.sleep(1)
            else:
                self.sync_manual(computer, provider)

                computers = provider.all_with_last_activtiy()
                computers = [
                    c for c in computers
                    if (now() - c.last_activity).total_seconds() < 10
                ]
                computers_names = {c.name for c in computers}

                for c, project, tasks in task_synced_provider.for_computer(
                        computer.name):
                    if c.sync_with_this_computer:
                        if c.name not in computers_names:
                            self.logger.info(f'Computer = {c.name} '
                                             f'is offline. Can not sync',
                                             ComponentType.WorkerSupervisor,
                                             hostname)
                            continue

                        if c.syncing_computer:
                            continue

                        sync_folders = yaml_load(project.sync_folders)
                        ignore_folders = yaml_load(project.ignore_folders)

                        sync_folders = correct_folders(sync_folders,
                                                       project.name)
                        ignore_folders = correct_folders(ignore_folders,
                                                         project.name)

                        if not isinstance(sync_folders, list):
                            sync_folders = []
                        if not isinstance(ignore_folders, list):
                            ignore_folders = []

                        folders = [[s, ignore_folders] for s in sync_folders]

                        computer.syncing_computer = c.name
                        provider.update()

                        sync_directed(self.session, c, computer, folders)

                    for t in tasks:
                        task_synced_provider.add(
                            TaskSynced(computer=computer.name, task=t.id)
                        )

                    time.sleep(FILE_SYNC_INTERVAL)

            computer.last_synced = sync_start
            computer.syncing_computer = None
            provider.update()
        except Exception as e:
            self.process_error(e)
Exemple #18
0
class Catalyst(Executor, Callback):
    def __init__(self, args: Args, report: ReportLayoutInfo, distr_info: dict,
                 resume: dict, grid_config: dict, trace: str, params: dict):
        super().__init__(order=0)

        self.resume = resume
        self.distr_info = distr_info
        self.args = args
        self.report = report
        self.experiment = None
        self.runner = None
        self.series_provider = ReportSeriesProvider(self.session)
        self.computer_provider = ComputerProvider(self.session)
        self.grid_config = grid_config
        self.master = True
        self.checkpoint_resume = False
        self.checkpoint_stage_epoch = 0
        self.trace = trace
        self.params = params

    def callbacks(self):
        result = OrderedDict()
        if self.master:
            result['catalyst'] = self

        return result

    def on_epoch_start(self, state: RunnerState):
        if self.checkpoint_resume and state.stage_epoch == 0:
            state.epoch += 1

        state.stage_epoch = state.stage_epoch + self.checkpoint_stage_epoch
        state.checkpoint_data = {'stage_epoch': state.stage_epoch}
        if self.master:
            if state.stage_epoch == 0:
                self.step.start(1, name=state.stage)

            self.step.start(2,
                            name=f'epoch {state.stage_epoch}',
                            index=state.stage_epoch)

    def on_epoch_end(self, state: RunnerState):
        self.step.end(2)

        for s in self.report.series:
            train = state.metrics.epoch_values['train'][s.key]
            val = state.metrics.epoch_values['valid'][s.key]

            task_id = self.task.parent or self.task.id
            train = ReportSeries(part='train',
                                 name=s.key,
                                 epoch=state.epoch,
                                 task=task_id,
                                 value=train,
                                 time=now(),
                                 stage=state.stage)

            val = ReportSeries(part='valid',
                               name=s.key,
                               epoch=state.epoch,
                               task=task_id,
                               value=val,
                               time=now(),
                               stage=state.stage)

            self.series_provider.add(train)
            self.series_provider.add(val)

            if s.key == self.report.metric.name:
                best = False
                task = self.task
                if task.parent:
                    task = self.task_provider.by_id(task.parent)

                if self.report.metric.minimize:
                    if task.score is None or val.value < task.score:
                        best = True
                else:
                    if task.score is None or val.value > task.score:
                        best = True
                if best:
                    task.score = val.value
                    self.task_provider.update()

    def on_stage_start(self, state: RunnerState):
        state.loggers = {
            'console': VerboseLogger(),
            'raise': RaiseExceptionLogger()
        }

    def on_stage_end(self, state: RunnerState):
        self.checkpoint_resume = False
        self.checkpoint_stage_epoch = 0
        self.step.end(1)

    @classmethod
    def _from_config(cls, executor: dict, config: Config,
                     additional_info: dict):
        args = Args()
        for k, v in executor['args'].items():
            v = str(v)
            if v in ['False', 'True']:
                v = v == 'True'
            elif v.isnumeric():
                v = int(v)

            setattr(args, k, v)

        assert 'report_config' in additional_info, 'layout was not filled'
        report_config = additional_info['report_config']
        grid_cell = additional_info.get('grid_cell')
        report = ReportLayoutInfo(report_config)
        if len(args.configs) == 0:
            args.configs = [args.config]

        grid_config = {}
        if grid_cell is not None:
            grid_config = grid_cells(executor['grid'])[grid_cell][0]

        distr_info = additional_info.get('distr_info', {})
        resume = additional_info.get('resume')
        params = executor.get('params', {})

        return cls(args=args,
                   report=report,
                   grid_config=grid_config,
                   distr_info=distr_info,
                   resume=resume,
                   trace=executor.get('trace'),
                   params=params)

    def set_dist_env(self, config):
        info = self.distr_info
        os.environ['MASTER_ADDR'] = info['master_addr']
        os.environ['MASTER_PORT'] = str(info['master_port'])
        os.environ['WORLD_SIZE'] = str(info['world_size'])

        os.environ['RANK'] = str(info['rank'])
        distributed_params = config.get('distributed_params', {})
        distributed_params['rank'] = 0
        config['distributed_params'] = distributed_params

        if info['rank'] > 0:
            self.master = False

    def parse_args_uargs(self):
        args, config = parse_args_uargs(self.args, [])
        config = merge_dicts_smart(config, self.grid_config)
        config = merge_dicts_smart(config, self.params)

        if self.distr_info:
            self.set_dist_env(config)
        return args, config

    def _checkpoint_fix_config(self, experiment):
        resume = self.resume
        if not resume:
            return

        checkpoint_dir = join(experiment.logdir, 'checkpoints')
        os.makedirs(checkpoint_dir, exist_ok=True)

        file = 'last_full.pth' if resume.get('load_last') else 'best_full.pth'

        path = join(checkpoint_dir, file)
        computer = socket.gethostname()
        if computer != resume['master_computer']:
            master_computer = self.computer_provider.by_name(
                resume['master_computer'])
            path_from = join(master_computer.root_folder,
                             str(resume['master_task_id']), 'log',
                             'checkpoints', file)
            self.info(f'copying checkpoint from: computer = '
                      f'{resume["master_computer"]} path_from={path_from} '
                      f'path_to={path}')

            success = copy_remote(session=self.session,
                                  computer_from=resume['master_computer'],
                                  path_from=path_from,
                                  path_to=path)

            if not success:
                self.error(f'copying from '
                           f'{resume["master_computer"]}/'
                           f'{path_from} failed')
            else:
                self.info('checkpoint copied successfully')

        elif self.task.id != resume['master_task_id']:
            path = join(TASK_FOLDER, str(resume['master_task_id']), 'log',
                        'checkpoints', file)
            self.info(f'master_task_id!=task.id, using checkpoint'
                      f' from task_id = {resume["master_task_id"]}')

        if not os.path.exists(path):
            self.info(f'no checkpoint at {path}')
            return

        ckpt = load_checkpoint(path)
        stages_config = experiment.stages_config
        for k, v in list(stages_config.items()):
            if k == ckpt['stage']:
                stage_epoch = ckpt['checkpoint_data']['stage_epoch'] + 1

                # if it is the last epoch in the stage
                if stage_epoch == v['state_params']['num_epochs'] \
                        or resume.get('load_best'):
                    del stages_config[k]
                    break

                self.checkpoint_stage_epoch = stage_epoch
                v['state_params']['num_epochs'] -= stage_epoch
                break
            del stages_config[k]

        stage = experiment.stages_config[experiment.stages[0]]
        for k, v in stage['callbacks_params'].items():
            if v.get('callback') == 'CheckpointCallback':
                v['resume'] = path

        self.info(f'found checkpoint at {path}')

    def _checkpoint_fix_callback(self, callbacks: dict):
        def mock(state):
            pass

        for k, c in callbacks.items():
            if not isinstance(c, CheckpointCallback):
                continue

            if c.resume:
                self.checkpoint_resume = True

            if not self.master:
                c.on_epoch_end = mock
                c.on_stage_end = mock

    def work(self):
        args, config = self.parse_args_uargs()
        set_global_seed(args.seed)

        Experiment, R = import_experiment_and_runner(Path(args.expdir))

        runner_params = config.pop('runner_params', {})

        experiment = Experiment(config)
        runner: Runner = R(**runner_params)

        register()

        self.experiment = experiment
        self.runner = runner

        stages = experiment.stages[:]

        if self.master:
            task = self.task if not self.task.parent \
                else self.task_provider.by_id(self.task.parent)
            task.steps = len(stages)
            self.task_provider.commit()

        self._checkpoint_fix_config(experiment)

        _get_callbacks = experiment.get_callbacks

        def get_callbacks(stage):
            res = self.callbacks()
            for k, v in _get_callbacks(stage).items():
                res[k] = v

            self._checkpoint_fix_callback(res)
            return res

        experiment.get_callbacks = get_callbacks

        if experiment.logdir is not None:
            dump_environment(config, experiment.logdir, args.configs)

        if self.distr_info:
            info = yaml_load(self.task.additional_info)
            info['resume'] = {
                'master_computer': self.distr_info['master_computer'],
                'master_task_id': self.task.id - self.distr_info['rank'],
                'load_best': True
            }
            self.task.additional_info = yaml_dump(info)
            self.task_provider.commit()

            experiment.stages_config = {
                k: v
                for k, v in experiment.stages_config.items()
                if k == experiment.stages[0]
            }

        runner.run_experiment(experiment, check=args.check)

        if self.master and self.trace:
            traced = trace_model_from_checkpoint(self.experiment.logdir, self)
            torch.jit.save(traced, self.trace)

        return {'stage': experiment.stages[-1], 'stages': stages}
Exemple #19
0
class Catalyst(Executor, Callback):
    def __init__(self, args: Args, report: ReportLayoutInfo, distr_info: dict,
                 resume: dict, grid_config: dict, trace: str, params: dict,
                 **kwargs):
        super().__init__(**kwargs)

        self.series_provider = ReportSeriesProvider(self.session)
        self.computer_provider = ComputerProvider(self.session)
        self.memory_provider = MemoryProvider(self.session)

        self.order = 0
        self.resume = resume
        self.distr_info = distr_info
        self.args = args
        self.report = report
        self.experiment = None
        self.runner = None
        self.grid_config = grid_config
        self.master = True
        self.trace = trace
        self.params = params
        self.last_batch_logged = None
        self.loader_started_time = None
        self.parent = None
        self.node = CallbackNode.All

    def get_parent_task(self):
        if self.parent:
            return self.parent
        return self.task

    def callbacks(self):
        result = OrderedDict()
        if self.master:
            result['catalyst'] = self

        return result

    def on_loader_start(self, state: State):
        self.loader_started_time = now()

    def on_epoch_start(self, state: State):
        stage_index = self.experiment.stages.index(state.stage_name)
        self.step.start(1, name=state.stage_name, index=stage_index)

        self.step.start(2, name=f'epoch {state.epoch}', index=state.epoch - 1)

    def on_batch_start(self, state: State):
        if self.last_batch_logged and state.loader_step != state.loader_len:
            if (now() - self.last_batch_logged).total_seconds() < 10:
                return

        task = self.get_parent_task()
        task.batch_index = state.loader_step
        task.batch_total = state.loader_len
        task.loader_name = state.loader_name

        duration = int((now() - self.loader_started_time).total_seconds())
        task.epoch_duration = duration
        task.epoch_time_remaining = int(
            duration *
            (task.batch_total / task.batch_index)) - task.epoch_duration
        if state.epoch_metrics.get('train_loss') is not None:
            task.loss = float(state.epoch_metrics['train_loss'])
        if state.epoch_metrics.get('valid_loss') is not None:
            task.loss = float(state.epoch_metrics['valid_loss'])

        self.task_provider.update()
        self.last_batch_logged = now()

    def on_epoch_end(self, state: State):
        self.step.end(2)

        values = state.epoch_metrics

        for k, v in values.items():
            part = ''
            name = k

            for loader in state.loaders:
                if k.startswith(loader):
                    part = loader
                    name = k.replace(loader, '')
                    if name.startswith('_'):
                        name = name[1:]

            task_id = self.task.parent or self.task.id
            series = ReportSeries(part=part,
                                  name=name,
                                  epoch=state.epoch - 1,
                                  task=task_id,
                                  value=v,
                                  time=now(),
                                  stage=state.stage_name)
            self.series_provider.add(series)

            if name == self.report.metric.name:
                best = False
                task = self.task
                if task.parent:
                    task = self.task_provider.by_id(task.parent)

                if self.report.metric.minimize:
                    if task.score is None or v < task.score:
                        best = True
                else:
                    if task.score is None or v > task.score:
                        best = True
                if best:
                    task.score = v
                    self.task_provider.update()

    def on_stage_end(self, state: State):
        self.step.end(1)

    @classmethod
    def _from_config(cls, executor: dict, config: Config,
                     additional_info: dict):
        args = Args()
        for k, v in executor['args'].items():
            v = str(v)
            if v in ['False', 'True']:
                v = v == 'True'
            elif v.isnumeric():
                v = int(v)

            setattr(args, k, v)

        assert 'report_config' in additional_info, 'layout was not filled'
        report_config = additional_info['report_config']
        report = ReportLayoutInfo(report_config)
        if len(args.configs) == 0:
            args.configs = [args.config]

        distr_info = additional_info.get('distr_info', {})
        resume = additional_info.get('resume')
        params = executor.get('params', {})
        params.update(additional_info.get('params', {}))

        grid_config = executor.copy()
        grid_config.pop('args', '')

        return cls(args=args,
                   report=report,
                   grid_config=grid_config,
                   distr_info=distr_info,
                   resume=resume,
                   trace=executor.get('trace'),
                   params=params)

    def set_dist_env(self, config):
        info = self.distr_info
        os.environ['MASTER_ADDR'] = info['master_addr']
        os.environ['MASTER_PORT'] = str(info['master_port'])
        os.environ['WORLD_SIZE'] = str(info['world_size'])

        os.environ['RANK'] = str(info['rank'])
        os.environ['LOCAL_RANK'] = "0"
        distributed_params = config.get('distributed_params', {})
        distributed_params['rank'] = info['rank']
        config['distributed_params'] = distributed_params

        torch.cuda.set_device(0)

        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")

        if info['rank'] > 0:
            self.master = False
            self.node = CallbackNode.Worker
        else:
            self.node = CallbackNode.Master

    def parse_args_uargs(self):
        args, config = parse_args_uargs(self.args, [])
        config = merge_dicts_smart(config, self.grid_config)
        config = merge_dicts_smart(config, self.params)

        if self.distr_info:
            self.set_dist_env(config)
        return args, config

    def _fix_memory(self, experiment):
        if not torch.cuda.is_available():
            return
        max_memory = torch.cuda.get_device_properties(0).total_memory / (2**30)
        stages_config = experiment.stages_config
        for k, v in list(stages_config.items()):
            query = {}
            # noinspection PyProtectedMember
            for kk, vv in experiment._config['model_params'].items():
                query[kk] = vv
            for kk, vv in v['data_params'].items():
                query[kk] = vv
            variants = self.memory_provider.find(query)
            variants = [v for v in variants if v.memory < max_memory]
            if len(variants) == 0:
                continue
            variant = max(variants, key=lambda x: x.memory)
            v['data_params']['batch_size'] = variant.batch_size

    def _checkpoint_fix_config(self, experiment):
        resume = self.resume
        if not resume:
            return
        if experiment.logdir is None:
            return

        checkpoint_dir = join(experiment.logdir, 'checkpoints')
        os.makedirs(checkpoint_dir, exist_ok=True)

        file = 'last_full.pth' if resume.get('load_last') else 'best_full.pth'

        path = join(checkpoint_dir, file)
        computer = socket.gethostname()
        if computer != resume['master_computer']:
            master_computer = self.computer_provider.by_name(
                resume['master_computer'])
            path_from = join(master_computer.root_folder,
                             str(resume['master_task_id']), experiment.logdir,
                             'checkpoints', file)
            self.info(f'copying checkpoint from: computer = '
                      f'{resume["master_computer"]} path_from={path_from} '
                      f'path_to={path}')

            success = copy_remote(session=self.session,
                                  computer_from=resume['master_computer'],
                                  path_from=path_from,
                                  path_to=path)

            if not success:
                self.error(f'copying from '
                           f'{resume["master_computer"]}/'
                           f'{path_from} failed')
            else:
                self.info('checkpoint copied successfully')

        elif self.task.id != resume['master_task_id']:
            path = join(TASK_FOLDER, str(resume['master_task_id']),
                        experiment.logdir, 'checkpoints', file)
            self.info(f'master_task_id!=task.id, using checkpoint'
                      f' from task_id = {resume["master_task_id"]}')

        if not os.path.exists(path):
            self.info(f'no checkpoint at {path}')
            return

        ckpt = load_checkpoint(path)
        stages_config = experiment.stages_config
        for k, v in list(stages_config.items()):
            if k == ckpt['stage']:
                stage_epoch = ckpt['checkpoint_data']['epoch'] + 1

                # if it is the last epoch in the stage
                if stage_epoch >= v['state_params']['num_epochs'] \
                        or resume.get('load_best'):
                    del stages_config[k]
                    break

                self.checkpoint_stage_epoch = stage_epoch
                v['state_params']['num_epochs'] -= stage_epoch
                break
            del stages_config[k]

        stage = experiment.stages_config[experiment.stages[0]]
        for k, v in stage['callbacks_params'].items():
            if v.get('callback') == 'CheckpointCallback':
                v['resume'] = path

        self.info(f'found checkpoint at {path}')

    def _checkpoint_fix_callback(self, callbacks: dict):
        def mock(state):
            pass

        for k, c in callbacks.items():
            if not isinstance(c, CheckpointCallback):
                continue

            if c.resume:
                self.checkpoint_resume = True

            if not self.master:
                c.on_epoch_end = mock
                c.on_stage_end = mock
                c.on_batch_start = mock

    def work(self):
        args, config = self.parse_args_uargs()
        set_global_seed(args.seed)

        Experiment, R = import_experiment_and_runner(Path(args.expdir))

        runner_params = config.pop('runner_params', {})

        experiment = Experiment(config)
        runner: Runner = R(**runner_params)

        self.experiment = experiment
        self.runner = runner

        stages = experiment.stages[:]

        if self.task.parent:
            self.parent = self.task_provider.by_id(self.task.parent)

        if self.master:
            task = self.get_parent_task()
            task.steps = len(stages)
            self.task_provider.commit()

        self._checkpoint_fix_config(experiment)
        self._fix_memory(experiment)

        _get_callbacks = experiment.get_callbacks

        def get_callbacks(stage):
            res = self.callbacks()
            for k, v in _get_callbacks(stage).items():
                res[k] = v

            self._checkpoint_fix_callback(res)
            return res

        experiment.get_callbacks = get_callbacks

        if experiment.logdir is not None:
            dump_environment(config, experiment.logdir, args.configs)

        if self.distr_info:
            info = yaml_load(self.task.additional_info)
            info['resume'] = {
                'master_computer': self.distr_info['master_computer'],
                'master_task_id': self.task.id - self.distr_info['rank'],
                'load_best': True
            }
            self.task.additional_info = yaml_dump(info)
            self.task_provider.commit()

            experiment.stages_config = {
                k: v
                for k, v in experiment.stages_config.items()
                if k == experiment.stages[0]
            }

        runner.run_experiment(experiment)
        if runner.state.exception:
            raise runner.state.exception

        if self.master and self.trace:
            traced = trace_model_from_checkpoint(self.experiment.logdir, self)
            torch.jit.save(traced, self.trace)
        return {'stage': experiment.stages[-1], 'stages': stages}
Exemple #20
0
def computer_sync_start():
    provider = ComputerProvider(_read_session)
    return provider.sync_start()