Ejemplo n.º 1
0
 def release(self):
     """
     Releases the lock.
     """
     if self.is_locked:
         self.is_locked = False
         os.remove(self.lockfile)
         logger.debug(f'Released {self.lockfile}')
     else:
         raise RuntimeError('The file lock is not owned by this instance.')
Ejemplo n.º 2
0
def acquire_device(queue):
    """
    Context manager which acquires a device from device queue and releases it when
    the worker is done working.

    Args:
        queue (Queue): a blocking queue with available devices.
    """
    idx = queue.get()
    logger.debug(f'Acquired device {idx}.')

    yield idx

    queue.put(idx)
    logger.debug(f'Released device {idx}.')
Ejemplo n.º 3
0
    def acquire(self, blocking=True):
        """
        Tries to acquire the lock.

        Args:
            blocking: if the lock is unavailable and `blocking` is False, will return False,
                         otherwise will block the next statement execution until the lock is available.
        """
        logger.debug(f'Attempting to acquire {self.lockfile}')
        while True:
            try:
                fd = os.open(self.lockfile, os.O_CREAT | os.O_EXCL | os.O_RDWR)
                with os.fdopen(fd, 'w') as f:
                    f.write('\n')
                break
            except FileExistsError:
                if not blocking:
                    return False
                time.sleep(self.delay)
        self.is_locked = True
        logger.debug(f'Acquired {self.lockfile}')
        return True
Ejemplo n.º 4
0
def baseline_worker(device_idx, config, reward_metric, name):
    """
    Concurrent description evaluator.

    Args:
        device_idx (int): a dedicated CUDA devices index
        config (dict, str): configuration dictionary or path to YAML file.
        reward_metric (str): key for the returned evaluation stats dictionary.
        name (str): baseline name

    Returns:
        float: achieved ``reward_metric`` value
    """
    if isinstance(config, str):
        with open(config) as f:
            config = yaml.load(f)

    log_dir = config['log_dir']
    data_dir = config['data_dir']

    baseline_dir = join(log_dir, 'baselines', name)

    if exists(baseline_dir):
        rmtree(baseline_dir)
    make_dirs(baseline_dir)

    logger = get_logger(f'baseline_{name}', join(baseline_dir, 'training.log'))
    summary_writer = SummaryWriter(baseline_dir)

    logger.debug(f'Worker {device_idx}: initialization done.')

    config = config.get('child_training', config)

    batch_size = config.pop('batch_size')
    keep_data_on_device = config.pop('keep_data_on_device')
    adaptive_batch_size = config.pop('adaptive_batch_size')

    if adaptive_batch_size:
        min_batch_size = config.pop('min_batch_size')
        max_batch_size = config.pop('max_batch_size')
        batch_size_decay = config.pop('batch_size_decay')

        assert isinstance(min_batch_size, int)
        assert isinstance(max_batch_size, int)
        assert 0 < batch_size_decay < 1

        config['initial_lr'] *= max_batch_size / batch_size
        batch_size = max_batch_size
    else:
        min_batch_size = batch_size

    datasets = torch.load(join(data_dir, 'preprocessed.pth'))

    with torch.cuda.device(device_idx):
        model_path = join(log_dir, 'model.pth')

        model = torch.load(model_path)
        logger.debug(f'Worker {device_idx}: model loaded.')

        model = model.cuda()
        logger.debug(f'Worker {device_idx}: placed on device.')

        if keep_data_on_device:
            for key in datasets.keys():
                datasets[key].tensors = tuple(
                    map(lambda t: t.cuda(device_idx), datasets[key].tensors))
            gc.collect()

        while min_batch_size <= batch_size:
            try:
                loaders = Bunch()
                for k in datasets.keys():
                    loaders[k] = DataLoader(datasets[k],
                                            batch_size,
                                            shuffle=True,
                                            pin_memory=not keep_data_on_device)

                space_coach = FeedForwardCoach(
                    model,
                    loaders,
                    name=name,
                    logger=logger,
                    log_dir=log_dir,
                    tensorboard=summary_writer,
                    tqdm=get_tqdm(position=device_idx),
                    **config)
                logger.debug(f'Worker {device_idx}: beginning training.')
                space_coach.train_until_convergence(description={})
                logger.debug(f'Worker {device_idx}: beginning evaluation')
                stats = space_coach.evaluate({}, loaders.validation)
                mean_reward = np.mean(stats[reward_metric])

                return mean_reward

            except RuntimeError as e:
                if 'out of memory' in str(e):
                    if adaptive_batch_size:
                        batch_size = int(batch_size * batch_size_decay)
                        config['initial_lr'] *= batch_size_decay
                        logger.info(
                            f'Out of memory, decreasing batch size to {batch_size}.'
                        )
                    else:
                        logger.info('Out of memory on fixed batch size.')
                        return None
                else:
                    logger.error(e)
                    raise e

            except LossIsNoneError:
                logger.info('Loss is NaN.')
                return np.nan

        else:
            logger.info('Out of memory with minimum batch size.')
            return None
Ejemplo n.º 5
0
def generic_worker(description, device_queue, current_complexity, config,
                   space_type, reward_metric):
    """
    Concurrent description evaluator.

    Args:
        description (dict): description to be evaluated
        device_queue (Queue): a blocking queue with available devices.
        current_complexity (int): current curriculum complexity level.
        config (dict, str): configuration dictionary or path to YAML file.
        space_type (str): the name of root search space.
        reward_metric (str): key for the returned evaluation stats dictionary.

    Returns:
        Tuple of (description, mean ``reward_metric`` value).

        If loss is NaN, than mean ``reward_metric`` value = 0.

        If OOM was raised and ``not adaptive_batch_size`` or using ``min_batch_size`` causes OOM -- mean_auc=None.
    """
    with acquire_device(device_queue) as device_idx:
        description = dict(description)

        if isinstance(config, str):
            with open(config) as f:
                config = yaml.load(f)

        log_dir = config['log_dir']
        data_dir = config['data_dir']
        storage_path = config.get('storage',
                                  join(log_dir, 'description_reward.json'))

        logger, summary_writer, description_dir = worker_init(
            description, log_dir)
        logger.debug(f'Worker {device_idx}: initialization done.')

        def cleanup():
            for f in os.listdir(description_dir):
                if f.startswith('events.out.tfevents'):
                    os.remove(join(description_dir, f))

        config = config.get('child_training', config)

        batch_size = config.pop('batch_size')
        keep_data_on_device = config.pop('keep_data_on_device')
        adaptive_batch_size = config.pop('adaptive_batch_size')

        if adaptive_batch_size:
            min_batch_size = config.pop('min_batch_size')
            max_batch_size = config.pop('max_batch_size')
            batch_size_decay = config.pop('batch_size_decay')

            assert isinstance(min_batch_size, int)
            assert isinstance(max_batch_size, int)
            assert 0 < batch_size_decay < 1

            config['initial_lr'] *= max_batch_size / batch_size
            batch_size = max_batch_size
        else:
            min_batch_size = batch_size

        datasets = torch.load(join(data_dir, 'preprocessed.pth'))
        with torch.cuda.device(device_idx):
            save_path = join(log_dir, space_type, 'merged_space.pth')
            model_path = join(log_dir, 'model.pth')

            model = torch.load(model_path)
            logger.debug(f'Worker {device_idx}: model loaded.')
            with FileLock(save_path):
                model.space = torch.load(save_path)
                logger.debug(f'Worker {device_idx}: search space loaded.')
                model = model.cuda()
                logger.debug(f'Worker {device_idx}: placed on device.')
            model.space.logger = logger

            desc = model.space.preprocess(description,
                                          (-1, model.space_input_size))

            model.space.draw(desc, join(description_dir, 'graph.png'))
            img = np.array(Image.open(join(description_dir, 'graph.png')))
            summary_writer.add_image('graph', img)
            logger.debug(f'Worker {device_idx}: graph visualization drawn.')

            if keep_data_on_device:
                for key in datasets.keys():
                    datasets[key].tensors = tuple(
                        map(lambda t: t.cuda(device_idx),
                            datasets[key].tensors))
                gc.collect()
                logger.debug(
                    f'Worker {device_idx}: transferred data to device.')

            while min_batch_size <= batch_size:
                try:
                    loaders = Bunch()
                    for k in datasets.keys():
                        loaders[k] = DataLoader(
                            datasets[k],
                            batch_size,
                            shuffle=True,
                            pin_memory=not keep_data_on_device)

                    space_coach = FeedForwardCoach(
                        model,
                        loaders,
                        logger=logger,
                        log_dir=log_dir,
                        tensorboard=summary_writer,
                        tqdm=get_tqdm(position=device_idx),
                        **config)
                    logger.debug(f'Worker {device_idx}: beginning training.')
                    space_coach.train_until_convergence(description=desc)
                    logger.debug(f'Worker {device_idx}: beginning evaluation')
                    stats = space_coach.evaluate(desc, loaders.validation)
                    mean_reward = np.mean(stats[reward_metric])

                    with FileLock(save_path):
                        if exists(save_path):
                            other = torch.load(save_path)
                            model.space.merge(other.to(model.space.device))
                        model.space.cpu().save(log_dir, 'merged_space')

                    with FileLock(storage_path):
                        with open(storage_path, 'r') as f:
                            existing = json.load(f)

                        if current_complexity is not None:
                            assert isinstance(existing, dict)
                            if str(current_complexity) not in existing:
                                existing[str(current_complexity)] = []

                            existing[str(current_complexity)].append(
                                [description, mean_reward])
                        else:
                            assert isinstance(existing, list)
                            existing.append([description, mean_reward])

                        with open(storage_path, 'w+') as f:
                            json.dump(existing, f)

                    cleanup()
                    return description, mean_reward

                except RuntimeError as e:
                    if 'out of memory' in str(e):
                        if adaptive_batch_size:
                            batch_size = int(batch_size * batch_size_decay)
                            config['initial_lr'] *= batch_size_decay
                            logger.info(
                                f'Out of memory, decreasing batch size to {batch_size}.'
                            )
                        else:
                            logger.info(
                                'Out of memory on fixed batch size. Terminating.'
                            )
                            cleanup()
                            return description, None
                    else:
                        logger.error(e)
                        raise e

                except LossIsNoneError:
                    logger.info('Loss is NaN. Terminating.')
                    cleanup()
                    return description, 0.

            else:
                logger.info(
                    'Out of memory with minimum batch size. Terminating.')
                cleanup()
                return description, None
Ejemplo n.º 6
0
def train(config, worker, resume, num_gpus, gpu_idx):
    """
    Curriculum architect training procedure.
    Includes sampling descriptions with complexity :math:`i`, evaluating them, train architect and start over again
    with complexity :math:`i+1`.
    When ``max_compexity`` is reached, disables curriculum, flattens :class:`CurriculumStorage` and trains plainly
    from then on.

    Args:
        config (dict, str): configuration dictionary or path to YAML file.
        worker (callable): worker callable
        resume (bool): whether  to try resuming the previous session
        num_gpus (int): number of GPUs to use
        gpu_idx (str, optional): string of comma separated dpu indices. if ``None``, ``range(num_gpus)`` is used.

    Returns:
        On keyboard interrupt returns storage filled with all that's benn found and evaluated.
    """
    config, space, architect, archicoach, gpu_indices, log_dir =\
        train_init(config, resume, gpu_idx, num_gpus)

    input_shape = config['child_training']['input_shape']
    training_config = config['architect_training']

    load_architect = training_config['load_architect'] and resume
    epochs_per_loop = training_config['epochs_per_loop']
    architect_lr_decay = training_config['lr_decay']
    curriculum = training_config['curriculum']
    assert 0 < architect_lr_decay < 1

    if curriculum:
        storage_surplus_factor = training_config.get('storage_surplus_factor',
                                                     1)
        assert storage_surplus_factor >= 1

    storage = archicoach.storage
    summary_writer = archicoach.summary_writer

    loops = 0
    points_per_epoch = archicoach.batch_size * archicoach.epoch_steps

    while True:
        try:
            if curriculum:

                curriculum_complexity = loops + 1
                desired_storage_len = len(storage) + points_per_epoch

                if curriculum_complexity <= storage.max_complexity:

                    archicoach.stats.curriculum_complexity = curriculum_complexity
                    architect.search_space.set_curriculum_complexity(
                        curriculum_complexity)
                    storage.set_complexity(curriculum_complexity)
                    desired_storage_len = points_per_epoch * storage_surplus_factor

                elif curriculum_complexity == storage.max_complexity + 1:

                    curriculum_complexity = 0
                    archicoach.curriculum = False

                    architect.search_space.release_all_constraints()
                    archicoach.storage = archicoach.storage.flatten()

                else:

                    curriculum_complexity = 0
            else:

                curriculum_complexity = None
                desired_storage_len = (loops + 1) * points_per_epoch

            evaluation_list, deterministic_is_viable = sample_loop(
                architect, storage, space, input_shape, desired_storage_len)

            worker_fn = partial(worker,
                                current_complexity=curriculum_complexity)
            result = evaluate(evaluation_list, worker_fn, gpu_indices)
            logger.debug(
                f'Architect training: Evaluated {len(result)} descriptions.')

            accuracies = []
            for description, reward in result:

                if reward == 'exists':
                    r = np.nan
                    if curriculum:
                        for level, index in storage.find(description).items():
                            r = storage.storages[level].rewards[index]
                            if not np.isnan(r.mean().item()): break
                    else:
                        index = storage.find(description)
                        r = storage.rewards[index]

                    if not np.isnan(r.mean().item()):
                        reward = r.mean().item()
                        accuracies.append(reward)
                        storage.reward(description, float(reward))

                elif reward is not None:
                    accuracies.append(reward)
                    storage.reward(description, float(reward))

            logger.debug(
                f'Architect training: Updated storage with {len(accuracies)} items.'
            )

            # Since last reward corresponds to deterministic description
            if len(result) > 0:
                if result[-1][1] is None:
                    deterministic_is_viable = False

                if deterministic_is_viable:
                    summary_writer.add_scalar('stochastic_acc',
                                              np.mean(accuracies[:-1]), loops)
                    summary_writer.add_scalar('deterministic_acc',
                                              accuracies[-1], loops)
                else:
                    summary_writer.add_scalar('stochastic_acc',
                                              np.mean(accuracies), loops)
                    logger.debug(
                        f'Architect training: Deterministic description is not viable.'
                    )

            storage.filter_na()
            if len(storage
                   ) < desired_storage_len and curriculum_complexity != 0:
                logger.debug(
                    f'Architect training: not enough samples evaluated, rerunning the loop.'
                )
                continue

            try:
                if len(result) > 0 or not load_architect:
                    logger.debug(
                        f'Architect training: beginning the training.')
                    archicoach.train(epochs_per_loop)
                    architect.save(log_dir, 'checkpoint')
                archicoach.decay_lr(architect_lr_decay)
                loops += 1
            except ValueError as e:
                if 'Storage does not contain enough' in str(e):
                    continue

        except KeyboardInterrupt:
            return storage