def release(self): """ Releases the lock. """ if self.is_locked: self.is_locked = False os.remove(self.lockfile) logger.debug(f'Released {self.lockfile}') else: raise RuntimeError('The file lock is not owned by this instance.')
def acquire_device(queue): """ Context manager which acquires a device from device queue and releases it when the worker is done working. Args: queue (Queue): a blocking queue with available devices. """ idx = queue.get() logger.debug(f'Acquired device {idx}.') yield idx queue.put(idx) logger.debug(f'Released device {idx}.')
def acquire(self, blocking=True): """ Tries to acquire the lock. Args: blocking: if the lock is unavailable and `blocking` is False, will return False, otherwise will block the next statement execution until the lock is available. """ logger.debug(f'Attempting to acquire {self.lockfile}') while True: try: fd = os.open(self.lockfile, os.O_CREAT | os.O_EXCL | os.O_RDWR) with os.fdopen(fd, 'w') as f: f.write('\n') break except FileExistsError: if not blocking: return False time.sleep(self.delay) self.is_locked = True logger.debug(f'Acquired {self.lockfile}') return True
def baseline_worker(device_idx, config, reward_metric, name): """ Concurrent description evaluator. Args: device_idx (int): a dedicated CUDA devices index config (dict, str): configuration dictionary or path to YAML file. reward_metric (str): key for the returned evaluation stats dictionary. name (str): baseline name Returns: float: achieved ``reward_metric`` value """ if isinstance(config, str): with open(config) as f: config = yaml.load(f) log_dir = config['log_dir'] data_dir = config['data_dir'] baseline_dir = join(log_dir, 'baselines', name) if exists(baseline_dir): rmtree(baseline_dir) make_dirs(baseline_dir) logger = get_logger(f'baseline_{name}', join(baseline_dir, 'training.log')) summary_writer = SummaryWriter(baseline_dir) logger.debug(f'Worker {device_idx}: initialization done.') config = config.get('child_training', config) batch_size = config.pop('batch_size') keep_data_on_device = config.pop('keep_data_on_device') adaptive_batch_size = config.pop('adaptive_batch_size') if adaptive_batch_size: min_batch_size = config.pop('min_batch_size') max_batch_size = config.pop('max_batch_size') batch_size_decay = config.pop('batch_size_decay') assert isinstance(min_batch_size, int) assert isinstance(max_batch_size, int) assert 0 < batch_size_decay < 1 config['initial_lr'] *= max_batch_size / batch_size batch_size = max_batch_size else: min_batch_size = batch_size datasets = torch.load(join(data_dir, 'preprocessed.pth')) with torch.cuda.device(device_idx): model_path = join(log_dir, 'model.pth') model = torch.load(model_path) logger.debug(f'Worker {device_idx}: model loaded.') model = model.cuda() logger.debug(f'Worker {device_idx}: placed on device.') if keep_data_on_device: for key in datasets.keys(): datasets[key].tensors = tuple( map(lambda t: t.cuda(device_idx), datasets[key].tensors)) gc.collect() while min_batch_size <= batch_size: try: loaders = Bunch() for k in datasets.keys(): loaders[k] = DataLoader(datasets[k], batch_size, shuffle=True, pin_memory=not keep_data_on_device) space_coach = FeedForwardCoach( model, loaders, name=name, logger=logger, log_dir=log_dir, tensorboard=summary_writer, tqdm=get_tqdm(position=device_idx), **config) logger.debug(f'Worker {device_idx}: beginning training.') space_coach.train_until_convergence(description={}) logger.debug(f'Worker {device_idx}: beginning evaluation') stats = space_coach.evaluate({}, loaders.validation) mean_reward = np.mean(stats[reward_metric]) return mean_reward except RuntimeError as e: if 'out of memory' in str(e): if adaptive_batch_size: batch_size = int(batch_size * batch_size_decay) config['initial_lr'] *= batch_size_decay logger.info( f'Out of memory, decreasing batch size to {batch_size}.' ) else: logger.info('Out of memory on fixed batch size.') return None else: logger.error(e) raise e except LossIsNoneError: logger.info('Loss is NaN.') return np.nan else: logger.info('Out of memory with minimum batch size.') return None
def generic_worker(description, device_queue, current_complexity, config, space_type, reward_metric): """ Concurrent description evaluator. Args: description (dict): description to be evaluated device_queue (Queue): a blocking queue with available devices. current_complexity (int): current curriculum complexity level. config (dict, str): configuration dictionary or path to YAML file. space_type (str): the name of root search space. reward_metric (str): key for the returned evaluation stats dictionary. Returns: Tuple of (description, mean ``reward_metric`` value). If loss is NaN, than mean ``reward_metric`` value = 0. If OOM was raised and ``not adaptive_batch_size`` or using ``min_batch_size`` causes OOM -- mean_auc=None. """ with acquire_device(device_queue) as device_idx: description = dict(description) if isinstance(config, str): with open(config) as f: config = yaml.load(f) log_dir = config['log_dir'] data_dir = config['data_dir'] storage_path = config.get('storage', join(log_dir, 'description_reward.json')) logger, summary_writer, description_dir = worker_init( description, log_dir) logger.debug(f'Worker {device_idx}: initialization done.') def cleanup(): for f in os.listdir(description_dir): if f.startswith('events.out.tfevents'): os.remove(join(description_dir, f)) config = config.get('child_training', config) batch_size = config.pop('batch_size') keep_data_on_device = config.pop('keep_data_on_device') adaptive_batch_size = config.pop('adaptive_batch_size') if adaptive_batch_size: min_batch_size = config.pop('min_batch_size') max_batch_size = config.pop('max_batch_size') batch_size_decay = config.pop('batch_size_decay') assert isinstance(min_batch_size, int) assert isinstance(max_batch_size, int) assert 0 < batch_size_decay < 1 config['initial_lr'] *= max_batch_size / batch_size batch_size = max_batch_size else: min_batch_size = batch_size datasets = torch.load(join(data_dir, 'preprocessed.pth')) with torch.cuda.device(device_idx): save_path = join(log_dir, space_type, 'merged_space.pth') model_path = join(log_dir, 'model.pth') model = torch.load(model_path) logger.debug(f'Worker {device_idx}: model loaded.') with FileLock(save_path): model.space = torch.load(save_path) logger.debug(f'Worker {device_idx}: search space loaded.') model = model.cuda() logger.debug(f'Worker {device_idx}: placed on device.') model.space.logger = logger desc = model.space.preprocess(description, (-1, model.space_input_size)) model.space.draw(desc, join(description_dir, 'graph.png')) img = np.array(Image.open(join(description_dir, 'graph.png'))) summary_writer.add_image('graph', img) logger.debug(f'Worker {device_idx}: graph visualization drawn.') if keep_data_on_device: for key in datasets.keys(): datasets[key].tensors = tuple( map(lambda t: t.cuda(device_idx), datasets[key].tensors)) gc.collect() logger.debug( f'Worker {device_idx}: transferred data to device.') while min_batch_size <= batch_size: try: loaders = Bunch() for k in datasets.keys(): loaders[k] = DataLoader( datasets[k], batch_size, shuffle=True, pin_memory=not keep_data_on_device) space_coach = FeedForwardCoach( model, loaders, logger=logger, log_dir=log_dir, tensorboard=summary_writer, tqdm=get_tqdm(position=device_idx), **config) logger.debug(f'Worker {device_idx}: beginning training.') space_coach.train_until_convergence(description=desc) logger.debug(f'Worker {device_idx}: beginning evaluation') stats = space_coach.evaluate(desc, loaders.validation) mean_reward = np.mean(stats[reward_metric]) with FileLock(save_path): if exists(save_path): other = torch.load(save_path) model.space.merge(other.to(model.space.device)) model.space.cpu().save(log_dir, 'merged_space') with FileLock(storage_path): with open(storage_path, 'r') as f: existing = json.load(f) if current_complexity is not None: assert isinstance(existing, dict) if str(current_complexity) not in existing: existing[str(current_complexity)] = [] existing[str(current_complexity)].append( [description, mean_reward]) else: assert isinstance(existing, list) existing.append([description, mean_reward]) with open(storage_path, 'w+') as f: json.dump(existing, f) cleanup() return description, mean_reward except RuntimeError as e: if 'out of memory' in str(e): if adaptive_batch_size: batch_size = int(batch_size * batch_size_decay) config['initial_lr'] *= batch_size_decay logger.info( f'Out of memory, decreasing batch size to {batch_size}.' ) else: logger.info( 'Out of memory on fixed batch size. Terminating.' ) cleanup() return description, None else: logger.error(e) raise e except LossIsNoneError: logger.info('Loss is NaN. Terminating.') cleanup() return description, 0. else: logger.info( 'Out of memory with minimum batch size. Terminating.') cleanup() return description, None
def train(config, worker, resume, num_gpus, gpu_idx): """ Curriculum architect training procedure. Includes sampling descriptions with complexity :math:`i`, evaluating them, train architect and start over again with complexity :math:`i+1`. When ``max_compexity`` is reached, disables curriculum, flattens :class:`CurriculumStorage` and trains plainly from then on. Args: config (dict, str): configuration dictionary or path to YAML file. worker (callable): worker callable resume (bool): whether to try resuming the previous session num_gpus (int): number of GPUs to use gpu_idx (str, optional): string of comma separated dpu indices. if ``None``, ``range(num_gpus)`` is used. Returns: On keyboard interrupt returns storage filled with all that's benn found and evaluated. """ config, space, architect, archicoach, gpu_indices, log_dir =\ train_init(config, resume, gpu_idx, num_gpus) input_shape = config['child_training']['input_shape'] training_config = config['architect_training'] load_architect = training_config['load_architect'] and resume epochs_per_loop = training_config['epochs_per_loop'] architect_lr_decay = training_config['lr_decay'] curriculum = training_config['curriculum'] assert 0 < architect_lr_decay < 1 if curriculum: storage_surplus_factor = training_config.get('storage_surplus_factor', 1) assert storage_surplus_factor >= 1 storage = archicoach.storage summary_writer = archicoach.summary_writer loops = 0 points_per_epoch = archicoach.batch_size * archicoach.epoch_steps while True: try: if curriculum: curriculum_complexity = loops + 1 desired_storage_len = len(storage) + points_per_epoch if curriculum_complexity <= storage.max_complexity: archicoach.stats.curriculum_complexity = curriculum_complexity architect.search_space.set_curriculum_complexity( curriculum_complexity) storage.set_complexity(curriculum_complexity) desired_storage_len = points_per_epoch * storage_surplus_factor elif curriculum_complexity == storage.max_complexity + 1: curriculum_complexity = 0 archicoach.curriculum = False architect.search_space.release_all_constraints() archicoach.storage = archicoach.storage.flatten() else: curriculum_complexity = 0 else: curriculum_complexity = None desired_storage_len = (loops + 1) * points_per_epoch evaluation_list, deterministic_is_viable = sample_loop( architect, storage, space, input_shape, desired_storage_len) worker_fn = partial(worker, current_complexity=curriculum_complexity) result = evaluate(evaluation_list, worker_fn, gpu_indices) logger.debug( f'Architect training: Evaluated {len(result)} descriptions.') accuracies = [] for description, reward in result: if reward == 'exists': r = np.nan if curriculum: for level, index in storage.find(description).items(): r = storage.storages[level].rewards[index] if not np.isnan(r.mean().item()): break else: index = storage.find(description) r = storage.rewards[index] if not np.isnan(r.mean().item()): reward = r.mean().item() accuracies.append(reward) storage.reward(description, float(reward)) elif reward is not None: accuracies.append(reward) storage.reward(description, float(reward)) logger.debug( f'Architect training: Updated storage with {len(accuracies)} items.' ) # Since last reward corresponds to deterministic description if len(result) > 0: if result[-1][1] is None: deterministic_is_viable = False if deterministic_is_viable: summary_writer.add_scalar('stochastic_acc', np.mean(accuracies[:-1]), loops) summary_writer.add_scalar('deterministic_acc', accuracies[-1], loops) else: summary_writer.add_scalar('stochastic_acc', np.mean(accuracies), loops) logger.debug( f'Architect training: Deterministic description is not viable.' ) storage.filter_na() if len(storage ) < desired_storage_len and curriculum_complexity != 0: logger.debug( f'Architect training: not enough samples evaluated, rerunning the loop.' ) continue try: if len(result) > 0 or not load_architect: logger.debug( f'Architect training: beginning the training.') archicoach.train(epochs_per_loop) architect.save(log_dir, 'checkpoint') archicoach.decay_lr(architect_lr_decay) loops += 1 except ValueError as e: if 'Storage does not contain enough' in str(e): continue except KeyboardInterrupt: return storage