def make_device_groups(worker_count, devices, shared, cpu_mode): if not devices or cpu_mode: print('Warning no devices detected, It will run in CPU mode') devices = [0] if shared or cpu_mode: return [devices for _ in range(worker_count)] device_per_worker = len(devices) // worker_count remaining_devices = len(devices) % worker_count if device_per_worker == 0: raise RuntimeError( f'Not enough devices (devices: {len(devices)}) < (workers: {worker_count})' 'Use --device-sharing or --cpu to bypass this error' ) groups = [] for wid in range(worker_count): groups.append(devices[device_per_worker * wid: device_per_worker * (wid + 1)]) if remaining_devices > 0: warning('Some devices were not assigned to worker') return groups
def init(self, params=None, override=False, **kwargs): """instantiate the underlying optimizer Raises ------ MissingParameters if an hyper parameter is missing """ if params is not None: params = list(params) assert isinstance(params, (list, tuple)) if self._optimizer and not override: warning( 'Optimizer is already set, use override=True to force re initialization' ) return self # add missing hyper parameters self.hyper_parameters.add_parameters(**kwargs) if params is None: params = self._model_parameters if params is None: raise MissingArgument('Missing Model parameters!') self._optimizer = self._wrap_optimizer( self.optimizer_builder( params, **self.hyper_parameters.parameters(strict=True)))
def observe(self, hpo): debug('observe') new_results = 0 m = self.pop_result() while m is not None: actioned = True if m.mtype == RESULT_ITEM: info(f'HPO {self.experiment} observed {m.message[0]["uid"]}') try: hpo.observe(m.message[0], m.message[1]) new_results += 1 except TrialDoesNotExist as e: warning(f'Could not observe trial: {e}') actioned = False elif m.mtype == WORKER_JOIN: self.worker_count += 1 elif m.mtype == WORKER_LEFT: self.worker_count -= 1 else: debug(f'Received: {m}') if actioned: self.future_client.mark_actioned(RESULT_QUEUE, m) m = self.pop_result() return new_results
def save(self, task): if self.uid is None: raise BadCheckpoint('No uid was given cannot save state') was_saved = False state = state_dict(task) state['rng'] = get_rng_states() # Was enough time passed since last save now = datetime.utcnow() elapsed = now - self.last_save should_save = elapsed.total_seconds() > self.time_buffer # Is it the best model we have seen so far is_best = True if self.keep_best is not None: is_best = self.keep_best(task.metrics.value()) if state: # Current model is not the best and we did not save the last model in a different path # (which is the best right now) # So we need to move the last state so it does not get overridden by current state if not is_best and self.best_name is None: info(f'Saving best ({self.keep_best.metric}: {self.keep_best.best})') self.best_name = self.new_best_name() was_pending = self.save_pending() if not was_pending: self.storage.rename(self.uid, self.best_name) if should_save: was_saved = self.storage.save(self.uid, state) self.save_pending() self.pending = None self.last_save = datetime.utcnow() else: self.save_pending() self.pending = (is_best, state) # we have a new best and the best was saved as with a different filename # So we need to change both the best state and the latest state if is_best and self.best_name is not None: info(f'New best ({self.keep_best.metric}: {self.keep_best.best})') self.storage.remove(self.best_name) self.best_name = self.new_best_name() was_pending = self.save_pending() if not was_pending: self.storage.copyfile(self.uid, self.best_name) else: warning('The state dictionary was empty!') if was_saved: info('Checkpoint saved') return info('Skipped Checkpoint')
def __init__(self, name=None, *, params=None, optimizer=None, half=False, loss_scale=1, dynamic_loss_scale=False, scale_window=1000, scale_factor=2, min_loss_scale=None, max_loss_scale=2.**24, **kwargs): self._optimizer = None if params is not None: params = list(params) assert isinstance(params, (list, tuple)) self._model_parameters = params self._half_parameters(half, loss_scale, dynamic_loss_scale, scale_window, scale_factor, min_loss_scale, max_loss_scale) # Track defined hyper parameters self.hyper_parameters = HyperParameters(space={}) if optimizer: warning('Using custom optimizer') if isinstance(optimizer, type): self.optimizer_builder = optimizer if hasattr(optimizer, 'get_space'): self.hyper_parameters.space = optimizer.get_space() else: self._optimizer = self._wrap_optimizer(optimizer) if hasattr(self._optimizer, 'get_space'): self.hyper_parameters.space = self._optimizer.get_space() elif name: # load an olympus model self.optimizer_builder = registered_optimizers.get(name.lower()) if not self.optimizer_builder: raise RegisteredOptimizerNotFound(name) if hasattr(self.optimizer_builder, 'get_space'): self.hyper_parameters.space = self.optimizer_builder.get_space( ) else: raise MissingArgument('optimizer or name needs to be set') # All additional args are hyper parameters self.hyper_parameters.add_parameters(**kwargs)
def register_hpo(name, factory, override=False): global registered_optimizer if name in registered_optimizer: warning(f'{name} was already registered, use override=True to ignore') if not override: return registered_optimizer[name] = factory
def register_initialization(name, factory, override=False): global registered_initialization if name in registered_initialization: warning(f'{name} was already registered, use override=True to ignore') if not override: return registered_initialization[name] = factory
def register_dataset(name, factory, override=False): global registered_datasets if name in registered_datasets: warning(f'{name} was already registered, use override=True to ignore') if not override: return registered_datasets[name] = factory
def register_adversary(name, factory, override=False): global registered_adversary if name in registered_adversary: warning(f'{name} was already registered, use override=True to ignore') if not override: return registered_adversary[name] = factory
def register_environment(name, factory, override=False): """Register a new environment backend""" global registered_environment if name in registered_environment: warning(f'{name} was already registered, use override=True to ignore') if not override: return registered_environment[name] = factory
def init(self, override=False, **kwargs): if self._initializer and not override: warning( 'Initializer is already set, use override=True to force re initialization' ) return self self.hyper_parameters.add_parameters(**kwargs) self._initializer = self.initializer_ctor( **self.hyper_parameters.parameters(strict=True)) return self
def __init__(self, dataset, device): """ Args: dataset (torch.utils.data.Dataset): dataset to cache device (torch.device): device where cached samples will be stored """ warning('DatasetCache must only be used with small datasets') if device.type == 'cuda': warning( 'Warning: pin_memory must be set to \'False\' when caching to a cuda device' ) self.dataset = dataset self.device = device self.cache = {}
def init(self, optimizer=None, override=False, **kwargs): """Initialize the LR schedule with the given hyper parameters""" if self._schedule: warning('LRSchedule is already set, use override=True to force re initialization') if not override: return self._schedule if optimizer is None: optimizer = self._optimizer if optimizer is None: raise MissingArgument('Missing optimizer argument!') self.hyper_parameters.add_parameters(**kwargs) self._schedule = self._schedule_builder( optimizer, **self.hyper_parameters.parameters(strict=True)) return self
def add_apsect(obj_type, aspect): if obj_type in aspect: warning(f'Overriding the aspect of {obj_type}') Resumable._aspects[obj_type] = aspect
def load_state_dict(self, state_dict): """Load a state dictionary to resume a previous training""" warning(f'This metric {type(self)} does not support resuming')
def state_dict(self): """Return a state dictionary used to checkpointing and resuming""" warning(f'This metric {type(self)} does not support resuming') return {}
def get_space(self) -> Dict[str, str]: """Return the dimension space of each parameters""" if self._optimizer: warning('Optimizer is already set') return self.hyper_parameters.missing_parameters()
def get_space(self): """Return the missing hyper parameters required to initialize the LR schedule""" if self._schedule: warning('LRSchedule is already set') return self.hyper_parameters.missing_parameters()