def run(self): """Starts the main loop. The main loop ends when a training extension makes a `training_finish_requested` record in the log. """ # This should do nothing if the user has already configured # logging, and will it least enable error messages otherwise. logging.basicConfig() # If this is resumption from a checkpoint, it is crucial to # reset `profile.current`. Otherwise, it simply does not hurt. self.profile.current = [] with change_recursion_limit(config.recursion_limit): self.original_sigint_handler = signal.signal( signal.SIGINT, self._handle_epoch_interrupt) self.original_sigterm_handler = signal.signal( signal.SIGTERM, self._handle_batch_interrupt) try: logger.info("Entered the main loop") if not self.status['training_started']: for extension in self.extensions: extension.main_loop = self self._run_extensions('before_training') with Timer('initialization', self.profile): self.algorithm_g.initialize() self.algorithm_d.initialize() self.status['training_started'] = True # We can not write "else:" here because extensions # called "before_training" could have changed the status # of the main loop. if self.log.status['iterations_done'] > 0: self.log.resume() self._run_extensions('on_resumption') self.status['epoch_interrupt_received'] = False self.status['batch_interrupt_received'] = False with Timer('training', self.profile): while self._run_epoch(): pass except TrainingFinish: self.log.current_row['training_finished'] = True except Exception as e: self._restore_signal_handlers() self.log.current_row['got_exception'] = traceback.format_exc() logger.error("Error occured during training." + error_message) try: self._run_extensions('on_error') except Exception: logger.error(traceback.format_exc()) logger.error("Error occured when running extensions." + error_in_error_handling_message) reraise_as(e) finally: self._restore_signal_handlers() if self.log.current_row.get('training_finished', False): self._run_extensions('after_training') if config.profile: self.profile.report()
def continue_training(path): """Continues training using checkpoint. Parameters ---------- path : str Path to checkpoint. Notes ----- Python picklers can unpickle objects from global namespace only if they are present in namespace where unpickling happens. Often global functions are needed for mapping, filtering and other data stream operations. In a case if the main loop uses global objects and this function fails with a message like ``` AttributeError: 'module' object has no attribute '...' ``` it means that you need to import these objects. Examples -------- This function can be used in two ways: in your script where a main loop defined or in a different script. For later options see Notes section. """ with change_recursion_limit(config.recursion_limit): with open(path, "rb") as f: main_loop = load(f) main_loop.run()
def run(self): """Starts the main loop. The main loop ends when a training extension makes a `training_finish_requested` record in the log. """ # This should do nothing if the user has already configured # logging, and will it least enable error messages otherwise. logging.basicConfig() if self._model and isinstance(self.algorithm, DifferentiableCostMinimizer): # Sanity check: model and algorithm should be configured # similarly. if not self._model.get_objective() == self.algorithm.cost: logger.warning("different costs for model and algorithm") if not (set(self._model.get_params().values()) == set(self.algorithm.params)): logger.warning("different params for model and algorithm") with change_recursion_limit(config.recursion_limit): self.original_sigint_handler = signal.signal( signal.SIGINT, self._handle_epoch_interrupt) self.original_sigterm_handler = signal.signal( signal.SIGTERM, self._handle_batch_interrupt) try: logger.info("Entered the main loop") if not self.status._training_started: for extension in self.extensions: extension.main_loop = self self._run_extensions('before_training') self.algorithm.initialize() self.status._training_started = True # We can not write "else:" here because extensions # called "before_training" could have changed the status # of the main loop. if self.log.status.iterations_done > 0: self._run_extensions('on_resumption') while self._run_epoch(): pass except TrainingFinish: self.log.current_row.training_finished = True except Exception as e: self._restore_signal_handlers() self.log.current_row.got_exception = traceback.format_exc(e) logger.error("Error occured during training." + error_message) try: self._run_extensions('on_error') except Exception as inner_e: logger.error(traceback.format_exc(inner_e)) logger.error("Error occured when running extensions." + error_in_error_handling_message) reraise_as(e) finally: if self.log.current_row.training_finished: self._run_extensions('after_training') self._restore_signal_handlers()
def dump(pickle_path, dump_path): if not dump_path: root, ext = os.path.splitext(pickle_path) if not ext: raise ValueError dump_path = root with change_recursion_limit(config.recursion_limit): main_loop = cPickle.load(open(pickle_path, "rb")) MainLoopDumpManager(dump_path).dump(main_loop)
def run(self): logging.basicConfig() with change_recursion_limit(cfg.recursion_limit): self.original_sigint_handler = signal.signal( signal.SIGINT, self._handle_epoch_interrupt) self.original_sigterm_handler = signal.signal( signal.SIGTERM, self._handle_batch_interrupt) try: logger.info("Entered the main loop") if not self.status['training_started']: for extension in self.extensions: extension.main_loop = self self._run_extensions('before_training') with Timer('initialization', self.profile): self.algorithm.initialize() self.status['training_started'] = True if self.log.status['iterations_done'] > 0: self._run_extensions('on_resumption') self.status['epoch_interrupt_received'] = False self.status['batch_interrupt_received'] = False with Timer('training', self.profile): while self._run_epoch(): pass except TrainingFinish: self.log.current_row['training_finished'] = True except Exception as e: self._restore_signal_handlers() self.log.current_row['got_exception'] = traceback.format_exc(e) logger.error("Error occured during training." + error_message) try: self._run_extensions('on_error') except Exception as inner_e: logger.error(traceback.format_exc(inner_e)) logger.error("Error occured when running extensions." + error_in_error_handling_message) reraise_as(e) finally: if self.log.current_row.get('training_finished', False): self._run_extensions('after_training') if cfg.profile: self.profile.report() self._restore_signal_handlers()
def load_log(fname): """Load a :class:`TrainingLog` object from disk. This function automatically handles various file formats that contain an instance of an :class:`TrainingLog`. This includes a pickled Log object, a pickled :class:`MainLoop` or an experiment dump (TODO). """ with change_recursion_limit(config.recursion_limit): with open(fname, 'rb') as f: from_disk = load(f) # TODO: Load "dumped" experiments if isinstance(from_disk, TrainingLog): log = from_disk elif isinstance(from_disk, MainLoop): log = from_disk.log del from_disk else: raise ValueError("Could not load '{}': Unrecognized content.") return log
def run(self): """Starts the main loop. The main loop ends when a training extension makes a `training_finish_requested` record in the log. """ # This should do nothing if the user has already configured # logging, and will it least enable error messages otherwise. logging.basicConfig() if self._model and isinstance(self.algorithm, DifferentiableCostMinimizer): # Sanity check: model and algorithm should be configured # similarly. if not self._model.get_objective() == self.algorithm.cost: logger.warning("different costs for model and algorithm") if not (set(self._model.get_params().values()) == set( self.algorithm.params)): logger.warning("different params for model and algorithm") with change_recursion_limit(config.recursion_limit): self.original_sigint_handler = signal.signal( signal.SIGINT, self._handle_epoch_interrupt) self.original_sigterm_handler = signal.signal( signal.SIGTERM, self._handle_batch_interrupt) try: logger.info("Entered the main loop") if not self.status['training_started']: for extension in self.extensions: extension.main_loop = self self._run_extensions('before_training') with Timer('initialization', self.profile): self.algorithm.initialize() self.status['training_started'] = True # We can not write "else:" here because extensions # called "before_training" could have changed the status # of the main loop. if self.log.status['iterations_done'] > 0: self._run_extensions('on_resumption') self.status['epoch_interrupt_received'] = False self.status['batch_interrupt_received'] = False with Timer('training', self.profile): while self._run_epoch(): pass except TrainingFinish: self.log.current_row['training_finished'] = True except Exception as e: self._restore_signal_handlers() self.log.current_row['got_exception'] = traceback.format_exc(e) logger.error("Error occured during training." + error_message) try: self._run_extensions('on_error') except Exception as inner_e: logger.error(traceback.format_exc(inner_e)) logger.error("Error occured when running extensions." + error_in_error_handling_message) reraise_as(e) finally: if self.log.current_row.get('training_finished', False): self._run_extensions('after_training') if config.profile: self.profile.report() self._restore_signal_handlers()
def continue_training(path): with change_recursion_limit(config.recursion_limit): main_loop = cPickle.load(open(path, "rb")) main_loop.run()
def dump(pickle_path, dump_path): with change_recursion_limit(config.recursion_limit): main_loop = cPickle.load(open(pickle_path, "rb")) MainLoopDumpManager(dump_path).dump(main_loop)
def run(self): """Starts the main loop. The main loop ends when a training extension makes a `training_finish_requested` record in the log. """ # This should do nothing if the user has already configured # logging, and will it least enable error messages otherwise. logging.basicConfig() # If this is resumption from a checkpoint, it is crucial to # reset `profile.current`. Otherwise, it simply does not hurt. self.profile.current = [] # check the model only if it wants to be checked if hasattr(self._model, 'check_sanity'): self._model.check_sanity(self.algorithm) with change_recursion_limit(config.recursion_limit): self.original_sigint_handler = signal.signal( signal.SIGINT, self._handle_epoch_interrupt) self.original_sigterm_handler = signal.signal( signal.SIGTERM, self._handle_batch_interrupt) try: logger.info("Entered the main loop") if not self.status['training_started']: for extension in self.extensions: extension.main_loop = self self._run_extensions('before_training') with Timer('initialization', self.profile): self.algorithm.initialize() self.status['training_started'] = True # We can not write "else:" here because extensions # called "before_training" could have changed the status # of the main loop. if self.log.status['iterations_done'] > 0: self.log.resume() self._run_extensions('on_resumption') self.status['epoch_interrupt_received'] = False self.status['batch_interrupt_received'] = False with Timer('training', self.profile): while self._run_epoch(): pass except TrainingFinish: self.log.current_row['training_finished'] = True except Exception as e: self._restore_signal_handlers() self.log.current_row['got_exception'] = traceback.format_exc() logger.error("Error occured during training." + error_message) try: self._run_extensions('on_error', e) except Exception: logger.error(traceback.format_exc()) logger.error("Error occured when running extensions." + error_in_error_handling_message) reraise_as(e) finally: self._restore_signal_handlers() if self.log.current_row.get('training_finished', False): self._run_extensions('after_training') if config.profile: self.profile.report()