class FileMonitorLearningRate(Configurable): file_path = State() def __init__(self, **kwargs): self.load_all(**kwargs) self.monitor = SignalMonitor(self.file_path) def get_learning_rate(self, epoch, step): signal = self.monitor.get_signal() if signal is not None: return float(signal) return None
class ModelSaver(Configurable): dir_path = State() save_interval = State(default=1000) signal_path = State() def __init__(self, **kwargs): self.load_all(**kwargs) # BUG: signal path should not be global self.monitor = SignalMonitor(self.signal_path) def maybe_save_model(self, model, epoch, step, logger): if step % self.save_interval == 0 or self.monitor.get_signal( ) is not None: self.save_model(model, epoch, step) logger.report_time('Saving ') logger.iter(step) def save_model(self, model, epoch=None, step=None): if isinstance(model, dict): for name, net in model.items(): checkpoint_name = self.make_checkpoint_name(name, epoch, step) self.save_checkpoint(net, checkpoint_name) else: checkpoint_name = self.make_checkpoint_name('model', epoch, step) self.save_checkpoint(model, checkpoint_name) def save_checkpoint(self, net, name): if dist.is_available() and dist.is_initialized( ) and not dist.get_rank() == 0: return os.makedirs(self.dir_path, exist_ok=True) torch.save(net.state_dict(), os.path.join(self.dir_path, name)) def make_checkpoint_name(self, name, epoch=None, step=None): if epoch is None or step is None: c_name = name + '_latest' else: c_name = '{}_epoch_{}_minibatch_{}'.format(name, epoch, step) return c_name
def __init__(self, **kwargs): self.load_all(**kwargs) self.monitor = SignalMonitor(self.file_path)
def __init__(self, **kwargs): self.load_all(**kwargs) # BUG: signal path should not be global self.monitor = SignalMonitor(self.signal_path)