Example #1
0
    def train(self, auto_resume=True, eval_freq=0):
        '''
        Args:
            auto_resume(boolean): automatically resume latest model from self.result_dir/model_{latest_epoch}.pth if True.
            eval_freq(int): if type is int, evaluate every eval_freq. default is 0.
        '''

        # automatically resume from the latest model
        start_epoch = 0
        if auto_resume:
            start_epoch = self.resume_latest_model()
            start_epoch = 0 if start_epoch is None else start_epoch
        # train loop
        for curr_epoch in range(start_epoch, self.optimizer.max_epochs):
            # save model
            self.save_model(curr_epoch)
            # evaluate final model
            if eval_freq > 0 and curr_epoch % eval_freq == 0 and curr_epoch > 0:
                self.eval(onebyone=False)
            # train
            results = self.train_an_epoch(curr_epoch)
            # logging
            self.logging(EPOCH=curr_epoch, TIME=time_now(), RESULTS=results)
        # save final model
        self.save_model(self.optimizer.max_epochs)
        # evaluate final model
        self.eval(onebyone=False)
Example #2
0
 def resume_latest_model(self):
     '''
     resume from the latest model in path self.results_dir
     '''
     root, _, files = os_walk(self.results_dir)
     pth_files = [file for file in files if '.pth' in file and file != 'final_model.pth.tar']
     if len(pth_files) != 0:
         pth_epochs = [int(pth_file.replace('.pth', '').split('_')[1]) for pth_file in pth_files]
         max_epoch = max(pth_epochs)
         model_path = os.path.join(root, 'model_{}.pth'.format(max_epoch))
         self.model.load_state_dict(torch.load(model_path), strict=True)
         self.logging(time_now(), 'restore from {}'.format(model_path))
         return max_epoch
     else:
         return None