def __init__(self, model, resume, config, logger_path): self.config = config self.device, device_ids = self._prepare_device(config.device) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.epochs = config.trainer.epochs self.save_freq = config.trainer.save_freq self.verbosity = config.trainer.verbosity self.checkpoint_dir = config.trainer.checkpoint_dir mkdir_dir(self.checkpoint_dir) self.train_logger = Logger(logger_path) self.monitor = config.trainer.monitor self.monitor_mode = config.trainer.monitor_mode assert self.monitor_mode in ['min', 'max', 'off'] self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 self.writer = WriterTensorboardX(config.trainer.checkpoint_dir, self.train_logger, config.visualization.tensorboardX) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, optimizer, resume, config): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.optimizer = optimizer self.steps = config['trainer']['steps'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.start_step = 0 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX(writer_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # 生成一个对应类的日志 # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: # 如果gpu > 1并行gpu self.model = torch.nn.DataParallel(model, device_ids=device_ids) #self.device = torch.device('cuda:0' if config['n_gpu'] > 0 else 'cpu') #self.model = model.to(self.device) #if config['n_gpu'] > 1: #self.model = torch.nn.DataParallel(model) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.train_logger = train_logger # configuration to monitor model performance and save best self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode in ['min', 'max', 'off'] # 判断monitor_mode是否是这三个 不是抛出异常 self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf # 初始化模型最优loss self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') # 获取当前时间 self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # checkpoint的目录(名字) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) # 可视化的地址 self.writer = WriterTensorboardX( writer_dir, self.logger, config['visualization']['tensorboardX']) # 第三个参数决定是否开启可视化 # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) # 如果checkpoint_dir不存在创建一个 config_save_path = os.path.join(self.checkpoint_dir, 'config.json') # path with open(config_save_path, 'w') as handle: # 把路径存到config json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.train_logger = train_logger cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.verbosity = cfg_trainer['verbosity'] self.monitor = cfg_trainer.get('monitor', 'off') self.validation_every = cfg_trainer['validation_every'] # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf self.early_stop = cfg_trainer.get('early_stop', math.inf) self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX(writer_dir, self.logger, cfg_trainer['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__(self, modelD, modelG, loss1, loss2, metrics, optimizerD, optimizerG, resume, config, train_logger=None): #用于GAN的base,输入有D和G两个模型 self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.modelD = modelD.to(self.device) self.modelG = modelG.to(self.device) if len(device_ids) > 1: self.modelD = torch.nn.DataParallel(modelD, device_ids=device_ids) self.modelG = torch.nn.DataParallel(modelG, device_ids=device_ids) self.loss1 = loss1 self.loss2 = loss2 self.metrics = metrics self.optimizerD = optimizerD self.optimizerG = optimizerG self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.train_logger = train_logger # configuration to monitor model performance and save best self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode in ['min', 'max', 'off'] self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX( writer_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__( self, model, losses, metrics, optimizer_g, optimizer_d_s, optimizer_d_t, resume, config, train_logger=None, pretrained_path=None, ): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) self.losses = losses self.metrics = metrics self.optimizer_g = optimizer_g self.optimizer_d_s = optimizer_d_s self.optimizer_d_t = optimizer_d_t self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] # Set pretrained_load_strict to False to load model without strict state name matching # It's useful when pretrained model without GAN but we want to use GAN for this time self.pretrained_load_strict = config['trainer']['pretrained_load_strict'] self.train_logger = train_logger # configuration to monitor model performance and save best self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode in ['min', 'max', 'off'] self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX(writer_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume) elif pretrained_path is not None: self._load_pretrained(pretrained_path) # put model into DataParallel module only after the checkpoint is loaded if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids)
def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = model.to(self.device) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.train_logger = train_logger cfg_trainer = config['train'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_p'] self.verbosity = cfg_trainer['verbosity'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf self.early_stop = cfg_trainer.get('early_stop', math.inf) self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], start_time, 'checkpoints') self.log_dir = os.path.join(cfg_trainer['save_dir'], start_time, 'logs') self.writer = WriterTensorboardX(self.log_dir, self.logger, cfg_trainer['tbX']) # Save configuration file into checkpoint directory: mkdir_p(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.with_cuda = config['cuda'] and torch.cuda.is_available() if config['cuda'] and not torch.cuda.is_available(): self.logger.warning( 'Warning: There\'s no GPU available on this machine, ' 'training will be performed on CPU.') self.device = torch.device( 'cuda:' + str(config['gpu']) if self.with_cuda else 'cpu') self.model = model.to(self.device) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.train_logger = train_logger # configuration to monitor model performance and save best self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode in ['min', 'max', 'off'] self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX( writer_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__( self, model, loss, metrics, optimizer, resume, config, train_logger=None ): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config["n_gpu"]) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.train_logger = train_logger cfg_trainer = config["trainer"] self.epochs = cfg_trainer["epochs"] self.save_period = cfg_trainer["save_period"] self.verbosity = cfg_trainer["verbosity"] self.monitor = cfg_trainer.get("monitor", "off") # configuration to monitor model performance and save best if self.monitor == "off": self.mnt_mode = "off" self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = math.inf if self.mnt_mode == "min" else -math.inf self.early_stop = cfg_trainer.get("early_stop", math.inf) self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime("%m%d_%H%M%S") self.checkpoint_dir = os.path.join( cfg_trainer["save_dir"], config["name"], start_time ) # setup visualization writer instance writer_dir = os.path.join(cfg_trainer["log_dir"], config["name"], start_time) self.writer = WriterTensorboardX( writer_dir, self.logger, cfg_trainer["tensorboardX"] ) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, "config.json") with open(config_save_path, "w") as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, metrics, optimizer, resume, config): self.config = config self.logger = config.get_logger( 'trainer', config['trainer']['verbosity']) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: print("Using DataParallel for loss") self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.verbosity = cfg_trainer['verbosity'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf # Since early stopping was not available in earlier versions of the codebase # we disable it completely (this allows previous experiments to be reproduced # without modifying configs) # self.early_stop = cfg_trainer.get('early_stop', math.inf) self.early_stop = math.inf self.start_epoch = 1 self.latest_log = None # setup directory for checkpoint saving # if resume: # start_time = os.path.split(os.path.split(resume)[0])[1] # else: # start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = config.save_dir # setup visualization writer instance # writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX( config.log_dir, self.logger, cfg_trainer['tensorboardX']) if resume: self._resume_checkpoint(resume)
def __init__(self, game, nnet, args): self.game = game self.args = args self.nnet = nnet self.pnet = self.nnet.__class__(self.game, self.args) # the competitor network self.mcts = MCTS(self.game, self.nnet, self.args) self.trainExamplesHistory = [ ] # history of examples from args.numItersForTrainExamplesHistory latest iterations self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples() self.elo = 0 # elo score of the current model self.logger = logging.getLogger(self.__class__.__name__) start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') # setup visualization writer instance writer_dir = os.path.join(self.args.log_dir, self.args.name, start_time) self.writer = WriterTensorboardX(writer_dir, self.logger, self.args.tensorboardX)
class Coach(): """ This class executes the self-play + learning. It uses the functions defined in Game and NeuralNet. args are specified in main.py. """ def __init__(self, game, nnet, args): self.game = game self.args = args self.nnet = nnet self.pnet = self.nnet.__class__(self.game, self.args) # the competitor network self.mcts = MCTS(self.game, self.nnet, self.args) self.trainExamplesHistory = [ ] # history of examples from args.numItersForTrainExamplesHistory latest iterations self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples() self.elo = 0 # elo score of the current model self.logger = logging.getLogger(self.__class__.__name__) start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') # setup visualization writer instance writer_dir = os.path.join(self.args.log_dir, self.args.name, start_time) self.writer = WriterTensorboardX(writer_dir, self.logger, self.args.tensorboardX) def executeEpisode(self): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to trainExamples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in trainExamples. It uses a temp=1 if episodeStep < tempThreshold, and thereafter uses temp=0. Returns: trainExamples: a list of examples of the form (canonicalBoard,pi,v) pi is the MCTS informed policy vector, v is +1 if the player eventually won the game, else -1. """ self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree trainExamples = [] board = self.game.getInitBoard() self.curPlayer = 1 episodeStep = 0 while True: episodeStep += 1 canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer) temp = int(episodeStep < self.args.tempThreshold) pi = self.mcts.getActionProb(canonicalBoard, temp=temp) sym = self.game.getSymmetries(canonicalBoard, pi) for b, p in sym: trainExamples.append([b, self.curPlayer, p, None]) action = np.random.choice(len(pi), p=pi) board, self.curPlayer = self.game.getNextState( board, self.curPlayer, action) r = self.game.getGameEnded(board, self.curPlayer) if r != 0: return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer))) for x in trainExamples] def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in tqdm(range(1, self.args.numIters + 1), desc='Iteration'): # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) for eps in tqdm(range(self.args.numEps), desc='mcts.Episode'): iterationTrainExamples += self.executeEpisode() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory ) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i - 1) # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples, self.writer) self.writer.set_step(i - 1, "learning") nmcts = MCTS(self.game, self.nnet, self.args) print("PITTING AGAINST METRIC COMPONENTS") for metric_opponent in self.args.metric_opponents: arena = Arena( lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), metric_opponent(self.game).play, self.game) nwins, owins, draws = arena.playGames( self.args.metricArenaCompare) print('%s WINS : %d / %d ; DRAWS : %d' % (metric_opponent.__name__, nwins, owins, draws)) if nwins + owins == 0: win_prct = 0 else: win_prct = float(nwins) / (nwins + owins) self.writer.add_scalar( '{}_win'.format(metric_opponent.__name__), win_prct) # Reset nmcts nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) if nwins + pwins == 0: win_prct = 0 else: win_prct = float(nwins) / (nwins + pwins) self.writer.add_scalar('self_win', win_prct) # Calculate elo score for self play results = [-x for x in arena.get_results() ] # flip to be next neural network wins nelo, pelo = elo(self.elo, self.elo, results) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins == 0 or float(nwins) / ( pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.elo = pelo self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.elo = nelo self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') self.writer.add_scalar('self_elo', self.elo) def getCheckpointFile(self, iteration): return 'checkpoint_' + str(iteration) + '.pth.tar' def saveTrainExamples(self, iteration): folder = self.args.checkpoint if not os.path.exists(folder): os.makedirs(folder) filename = os.path.join( folder, self.getCheckpointFile(iteration) + ".examples") with open(filename, "wb+") as f: Pickler(f).dump(self.trainExamplesHistory) f.closed def loadTrainExamples(self): modelFile = os.path.join(self.args.load_folder_file[0], self.args.load_folder_file[1]) examplesFile = modelFile + ".examples" if not os.path.isfile(examplesFile): print(examplesFile) r = input("File with trainExamples not found. Continue? [y|n]") if r != "y": sys.exit() else: print("File with trainExamples found. Read it.") with open(examplesFile, "rb") as f: self.trainExamplesHistory = Unpickler(f).load() f.closed # examples based on the model were already collected (loaded) self.skipFirstSelfPlay = True
def __init__( self, model: Module, loss: Callable, loss_args: dict, metrics: List[Callable], metric_args: List[dict], optimizer: Optimizer, config: dict, resume: Optional[str] = None, train_logger: Optional[Logger] = None ) -> None: self.config: dict = config self.logger: logging.Logger = logging.getLogger(self.__class__.__name__) # Setup GPU device if available. self.device: str device_ids: List[int] self.device, device_ids = self._prepare_device(config["n_gpu"]) # Move model into device(s). self.model: Module = model.to(self.device) if len(device_ids) > 1: self.model: Module = DataParallel(model, device_ids = device_ids) self.loss: Callable = loss self.loss_args: dict = loss_args self.metrics: List[Callable] = metrics self.metric_args: List[dict] = metric_args self.optimizer: Optimizer = optimizer self.train_logger: Optional[Logger] = train_logger cfg_trainer: dict = config["trainer"] self.epochs: int = cfg_trainer["epochs"] self.save_period: int = cfg_trainer["save_period"] self.verbosity: int = cfg_trainer["verbosity"] self.monitor: str = cfg_trainer.get("monitor", "off") self.mnt_mode: str self.mnt_best: float # Configuration to monitor model performance and save the best result. if self.monitor == "off": self.mnt_mode = "off" self.mnt_best = 0 else: self.mnt_mode: str self.mnt_metric: str self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = math.inf if self.mnt_mode == "min" \ else -math.inf self.early_stop: float = cfg_trainer.get("early_stop", math.inf) self.start_epoch = 1 # Setup directory for saving checkpoints. start_time: str = datetime.datetime.now().strftime("%m%d_%H%M%S") self.checkpoint_dir: str = os.path.join( cfg_trainer["save_dir"], config["name"], start_time ) # Setup visualization writer instance. writer_dir: str = os.path.join( cfg_trainer["log_dir"], config["name"], start_time ) self.writer: WriterTensorboardX = WriterTensorboardX( writer_dir, self.logger, cfg_trainer["tensorboardX"] ) # Save configuration file into checkpoint directory. ensure_dir(self.checkpoint_dir) config_save_path: str = os.path.join( self.checkpoint_dir, "config.json" ) with open(config_save_path, 'w') as handle: json.dump(config, handle, indent = 4, sort_keys = False) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.train_logger = train_logger cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.verbosity = cfg_trainer['verbosity'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf self.early_stop = cfg_trainer.get('early_stop', math.inf) self.start_epoch = 1 # UTC Time to Beijing Time, +8 hours start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') # start_time = start_time[:5] + str((int(start_time[5:7]) + 8) % 24) + start_time[7:] # setup directory for checkpoint saving self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX(writer_dir, self.logger, cfg_trainer['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') # logging to file fh = logging.handlers.RotatingFileHandler( Path(config['trainer']['save_dir']) / config['name'] / start_time / 'main.log', 'w+', 20 * 1024 * 1024, 5) formatter = logging.Formatter( '%(asctime)s %(levelname)5s - %(name)s ' '[%(filename)s line %(lineno)d] - %(message)s', datefmt='%m-%d %H:%M:%S') fh.setFormatter(formatter) self.logger.addHandler(fh) # print(self.model) # self.logger.info(self.model) print('saving weight/log/config to {}'.format(start_time)) with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: if resume.split('/') == 1: self._resume_checkpoint_outer(resume) else: self._resume_checkpoint(resume)
def __init__(self, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger( self.__class__.__name__ ) # used for displaying logging and warning info # setup GPU device if available, move model into configured device self.device, self.device_ids = self._prepare_device( self.config['n_gpu']) # dataloader self.train_set, self.val_set = get_INBreast_dataloader(self.config) # class weight for balanced dataset self.class_weight = [ torch.FloatTensor([1, 20]), torch.FloatTensor([1, 4]) ] self.label_weight = [2, 1] self.output_ch = len(self.class_weight) # build model architecture self.model = module_arch.AttU_Net(img_ch=1, output_ch=self.output_ch) # print(self.model) # model parallel using muti-gpu self.model = model_parallel(self.model, self.device, self.device_ids) # build optimizer, learning rate scheduler. self.optimizer, self.scheduler = build_optimizer( self.model, self.config) self.train_logger = train_logger # used for saving logging info trainer_config = self.config['trainer'] self.max_epochs = trainer_config['max_epochs'] self.save_period = trainer_config['save_period'] self.val_period = trainer_config['val_period'] self.verbosity = trainer_config['verbosity'] self.monitor = trainer_config.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf self.early_stop = trainer_config.get('early_stop', math.inf) self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') if self.save_period != 0: self.checkpoint_dir = os.path.join(trainer_config['save_dir'], self.config['name'], start_time) ensure_dir(self.checkpoint_dir) # setup visualization writer instance writer_dir = os.path.join(trainer_config['log_dir'], self.config['name'], start_time) ensure_dir(writer_dir) self.writer = WriterTensorboardX(writer_dir, self.logger, trainer_config['tensorboardX']) # Save configuration file into logging directory: self._save_config(writer_dir) if resume: self._resume_checkpoint(resume)
def __init__(self, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger( self.__class__.__name__ ) # used for displaying logging and warning info # setup GPU device if available, move model into configured device self.device, self.device_ids = self._prepare_device( self.config['n_gpu']) # dataloader self.train_set, self.val_set = get_UCLA_dataset(self.config) # class weight for balanced dataset self.class_weight = torch.FloatTensor([1, 1]) # build model architecture self.model = module_arch.AttU_Net_Classification(img_ch=1) # loading pretrained model pretrain_path = '../deployment/checkpoint/segmentation_model.pth' checkpoint = torch.load(pretrain_path) self.model.load_state_dict(checkpoint['model_state_dict'], strict=False) # freeze all parameters for param in self.model.parameters(): param.requires_grad = False # unfreeze classification part for param in self.model.classification.parameters(): param.requires_grad = True for param in self.model.fc.parameters(): param.requires_grad = True # model parallel using muti-gpu self.model = model_parallel(self.model, self.device, self.device_ids) # build optimizer, learning rate scheduler. self.optimizer, self.scheduler = build_optimizer( self.model, self.config) self.train_logger = train_logger # used for saving logging info trainer_config = self.config['trainer'] self.max_epochs = trainer_config['max_epochs'] self.save_period = trainer_config['save_period'] self.val_period = trainer_config['val_period'] self.verbosity = trainer_config['verbosity'] self.monitor = trainer_config.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf self.early_stop = trainer_config.get('early_stop', math.inf) self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') if self.save_period != 0: self.checkpoint_dir = os.path.join(trainer_config['save_dir'], self.config['name'], start_time) ensure_dir(self.checkpoint_dir) # setup visualization writer instance writer_dir = os.path.join(trainer_config['log_dir'], self.config['name'], start_time) ensure_dir(writer_dir) self.writer = WriterTensorboardX(writer_dir, self.logger, trainer_config['tensorboardX']) # Save configuration file into logging directory: self._save_config(writer_dir) if resume: self._resume_checkpoint(resume)
def __init__(self, models, optimizers, loss, metrics, resume, config, train_logger=None): """ class initialization :param models: models dictionary contains generator model, local discriminator model, and global discriminator model :param optimizers: optimizers dictionary contains generator optimizer and the discriminators optimizers :param loss: loss dictionary contains the loss objectives :param metrics: other metrics except for the loss want to display during training :param resume: resume checkpoint :param config: config file :param train_logger: logger """ self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, self.device_ids = self._prepare_device(config['n_gpu']) self.generator = models["generator"].to(self.device) self.local_discriminator = models["local_discriminator"].to( self.device) # paralleling the models if multiple GPUs if len(self.device_ids) > 1: self.generator = torch.nn.DataParallel(self.generator, device_ids=self.device_ids) self.local_discriminator = torch.nn.DataParallel( self.local_discriminator, device_ids=self.device_ids) self.loss = loss self.metrics = metrics self.train_logger = train_logger self.generator_optimizer = optimizers["generator"] self.local_discriminator_optimizer = optimizers["local_discriminator"] # read training settings cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.verbosity = cfg_trainer['verbosity'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf self.early_stop = cfg_trainer.get('early_stop', math.inf) self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX(writer_dir, self.logger, cfg_trainer['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) self.config["checkpoint_dir"] = self.checkpoint_dir config_save_path = os.path.join(self.checkpoint_dir, 'line_gan_local_config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
class BaseTrainer: """ Base class for all trainers """ def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None, data_loader=None): self.config = config self.data_loader = data_loader #if any sent # Setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) os.makedirs(self.checkpoint_dir, exist_ok=True) # Setup logger logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", handlers=[ logging.FileHandler( os.path.join(self.checkpoint_dir, "train.log")), logging.StreamHandler(), ]) self.logger = logging.getLogger(self.__class__.__name__) # Setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.train_logger = train_logger # configuration to monitor model performance and save best self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode in ['min', 'max', 'off'] self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 # setup visualization writer instance writer_train_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time, "train") writer_valid_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time, "valid") self.writer_train = WriterTensorboardX( writer_train_dir, self.logger, config['visualization']['tensorboardX']) self.writer_valid = WriterTensorboardX( writer_valid_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration file into checkpoint directory config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) # Resume if resume: self._resume_checkpoint(resume) def _prepare_device(self, n_gpu_use): """ setup GPU device if available, move model into configured device """ n_gpu = torch.cuda.device_count() if n_gpu_use > 0 and n_gpu == 0: self.logger.warning( "Warning: There\'s no GPU available on this machine, training will be performed on CPU." ) n_gpu_use = 0 if n_gpu_use > n_gpu: msg = "Warning: The number of GPU\'s configured to use is {}, but only {} are available on this machine.".format( n_gpu_use, n_gpu) self.logger.warning(msg) n_gpu_use = n_gpu device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu') list_ids = list(range(n_gpu_use)) return device, list_ids def addGraph_Tensorboard(self): if self.data_loader == None: return data, _ = next(iter(self.data_loader)) data = data.to(self.device) self.writer_train.addGraph(self.model, data) def train(self): #first lets write the graph to tensorboardX if possible self.addGraph_Tensorboard() for epoch in range(self.start_epoch, self.epochs + 1): self.logger.info( "\n----------------------------------------------------------------" ) self.logger.info("[EPOCH %d]" % (epoch)) start_time = time() result = self._train_epoch(epoch) finish_time = time() self.logger.info("Finish at {}, Runtime: {:.3f} [s]".format( datetime.datetime.now(), finish_time - start_time)) # save logged informations into log dict log = {} for key, value in result.items(): if key == 'train_metrics': log.update({ 'train_' + mtr.__name__: value[i] for i, mtr in enumerate(self.metrics) }) elif key == 'valid_metrics': log.update({ 'valid_' + mtr.__name__: value[i] for i, mtr in enumerate(self.metrics) }) else: log[key] = value # print logged informations to the screen if self.train_logger is not None: self.train_logger.add_entry(log) if self.verbosity >= 1: for key, value in sorted(list(log.items())): self.logger.info('{:25s}: {}'.format(str(key), value)) # evaluate model performance according to configured metric, save best checkpoint as model_best best = False if self.monitor_mode != 'off': try: if (self.monitor_mode == 'min' and log[self.monitor] < self.monitor_best) or\ (self.monitor_mode == 'max' and log[self.monitor] > self.monitor_best): self.logger.info( "Monitor improved from %f to %f" % (self.monitor_best, log[self.monitor])) self.monitor_best = log[self.monitor] best = True except KeyError: if epoch == 1: msg = "Warning: Can\'t recognize metric named '{}' ".format(self.monitor)\ + "for performance monitoring. model_best checkpoint won\'t be updated." self.logger.warning(msg) # Save checkpoint self._save_checkpoint(epoch, save_best=best) def _train_epoch(self, epoch): """ Training logic for an epoch :param epoch: Current epoch number """ raise NotImplementedError def _save_checkpoint(self, epoch, save_best=False): """ Saving checkpoints :param epoch: current epoch number :param log: logging information of the epoch :param save_best: if True, rename the saved checkpoint to 'model_best.pth' """ # Construct savedict arch = type(self.model).__name__ state = { 'arch': arch, 'epoch': epoch, 'logger': self.train_logger, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'monitor_best': self.monitor_best, 'config': self.config } # Save checkpoint for each epoch if self.save_freq is not None: # Use None mode to avoid over disk space with large models if epoch % self.save_freq == 0: filename = os.path.join(self.checkpoint_dir, 'epoch{}.pth'.format(epoch)) torch.save(state, filename) self.logger.info("Saving checkpoint at {}".format(filename)) # Save the best checkpoint if save_best: best_path = os.path.join(self.checkpoint_dir, 'model_best.pth') torch.save(state, best_path) self.logger.info("Saving current best at {}".format(best_path)) else: self.logger.info("Monitor is not improved from %f" % (self.monitor_best)) def _resume_checkpoint(self, resume_path): """ Resume from saved checkpoints :param resume_path: Checkpoint path to be resumed """ self.logger.info("Loading checkpoint: {}".format(resume_path)) checkpoint = torch.load(resume_path) self.start_epoch = checkpoint['epoch'] + 1 self.monitor_best = checkpoint['monitor_best'] # load architecture params from checkpoint. if checkpoint['config']['arch'] != self.config['arch']: self.logger.warning('Warning: Architecture configuration given in config file is different from that of checkpoint. ' + \ 'This may yield an exception while state_dict is being loaded.') self.model.load_state_dict(checkpoint['state_dict'], strict=True) # # load optimizer state from checkpoint only when optimizer type is not changed. # if checkpoint['config']['optimizer']['type'] != self.config['optimizer']['type']: # self.logger.warning('Warning: Optimizer type given in config file is different from that of checkpoint. ' + \ # 'Optimizer parameters not being resumed.') # else: # self.optimizer.load_state_dict(checkpoint['optimizer']) self.train_logger = checkpoint['logger'] self.logger.info("Checkpoint '{}' (epoch {}) loaded".format( resume_path, self.start_epoch - 1))
def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None): self.config = config # Setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) os.makedirs(self.checkpoint_dir, exist_ok=True) # Setup logger ''' logging.basicConfig( level=logging.INFO, format="%(asctime)s %(message)s", handlers=[ logging.FileHandler(os.path.join(self.checkpoint_dir, "train.log")), ]) self.logger = logging.getLogger(self.__class__.__name__) ''' fh = logging.FileHandler(os.path.join(self.checkpoint_dir, "train.log")) fh.setLevel(logging.INFO) self.logger = logging.getLogger(self.__class__.__name__) self.logger.setLevel(logging.INFO) self.logger.addHandler(fh) # Setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.train_logger = train_logger # configuration to monitor model performance and save best self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode in ['min', 'max', 'off'] self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 # setup visualization writer instance writer_train_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time, "train") writer_valid_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time, "valid") self.writer_train = WriterTensorboardX( writer_train_dir, self.logger, config['visualization']['tensorboardX']) self.writer_valid = WriterTensorboardX( writer_valid_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration file into checkpoint directory config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) # Resume if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, metric, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.train_logger = train_logger cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.verbosity = cfg_trainer['verbosity'] self.monitor = cfg_trainer.get('monitor', 'off') if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf self.early_stop = cfg_trainer.get('early_stop', math.inf) self.start_epoch = 1 start_time = datatime.datatime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], config['name'], start_time) writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'],start_time) self.writer = WriterTensorboardX(writer_dir, self.logger, cfg_trainer['tensorboardX']) ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume) def _prepare_device(self, n_gpu_use): n_gpu = torch.cuda.device_count() if n_gpu_use > 0 and n_gpu == 0: self.logger.warning("Warning: There\'s no GPU available on this machine, training will be performed on CPU.") n_gpu_use = 0 if n_gpu_use > n_gpu: self.logger.warning("Warning: The number of GPU\'s configured to use but only on this machine".format(n_gpu_use, n_gpu)) n_gpu_use = n_gpu device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu') list_ids = list(range(n_gpu_use)) return device, list_ids def train(self): for epoch in range(self.start_epoch, self.epochs + 1): result = self._train_epoch(epoch) log = {'epoch': epoch} for key, value in result.items(): if key == 'metrics': log.update({mtr.__name__: value[i] for i, mtr in enumerate(self.metrics)}) elif key == 'val_metrics': log.update({'val_' + mtr.__name__: value[i] for i, mtr in enumerate(self.metrics)}) else: log[key] = value if self.train_logger is not None: self.train_logger.add_entry(log) if self.verbosity >= 1: for key, value in log.items(): self.logger.info(' {:15s}: {}'.format(str(key), value)) best = False if self.mnt_mode != 'off': try: improved = (self.mnt_mode == 'min' and log[self.mnt_metric] < self.mnt_best) or (self.mnt_mode == 'max' and log[self.mnt_metric] > self.mnt_best) except KeyError: self.logger.warning("Warning: Metric '{}' is not found. Model performance monitoring is disabled.".format(self.mnt_metric)) self.mnt_mode = 'off' improved = False not_improved_count = 0 if improved: self.mnt_best = log[self.mnt_metric] not_improved_count = 0 best = True else: not_imporved_count += 1 if not_improved_count > self.early_stop: self.logger.info("Validation performance didn\'t improve for {} epochs. Training stops.".format(self.early_stop)) break if epoch % self.save_period == 0: self._save_checkpoint(epoch, save_best=best) def _train_epoch(self, epoch): raise NotImplementedError def _save_checkpoint(self, epoch, save_best=False): arch = type(self.model).__name__ state = { 'arch': arch, 'epoch': epoch, 'logger': self.train_logger, 'state_dict': self.model.state_dict(), 'monitor_best': self.mnt_best, 'config': self.config } filename = os.path.join(self.checkpoint_dir, 'checkpoint-epoch{}.pth'.format(epoch)) torch.save(state, filename) self.logger.info("Saving checkpoint: {} ...".format(filename)) if save_best: best_path = os.path.join(self.checkpoint_dir, 'model_best.pth') torch.save(state, best_path) self.logger.info("Saving current best: {} ...".format('model_best.pth')) def _resume_checkpoint(self, resume_path): self.logger.info("Loading checkpoint: {} ...".format(resume_path)) checkpoint = torch.load(resume_path) self.start_epoch = checkpoint['epoch'] + 1 self.mnt_best = checkpoint['monitor_best'] if checkpoit['config']['arch'] != self.config['arch']: self.logger.warning('Warning: Architecture configuration given in config file is different from that of checkpoint. This may yield an exception while state_dict is being loaded.') self.model.load_state_dict(checkpoint['state_dict']) if checkpoint['config']['optimizer']['type'] != self.config['optimizer']['type']: self.logger.warning('Warning: Optimizer type given in config file is different from that of checkpoint. Optimizer parameters not being resumed') else: self.optimizer.load_state_dict(checkpoint['optimizer']) self.train_logger = checkpoint['logger'] self.logger.info("Checkpoint '{}' (epoch {}) loaded".format(resume_path, self.start_epoch))
class BaseTrainer: """ Base class for all trainers """ def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = model.to(self.device) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.train_logger = train_logger cfg_trainer = config['train'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_p'] self.verbosity = cfg_trainer['verbosity'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf self.early_stop = cfg_trainer.get('early_stop', math.inf) self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], start_time, 'checkpoints') self.log_dir = os.path.join(cfg_trainer['save_dir'], start_time, 'logs') self.writer = WriterTensorboardX(self.log_dir, self.logger, cfg_trainer['tbX']) # Save configuration file into checkpoint directory: mkdir_p(self.checkpoint_dir) if self.config.get('cfg', None) is not None: cfg_save_path = os.path.join(self.checkpoint_dir, 'model.cfg') with open(cfg_save_path, 'w') as fw: fw.write(open(self.config['cfg']).read()) self.config['cfg'] = cfg_save_path config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(self.config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume) def train(self): """ Full training logic """ best_df = None not_improved_count = 0 #f = open(os.path.join(self.log_dir, 'lr.txt'), 'w') for epoch in range(self.start_epoch, self.epochs + 1): # _train_epoch returns dict with train metrics ("metrics"), validation # metrics ("val_metrics") and other key,value pairs. Store/update them in log. result = self._train_epoch(epoch) # save logged informations into log dict log = {'epoch': epoch} for key, value in result.items(): if key == 'metrics': log.update({ mtr.__name__: value[i] for i, mtr in enumerate(self.metrics) }) elif key == 'val_metrics': log.update({ 'val_' + mtr.__name__: value[i] for i, mtr in enumerate(self.metrics) }) else: log[key] = value c_lr = self.optimizer.param_groups[0]['lr'] # print logged informations to the screen if self.train_logger is not None: self.train_logger.add_entry(log) if self.verbosity >= 1: df = pd.DataFrame.from_dict([log]).T df.columns = [''] #self.logger.info('Epoch: {}'.format(epoch)) self.logger.info('{}'.format(df.loc[df.index != 'epoch'])) self.logger.info('lr_0: {}'.format(c_lr)) #f.write('%.5f\t%.5f\t%.5f\n'%(c_lr, result['loss'], result['metrics'][0])) #f.flush() self.writer.add_scalar('lr', c_lr) # evaluate model performance according to configured metric, save best checkpoint as model_best best = False if self.mnt_mode != 'off': try: # check whether model performance improved or not, according to specified metric(mnt_metric) improved = (self.mnt_mode == 'min' and log[self.mnt_metric] < self.mnt_best) or \ (self.mnt_mode == 'max' and log[self.mnt_metric] > self.mnt_best) except KeyError: self.logger.warning( "Warning: Metric '{}' is not found. Model performance monitoring is disabled." .format(self.mnt_metric)) self.mnt_mode = 'off' improved = False not_improved_count = 0 if improved: self.mnt_best = log[self.mnt_metric] not_improved_count = 0 best = True best_df = df else: not_improved_count += 1 if not_improved_count > self.early_stop: self.logger.info( "Validation performance didn\'t improve for {} epochs. Training stops." .format(self.early_stop)) self.logger.info('Final:\n{}'.format( best_df.loc[best_df.index != 'epoch'])) break if len(self.writer) > 0: self.logger.info( '\nRun TensorboardX:\ntensorboard --logdir={}\n'.format( self.log_dir)) if epoch % self.save_period == 0: self._save_checkpoint(epoch, save_best=best) #self.logger.info('\n\n\tTensorboardX Path: {}\n'.format(self.log_dir)) def _train_epoch(self, epoch): """ Training logic for an epoch :param epoch: Current epoch number """ raise NotImplementedError def _save_checkpoint(self, epoch, save_best=False): """ Saving checkpoints :param epoch: current epoch number :param log: logging information of the epoch :param save_best: if True, rename the saved checkpoint to 'model_best.pth' """ arch = type(self.model).__name__ state = { 'arch': arch, 'epoch': epoch, 'logger': self.train_logger, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'monitor_best': self.mnt_best, 'config': self.config, 'classes': self.model.classes } filename = os.path.join(self.checkpoint_dir, 'checkpoint-current.pth') #filename = os.path.join(self.checkpoint_dir, 'checkpoint-epoch{}.pth'.format(epoch)) torch.save(state, filename) self.logger.info("Saving checkpoint: {} ...".format(filename)) if save_best: best_path = os.path.join(self.checkpoint_dir, 'model_best.pth') torch.save(state, best_path) self.logger.info( "Saving current best: {} ...".format('model_best.pth')) self.logger.info("[IMPROVED]") def _resume_checkpoint(self, resume_path): """ Resume from saved checkpoints :param resume_path: Checkpoint path to be resumed """ self.logger.info("Loading checkpoint: {} ...".format(resume_path)) checkpoint = torch.load(resume_path) self.start_epoch = checkpoint['epoch'] + 1 self.mnt_best = checkpoint['monitor_best'] # load architecture params from checkpoint. self.model.load_state_dict(checkpoint['state_dict'], ) # load optimizer state from checkpoint only when optimizer type is not changed. self.optimizer.load_state_dict(checkpoint['optimizer']) self.train_logger = checkpoint['logger'] self.logger.info("Checkpoint '{}' (epoch {}) loaded".format( resume_path, self.start_epoch))
def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.train_logger = train_logger cfg_trainer = config['trainer'] cfg_model = config['arch']['args'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.verbosity = cfg_trainer['verbosity'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf self.early_stop = cfg_trainer.get('early_stop', math.inf) self.start_epoch = 1 self.not_improved_count = 0 ss = str(config['trainer']['label_portion']) logvar_trainable = int(cfg_model['logvar_trainable']) pow_exp = int(cfg_model['pow_exp']) latent_dim = cfg_model['latent_dim'] is_pitch_condition = int(cfg_model['is_pitch_condition']) is_pitch_discriminate = int(cfg_model['is_pitch_discriminate']) model_dir = '-'.join([ config['data_loader']['args']['data_dir'].split('/')[-1], 'ss_%s' % ss, 'latent_%d' % latent_dim, 'pow_%d' % pow_exp, 'lvl_%d' % logvar_trainable, 'pc_%d' % is_pitch_condition, 'pd_%d' % is_pitch_discriminate, self.mnt_metric ]) self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], config['name'], model_dir) # setup visualization writer instance writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'], model_dir) self.writer = WriterTensorboardX(writer_dir, self.logger, cfg_trainer['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device n_gpu_use = config['n_gpu'] n_gpu = torch.cuda.device_count() if n_gpu_use > 0 and n_gpu == 0: self.logger.warning( "Warning: There\'s no GPU available on this machine, training will be performed on CPU." ) n_gpu_use = 0 if n_gpu_use > n_gpu: msg = "Warning: The number of GPU\'s configured to use is {}, but only {} are available on this machine.".format( n_gpu_use, n_gpu) self.logger.warning(msg) n_gpu_use = n_gpu self.device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu') self.model = model.to(self.device) if n_gpu_use > 1: self.model = torch.nn.DataParallel(model, device_ids=list( range(n_gpu_use))) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.train_logger = train_logger # configuration to monitor model performance and save best self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode in ['min', 'max', 'off'] self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX( writer_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def train(self, examples, writer=None): """ examples: list of examples, each example is of form (board, pi, v) writer: optional tensorboardX writer """ optimizer = self.args.optimizer(self.nnet.parameters(), lr=self.args.lr, **self.args.optimizer_kwargs) scheduler = self.args.lr_scheduler(optimizer, **self.args.lr_scheduler_kwargs) # If no writer, create unusable writer if writer is None: writer = WriterTensorboardX(None, None, False) epoch_bar = tqdm(desc="Training Epoch", total=self.args.epochs) for epoch in range(self.args.epochs): self.nnet.train() scheduler.step() pi_losses = AverageMeter() v_losses = AverageMeter() total_losses = AverageMeter() num_batches = int(len(examples)/self.args.batch_size) bar = tqdm(desc='Batch', total=num_batches) batch_idx = 0 while batch_idx < num_batches: writer.set_step((self.train_iteration * self.args.epochs * num_batches) + (epoch * num_batches) + batch_idx) sample_ids = np.random.randint(len(examples), size=self.args.batch_size) boards, pis, vs = list(zip(*[examples[i] for i in sample_ids])) boards = torch.FloatTensor(np.array(boards).astype(np.float64)) target_pis = torch.FloatTensor(np.array(pis)) target_vs = torch.FloatTensor(np.array(vs).astype(np.float64)) # predict if self.args.cuda: boards, target_pis, target_vs = boards.contiguous().cuda(), target_pis.contiguous().cuda(), target_vs.contiguous().cuda() # compute output out_pi, out_v = self.nnet(boards) l_pi = self.loss_pi(target_pis, out_pi) l_v = self.loss_v(target_vs, out_v) total_loss = l_pi + l_v pi_losses.update(l_pi.item(), boards.size(0)) v_losses.update(l_v.item(), boards.size(0)) total_losses.update(total_loss.item(), boards.size(0)) # record loss writer.add_scalar('pi_loss', l_pi.item()) writer.add_scalar('v_loss', l_v.item()) writer.add_scalar('loss', total_loss.item()) # compute gradient and do SGD step optimizer.zero_grad() total_loss.backward() optimizer.step() # measure elapsed time batch_idx += 1 # plot progress bar.set_postfix( lpi=l_pi.item(), lv=l_v.item(), loss=total_loss.item() ) bar.update() bar.close() writer.set_step((self.train_iteration * self.args.epochs) + epoch, 'train_epoch') writer.add_scalar('epoch_pi_loss', pi_losses.avg) writer.add_scalar('epoch_v_loss', v_losses.avg) writer.add_scalar('epoch_loss', total_losses.avg) epoch_bar.set_postfix( avg_lpi=pi_losses.avg, avg_lv=v_losses.avg, avg_l=total_losses.avg ) epoch_bar.update() epoch_bar.close() self.train_iteration += 1