def main(): # Add seed args = parser.get() data_class = data.Dataset(args) train, validation = data_class.train(), data_class.validation() model = models.get(args) optimizer = optimizer.get(args, model.parameters()) criterion = torch.nn.CrossEntropyLoss() for epoch in range(args.epochs): train_metrics = runner.run( model, criterion, optimizer, train, True, { "loss": metrics.loss, "accuracy": metrics.accuracy }, ) metrics.print(train_metrics) validation_metrics = runner.run( model, criterion, optimizer, validation, False, { "loss": metrics.loss, "accuracy": metrics.accuracy }, ) metrics.print(validation_metrics)
def __init__(self, params): """ Initialize trainer. """ self.params = params # epoch / iteration size assert isinstance(config.epoch_size, int) assert config.epoch_size >= 1 self.epoch_size = config.epoch_size # network and criterion net, criterion = model.get() self.net = net self.criterion = criterion # data iterators self.iterators = {} train_iter, valid_iter, SRC_TEXT, TGT_TEXT = dataset.load() torch.distributed.barrier() print("Process {}, dataset loaded.".format(params.local_rank)) self.iterators["train"] = train_iter self.iterators["valid"] = valid_iter self.num_train = len(train_iter) self.SRC_TEXT = SRC_TEXT self.TGT_TEXT = TGT_TEXT torch.distributed.barrier() # Multi-GPU assert config.amp >= 1 or not config.fp16 if config.multi_gpu and config.fp16 == False: logger.info("Using nn.parallel.DistributedDataParallel ...") self.net = nn.parallel.DistributedDataParallel( self.net, device_ids=[params.local_rank], output_device=params.local_rank) # set optimizers self.opt = optimizer.get(self.net) torch.distributed.barrier() # Float16 / distributed if config.fp16: self.init_amp() if config.multi_gpu: logger.info("Using apex.parallel.DistributedDataParallel ...") self.net = apex.parallel.DistributedDataParallel( self.net, delay_allreduce=True) # validation metrics self.best_metrics = {} for k in config.valid_metrics.keys(): factor = config.valid_metrics[k] self.best_metrics[k] = [config.init_metric * factor, factor] # early stopping metrics self.early_stopping_metrics = {} for k in self.best_metrics: self.early_stopping_metrics[k] = self.best_metrics[k] self.decrease_counts = 0 self.decrease_counts_max = config.decrease_counts_max self.stopping_criterion = config.stopping_criterion if config.multi_gpu: self.should_terminate = torch.tensor(0).byte() self.should_terminate = self.should_terminate.cuda() else: self.should_terminate = False assert (self.stopping_criterion in self.best_metrics) or (self.stopping_criterion is None) # training statistics self.epoch = 0 self.n_iter = 0 self.n_total_iter = 0 self.n_sentences = 0 self.stats = OrderedDict([('processed_s', 0), ('processed_w', 0)] + [('MT-%s-%s-loss' % (config.SRC_LAN, config.TGT_LAN), [])] + [('MT-%s-%s-ppl' % (config.SRC_LAN, config.TGT_LAN), [])]) self.last_time = time.time() # reload potential checkpoints self.reload_checkpoint(network_only=config.reload_network_only) print("Process {}, trainer initialized.".format(params.local_rank))
def __init__(self, params): """ Initialize trainer. """ self.params = params # Initialize tensorboard writer train_log = SummaryWriter( os.path.join(config.tensorboard_log_path, "log", "train")) valid_log = SummaryWriter( os.path.join(config.tensorboard_log_path, "log", "valid")) self._tensorboard = TensorboardWriter(train_log, valid_log) # epoch / iteration size assert isinstance(config.epoch_size, int) assert config.epoch_size >= 1 self.epoch_size = config.epoch_size # network and criterion net, criterion = model.get() self.net = net self.criterion = criterion # data iterators self.iterators = {} train_iter, valid_iter, SRC_TEXT, TGT_TEXT = dataset.get() self.iterators["train"] = train_iter self.iterators["valid"] = valid_iter self.num_train = len(train_iter) self.SRC_TEXT = SRC_TEXT self.TGT_TEXT = TGT_TEXT # Multi-GPU if config.multi_gpu: logger.info("Using nn.parallel.DistributedDataParallel ...") self.net = nn.parallel.DistributedDataParallel( self.net, device_ids=[params.local_rank], output_device=params.local_rank) """ self.criterion = nn.parallel.DistributedDataParallel( self.criterion, device_ids=[params.local_rank], output_device=params.local_rank ) """ # set optimizers self.opt = optimizer.get(self.net) # validation metrics self.best_metrics = {} for k in config.valid_metrics.keys(): factor = config.valid_metrics[k] self.best_metrics[k] = [config.init_metric * factor, factor] # training statistics self.epoch = 0 self.n_iter = 0 self.n_total_iter = 0 self.n_sentences = 0 self.stats = OrderedDict([('processed_s', 0), ('processed_w', 0)] + [('MT-%s-%s-loss' % (config.SRC_LAN, config.TGT_LAN), [])] + [('MT-%s-%s-ppl' % (config.SRC_LAN, config.TGT_LAN), [])]) self.last_time = time.time() # reload potential checkpoints self.reload_checkpoint()