Beispiel #1
0
def main():
    # Add seed
    args = parser.get()

    data_class = data.Dataset(args)
    train, validation = data_class.train(), data_class.validation()

    model = models.get(args)
    optimizer = optimizer.get(args, model.parameters())
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(args.epochs):
        train_metrics = runner.run(
            model,
            criterion,
            optimizer,
            train,
            True,
            {
                "loss": metrics.loss,
                "accuracy": metrics.accuracy
            },
        )
        metrics.print(train_metrics)
        validation_metrics = runner.run(
            model,
            criterion,
            optimizer,
            validation,
            False,
            {
                "loss": metrics.loss,
                "accuracy": metrics.accuracy
            },
        )
        metrics.print(validation_metrics)
Beispiel #2
0
    def __init__(self, params):
        """
        Initialize trainer.
        """
        self.params = params

        # epoch / iteration size
        assert isinstance(config.epoch_size, int)
        assert config.epoch_size >= 1
        self.epoch_size = config.epoch_size

        # network and criterion
        net, criterion = model.get()
        self.net = net
        self.criterion = criterion

        # data iterators
        self.iterators = {}
        train_iter, valid_iter, SRC_TEXT, TGT_TEXT = dataset.load()
        torch.distributed.barrier()
        print("Process {}, dataset loaded.".format(params.local_rank))
        self.iterators["train"] = train_iter
        self.iterators["valid"] = valid_iter
        self.num_train = len(train_iter)
        self.SRC_TEXT = SRC_TEXT
        self.TGT_TEXT = TGT_TEXT

        torch.distributed.barrier()

        # Multi-GPU
        assert config.amp >= 1 or not config.fp16
        if config.multi_gpu and config.fp16 == False:
            logger.info("Using nn.parallel.DistributedDataParallel ...")
            self.net = nn.parallel.DistributedDataParallel(
                self.net,
                device_ids=[params.local_rank],
                output_device=params.local_rank)

        # set optimizers
        self.opt = optimizer.get(self.net)

        torch.distributed.barrier()
        # Float16 / distributed
        if config.fp16:
            self.init_amp()
            if config.multi_gpu:
                logger.info("Using apex.parallel.DistributedDataParallel ...")
                self.net = apex.parallel.DistributedDataParallel(
                    self.net, delay_allreduce=True)

        # validation metrics
        self.best_metrics = {}
        for k in config.valid_metrics.keys():
            factor = config.valid_metrics[k]
            self.best_metrics[k] = [config.init_metric * factor, factor]

        # early stopping metrics
        self.early_stopping_metrics = {}
        for k in self.best_metrics:
            self.early_stopping_metrics[k] = self.best_metrics[k]

        self.decrease_counts = 0
        self.decrease_counts_max = config.decrease_counts_max
        self.stopping_criterion = config.stopping_criterion
        if config.multi_gpu:
            self.should_terminate = torch.tensor(0).byte()
            self.should_terminate = self.should_terminate.cuda()
        else:
            self.should_terminate = False
        assert (self.stopping_criterion
                in self.best_metrics) or (self.stopping_criterion is None)

        # training statistics
        self.epoch = 0
        self.n_iter = 0
        self.n_total_iter = 0
        self.n_sentences = 0
        self.stats = OrderedDict([('processed_s', 0), ('processed_w', 0)] +
                                 [('MT-%s-%s-loss' %
                                   (config.SRC_LAN, config.TGT_LAN), [])] +
                                 [('MT-%s-%s-ppl' %
                                   (config.SRC_LAN, config.TGT_LAN), [])])
        self.last_time = time.time()

        # reload potential checkpoints
        self.reload_checkpoint(network_only=config.reload_network_only)
        print("Process {}, trainer initialized.".format(params.local_rank))
Beispiel #3
0
    def __init__(self, params):
        """
        Initialize trainer.
        """
        self.params = params

        # Initialize tensorboard writer
        train_log = SummaryWriter(
            os.path.join(config.tensorboard_log_path, "log", "train"))
        valid_log = SummaryWriter(
            os.path.join(config.tensorboard_log_path, "log", "valid"))
        self._tensorboard = TensorboardWriter(train_log, valid_log)

        # epoch / iteration size
        assert isinstance(config.epoch_size, int)
        assert config.epoch_size >= 1
        self.epoch_size = config.epoch_size

        # network and criterion
        net, criterion = model.get()
        self.net = net
        self.criterion = criterion

        # data iterators
        self.iterators = {}
        train_iter, valid_iter, SRC_TEXT, TGT_TEXT = dataset.get()
        self.iterators["train"] = train_iter
        self.iterators["valid"] = valid_iter
        self.num_train = len(train_iter)
        self.SRC_TEXT = SRC_TEXT
        self.TGT_TEXT = TGT_TEXT

        # Multi-GPU
        if config.multi_gpu:
            logger.info("Using nn.parallel.DistributedDataParallel ...")
            self.net = nn.parallel.DistributedDataParallel(
                self.net,
                device_ids=[params.local_rank],
                output_device=params.local_rank)
            """
            self.criterion = nn.parallel.DistributedDataParallel(
                    self.criterion, device_ids=[params.local_rank], output_device=params.local_rank
                    )
            """

        # set optimizers
        self.opt = optimizer.get(self.net)

        # validation metrics
        self.best_metrics = {}
        for k in config.valid_metrics.keys():
            factor = config.valid_metrics[k]
            self.best_metrics[k] = [config.init_metric * factor, factor]

        # training statistics
        self.epoch = 0
        self.n_iter = 0
        self.n_total_iter = 0
        self.n_sentences = 0
        self.stats = OrderedDict([('processed_s', 0), ('processed_w', 0)] +
                                 [('MT-%s-%s-loss' %
                                   (config.SRC_LAN, config.TGT_LAN), [])] +
                                 [('MT-%s-%s-ppl' %
                                   (config.SRC_LAN, config.TGT_LAN), [])])
        self.last_time = time.time()

        # reload potential checkpoints
        self.reload_checkpoint()