Example #1
0
 def _default_log_interval_event(self, run_states: List[RunState]):
     '''
     PaddleHub default handler for log_interval_event, it will complete visualization.
     Args:
         run_states (object): the results in train phase
     '''
     scores, avg_loss, run_speed = self._calculate_metrics(run_states)
     self.vdl_writer.add_scalar(
         tag='Loss_{}'.format(self.phase), value=avg_loss, step=self._envs['train'].current_step)
     log_scores = ''
     for metric in scores:
         self.vdl_writer.add_scalar(
             tag='{}_{}'.format(metric, self.phase), value=scores[metric], step=self._envs['train'].current_step)
         log_scores += '{}={:.5f} '.format(metric, scores[metric])
     logger.train('step {} / {}: loss={:.5f} {}[step/sec: {:.2f}]'.format(self.current_step, self.max_train_steps,
                                                                          avg_loss, log_scores, run_speed))
Example #2
0
    def train_one_epoch(self, loader: paddle.io.DataLoader, timer: Timer,
                        current_epoch: int, epochs: int, log_interval: int,
                        steps_per_epoch: int) -> None:
        avg_loss = 0
        avg_metrics = defaultdict(int)
        self.model.train()

        for batch_idx, batch in enumerate(loader):
            loss, metrics = self.training_step(batch, batch_idx)
            self.optimizer_step(current_epoch, batch_idx, self.optimizer, loss)
            self.optimizer_zero_grad(current_epoch, batch_idx, self.optimizer)

            # calculate metrics and loss
            avg_loss += loss.numpy()[0]
            for metric, value in metrics.items():
                avg_metrics[metric] += value.numpy()[0]

            timer.count()

            if (batch_idx + 1) % log_interval == 0 and self.local_rank == 0:
                lr = self.optimizer.get_lr()
                avg_loss /= log_interval
                if self.use_vdl:
                    self.log_writer.add_scalar(tag='TRAIN/loss',
                                               step=timer.current_step,
                                               value=avg_loss)

                print_msg = 'Epoch={}/{}, Step={}/{}'.format(
                    current_epoch, epochs, batch_idx + 1, steps_per_epoch)
                print_msg += ' loss={:.4f}'.format(avg_loss)

                for metric, value in avg_metrics.items():
                    value /= log_interval
                    if self.use_vdl:
                        self.log_writer.add_scalar(
                            tag='TRAIN/{}'.format(metric),
                            step=timer.current_step,
                            value=value)
                    print_msg += ' {}={:.4f}'.format(metric, value)

                print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
                    lr, timer.timing, timer.eta)

                logger.train(print_msg)

                avg_loss = 0
                avg_metrics = defaultdict(int)
Example #3
0
    def train(self,
              train_dataset: paddle.io.Dataset,
              epochs: int = 1,
              batch_size: int = 1,
              num_workers: int = 0,
              eval_dataset: paddle.io.Dataset = None,
              log_interval: int = 10,
              save_interval: int = 10,
              collate_fn: Callable = None):
        '''
        Train a model with specific config.

        Args:
            train_dataset(paddle.io.Dataset) : Dataset to train the model
            epochs(int) : Number of training loops, default is 1.
            batch_size(int) : Batch size of per step, default is 1.
            num_workers(int) : Number of subprocess to load data, default is 0.
            eval_dataset(paddle.io.Dataset) : The validation dataset, deafult is None. If set, the Trainer will
                execute evaluate function every `save_interval` epochs.
            log_interval(int) : Log the train infomation every `log_interval` steps.
            save_interval(int) : Save the checkpoint every `save_interval` epochs.
            collate_fn(callable): function to generate mini-batch data by merging the sample list.
                None for only stack each fields of sample in axis 0(same as :attr::`np.stack(..., axis=0)`). Default None
        '''
        batch_sampler = paddle.io.DistributedBatchSampler(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,
            drop_last=False)
        loader = paddle.io.DataLoader(train_dataset,
                                      batch_sampler=batch_sampler,
                                      num_workers=num_workers,
                                      return_list=True,
                                      use_buffer_reader=True,
                                      collate_fn=collate_fn)

        steps_per_epoch = len(batch_sampler)
        timer = Timer(steps_per_epoch * epochs)
        timer.start()

        for i in range(epochs):
            self.current_epoch += 1
            avg_loss = 0
            avg_metrics = defaultdict(int)
            self.model.train()

            for batch_idx, batch in enumerate(loader):
                loss, metrics = self.training_step(batch, batch_idx)
                self.optimizer_step(self.current_epoch, batch_idx,
                                    self.optimizer, loss)
                self.optimizer_zero_grad(self.current_epoch, batch_idx,
                                         self.optimizer)

                # calculate metrics and loss
                avg_loss += loss.numpy()[0]
                for metric, value in metrics.items():
                    if isinstance(value, paddle.Tensor):
                        value = value.numpy()
                    avg_metrics[metric] += value

                timer.count()

                if (batch_idx +
                        1) % log_interval == 0 and self.local_rank == 0:
                    lr = self.optimizer.get_lr()
                    avg_loss /= log_interval
                    if self.use_vdl:
                        self.log_writer.add_scalar(tag='TRAIN/loss',
                                                   step=timer.current_step,
                                                   value=avg_loss)

                    print_msg = 'Epoch={}/{}, Step={}/{}'.format(
                        self.current_epoch, epochs, batch_idx + 1,
                        steps_per_epoch)
                    print_msg += ' loss={:.4f}'.format(avg_loss)

                    for metric, value in avg_metrics.items():
                        value /= log_interval
                        if self.use_vdl:
                            self.log_writer.add_scalar(
                                tag='TRAIN/{}'.format(metric),
                                step=timer.current_step,
                                value=value)
                        if isinstance(value, np.ndarray):
                            value = value.item()
                        print_msg += ' {}={:.4f}'.format(metric, value)

                    print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
                        lr, timer.timing, timer.eta)

                    logger.train(print_msg)

                    avg_loss = 0
                    avg_metrics = defaultdict(int)

                if self.current_epoch % save_interval == 0 and batch_idx + 1 == steps_per_epoch and self.local_rank == 0:
                    if eval_dataset:
                        result = self.evaluate(eval_dataset,
                                               batch_size,
                                               num_workers,
                                               collate_fn=collate_fn)
                        eval_loss = result.get('loss', None)
                        eval_metrics = result.get('metrics', {})
                        if self.use_vdl:
                            if eval_loss:
                                self.log_writer.add_scalar(
                                    tag='EVAL/loss',
                                    step=timer.current_step,
                                    value=eval_loss)

                            for metric, value in eval_metrics.items():
                                self.log_writer.add_scalar(
                                    tag='EVAL/{}'.format(metric),
                                    step=timer.current_step,
                                    value=value)

                        if not self.best_metrics or self.compare_metrics(
                                self.best_metrics, eval_metrics):
                            self.best_metrics = eval_metrics
                            best_model_path = os.path.join(
                                self.checkpoint_dir, 'best_model')
                            self.save_model(best_model_path)
                            self._save_metrics()

                            metric_msg = [
                                '{}={:.4f}'.format(metric, value)
                                for metric, value in self.best_metrics.items()
                            ]
                            metric_msg = ' '.join(metric_msg)
                            logger.eval(
                                'Saving best model to {} [best {}]'.format(
                                    best_model_path, metric_msg))

                    self._save_checkpoint()