Example #1
0
    def evaluate(self, step):
        val_losses = {}
        val_losses["loss"] = 0

        print("Evaluating model...")
        with torch.no_grad():
            for data in iter(self.val_loader):
                imgs, targets = data

                imgs = [image.to(self.config.device) for image in imgs]
                targets = [{k: v.to(self.config.device)
                            for k, v in t.items()} for t in targets]

                if self.config.distributed:
                    torch.cuda.synchronize()

                losses = self.img2pose_model.forward(imgs, targets)

                if self.config.distributed:
                    losses = reduce_dict(losses)

                loss = sum(loss for loss in losses.values())

                for loss_name in losses.keys():
                    if loss_name in val_losses:
                        val_losses[loss_name] += losses[loss_name].item()
                    else:
                        val_losses[loss_name] = losses[loss_name].item()

                val_losses["loss"] += loss.item()

        for loss_name in val_losses.keys():
            if is_main_process():
                self.writer.add_scalar(
                    f"val_{loss_name}",
                    round(val_losses[loss_name] / len(self.val_loader), 6),
                    step,
                )

        val_loss = round(val_losses["loss"] / len(self.val_loader), 6)
        self.checkpoint(val_loss, step)

        print("Current validation loss: " + f"{val_loss:.6f} at step {step}" +
              " - Best validation loss: " +
              f"{self.best_val_loss:.6f} at step {self.best_step}")

        self.img2pose_model.train()

        return val_loss
Example #2
0
    def run(self):
        self.img2pose_model.train()

        # accumulate running loss to log into tensorboard
        running_losses = {}
        running_losses["loss"] = 0

        step = 0

        # prints the best step and loss every time it does a validation
        self.best_step = 0
        self.best_val_loss = float("Inf")

        for epoch in range(self.config.epochs):
            train_logger = TrainLogger(self.config.batch_size,
                                       self.config.frequency_log)
            idx = 0
            for idx, data in enumerate(self.train_loader):
                imgs, targets = data
                imgs = [image.to(self.config.device) for image in imgs]
                targets = [{k: v.to(self.config.device)
                            for k, v in t.items()} for t in targets]
                self.optimizer.zero_grad()

                # forward pass

                losses = self.img2pose_model.forward(imgs, targets)

                loss = sum(loss for loss in losses.values())
                # if loss.item() > 100000:
                #     import ipdb; ipdb.set_trace()
                # does a backward propagation through the network
                loss.backward()

                torch.nn.utils.clip_grad_norm_(
                    self.img2pose_model.fpn_model.parameters(), 10)

                self.optimizer.step()

                if self.config.distributed:
                    losses = reduce_dict(losses)
                    loss = sum(loss for loss in losses.values())

                for loss_name in losses.keys():
                    if loss_name in running_losses:
                        running_losses[loss_name] += losses[loss_name].item()
                    else:
                        running_losses[loss_name] = losses[loss_name].item()

                running_losses["loss"] += loss.item()

                # saves loss into tensorboard
                if step % self.tensorboard_loss_every == 0 and step != 0:
                    for loss_name in running_losses.keys():
                        self.writer.add_scalar(
                            f"train_{loss_name}",
                            running_losses[loss_name] /
                            self.tensorboard_loss_every,
                            step,
                        )

                        running_losses[loss_name] = 0

                train_logger(epoch, self.config.epochs, idx,
                             len(self.train_loader), loss.item())
                step += 1

            # evaluate model using validation set (if set)
            if self.config.val_source is not None:
                val_loss = self.evaluate(step)

            else:
                # otherwise just save the model
                save_model(
                    self.img2pose_model.fpn_model_without_ddp,
                    self.optimizer,
                    self.config,
                    step=step,
                )

            # if validation loss stops decreasing, decrease lr
            if self.config.lr_plateau and self.config.val_source is not None:
                self.scheduler.step(val_loss)

            # early stop model to prevent overfitting
            if self.config.early_stop and self.config.val_source is not None:
                self.early_stop(val_loss)
                if self.early_stop.stop:
                    print("Early stopping model...")
                    break

        if self.config.val_source is not None:
            val_loss = self.evaluate(step)