Esempio n. 1
0
    def run_step(self, scaler, model, sample, optimizer, lossLogger, performanceLogger, prefix):
        '''
            Training step including forward
            :param model: model to train
            :param sample: a batch of input data
            :param optimizer:
            :param lossLogger:
            :param performanceLogger:
            :param prefix: train or val or infer
            :return: losses, predicts
        '''
        imgs, targets = sample['image'], sample['target']
        imgs = list(img.cuda() for img in imgs) if isinstance(imgs, list) else imgs.cuda()
        if isinstance(targets, list):
            if isinstance(targets[0], torch.Tensor):
                targets = [t.cuda() for t in targets]
            elif isinstance(targets[0], np.ndarray):
                targets = [torch.from_numpy(t).cuda() for t in targets]
            else:
                targets = [{k: v.cuda() for k, v in t.items()} for t in targets]
        elif isinstance(targets, dict):
            for (k, v) in targets.items():
                if isinstance(v, torch.Tensor):
                    targets[k] = v.cuda()
                elif isinstance(v, list):
                    if isinstance(v[0], torch.Tensor):
                        targets[k] = [t.cuda() for t in v]
                    elif isinstance(v[0], np.ndarray):
                        targets[k] = [torch.from_numpy(t).cuda() for t in v]
        else:
            targets = targets.cuda()

        if prefix=='train':
            # zero the parameter gradients
            optimizer.zero_grad()

            # Autocast
            with amp.autocast(enabled=True):
                out = model(imgs, targets, prefix)
                if not isinstance(out, tuple):
                    losses, predicts = out, None
                else:
                    losses, predicts = out

            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            # Backward passes under autocast are not recommended.
            # Backward ops run in the same dtype autocast chose for corresponding forward ops.
            scaler.scale(losses["loss"]).backward()

            if self.cfg.GRAD_CLIP and self.cfg.GRAD_CLIP.VALUE:
                self.clip_grad(model)

            # scaler.step() first unscales the gradients of the optimizer's assigned params.
            # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
            # otherwise, optimizer.step() is skipped.
            scaler.step(optimizer)
            # Updates the scale for next iteration.
            scaler.update()
        else:
            losses, predicts = model(imgs, targets, prefix)

        if lossLogger is not None:
            if self.cfg.distributed:
                # reduce losses over all GPUs for logging purposes
                loss_dict_reduced = reduce_dict(losses)
                lossLogger.update(**loss_dict_reduced)
                del loss_dict_reduced
            else:
                lossLogger.update(**losses)

        if performanceLogger is not None:
            if predicts is not None:
                if self.cfg.distributed:
                    # reduce performances over all GPUs for logging purposes
                    predicts_dict_reduced = reduce_dict(predicts)
                    performanceLogger.update(targets, predicts_dict_reduced)
                    del predicts_dict_reduced
                else:
                    performanceLogger.update(targets, predicts)
                del predicts

        del imgs, targets
        return losses
Esempio n. 2
0
    def train_epoch(self,
                    scaler,
                    epoch,
                    model,
                    dataloader,
                    optimizer,
                    prefix="train"):
        model.train()

        _timer = Timer()
        lossLogger = LossLogger()
        performanceLogger = MetricLogger(self.dictionary, self.cfg)

        for i, sample in enumerate(dataloader):
            imgs, targets = sample['image'], sample['target']
            _timer.tic()
            # zero the parameter gradients
            optimizer.zero_grad()

            imgs = list(
                img.cuda()
                for img in imgs) if isinstance(imgs, list) else imgs.cuda()
            if isinstance(targets, list):
                if isinstance(targets[0], torch.Tensor):
                    targets = [t.cuda() for t in targets]
                else:
                    targets = [{k: v.cuda()
                                for k, v in t.items()} for t in targets]
            else:
                targets = targets.cuda()

            # Autocast
            with amp.autocast(enabled=True):
                out = model(imgs, targets, prefix)

            if not isinstance(out, tuple):
                losses, predicts = out, None
            else:
                losses, predicts = out

            self.n_iters_elapsed += 1

            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            # Backward passes under autocast are not recommended.
            # Backward ops run in the same dtype autocast chose for corresponding forward ops.
            scaler.scale(losses["loss"]).backward()
            # scaler.step() first unscales the gradients of the optimizer's assigned params.
            # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
            # otherwise, optimizer.step() is skipped.
            scaler.step(optimizer)
            # Updates the scale for next iteration.
            scaler.update()

            # torch.cuda.synchronize()
            _timer.toc()

            if (i + 1) % self.cfg.N_ITERS_TO_DISPLAY_STATUS == 0:
                if self.cfg.distributed:
                    # reduce losses over all GPUs for logging purposes
                    loss_dict_reduced = reduce_dict(losses)
                    lossLogger.update(**loss_dict_reduced)
                    del loss_dict_reduced
                else:
                    lossLogger.update(**losses)

                if predicts is not None:
                    if self.cfg.distributed:
                        # reduce performances over all GPUs for logging purposes
                        predicts_dict_reduced = reduce_dict(predicts)
                        performanceLogger.update(targets,
                                                 predicts_dict_reduced)
                        del predicts_dict_reduced
                    else:
                        performanceLogger.update(**predicts)
                    del predicts

                if self.cfg.local_rank == 0:
                    template = "[epoch {}/{}, iter {}, lr {}] Total train loss: {:.4f} " "(ips = {:.2f})\n" "{}"
                    logger.info(
                        template.format(
                            epoch,
                            self.cfg.N_MAX_EPOCHS,
                            i,
                            round(get_current_lr(optimizer), 6),
                            lossLogger.meters["loss"].value,
                            self.batch_size *
                            self.cfg.N_ITERS_TO_DISPLAY_STATUS / _timer.diff,
                            "\n".join([
                                "{}: {:.4f}".format(n, l.value)
                                for n, l in lossLogger.meters.items()
                                if n != "loss"
                            ]),
                        ))

            del imgs, targets, losses

        if self.cfg.TENSORBOARD and self.cfg.local_rank == 0:
            # Logging train losses
            [
                self.tb_writer.add_scalar(f"loss/{prefix}_{n}", l.global_avg,
                                          epoch)
                for n, l in lossLogger.meters.items()
            ]
            performances = performanceLogger.compute()
            if len(performances):
                [
                    self.tb_writer.add_scalar(f"performance/{prefix}_{k}", v,
                                              epoch)
                    for k, v in performances.items()
                ]

        if self.cfg.TENSORBOARD_WEIGHT and False:
            for name, param in model.named_parameters():
                layer, attr = os.path.splitext(name)
                attr = attr[1:]
                self.tb_writer.add_histogram("{}/{}".format(layer, attr),
                                             param, epoch)
Esempio n. 3
0
    def val_epoch(self, epoch, model, dataloader, prefix="val"):
        model.eval()

        lossLogger = LossLogger()
        performanceLogger = MetricLogger(self.dictionary, self.cfg)

        with torch.no_grad():
            for sample in dataloader:
                imgs, targets = sample['image'], sample['target']
                imgs = list(img.cuda() for img in imgs) if isinstance(
                    imgs, list) else imgs.cuda()
                if isinstance(targets, list):
                    if isinstance(targets[0], torch.Tensor):
                        targets = [t.cuda() for t in targets]
                    else:
                        targets = [{k: v.cuda()
                                    for k, v in t.items()} for t in targets]
                else:
                    targets = targets.cuda()

                losses, predicts = model(imgs, targets, prefix)

                if self.cfg.distributed:
                    # reduce losses over all GPUs for logging purposes
                    loss_dict_reduced = reduce_dict(losses)
                    lossLogger.update(**loss_dict_reduced)
                    del loss_dict_reduced
                else:
                    lossLogger.update(**losses)

                if predicts is not None:
                    if self.cfg.distributed:
                        # reduce performances over all GPUs for logging purposes
                        predicts_dict_reduced = reduce_dict(predicts)
                        performanceLogger.update(targets,
                                                 predicts_dict_reduced)
                        del predicts_dict_reduced
                    else:
                        performanceLogger.update(targets, predicts)
                    del predicts

                del imgs, targets, losses

        performances = performanceLogger.compute()
        if self.cfg.TENSORBOARD and self.cfg.local_rank == 0:
            # Logging val Loss
            [
                self.tb_writer.add_scalar(f"loss/{prefix}_{n}", l.global_avg,
                                          epoch)
                for n, l in lossLogger.meters.items()
            ]
            if len(performances):
                # Logging val performances
                [
                    self.tb_writer.add_scalar(f"performance/{prefix}_{k}", v,
                                              epoch)
                    for k, v in performances.items()
                ]

        if self.cfg.local_rank == 0:
            template = "[epoch {}] Total {} loss : {:.4f} " "\n" "{}"
            logger.info(
                template.format(
                    epoch,
                    prefix,
                    lossLogger.meters["loss"].global_avg,
                    "\n".join([
                        "{}: {:.4f}".format(n, l.global_avg)
                        for n, l in lossLogger.meters.items() if n != "loss"
                    ]),
                ))

            perf_log_str = f"\n------------ Performances ({prefix}) ----------\n"
            for k, v in performances.items():
                perf_log_str += "{:}: {:.4f}\n".format(k, v)
            perf_log_str += "------------------------------------\n"
            logger.info(perf_log_str)

        acc = performances['performance']

        return acc
Esempio n. 4
0
    def train_epoch(self,
                    epoch,
                    model,
                    dataloader,
                    optimizer,
                    lr_scheduler,
                    grad_normalizer=None,
                    prefix="train"):
        model.train()

        _timer = Timer()
        lossLogger = MetricLogger(delimiter="  ")
        performanceLogger = MetricLogger(delimiter="  ")

        for i, (imgs, targets) in enumerate(dataloader):
            _timer.tic()
            # zero the parameter gradients
            optimizer.zero_grad()

            # imgs = imgs.cuda()
            imgs = list(
                img.cuda()
                for img in imgs) if isinstance(imgs, list) else imgs.cuda()
            # labels = [label.cuda() for label in labels] if isinstance(labels,list) else labels.cuda()
            # labels = [{k: v.cuda() for k, v in t.items()} for t in labels] if isinstance(labels,list) else labels.cuda()
            if isinstance(targets, list):
                if isinstance(targets[0], torch.Tensor):
                    targets = [t.cuda() for t in targets]
                else:
                    targets = [{k: v.cuda()
                                for k, v in t.items()} for t in targets]
            else:
                targets = targets.cuda()

            out = model(imgs, targets, prefix)

            if not isinstance(out, tuple):
                losses, performances = out, None
            else:
                losses, performances = out

            self.n_iters_elapsed += 1

            with amp.scale_loss(losses["loss"], optimizer) as scaled_loss:
                scaled_loss.backward()

            optimizer.step()

            torch.cuda.synchronize()
            _timer.toc()

            if (i + 1) % self.cfg.N_ITERS_TO_DISPLAY_STATUS == 0:
                if self.cfg.distributed:
                    # reduce losses over all GPUs for logging purposes
                    loss_dict_reduced = reduce_dict(losses)
                    lossLogger.update(**loss_dict_reduced)
                else:
                    lossLogger.update(**losses)

                if performances is not None and all(performances):
                    if self.cfg.distributed:
                        # reduce performances over all GPUs for logging purposes
                        performance_dict_reduced = reduce_dict(performances)
                        performanceLogger.update(**performance_dict_reduced)
                    else:
                        performanceLogger.update(**performances)

                if self.cfg.local_rank == 0:
                    template = "[epoch {}/{}, iter {}, lr {}] Total train loss: {:.4f} " "(ips = {:.2f})\n" "{}"
                    logger.info(
                        template.format(
                            epoch,
                            self.cfg.N_MAX_EPOCHS,
                            i,
                            round(get_current_lr(optimizer), 6),
                            lossLogger.meters["loss"].value,
                            self.batch_size *
                            self.cfg.N_ITERS_TO_DISPLAY_STATUS /
                            _timer.total_time,
                            "\n".join([
                                "{}: {:.4f}".format(n, l.value)
                                for n, l in lossLogger.meters.items()
                                if n != "loss"
                            ]),
                        ))

            del imgs, targets

        if self.cfg.TENSORBOARD and self.cfg.local_rank == 0:
            # Logging train losses
            [
                self.tb_writer.add_scalar(f"loss/{prefix}_{n}", l.global_avg,
                                          epoch)
                for n, l in lossLogger.meters.items()
            ]
            if len(performanceLogger.meters):
                [
                    self.tb_writer.add_scalar(f"performance/{prefix}_{k}",
                                              v.global_avg, epoch)
                    for k, v in performanceLogger.meters.items()
                ]

        if self.cfg.TENSORBOARD_WEIGHT and False:
            for name, param in model.named_parameters():
                layer, attr = os.path.splitext(name)
                attr = attr[1:]
                self.tb_writer.add_histogram("{}/{}".format(layer, attr),
                                             param, epoch)