Example #1
0
    def __init__(self, args):
        super(Trainer, self).__init__(args)

        #### 0. Setup
        self.save_dir = tools.set_save_dir(args)
        with open(os.path.join(self.save_dir, "args.json"), "w") as j:
            json.dump(vars(args), j)

        #### 1. Models
        model = getattr(models, args.model)(args)
        print(
            "Model param nums: ",
            sum(p.numel() for p in model.parameters() if p.requires_grad),
        )

        self.model = model.cuda()

        #### 2. Opt
        self.optimizer = opts.get_optimizer(args, self.model)
        self.scheduler = None
        if self.args.lr_scheduler is not None:
            self.scheduler = opts.get_scheduler(args, self.optimizer)

        #### 3. Data

        if args.augment is not None:
            augmentation = getattr(augmentations, args.augment)
        else:
            augmentation = None
        self.train_loader, self.val_loader = inputs.get_dataloader(
            args, transform=augmentation)

        #### 4. Logger
        self.writer = writer.Writer(log_dir=self.save_dir)
        self.logger = logger.Logger()
        self.logger.open(os.path.join(self.save_dir, "log.train.txt"),
                         mode="a")
        self.logger.write("\n>> Pytorch version: {}".format(torch.__version__))
        self.logger.write("\n>> Args: {}".format(args))

        # Validator
        self.validator = Validator(
            args,
            is_trainval=True,
            writer=self.writer,
            val_loader=self.val_loader,
        )
Example #2
0
    def __init__(self, args):
        self.args = args

        #### 0. Setup
        self.save_dir = tools.set_save_dir(args)
        with open(os.path.join(self.save_dir, "args.json"), "w") as j:
            json.dump(vars(args), j)

        #### 1. Data
        # TODO: augmentation
        augmentation = getattr(augmentations, args.augment)
        self.train_loader, self.val_loader = my_input.get_dataloader(
            args, transform=augmentation)

        #### 2. Model
        model = models.PENetClassifier(**vars(args))
        model.load_pretrained(PRETRAINED_WEIGHTS, "0")
        self.model = model.cuda()

        #### 3. Opt
        self.optimizer = opts.get_optimizer(args, self.model)
        self.scheduler = None
        if self.args.lr_scheduler is not None:
            self.scheduler = opts.get_scheduler(args, self.optimizer)

        #### 4. Logger
        self.writer = writer.Writer(log_dir=self.save_dir)
        self.logger = logger.Logger()
        self.logger.open(os.path.join(self.save_dir, "log.train.txt"),
                         mode="a")
        self.logger.write("\n>> Pytorch version: {}".format(torch.__version__))
        self.logger.write("\n>> Args: {}".format(args))

        # self.visualizer = visualizer.Visualizer(
        #     args, "train", self.save_dir, self.writer
        # )

        # Validator
        self.validator = Validator(
            args,
            is_trainval=True,
            writer=self.writer,
            val_loader=self.val_loader,
        )
Example #3
0
    def __init__(self, cfgs):

        save_dict = OrderedDict()

        save_dict["fold"] = cfgs["fold"]
        if cfgs["memo"] is not None:
            save_dict["memo"] = cfgs["memo"]  # 1,2,3
        specific_dir = ["{}-{}".format(key, save_dict[key]) for key in save_dict.keys()]

        cfgs["save_dir"] = os.path.join(
            cfgs["save_dir"],
            # cfgs["model"]["meta"],
            # cfgs["model"]["inputs"]["label"],
            "_".join(specific_dir),
        )
        os.makedirs(cfgs["save_dir"], exist_ok=True)

        ####### CONFIGS
        self.cfgs = cfgs

        ####### Logging
        self.tb_writer = utils.get_writer(self.cfgs)
        self.txt_logger = utils.get_logger(self.cfgs)

        self.do_logging = True
        if len(self.cfgs["gpu"]) > 1:
            if dist.get_rank() != 0:
                self.do_logging = False

        if self.do_logging:
            self.txt_logger.write("\n\n----train.py----")
            self.txt_logger.write("\n{}".format(datetime.datetime.now()))
            self.txt_logger.write(
                "\n\nSave Directory: \n{}".format(self.cfgs["save_dir"])
            )
            self.txt_logger.write("\n\nConfigs: \n{}\n".format(self.cfgs))

        ####### MODEL
        model = models.get_model(self.cfgs)
        if len(self.cfgs["gpu"]) > 1:
            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
            self.device = torch.device("cuda:{}".format(self.cfgs["local_rank"]))
            self.model = model.to(self.device)
            self.model = DistributedDataParallel(
                self.model,
                device_ids=[self.cfgs["local_rank"]],
                output_device=self.cfgs["local_rank"],
            )
        else:
            self.device = torch.device("cuda:{}".format(self.cfgs["local_rank"]))
            self.model = model.to(self.device)

        ####### Data

        train_dataset = inputs.get_dataset(self.cfgs, mode="train")
        if len(self.cfgs["gpu"]) > 1:
            train_sampler = DistributedSampler(
                train_dataset,
                num_replicas=len(self.cfgs["gpu"]),
                rank=self.cfgs["local_rank"],
            )
        else:
            train_sampler = None

        self.train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=self.cfgs["batch_size"],
            num_workers=self.cfgs["num_workers"],
            pin_memory=True,
            drop_last=False,
            collate_fn=inputs.get_collater(),
            sampler=train_sampler,
        )

        # if self.do_logging:
        #     self.txt_logger.write("\nDataset: ")
        #     self.txt_logger.write(
        #         "\nTRAIN Abnormal/Normal: {}/{}".format(
        #             len(train_dataset.abnormal_meta_df),
        #             len(train_dataset.normal_meta_df),
        #         )
        #     )

        ####### Opts
        self.optimizer = opts.get_optimizer(self.cfgs, self.model.parameters())
        self.scheduler = opts.get_scheduler(self.cfgs, self.optimizer)
        self.grad_scaler = GradScaler(enabled=self.cfgs["use_amp"])

        ####### Validator
        self.validator = Validator(self.cfgs, self.device)
Example #4
0
class Trainer(object):
    def __init__(self, cfgs):

        save_dict = OrderedDict()

        save_dict["fold"] = cfgs["fold"]
        if cfgs["memo"] is not None:
            save_dict["memo"] = cfgs["memo"]  # 1,2,3
        specific_dir = ["{}-{}".format(key, save_dict[key]) for key in save_dict.keys()]

        cfgs["save_dir"] = os.path.join(
            cfgs["save_dir"],
            # cfgs["model"]["meta"],
            # cfgs["model"]["inputs"]["label"],
            "_".join(specific_dir),
        )
        os.makedirs(cfgs["save_dir"], exist_ok=True)

        ####### CONFIGS
        self.cfgs = cfgs

        ####### Logging
        self.tb_writer = utils.get_writer(self.cfgs)
        self.txt_logger = utils.get_logger(self.cfgs)

        self.do_logging = True
        if len(self.cfgs["gpu"]) > 1:
            if dist.get_rank() != 0:
                self.do_logging = False

        if self.do_logging:
            self.txt_logger.write("\n\n----train.py----")
            self.txt_logger.write("\n{}".format(datetime.datetime.now()))
            self.txt_logger.write(
                "\n\nSave Directory: \n{}".format(self.cfgs["save_dir"])
            )
            self.txt_logger.write("\n\nConfigs: \n{}\n".format(self.cfgs))

        ####### MODEL
        model = models.get_model(self.cfgs)
        if len(self.cfgs["gpu"]) > 1:
            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
            self.device = torch.device("cuda:{}".format(self.cfgs["local_rank"]))
            self.model = model.to(self.device)
            self.model = DistributedDataParallel(
                self.model,
                device_ids=[self.cfgs["local_rank"]],
                output_device=self.cfgs["local_rank"],
            )
        else:
            self.device = torch.device("cuda:{}".format(self.cfgs["local_rank"]))
            self.model = model.to(self.device)

        ####### Data

        train_dataset = inputs.get_dataset(self.cfgs, mode="train")
        if len(self.cfgs["gpu"]) > 1:
            train_sampler = DistributedSampler(
                train_dataset,
                num_replicas=len(self.cfgs["gpu"]),
                rank=self.cfgs["local_rank"],
            )
        else:
            train_sampler = None

        self.train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=self.cfgs["batch_size"],
            num_workers=self.cfgs["num_workers"],
            pin_memory=True,
            drop_last=False,
            collate_fn=inputs.get_collater(),
            sampler=train_sampler,
        )

        # if self.do_logging:
        #     self.txt_logger.write("\nDataset: ")
        #     self.txt_logger.write(
        #         "\nTRAIN Abnormal/Normal: {}/{}".format(
        #             len(train_dataset.abnormal_meta_df),
        #             len(train_dataset.normal_meta_df),
        #         )
        #     )

        ####### Opts
        self.optimizer = opts.get_optimizer(self.cfgs, self.model.parameters())
        self.scheduler = opts.get_scheduler(self.cfgs, self.optimizer)
        self.grad_scaler = GradScaler(enabled=self.cfgs["use_amp"])

        ####### Validator
        self.validator = Validator(self.cfgs, self.device)
        # if self.do_logging:
        #     self.txt_logger.write(
        #         "\nVAL   Abnormal/Normal: {}/{}".format(
        #             len(self.validator.val_loader.dataset.abnormal_meta_df),
        #             len(self.validator.val_loader.dataset.normal_meta_df),
        #         )
        #     )

        # if self.cfgs["model"]["val"]["ignore_normal"]:
        #     self.txt_logger.write("\nVAL   Ignore Normal")
        #     self.validator.val_loader.dataset.meta_df = (
        #         self.validator.val_loader.dataset.abnormal_meta_df
        #     )

    def do_train(self):

        ####### Setup Train
        self.epoch, self.iter, self.resume_epoch = 0, 0, 0
        self.tot_val_record = {
            "best": {"det_recl": -1, "det_prec": -1, "det_f1": -1, "loss": np.inf}
        }

        if self.cfgs["model"]["train"]["resume_train"]:
            with open(
                os.path.join(self.cfgs["save_dir"], "tot_val_record.pkl"), "rb"
            ) as f:
                self.tot_val_record = pickle.load(f)
                self.iter, self.resume_epoch = (
                    self.tot_val_record["best"]["iteration"],
                    self.tot_val_record["best"]["epoch"],
                )
                resume_model_dir = os.path.join(
                    self.cfgs["save_dir"], "epoch_{}.pt".format(self.resume_epoch)
                )
                checkpoint = torch.load(resume_model_dir)
                self.model.load_state_dict(checkpoint["model"], strict=True)
                self.optimizer.load_state_dict(checkpoint["optimizer"])
                self.grad_scaler.load_state_dict(checkpoint["scaler"])
                self.txt_logger.write("\n\nResume Training Here! \n\n")

        if self.do_logging:
            self.txt_logger.write("\n\nStart Training! \n\n")
            header_columns = ["epoch", "iter", "time", "train_loss", "val_loss"]
            header_columns += ["det_recl", "det_prec", "det_fppi", "det_f1"]
            header_columns += ["cls_auc", "cls_sens", "cls_spec"]
            header_columns += ["best_epoch"]
            self.txt_logger.log_header(header_columns)

        ####### Train
        self.start_time = time.time()
        self.endurance = 0
        for epoch in range(self.resume_epoch, self.cfgs["model"]["train"]["max_epoch"]):
            # self.train_loader.dataset.shuffle()
            # self.train_loader.dataset.meta_df = (
            #     self.train_loader.dataset.abnormal_meta_df
            # )

            self.one_epoch_steps = len(self.train_loader)
            self.display_step = (
                self.one_epoch_steps // self.cfgs["model"]["train"]["display_interval"]
            )

            self.epoch = epoch
            if self.endurance > self.cfgs["model"]["train"]["endurance"]:
                if self.do_logging:
                    self.txt_logger.write(
                        "\nStop training! No more performance gain expected!"
                    )
                    best_epoch = self.tot_val_record["best"]["epoch"]
                    self.txt_logger.write(
                        "\n\nBest saved at: {}, {} epoch\n\n".format(
                            self.cfgs["save_dir"], best_epoch
                        )
                    )
                break
            self.train_val_one_epoch()

    def train_val_one_epoch(self):

        self.optimizer.zero_grad()
        self.model.train()

        t0 = time.time()

        for i, data in enumerate(self.train_loader):
            t1 = time.time()
            img = data["img"].permute(0, 3, 1, 2).to(self.device)
            logit = self.model(img)

            t2 = time.time()

            # FIXME: GPU Util이 안 나온다
            loss = opts.calc_loss(self.cfgs, self.device, data, logit)

            t3 = time.time()

            self.grad_scaler.scale(loss).backward()
            self.grad_scaler.step(self.optimizer)
            self.grad_scaler.update()

            self.optimizer.zero_grad()

            t4 = time.time()

            # NOTE: Try to avoid excessive CPU-GPU synchronization (.item() calls, or printing values from CUDA tensors).

            if self.do_logging:
                loss = loss.detach().item()
                take_time = tools.convert_time(time.time() - self.start_time)
                train_logs = [loss, "-"]
                self.txt_logger.log_result(
                    [self.epoch, "{}/{}".format(i, self.one_epoch_steps), take_time]
                    + train_logs
                )
                self.tb_writer.write_scalars(
                    {"loss": {"train loss": loss}},
                    self.iter,
                )

                if self.iter % self.display_step == 0:
                    # Visualize
                    # Find abnormal
                    for viz_bi in range(len(data["fp"])):
                        if data["bbox"][viz_bi, 0, -1] != -1:
                            break

                    with torch.no_grad():
                        self.model.eval()
                        det_preds_viz = (
                            self.model(img, mode="viz")["preds"][viz_bi]
                            .detach()
                            .cpu()
                            .numpy()
                        )

                        if len(det_preds_viz) != 0:
                            # sigmoid
                            det_preds_viz[:, -1] = 1 / (
                                1 + np.exp(-1 * det_preds_viz[:, -1])
                            )
                        else:
                            det_preds_viz = np.ones((1, 6)) * -1

                        det_anns_viz = data["bbox"][viz_bi].numpy()

                        self.tb_writer.write_images(
                            data["fp"][viz_bi],
                            data["img"][viz_bi].numpy(),
                            det_preds_viz,
                            det_anns_viz,
                            self.iter,
                            "train",
                        )
                        self.model.train()

            self.iter += 1

            lr0 = self.cfgs["model"]["opts"]["learning_rate"]
            wep = self.cfgs["model"]["opts"]["warmup_epoch"]
            if self.epoch < wep:
                for pg in self.optimizer.param_groups:
                    pg["lr"] = lr0 / wep * (self.epoch + i / self.one_epoch_steps)
            else:
                if not self.scheduler is None:
                    self.scheduler.step(self.epoch - wep + i / self.one_epoch_steps)

            t5 = time.time()
            if self.cfgs["do_profiling"]:
                print("\ndata", t1 - t0)
                print("forward", t2 - t1)
                print("calc loss", t3 - t2)
                print("backward", t4 - t3)
                print("logging", t5 - t4)
            t0 = t5

        if self.epoch > self.cfgs["model"]["val"]["ignore_epoch"]:

            # Do Validation
            val_record, val_viz = self.validator.do_validate(self.model)
            self.tot_val_record[str(self.epoch + 1)] = val_record
            val_best = val_record[self.cfgs["model"]["val"]["best"]]

            # Save Model
            select_metric = self.cfgs["model"]["val"]["best"]
            val_improved = False
            if select_metric == "loss":
                if val_best < self.tot_val_record["best"][select_metric]:
                    val_improved = True
            elif select_metric == "det_f1":
                if val_best > self.tot_val_record["best"][select_metric]:
                    val_improved = True

            if val_improved:
                checkpoint = {
                    "epoch": self.epoch,
                    "model": self.model.state_dict(),
                    "optimizer": self.optimizer.state_dict(),
                    "scaler": self.grad_scaler.state_dict(),
                }
                model_name = os.path.join(
                    self.cfgs["save_dir"], "epoch_" + str(self.epoch + 1) + ".pt"
                )
                torch.save(checkpoint, model_name)
                self.tot_val_record["best"] = val_record
                self.tot_val_record["best"]["epoch"] = self.epoch + 1
                self.tot_val_record["best"]["iteration"] = self.iter
                self.endurance = 0
            else:
                self.endurance += 1

            if self.do_logging:
                take_time = utils.tools.convert_time(time.time() - self.start_time)
                vloss = val_record["loss"]
                vbest_epoch = self.tot_val_record["best"]["epoch"]
                metric_keys = ["det_recl", "det_prec", "det_fppi", "det_f1"]
                metric_keys += ["cls_auc", "cls_sens", "cls_spec"]
                val_logs = [vloss] + [val_record[k] for k in metric_keys]
                self.txt_logger.log_result(
                    [self.epoch + 1, self.iter, take_time, loss]
                    + val_logs
                    + [vbest_epoch],
                    txt_write=True,
                )
                self.txt_logger.write("\n", txt_write=True)
                self.tb_writer.write_images(
                    val_viz["fp"],
                    val_viz["img"],
                    val_viz["pred"],
                    val_viz["ann"],
                    self.iter,
                    "val",
                )

                self.tb_writer.write_scalars(
                    {
                        "metrics": {
                            "{}".format(key): val_record[key] for key in metric_keys
                        }
                    },
                    self.iter,
                )
                self.tb_writer.write_scalars({"loss": {"val loss": vloss}}, self.iter)

                with open(
                    os.path.join(self.cfgs["save_dir"], "tot_val_record.pkl"), "wb"
                ) as f:
                    pickle.dump(self.tot_val_record, f)
Example #5
0
class Trainer:
    def __init__(self, args):
        self.args = args

        #### 0. Setup
        self.save_dir = tools.set_save_dir(args)
        with open(os.path.join(self.save_dir, "args.json"), "w") as j:
            json.dump(vars(args), j)

        #### 1. Data
        # TODO: augmentation
        augmentation = getattr(augmentations, args.augment)
        self.train_loader, self.val_loader = my_input.get_dataloader(
            args, transform=augmentation)

        #### 2. Model
        model = models.PENetClassifier(**vars(args))
        model.load_pretrained(PRETRAINED_WEIGHTS, "0")
        self.model = model.cuda()

        #### 3. Opt
        self.optimizer = opts.get_optimizer(args, self.model)
        self.scheduler = None
        if self.args.lr_scheduler is not None:
            self.scheduler = opts.get_scheduler(args, self.optimizer)

        #### 4. Logger
        self.writer = writer.Writer(log_dir=self.save_dir)
        self.logger = logger.Logger()
        self.logger.open(os.path.join(self.save_dir, "log.train.txt"),
                         mode="a")
        self.logger.write("\n>> Pytorch version: {}".format(torch.__version__))
        self.logger.write("\n>> Args: {}".format(args))

        # self.visualizer = visualizer.Visualizer(
        #     args, "train", self.save_dir, self.writer
        # )

        # Validator
        self.validator = Validator(
            args,
            is_trainval=True,
            writer=self.writer,
            val_loader=self.val_loader,
        )

    def setup_resume(self):
        with open(os.path.join(self.save_dir, "tot_val_record.pkl"),
                  "rb") as f:
            self.tot_val_record = pickle.load(f)

        self.iteration, self.resume_epoch = (
            self.tot_val_record["best"]["iteration"],
            self.tot_val_record["best"]["epoch"],
        )

        rep = str(self.resume_epoch)
        print("\nResume training from here: ", self.tot_val_record[rep])

        resume_model_dir = os.path.join(
            self.save_dir, "epoch_{}.pt".format(self.resume_epoch))
        checkpoint = torch.load(resume_model_dir)
        self.model.load_state_dict(checkpoint["model"], strict=True)
        self.optimizer.load_state_dict(checkpoint["optimizer"])

    def setup_train(self):
        self.epoch = 0
        self.iteration = 0

        self.resume_epoch = 0

        self.tot_val_record = {
            "best": {
                "loss": -1,
                "precision": -1,
                "recall": -1,
                "f1": -1,
                "acc": 0,
                "epoch": -1,
            }
        }

        # FIXME:
        if self.args.resume_train:
            self.setup_resume()
            self.logger.write("\n\n** Resume Training Here! **")
            self.logger.write("\n>> Save Directory: {}\n".format(
                self.save_dir))

        else:
            self.logger.write("\n\n** Start Training Here! **")
            self.logger.write("\n>> Save Directory: {}\n\n".format(
                self.save_dir))

        print("\nStart Training\n")

        self.logger.set_header_columns([
            "epoch",
            "iter",
            "time",
            "train_loss",
            "val_loss",
            "acc",
            "precision",
            "recall",
            "f1",
            "best_epoch",
        ])
        self.logger.log_header()

        self.one_epoch_steps = len(self.train_loader)
        self.display_step = self.one_epoch_steps // self.args.display_interval

    def do_train(self):

        self.setup_train()

        # print("\nStart Training!\n",)

        self.start_time = time.time()
        endurance = 0
        for epoch in range(self.resume_epoch, self.args.max_epoch):
            self.epoch = epoch

            if endurance > self.args.endurance:
                print("Stop training! No more performance gain expected!")
                print(
                    "Best saved at: ",
                    self.iteration,
                    self.epoch,
                    self.start_time,
                    self.save_dir,
                    self.tot_val_record["best"]["epoch"],
                )
                break

            self.train_one_epoch()

            # print("precision / recall: ", pc, rc)

            if (epoch + 1) >= self.args.val_epoch:
                if (epoch + 1) % self.args.val_interval == 0:
                    val_record = self.validator.do_validate(
                        model=self.model, iteration=self.iteration)

                    self.save_model(val_record, endurance)
                    self.val_log_and_write(val_record)

    def train_one_epoch(self):

        # Shuffle
        if self.epoch > 0:
            if not self.args.is_debugging:  # overfit test
                self.train_loader.dataset.loc_df = (
                    self.train_loader.dataset.get_loc_df())

        self.optimizer.zero_grad()
        self.model.train()

        self.init_results()
        for i, data in enumerate(self.train_loader):

            fp = data["fp"]
            img = data["img"].cuda()  # .permute(0, 4, 1, 2, 3)
            anns = data["anns"].cuda()

            # self.scheduler_step(i)

            # if not (self.iteration % self.display_step == 0):

            outputs = self.model(img)

            if self.args.loss_type == "bce":
                criterion = torch.nn.BCEWithLogitsLoss()
            elif self.args.loss_type == "focal":
                criterion = losses.BinaryFocalLoss()

            loss = criterion(outputs, anns)

            loss.backward()

            self.optimizer.step()
            self.optimizer.zero_grad()

            self.update_results(anns, outputs, loss)
            self.train_log_and_write(i)

            # FIXME: visualizer
            # else:
            #     pass

            if self.scheduler is not None:
                opts.step_scheduler(self.scheduler, global_step=self.iteration)

            self.iteration += 1

        # self.last_loss = loss

    def save_model(self, val_record, endurance):
        if np.mean(val_record[self.args.best]) > np.mean(
                self.tot_val_record["best"][self.args.best]):
            model_state_dict = self.model.state_dict()

            checkpoint = {
                "epoch": self.epoch,
                "model": model_state_dict,
                "optimizer": self.optimizer.state_dict(),
            }
            model_name = os.path.join(self.save_dir,
                                      "epoch_" + repr(self.epoch + 1) + ".pt")
            torch.save(checkpoint, model_name)

            self.tot_val_record["best"] = val_record
            self.tot_val_record["best"]["epoch"] = self.epoch + 1
            self.tot_val_record["best"]["iteration"] = self.iteration

            endurance = 0
        else:
            endurance += 1

        self.tot_val_record[str(self.epoch + 1)] = val_record
        with open(os.path.join(self.save_dir, "tot_val_record.pkl"),
                  "wb") as f:
            pickle.dump(self.tot_val_record, f)

        return endurance

    def init_results(self):
        self.tot_nums = tools.AverageMeter()
        self.loss = tools.AverageMeter()

        self.gt_nums = tools.AverageMeter()
        self.tp_nums = tools.AverageMeter()
        self.pred_nums = tools.AverageMeter()
        self.correct_nums = tools.AverageMeter()

    def update_results(self, anns, outputs, loss):
        gts = anns.detach().cpu().numpy()
        preds = (outputs.detach().cpu().numpy() > 0.5).astype(np.float32)

        self.tot_nums.update(len(gts))
        self.loss.update(loss.item())

        self.correct_nums.update(np.sum(gts == preds))
        self.gt_nums.update(np.sum(gts == 1))
        self.pred_nums.update(np.sum(preds))
        self.tp_nums.update(np.sum(gts * preds))

    def train_log_and_write(self, i):

        acc = self.correct_nums.sum / self.tot_nums.sum
        pc = self.tp_nums.sum / (self.pred_nums.sum + 1e-6)
        rc = self.tp_nums.sum / (self.gt_nums.sum + 1e-6)
        f1 = (2 * rc * pc) / (rc + pc + 1e-6)

        take_time = tools.convert_time(time.time() - self.start_time)

        self.logger.log_result([
            self.epoch,
            "{}/{}".format(i, self.one_epoch_steps),
            take_time,
            self.loss.avg,
            "-",
            acc,
            pc,
            rc,
            f1,
            "-",
        ])

        self.writer.write_scalar({"lr": self.optimizer.param_groups[0]["lr"]},
                                 self.iteration)
        self.writer.write_scalars(
            {
                "statistics": {
                    "mean_{}".format(key): np.mean(value)
                    for key, value in zip(["acc", "precision", "recall", "f1"],
                                          [acc, pc, rc, f1])
                },
            },
            self.iteration,
        )

        self.writer.write_scalars(
            {"loss": {
                "train loss": self.loss.avg
            }},
            self.iteration,
        )

    def val_log_and_write(self, val_record):
        take_time = tools.convert_time(time.time() - self.start_time)
        self.logger.log_result([
            self.epoch + 1,
            self.iteration,
            take_time,
            self.loss.avg,
            val_record["loss"],
            val_record["acc"],
            val_record["precision"],
            val_record["recall"],
            val_record["f1"],
            self.tot_val_record["best"]["epoch"],
        ])
        print("\r")
        self.writer.write_scalars(
            {
                "statistics": {
                    "mean_{}".format(key): np.mean(val_record[key])
                    for key in ["acc", "precision", "recall", "f1"]
                },
            },
            self.iteration,
        )
        self.writer.write_scalars(
            {"loss": {
                "val loss": val_record["loss"]
            }},
            self.iteration,
        )
Example #6
0
class Trainer(Commoner):
    def __init__(self, args):
        super(Trainer, self).__init__(args)

        #### 0. Setup
        self.save_dir = tools.set_save_dir(args)
        with open(os.path.join(self.save_dir, "args.json"), "w") as j:
            json.dump(vars(args), j)

        #### 1. Models
        model = getattr(models, args.model)(args)
        print(
            "Model param nums: ",
            sum(p.numel() for p in model.parameters() if p.requires_grad),
        )

        self.model = model.cuda()

        #### 2. Opt
        self.optimizer = opts.get_optimizer(args, self.model)
        self.scheduler = None
        if self.args.lr_scheduler is not None:
            self.scheduler = opts.get_scheduler(args, self.optimizer)

        #### 3. Data

        if args.augment is not None:
            augmentation = getattr(augmentations, args.augment)
        else:
            augmentation = None
        self.train_loader, self.val_loader = inputs.get_dataloader(
            args, transform=augmentation)

        #### 4. Logger
        self.writer = writer.Writer(log_dir=self.save_dir)
        self.logger = logger.Logger()
        self.logger.open(os.path.join(self.save_dir, "log.train.txt"),
                         mode="a")
        self.logger.write("\n>> Pytorch version: {}".format(torch.__version__))
        self.logger.write("\n>> Args: {}".format(args))

        # Validator
        self.validator = Validator(
            args,
            is_trainval=True,
            writer=self.writer,
            val_loader=self.val_loader,
        )

    def do_train(self):

        self.setup_train()

        self.start_time = time.time()
        endurance = 0
        for epoch in range(self.resume_epoch, self.args.max_epoch):
            self.epoch = epoch

            if endurance > self.args.endurance:
                print("Stop training! No more performance gain expected!")
                print(
                    "Best saved at: ",
                    self.iteration,
                    self.epoch,
                    self.start_time,
                    self.save_dir,
                    self.tot_val_record["best"]["epoch"],
                )
                break

            self.train_one_epoch()

            if (epoch + 1) >= self.args.val_epoch:
                if (epoch + 1) % self.args.val_interval == 0:
                    val_record = self.validator.do_validate(
                        model=self.model, iteration=self.iteration)

                    self.save_model(val_record, endurance)
                    self.val_log_and_write(val_record)

    def train_one_epoch(self):

        # Shuffle
        if self.epoch > 0:
            if not self.args.is_debugging:  # overfit test
                self.train_loader.dataset.loc_df = (
                    self.train_loader.dataset.get_loc_df())

        self.optimizer.zero_grad()
        self.model.train()

        self.init_results()
        for i, data in enumerate(self.train_loader):

            fps = data["fp"]
            imgs = data["img"].cuda()  # .permute(0, 4, 1, 2, 3)
            anns = data["anns"].cuda()

            # self.scheduler_step(i)
            # if not (self.iteration % self.display_step == 0):

            outputs = self.model(imgs)
            if self.args.print_io:
                print("train inputs: ", imgs[0])
                print("train outputs: ", outputs[0])

            loss = self.calc_loss(fps, anns, outputs)

            if loss > 0:
                loss.backward()

                self.optimizer.step()
                self.optimizer.zero_grad()

            self.update_results(fps, anns, outputs, loss)
            self.train_log_and_write(i)

            if self.scheduler is not None:
                opts.step_scheduler(self.scheduler, global_step=self.iteration)

            self.iteration += 1

    def setup_resume(self):
        with open(os.path.join(self.save_dir, "tot_val_record.pkl"),
                  "rb") as f:
            self.tot_val_record = pickle.load(f)

        self.iteration, self.resume_epoch = (
            self.tot_val_record["best"]["iteration"],
            self.tot_val_record["best"]["epoch"],
        )

        rep = str(self.resume_epoch)
        print("\nResume training from here: ", self.tot_val_record[rep])

        resume_model_dir = os.path.join(
            self.save_dir, "epoch_{}.pt".format(self.resume_epoch))
        checkpoint = torch.load(resume_model_dir)
        self.model.load_state_dict(checkpoint["model"], strict=True)
        self.optimizer.load_state_dict(checkpoint["optimizer"])

    def setup_train(self):
        self.epoch = 0
        self.iteration = 0

        self.resume_epoch = 0

        self.tot_val_record = {
            "best": {
                "loss": np.inf,
                "comp_metric": np.inf,
                "precision": -1,
                "recall": -1,
                "f1": -1,
                "acc": -1,
                "epoch": -1,
            }
        }

        # FIXME:
        if self.args.resume_train:
            self.setup_resume()
            self.logger.write("\n\n** Resume Training Here! **")
            self.logger.write("\n>> Save Directory: {}\n".format(
                self.save_dir))

        else:
            self.logger.write("\n\n** Start Training Here! **")
            self.logger.write("\n>> Save Directory: {}\n\n".format(
                self.save_dir))

        print("\nStart Training\n")

        self.logger.set_header_columns([
            "epoch",
            "iter",
            "time",
            "train_loss",
            "train_metric",
            "val_loss",
            "val_metric",
            "acc",
            "precision",
            "recall",
            "f1",
            "best_epoch",
        ])
        self.logger.log_header()

        self.one_epoch_steps = len(self.train_loader)
        self.display_step = self.one_epoch_steps // self.args.display_interval

    def save_model(self, val_record, endurance):
        current = np.mean(val_record[self.args.best])
        prev_best = np.mean(self.tot_val_record["best"][self.args.best])

        model_improved = False
        if (self.args.best == "loss") or (self.args.best == "comp_metric"):
            if current < prev_best:
                model_improved = True
        else:
            if current > prev_best:
                model_improved = True

        if model_improved:

            checkpoint = {
                "epoch": self.epoch,
                "model": self.model.state_dict(),
                "optimizer": self.optimizer.state_dict(),
            }
            model_name = os.path.join(self.save_dir,
                                      "epoch_" + repr(self.epoch + 1) + ".pt")
            torch.save(checkpoint, model_name)

            self.tot_val_record["best"] = val_record
            self.tot_val_record["best"]["epoch"] = self.epoch + 1
            self.tot_val_record["best"]["iteration"] = self.iteration

            endurance = 0
        else:
            endurance += 1

        self.tot_val_record[str(self.epoch + 1)] = val_record
        with open(os.path.join(self.save_dir, "tot_val_record.pkl"),
                  "wb") as f:
            pickle.dump(self.tot_val_record, f)

        return endurance

    def train_log_and_write(self, i):

        result = self.get_results()

        take_time = tools.convert_time(time.time() - self.start_time)

        self.logger.log_result([
            self.epoch,
            "{}/{}".format(i, self.one_epoch_steps),
            take_time,
            result["loss"],
            result["comp_metric"],
            "-",
            "-",
            result["acc"],
            result["precision"],
            result["recall"],
            result["f1"],
            "-",
        ])

        self.writer.write_scalar({"lr": self.optimizer.param_groups[0]["lr"]},
                                 self.iteration)
        self.writer.write_scalars(
            {
                "statistics": {
                    "mean_{}".format(key): np.mean(value)
                    for key, value in zip(
                        ["acc", "precision", "recall", "f1"],
                        [
                            result["acc"],
                            result["precision"],
                            result["recall"],
                            result["f1"],
                        ],
                    )
                },
            },
            self.iteration,
        )

        self.writer.write_scalars(
            {"loss": {
                "train loss": result["loss"]
            }},
            self.iteration,
        )
        self.writer.write_scalars(
            {"comp_metric": {
                "train comp metric": result["comp_metric"]
            }},
            self.iteration,
        )

    def val_log_and_write(self, val_record):
        take_time = tools.convert_time(time.time() - self.start_time)

        train_comp_metric = (self.comp_metric_loss.sum) / (
            self.comp_metric_weight.sum + 1e-15)

        self.logger.log_result([
            self.epoch + 1,
            self.iteration,
            take_time,
            self.loss.avg,
            train_comp_metric,
            val_record["loss"],
            val_record["comp_metric"],
            val_record["acc"],
            val_record["precision"],
            val_record["recall"],
            val_record["f1"],
            self.tot_val_record["best"]["epoch"],
        ])
        print("\r")
        self.writer.write_scalars(
            {
                "statistics": {
                    "mean_{}".format(key): np.mean(val_record[key])
                    for key in ["acc", "precision", "recall", "f1"]
                },
            },
            self.iteration,
        )
        self.writer.write_scalars(
            {"loss": {
                "val loss": val_record["loss"]
            }},
            self.iteration,
        )
        self.writer.write_scalars(
            {"comp_metric": {
                "val comp metric": val_record["comp_metric"]
            }},
            self.iteration,
        )
Example #7
0
import matplotlib

matplotlib.use("Agg")  # tensorboardX
import os, sys
import json
import torch
import torch.backends.cudnn as cudnn

from args import args

if __name__ == "__main__":
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    if len(args.gpu.split(",")) > 1:
        args.multi_gpu = True

    torch.autograd.set_detect_anomaly(True)
    cudnn.deterministic = True
    cudnn.benchmark = False

    if args.mode == "train":
        from scripts.train import Trainer

        Trainer(args).do_train()

    elif args.mode == "val":  # 'test' 와 동일,  /train/trainval/val(test)
        from scripts.validate import Validator

        Validator(args, is_trainval=False).do_validate()