def __init__(self, args): super(Trainer, self).__init__(args) #### 0. Setup self.save_dir = tools.set_save_dir(args) with open(os.path.join(self.save_dir, "args.json"), "w") as j: json.dump(vars(args), j) #### 1. Models model = getattr(models, args.model)(args) print( "Model param nums: ", sum(p.numel() for p in model.parameters() if p.requires_grad), ) self.model = model.cuda() #### 2. Opt self.optimizer = opts.get_optimizer(args, self.model) self.scheduler = None if self.args.lr_scheduler is not None: self.scheduler = opts.get_scheduler(args, self.optimizer) #### 3. Data if args.augment is not None: augmentation = getattr(augmentations, args.augment) else: augmentation = None self.train_loader, self.val_loader = inputs.get_dataloader( args, transform=augmentation) #### 4. Logger self.writer = writer.Writer(log_dir=self.save_dir) self.logger = logger.Logger() self.logger.open(os.path.join(self.save_dir, "log.train.txt"), mode="a") self.logger.write("\n>> Pytorch version: {}".format(torch.__version__)) self.logger.write("\n>> Args: {}".format(args)) # Validator self.validator = Validator( args, is_trainval=True, writer=self.writer, val_loader=self.val_loader, )
def __init__(self, args): self.args = args #### 0. Setup self.save_dir = tools.set_save_dir(args) with open(os.path.join(self.save_dir, "args.json"), "w") as j: json.dump(vars(args), j) #### 1. Data # TODO: augmentation augmentation = getattr(augmentations, args.augment) self.train_loader, self.val_loader = my_input.get_dataloader( args, transform=augmentation) #### 2. Model model = models.PENetClassifier(**vars(args)) model.load_pretrained(PRETRAINED_WEIGHTS, "0") self.model = model.cuda() #### 3. Opt self.optimizer = opts.get_optimizer(args, self.model) self.scheduler = None if self.args.lr_scheduler is not None: self.scheduler = opts.get_scheduler(args, self.optimizer) #### 4. Logger self.writer = writer.Writer(log_dir=self.save_dir) self.logger = logger.Logger() self.logger.open(os.path.join(self.save_dir, "log.train.txt"), mode="a") self.logger.write("\n>> Pytorch version: {}".format(torch.__version__)) self.logger.write("\n>> Args: {}".format(args)) # self.visualizer = visualizer.Visualizer( # args, "train", self.save_dir, self.writer # ) # Validator self.validator = Validator( args, is_trainval=True, writer=self.writer, val_loader=self.val_loader, )
def __init__(self, cfgs): save_dict = OrderedDict() save_dict["fold"] = cfgs["fold"] if cfgs["memo"] is not None: save_dict["memo"] = cfgs["memo"] # 1,2,3 specific_dir = ["{}-{}".format(key, save_dict[key]) for key in save_dict.keys()] cfgs["save_dir"] = os.path.join( cfgs["save_dir"], # cfgs["model"]["meta"], # cfgs["model"]["inputs"]["label"], "_".join(specific_dir), ) os.makedirs(cfgs["save_dir"], exist_ok=True) ####### CONFIGS self.cfgs = cfgs ####### Logging self.tb_writer = utils.get_writer(self.cfgs) self.txt_logger = utils.get_logger(self.cfgs) self.do_logging = True if len(self.cfgs["gpu"]) > 1: if dist.get_rank() != 0: self.do_logging = False if self.do_logging: self.txt_logger.write("\n\n----train.py----") self.txt_logger.write("\n{}".format(datetime.datetime.now())) self.txt_logger.write( "\n\nSave Directory: \n{}".format(self.cfgs["save_dir"]) ) self.txt_logger.write("\n\nConfigs: \n{}\n".format(self.cfgs)) ####### MODEL model = models.get_model(self.cfgs) if len(self.cfgs["gpu"]) > 1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) self.device = torch.device("cuda:{}".format(self.cfgs["local_rank"])) self.model = model.to(self.device) self.model = DistributedDataParallel( self.model, device_ids=[self.cfgs["local_rank"]], output_device=self.cfgs["local_rank"], ) else: self.device = torch.device("cuda:{}".format(self.cfgs["local_rank"])) self.model = model.to(self.device) ####### Data train_dataset = inputs.get_dataset(self.cfgs, mode="train") if len(self.cfgs["gpu"]) > 1: train_sampler = DistributedSampler( train_dataset, num_replicas=len(self.cfgs["gpu"]), rank=self.cfgs["local_rank"], ) else: train_sampler = None self.train_loader = DataLoader( dataset=train_dataset, batch_size=self.cfgs["batch_size"], num_workers=self.cfgs["num_workers"], pin_memory=True, drop_last=False, collate_fn=inputs.get_collater(), sampler=train_sampler, ) # if self.do_logging: # self.txt_logger.write("\nDataset: ") # self.txt_logger.write( # "\nTRAIN Abnormal/Normal: {}/{}".format( # len(train_dataset.abnormal_meta_df), # len(train_dataset.normal_meta_df), # ) # ) ####### Opts self.optimizer = opts.get_optimizer(self.cfgs, self.model.parameters()) self.scheduler = opts.get_scheduler(self.cfgs, self.optimizer) self.grad_scaler = GradScaler(enabled=self.cfgs["use_amp"]) ####### Validator self.validator = Validator(self.cfgs, self.device)
class Trainer(object): def __init__(self, cfgs): save_dict = OrderedDict() save_dict["fold"] = cfgs["fold"] if cfgs["memo"] is not None: save_dict["memo"] = cfgs["memo"] # 1,2,3 specific_dir = ["{}-{}".format(key, save_dict[key]) for key in save_dict.keys()] cfgs["save_dir"] = os.path.join( cfgs["save_dir"], # cfgs["model"]["meta"], # cfgs["model"]["inputs"]["label"], "_".join(specific_dir), ) os.makedirs(cfgs["save_dir"], exist_ok=True) ####### CONFIGS self.cfgs = cfgs ####### Logging self.tb_writer = utils.get_writer(self.cfgs) self.txt_logger = utils.get_logger(self.cfgs) self.do_logging = True if len(self.cfgs["gpu"]) > 1: if dist.get_rank() != 0: self.do_logging = False if self.do_logging: self.txt_logger.write("\n\n----train.py----") self.txt_logger.write("\n{}".format(datetime.datetime.now())) self.txt_logger.write( "\n\nSave Directory: \n{}".format(self.cfgs["save_dir"]) ) self.txt_logger.write("\n\nConfigs: \n{}\n".format(self.cfgs)) ####### MODEL model = models.get_model(self.cfgs) if len(self.cfgs["gpu"]) > 1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) self.device = torch.device("cuda:{}".format(self.cfgs["local_rank"])) self.model = model.to(self.device) self.model = DistributedDataParallel( self.model, device_ids=[self.cfgs["local_rank"]], output_device=self.cfgs["local_rank"], ) else: self.device = torch.device("cuda:{}".format(self.cfgs["local_rank"])) self.model = model.to(self.device) ####### Data train_dataset = inputs.get_dataset(self.cfgs, mode="train") if len(self.cfgs["gpu"]) > 1: train_sampler = DistributedSampler( train_dataset, num_replicas=len(self.cfgs["gpu"]), rank=self.cfgs["local_rank"], ) else: train_sampler = None self.train_loader = DataLoader( dataset=train_dataset, batch_size=self.cfgs["batch_size"], num_workers=self.cfgs["num_workers"], pin_memory=True, drop_last=False, collate_fn=inputs.get_collater(), sampler=train_sampler, ) # if self.do_logging: # self.txt_logger.write("\nDataset: ") # self.txt_logger.write( # "\nTRAIN Abnormal/Normal: {}/{}".format( # len(train_dataset.abnormal_meta_df), # len(train_dataset.normal_meta_df), # ) # ) ####### Opts self.optimizer = opts.get_optimizer(self.cfgs, self.model.parameters()) self.scheduler = opts.get_scheduler(self.cfgs, self.optimizer) self.grad_scaler = GradScaler(enabled=self.cfgs["use_amp"]) ####### Validator self.validator = Validator(self.cfgs, self.device) # if self.do_logging: # self.txt_logger.write( # "\nVAL Abnormal/Normal: {}/{}".format( # len(self.validator.val_loader.dataset.abnormal_meta_df), # len(self.validator.val_loader.dataset.normal_meta_df), # ) # ) # if self.cfgs["model"]["val"]["ignore_normal"]: # self.txt_logger.write("\nVAL Ignore Normal") # self.validator.val_loader.dataset.meta_df = ( # self.validator.val_loader.dataset.abnormal_meta_df # ) def do_train(self): ####### Setup Train self.epoch, self.iter, self.resume_epoch = 0, 0, 0 self.tot_val_record = { "best": {"det_recl": -1, "det_prec": -1, "det_f1": -1, "loss": np.inf} } if self.cfgs["model"]["train"]["resume_train"]: with open( os.path.join(self.cfgs["save_dir"], "tot_val_record.pkl"), "rb" ) as f: self.tot_val_record = pickle.load(f) self.iter, self.resume_epoch = ( self.tot_val_record["best"]["iteration"], self.tot_val_record["best"]["epoch"], ) resume_model_dir = os.path.join( self.cfgs["save_dir"], "epoch_{}.pt".format(self.resume_epoch) ) checkpoint = torch.load(resume_model_dir) self.model.load_state_dict(checkpoint["model"], strict=True) self.optimizer.load_state_dict(checkpoint["optimizer"]) self.grad_scaler.load_state_dict(checkpoint["scaler"]) self.txt_logger.write("\n\nResume Training Here! \n\n") if self.do_logging: self.txt_logger.write("\n\nStart Training! \n\n") header_columns = ["epoch", "iter", "time", "train_loss", "val_loss"] header_columns += ["det_recl", "det_prec", "det_fppi", "det_f1"] header_columns += ["cls_auc", "cls_sens", "cls_spec"] header_columns += ["best_epoch"] self.txt_logger.log_header(header_columns) ####### Train self.start_time = time.time() self.endurance = 0 for epoch in range(self.resume_epoch, self.cfgs["model"]["train"]["max_epoch"]): # self.train_loader.dataset.shuffle() # self.train_loader.dataset.meta_df = ( # self.train_loader.dataset.abnormal_meta_df # ) self.one_epoch_steps = len(self.train_loader) self.display_step = ( self.one_epoch_steps // self.cfgs["model"]["train"]["display_interval"] ) self.epoch = epoch if self.endurance > self.cfgs["model"]["train"]["endurance"]: if self.do_logging: self.txt_logger.write( "\nStop training! No more performance gain expected!" ) best_epoch = self.tot_val_record["best"]["epoch"] self.txt_logger.write( "\n\nBest saved at: {}, {} epoch\n\n".format( self.cfgs["save_dir"], best_epoch ) ) break self.train_val_one_epoch() def train_val_one_epoch(self): self.optimizer.zero_grad() self.model.train() t0 = time.time() for i, data in enumerate(self.train_loader): t1 = time.time() img = data["img"].permute(0, 3, 1, 2).to(self.device) logit = self.model(img) t2 = time.time() # FIXME: GPU Util이 안 나온다 loss = opts.calc_loss(self.cfgs, self.device, data, logit) t3 = time.time() self.grad_scaler.scale(loss).backward() self.grad_scaler.step(self.optimizer) self.grad_scaler.update() self.optimizer.zero_grad() t4 = time.time() # NOTE: Try to avoid excessive CPU-GPU synchronization (.item() calls, or printing values from CUDA tensors). if self.do_logging: loss = loss.detach().item() take_time = tools.convert_time(time.time() - self.start_time) train_logs = [loss, "-"] self.txt_logger.log_result( [self.epoch, "{}/{}".format(i, self.one_epoch_steps), take_time] + train_logs ) self.tb_writer.write_scalars( {"loss": {"train loss": loss}}, self.iter, ) if self.iter % self.display_step == 0: # Visualize # Find abnormal for viz_bi in range(len(data["fp"])): if data["bbox"][viz_bi, 0, -1] != -1: break with torch.no_grad(): self.model.eval() det_preds_viz = ( self.model(img, mode="viz")["preds"][viz_bi] .detach() .cpu() .numpy() ) if len(det_preds_viz) != 0: # sigmoid det_preds_viz[:, -1] = 1 / ( 1 + np.exp(-1 * det_preds_viz[:, -1]) ) else: det_preds_viz = np.ones((1, 6)) * -1 det_anns_viz = data["bbox"][viz_bi].numpy() self.tb_writer.write_images( data["fp"][viz_bi], data["img"][viz_bi].numpy(), det_preds_viz, det_anns_viz, self.iter, "train", ) self.model.train() self.iter += 1 lr0 = self.cfgs["model"]["opts"]["learning_rate"] wep = self.cfgs["model"]["opts"]["warmup_epoch"] if self.epoch < wep: for pg in self.optimizer.param_groups: pg["lr"] = lr0 / wep * (self.epoch + i / self.one_epoch_steps) else: if not self.scheduler is None: self.scheduler.step(self.epoch - wep + i / self.one_epoch_steps) t5 = time.time() if self.cfgs["do_profiling"]: print("\ndata", t1 - t0) print("forward", t2 - t1) print("calc loss", t3 - t2) print("backward", t4 - t3) print("logging", t5 - t4) t0 = t5 if self.epoch > self.cfgs["model"]["val"]["ignore_epoch"]: # Do Validation val_record, val_viz = self.validator.do_validate(self.model) self.tot_val_record[str(self.epoch + 1)] = val_record val_best = val_record[self.cfgs["model"]["val"]["best"]] # Save Model select_metric = self.cfgs["model"]["val"]["best"] val_improved = False if select_metric == "loss": if val_best < self.tot_val_record["best"][select_metric]: val_improved = True elif select_metric == "det_f1": if val_best > self.tot_val_record["best"][select_metric]: val_improved = True if val_improved: checkpoint = { "epoch": self.epoch, "model": self.model.state_dict(), "optimizer": self.optimizer.state_dict(), "scaler": self.grad_scaler.state_dict(), } model_name = os.path.join( self.cfgs["save_dir"], "epoch_" + str(self.epoch + 1) + ".pt" ) torch.save(checkpoint, model_name) self.tot_val_record["best"] = val_record self.tot_val_record["best"]["epoch"] = self.epoch + 1 self.tot_val_record["best"]["iteration"] = self.iter self.endurance = 0 else: self.endurance += 1 if self.do_logging: take_time = utils.tools.convert_time(time.time() - self.start_time) vloss = val_record["loss"] vbest_epoch = self.tot_val_record["best"]["epoch"] metric_keys = ["det_recl", "det_prec", "det_fppi", "det_f1"] metric_keys += ["cls_auc", "cls_sens", "cls_spec"] val_logs = [vloss] + [val_record[k] for k in metric_keys] self.txt_logger.log_result( [self.epoch + 1, self.iter, take_time, loss] + val_logs + [vbest_epoch], txt_write=True, ) self.txt_logger.write("\n", txt_write=True) self.tb_writer.write_images( val_viz["fp"], val_viz["img"], val_viz["pred"], val_viz["ann"], self.iter, "val", ) self.tb_writer.write_scalars( { "metrics": { "{}".format(key): val_record[key] for key in metric_keys } }, self.iter, ) self.tb_writer.write_scalars({"loss": {"val loss": vloss}}, self.iter) with open( os.path.join(self.cfgs["save_dir"], "tot_val_record.pkl"), "wb" ) as f: pickle.dump(self.tot_val_record, f)
class Trainer: def __init__(self, args): self.args = args #### 0. Setup self.save_dir = tools.set_save_dir(args) with open(os.path.join(self.save_dir, "args.json"), "w") as j: json.dump(vars(args), j) #### 1. Data # TODO: augmentation augmentation = getattr(augmentations, args.augment) self.train_loader, self.val_loader = my_input.get_dataloader( args, transform=augmentation) #### 2. Model model = models.PENetClassifier(**vars(args)) model.load_pretrained(PRETRAINED_WEIGHTS, "0") self.model = model.cuda() #### 3. Opt self.optimizer = opts.get_optimizer(args, self.model) self.scheduler = None if self.args.lr_scheduler is not None: self.scheduler = opts.get_scheduler(args, self.optimizer) #### 4. Logger self.writer = writer.Writer(log_dir=self.save_dir) self.logger = logger.Logger() self.logger.open(os.path.join(self.save_dir, "log.train.txt"), mode="a") self.logger.write("\n>> Pytorch version: {}".format(torch.__version__)) self.logger.write("\n>> Args: {}".format(args)) # self.visualizer = visualizer.Visualizer( # args, "train", self.save_dir, self.writer # ) # Validator self.validator = Validator( args, is_trainval=True, writer=self.writer, val_loader=self.val_loader, ) def setup_resume(self): with open(os.path.join(self.save_dir, "tot_val_record.pkl"), "rb") as f: self.tot_val_record = pickle.load(f) self.iteration, self.resume_epoch = ( self.tot_val_record["best"]["iteration"], self.tot_val_record["best"]["epoch"], ) rep = str(self.resume_epoch) print("\nResume training from here: ", self.tot_val_record[rep]) resume_model_dir = os.path.join( self.save_dir, "epoch_{}.pt".format(self.resume_epoch)) checkpoint = torch.load(resume_model_dir) self.model.load_state_dict(checkpoint["model"], strict=True) self.optimizer.load_state_dict(checkpoint["optimizer"]) def setup_train(self): self.epoch = 0 self.iteration = 0 self.resume_epoch = 0 self.tot_val_record = { "best": { "loss": -1, "precision": -1, "recall": -1, "f1": -1, "acc": 0, "epoch": -1, } } # FIXME: if self.args.resume_train: self.setup_resume() self.logger.write("\n\n** Resume Training Here! **") self.logger.write("\n>> Save Directory: {}\n".format( self.save_dir)) else: self.logger.write("\n\n** Start Training Here! **") self.logger.write("\n>> Save Directory: {}\n\n".format( self.save_dir)) print("\nStart Training\n") self.logger.set_header_columns([ "epoch", "iter", "time", "train_loss", "val_loss", "acc", "precision", "recall", "f1", "best_epoch", ]) self.logger.log_header() self.one_epoch_steps = len(self.train_loader) self.display_step = self.one_epoch_steps // self.args.display_interval def do_train(self): self.setup_train() # print("\nStart Training!\n",) self.start_time = time.time() endurance = 0 for epoch in range(self.resume_epoch, self.args.max_epoch): self.epoch = epoch if endurance > self.args.endurance: print("Stop training! No more performance gain expected!") print( "Best saved at: ", self.iteration, self.epoch, self.start_time, self.save_dir, self.tot_val_record["best"]["epoch"], ) break self.train_one_epoch() # print("precision / recall: ", pc, rc) if (epoch + 1) >= self.args.val_epoch: if (epoch + 1) % self.args.val_interval == 0: val_record = self.validator.do_validate( model=self.model, iteration=self.iteration) self.save_model(val_record, endurance) self.val_log_and_write(val_record) def train_one_epoch(self): # Shuffle if self.epoch > 0: if not self.args.is_debugging: # overfit test self.train_loader.dataset.loc_df = ( self.train_loader.dataset.get_loc_df()) self.optimizer.zero_grad() self.model.train() self.init_results() for i, data in enumerate(self.train_loader): fp = data["fp"] img = data["img"].cuda() # .permute(0, 4, 1, 2, 3) anns = data["anns"].cuda() # self.scheduler_step(i) # if not (self.iteration % self.display_step == 0): outputs = self.model(img) if self.args.loss_type == "bce": criterion = torch.nn.BCEWithLogitsLoss() elif self.args.loss_type == "focal": criterion = losses.BinaryFocalLoss() loss = criterion(outputs, anns) loss.backward() self.optimizer.step() self.optimizer.zero_grad() self.update_results(anns, outputs, loss) self.train_log_and_write(i) # FIXME: visualizer # else: # pass if self.scheduler is not None: opts.step_scheduler(self.scheduler, global_step=self.iteration) self.iteration += 1 # self.last_loss = loss def save_model(self, val_record, endurance): if np.mean(val_record[self.args.best]) > np.mean( self.tot_val_record["best"][self.args.best]): model_state_dict = self.model.state_dict() checkpoint = { "epoch": self.epoch, "model": model_state_dict, "optimizer": self.optimizer.state_dict(), } model_name = os.path.join(self.save_dir, "epoch_" + repr(self.epoch + 1) + ".pt") torch.save(checkpoint, model_name) self.tot_val_record["best"] = val_record self.tot_val_record["best"]["epoch"] = self.epoch + 1 self.tot_val_record["best"]["iteration"] = self.iteration endurance = 0 else: endurance += 1 self.tot_val_record[str(self.epoch + 1)] = val_record with open(os.path.join(self.save_dir, "tot_val_record.pkl"), "wb") as f: pickle.dump(self.tot_val_record, f) return endurance def init_results(self): self.tot_nums = tools.AverageMeter() self.loss = tools.AverageMeter() self.gt_nums = tools.AverageMeter() self.tp_nums = tools.AverageMeter() self.pred_nums = tools.AverageMeter() self.correct_nums = tools.AverageMeter() def update_results(self, anns, outputs, loss): gts = anns.detach().cpu().numpy() preds = (outputs.detach().cpu().numpy() > 0.5).astype(np.float32) self.tot_nums.update(len(gts)) self.loss.update(loss.item()) self.correct_nums.update(np.sum(gts == preds)) self.gt_nums.update(np.sum(gts == 1)) self.pred_nums.update(np.sum(preds)) self.tp_nums.update(np.sum(gts * preds)) def train_log_and_write(self, i): acc = self.correct_nums.sum / self.tot_nums.sum pc = self.tp_nums.sum / (self.pred_nums.sum + 1e-6) rc = self.tp_nums.sum / (self.gt_nums.sum + 1e-6) f1 = (2 * rc * pc) / (rc + pc + 1e-6) take_time = tools.convert_time(time.time() - self.start_time) self.logger.log_result([ self.epoch, "{}/{}".format(i, self.one_epoch_steps), take_time, self.loss.avg, "-", acc, pc, rc, f1, "-", ]) self.writer.write_scalar({"lr": self.optimizer.param_groups[0]["lr"]}, self.iteration) self.writer.write_scalars( { "statistics": { "mean_{}".format(key): np.mean(value) for key, value in zip(["acc", "precision", "recall", "f1"], [acc, pc, rc, f1]) }, }, self.iteration, ) self.writer.write_scalars( {"loss": { "train loss": self.loss.avg }}, self.iteration, ) def val_log_and_write(self, val_record): take_time = tools.convert_time(time.time() - self.start_time) self.logger.log_result([ self.epoch + 1, self.iteration, take_time, self.loss.avg, val_record["loss"], val_record["acc"], val_record["precision"], val_record["recall"], val_record["f1"], self.tot_val_record["best"]["epoch"], ]) print("\r") self.writer.write_scalars( { "statistics": { "mean_{}".format(key): np.mean(val_record[key]) for key in ["acc", "precision", "recall", "f1"] }, }, self.iteration, ) self.writer.write_scalars( {"loss": { "val loss": val_record["loss"] }}, self.iteration, )
class Trainer(Commoner): def __init__(self, args): super(Trainer, self).__init__(args) #### 0. Setup self.save_dir = tools.set_save_dir(args) with open(os.path.join(self.save_dir, "args.json"), "w") as j: json.dump(vars(args), j) #### 1. Models model = getattr(models, args.model)(args) print( "Model param nums: ", sum(p.numel() for p in model.parameters() if p.requires_grad), ) self.model = model.cuda() #### 2. Opt self.optimizer = opts.get_optimizer(args, self.model) self.scheduler = None if self.args.lr_scheduler is not None: self.scheduler = opts.get_scheduler(args, self.optimizer) #### 3. Data if args.augment is not None: augmentation = getattr(augmentations, args.augment) else: augmentation = None self.train_loader, self.val_loader = inputs.get_dataloader( args, transform=augmentation) #### 4. Logger self.writer = writer.Writer(log_dir=self.save_dir) self.logger = logger.Logger() self.logger.open(os.path.join(self.save_dir, "log.train.txt"), mode="a") self.logger.write("\n>> Pytorch version: {}".format(torch.__version__)) self.logger.write("\n>> Args: {}".format(args)) # Validator self.validator = Validator( args, is_trainval=True, writer=self.writer, val_loader=self.val_loader, ) def do_train(self): self.setup_train() self.start_time = time.time() endurance = 0 for epoch in range(self.resume_epoch, self.args.max_epoch): self.epoch = epoch if endurance > self.args.endurance: print("Stop training! No more performance gain expected!") print( "Best saved at: ", self.iteration, self.epoch, self.start_time, self.save_dir, self.tot_val_record["best"]["epoch"], ) break self.train_one_epoch() if (epoch + 1) >= self.args.val_epoch: if (epoch + 1) % self.args.val_interval == 0: val_record = self.validator.do_validate( model=self.model, iteration=self.iteration) self.save_model(val_record, endurance) self.val_log_and_write(val_record) def train_one_epoch(self): # Shuffle if self.epoch > 0: if not self.args.is_debugging: # overfit test self.train_loader.dataset.loc_df = ( self.train_loader.dataset.get_loc_df()) self.optimizer.zero_grad() self.model.train() self.init_results() for i, data in enumerate(self.train_loader): fps = data["fp"] imgs = data["img"].cuda() # .permute(0, 4, 1, 2, 3) anns = data["anns"].cuda() # self.scheduler_step(i) # if not (self.iteration % self.display_step == 0): outputs = self.model(imgs) if self.args.print_io: print("train inputs: ", imgs[0]) print("train outputs: ", outputs[0]) loss = self.calc_loss(fps, anns, outputs) if loss > 0: loss.backward() self.optimizer.step() self.optimizer.zero_grad() self.update_results(fps, anns, outputs, loss) self.train_log_and_write(i) if self.scheduler is not None: opts.step_scheduler(self.scheduler, global_step=self.iteration) self.iteration += 1 def setup_resume(self): with open(os.path.join(self.save_dir, "tot_val_record.pkl"), "rb") as f: self.tot_val_record = pickle.load(f) self.iteration, self.resume_epoch = ( self.tot_val_record["best"]["iteration"], self.tot_val_record["best"]["epoch"], ) rep = str(self.resume_epoch) print("\nResume training from here: ", self.tot_val_record[rep]) resume_model_dir = os.path.join( self.save_dir, "epoch_{}.pt".format(self.resume_epoch)) checkpoint = torch.load(resume_model_dir) self.model.load_state_dict(checkpoint["model"], strict=True) self.optimizer.load_state_dict(checkpoint["optimizer"]) def setup_train(self): self.epoch = 0 self.iteration = 0 self.resume_epoch = 0 self.tot_val_record = { "best": { "loss": np.inf, "comp_metric": np.inf, "precision": -1, "recall": -1, "f1": -1, "acc": -1, "epoch": -1, } } # FIXME: if self.args.resume_train: self.setup_resume() self.logger.write("\n\n** Resume Training Here! **") self.logger.write("\n>> Save Directory: {}\n".format( self.save_dir)) else: self.logger.write("\n\n** Start Training Here! **") self.logger.write("\n>> Save Directory: {}\n\n".format( self.save_dir)) print("\nStart Training\n") self.logger.set_header_columns([ "epoch", "iter", "time", "train_loss", "train_metric", "val_loss", "val_metric", "acc", "precision", "recall", "f1", "best_epoch", ]) self.logger.log_header() self.one_epoch_steps = len(self.train_loader) self.display_step = self.one_epoch_steps // self.args.display_interval def save_model(self, val_record, endurance): current = np.mean(val_record[self.args.best]) prev_best = np.mean(self.tot_val_record["best"][self.args.best]) model_improved = False if (self.args.best == "loss") or (self.args.best == "comp_metric"): if current < prev_best: model_improved = True else: if current > prev_best: model_improved = True if model_improved: checkpoint = { "epoch": self.epoch, "model": self.model.state_dict(), "optimizer": self.optimizer.state_dict(), } model_name = os.path.join(self.save_dir, "epoch_" + repr(self.epoch + 1) + ".pt") torch.save(checkpoint, model_name) self.tot_val_record["best"] = val_record self.tot_val_record["best"]["epoch"] = self.epoch + 1 self.tot_val_record["best"]["iteration"] = self.iteration endurance = 0 else: endurance += 1 self.tot_val_record[str(self.epoch + 1)] = val_record with open(os.path.join(self.save_dir, "tot_val_record.pkl"), "wb") as f: pickle.dump(self.tot_val_record, f) return endurance def train_log_and_write(self, i): result = self.get_results() take_time = tools.convert_time(time.time() - self.start_time) self.logger.log_result([ self.epoch, "{}/{}".format(i, self.one_epoch_steps), take_time, result["loss"], result["comp_metric"], "-", "-", result["acc"], result["precision"], result["recall"], result["f1"], "-", ]) self.writer.write_scalar({"lr": self.optimizer.param_groups[0]["lr"]}, self.iteration) self.writer.write_scalars( { "statistics": { "mean_{}".format(key): np.mean(value) for key, value in zip( ["acc", "precision", "recall", "f1"], [ result["acc"], result["precision"], result["recall"], result["f1"], ], ) }, }, self.iteration, ) self.writer.write_scalars( {"loss": { "train loss": result["loss"] }}, self.iteration, ) self.writer.write_scalars( {"comp_metric": { "train comp metric": result["comp_metric"] }}, self.iteration, ) def val_log_and_write(self, val_record): take_time = tools.convert_time(time.time() - self.start_time) train_comp_metric = (self.comp_metric_loss.sum) / ( self.comp_metric_weight.sum + 1e-15) self.logger.log_result([ self.epoch + 1, self.iteration, take_time, self.loss.avg, train_comp_metric, val_record["loss"], val_record["comp_metric"], val_record["acc"], val_record["precision"], val_record["recall"], val_record["f1"], self.tot_val_record["best"]["epoch"], ]) print("\r") self.writer.write_scalars( { "statistics": { "mean_{}".format(key): np.mean(val_record[key]) for key in ["acc", "precision", "recall", "f1"] }, }, self.iteration, ) self.writer.write_scalars( {"loss": { "val loss": val_record["loss"] }}, self.iteration, ) self.writer.write_scalars( {"comp_metric": { "val comp metric": val_record["comp_metric"] }}, self.iteration, )
import matplotlib matplotlib.use("Agg") # tensorboardX import os, sys import json import torch import torch.backends.cudnn as cudnn from args import args if __name__ == "__main__": os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu if len(args.gpu.split(",")) > 1: args.multi_gpu = True torch.autograd.set_detect_anomaly(True) cudnn.deterministic = True cudnn.benchmark = False if args.mode == "train": from scripts.train import Trainer Trainer(args).do_train() elif args.mode == "val": # 'test' 와 동일, /train/trainval/val(test) from scripts.validate import Validator Validator(args, is_trainval=False).do_validate()