def inference(self, dataset, model): dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, num_workers=self.num_workers, collate_fn=collate, shuffle=False, pin_memory=True) output = np.zeros((self.n_class, dataset.slide_size[0], dataset.slide_size[1])) # n_class x H x W template = np.zeros(dataset.slide_size, dtype='uint8') # H x W step = dataset.slide_step for sample in dataloader: imgs = sample['image'] coord = sample['coord'] with torch.no_grad(): #imgs = imgs.cuda() # 节省内存,float32 -> float16 imgs = imgs.cuda().half() preds = model.forward(imgs) preds_np = preds.cpu().detach().numpy() _, _, h, w = preds_np.shape for i in range(imgs.shape[0]): x = math.floor(coord[i][0] * step[0]) y = math.floor(coord[i][1] * step[1]) output[:, x:x + h, y:y + w] += preds_np[i] template[x:x + h, y:y + w] += np.ones((h, w), dtype='uint8') template[template == 0] = 1 output = output / template prediction = np.argmax(output, axis=0) return class_to_RGB(prediction)
for sample in dataloader: # print(dataset.slide_mask) output = evaluator.eval(sample, model, output) slide_time.update(time.time()-start_time) start_time = time.time() slide = dataset.slide # mask = dataset.slide_mask mask = dataset.get_slide_mask_from_index(i) evaluator.update_scores(mask, output) scores = evaluator.get_scores() # save result output_rgb = class_to_RGB(output) mask_rgb = class_to_RGB(mask) output_rgb = cv2.cvtColor(output_rgb, cv2.COLOR_BGR2RGB) mask_rgb = cv2.cvtColor(mask_rgb, cv2.COLOR_BGR2RGB) cv2.imwrite(os.path.join(output_path, slide+'_output.png'), output_rgb) cv2.imwrite(os.path.join(output_path, slide+'_mask.png'), mask_rgb) tbar.set_description('Slide: {}'.format(slide) + ', mIOU: %.5f; slide time: %.2f' % (scores['iou_mean'], slide_time.avg)) scores = evaluator.get_scores() print(evaluator.metrics.confusion_matrix) log = "" log = log + str(task_name) + ' slide inference \n' log = log + "mIOU = " + str(scores['iou_mean']) + '\n' log = log + "IOU: " + str(scores['iou']) + '\n' log += "================================\n"
batch_time.update(time.time() - start_time) tbar.set_description( 'mIoU: %.4f; data time: %.2f; batch time: %.2f' % (scores_val["iou_mean"], data_time.avg, batch_time.avg)) if not test: # has label masks = sample['mask'] # PIL images if test: # save predictions output_save_path = os.path.join(output_path, task_name) if not os.path.isdir(output_save_path): os.makedirs(output_save_path) for i in range(len(sample['id'])): transforms.functional.to_pil_image( class_to_RGB(predictions[i])).save( os.path.join(output_save_path, sample['id'][i] + "_mask.png")) if not evaluation and not test: # train:val if i_batch * batch_size + len(sample['id']) > ( epoch % len(dataloader_val)) and i_batch * batch_size <= ( epoch % len(dataloader_val)): # writer.add_image('image', transforms.ToTensor()(images[(epoch % len(dataloader_val)) - i_batch * batch_size]), epoch) if not test: mask_rgb = class_to_RGB( masks[epoch % len(dataloader_val) - i_batch * batch_size].numpy()) mask_rgb = ToTensor()(mask_rgb) predictions_rgb = class_to_RGB(
def main(cfg, distributed=True): if distributed: # DPP 1 dist.init_process_group('nccl') # DPP 2 local_rank = dist.get_rank() print(local_rank) torch.cuda.set_device(local_rank) device = torch.device('cuda', local_rank) else: device = torch.device("cuda:0") local_rank = 0 ################################################### mode = cfg.mode n_class = cfg.n_class model_path = cfg.model_path # save model log_path = cfg.log_path output_path = cfg.output_path if local_rank == 0: if not os.path.exists(model_path): os.makedirs(model_path) if not os.path.exists(log_path): os.makedirs(log_path) if not os.path.exists(output_path): os.makedirs(output_path) task_name = cfg.task_name print(task_name) ################################### print("preparing datasets and dataloaders......") batch_size = cfg.batch_size sub_batch_size = cfg.sub_batch_size size_g = (cfg.size_g, cfg.size_g) size_p = (cfg.size_p, cfg.size_p) num_workers = cfg.num_workers trainset_cfg = cfg.trainset_cfg valset_cfg = cfg.valset_cfg data_time = AverageMeter("DataTime", ':3.3f') batch_time = AverageMeter("BatchTime", ':3.3f') transformer_train = TransformerSegGL(crop_size=cfg.crop_size) dataset_train = OralDatasetSeg( trainset_cfg["img_dir"], trainset_cfg["mask_dir"], trainset_cfg["meta_file"], label=trainset_cfg["label"], transform=transformer_train, ) if distributed: sampler_train = DistributedSampler(dataset_train, shuffle=True) dataloader_train = DataLoader(dataset_train, num_workers=num_workers, batch_size=batch_size, collate_fn=collateGL, sampler=sampler_train, pin_memory=True) else: dataloader_train = DataLoader(dataset_train, num_workers=num_workers, batch_size=batch_size, collate_fn=collateGL, shuffle=True, pin_memory=True) transformer_val = TransformerSegGLVal() dataset_val = OralDatasetSeg(valset_cfg["img_dir"], valset_cfg["mask_dir"], valset_cfg["meta_file"], label=valset_cfg["label"], transform=transformer_val) dataloader_val = DataLoader(dataset_val, num_workers=2, batch_size=batch_size, collate_fn=collateGL, shuffle=False, pin_memory=True) ################################### print("creating models......") path_g = cfg.path_g path_g2l = cfg.path_g2l path_l2g = cfg.path_l2g model = GLNet(n_class, cfg.encoder, **cfg.model_cfg) if mode == 3: global_fixed = GLNet(n_class, cfg.encoder, **cfg.model_cfg) else: global_fixed = None model, global_fixed = create_model_load_weights(model, global_fixed, device, mode=mode, distributed=distributed, local_rank=local_rank, evaluation=False, path_g=path_g, path_g2l=path_g2l, path_l2g=path_l2g) ################################### num_epochs = cfg.num_epochs learning_rate = cfg.lr optimizer = get_optimizer(model, mode, learning_rate=learning_rate) scheduler = LR_Scheduler(cfg.scheduler, learning_rate, num_epochs, len(dataloader_train)) ################################## if cfg.loss == "ce": criterion = nn.CrossEntropyLoss(reduction='mean') elif cfg.loss == "sce": criterion = SymmetricCrossEntropyLoss(alpha=cfg.alpha, beta=cfg.beta, num_classes=cfg.n_class) # criterion4 = NormalizedSymmetricCrossEntropyLoss(alpha=cfg.alpha, beta=cfg.beta, num_classes=cfg.n_class) elif cfg.loss == "focal": criterion = FocalLoss(gamma=3) elif cfg.loss == "ce-dice": criterion = nn.CrossEntropyLoss(reduction='mean') # criterion2 = ####################################### trainer = Trainer(criterion, optimizer, n_class, size_g, size_p, sub_batch_size, mode, cfg.lamb_fmreg) evaluator = Evaluator(n_class, size_g, size_p, sub_batch_size, mode) evaluation = cfg.evaluation val_vis = cfg.val_vis best_pred = 0.0 print("start training......") # log if local_rank == 0: f_log = open(os.path.join(log_path, ".log"), 'w') log = task_name + '\n' for k, v in cfg.__dict__.items(): log += str(k) + ' = ' + str(v) + '\n' f_log.write(log) f_log.flush() # writer if local_rank == 0: writer = SummaryWriter(log_dir=log_path) writer_info = {} for epoch in range(num_epochs): trainer.set_train(model) optimizer.zero_grad() tbar = tqdm(dataloader_train) train_loss = 0 start_time = time.time() for i_batch, sample in enumerate(tbar): data_time.update(time.time() - start_time) scheduler(optimizer, i_batch, epoch, best_pred) # loss = trainer.train(sample, model) loss = trainer.train(sample, model, global_fixed) train_loss += loss.item() score_train, score_train_global, score_train_local = trainer.get_scores( ) batch_time.update(time.time() - start_time) start_time = time.time() if i_batch % 20 == 0 and local_rank == 0: if mode == 1: tbar.set_description( 'Train loss: %.4f;global mIoU: %.4f; data time: %.2f; batch time: %.2f' % (train_loss / (i_batch + 1), score_train_global["iou_mean"], data_time.avg, batch_time.avg)) elif mode == 2: tbar.set_description( 'Train loss: %.4f;agg mIoU: %.4f; local mIoU: %.4f; data time: %.2f; batch time: %.2f' % (train_loss / (i_batch + 1), score_train["iou_mean"], score_train_local["iou_mean"], data_time.avg, batch_time.avg)) else: tbar.set_description( 'Train loss: %.4f;agg mIoU: %.4f; global mIoU: %.4f; local mIoU: %.4f; data time: %.2f; batch time: %.2f' % (train_loss / (i_batch + 1), score_train["iou_mean"], score_train_global["iouu_mean"], score_train_local["iou_mean"], data_time.avg, batch_time.avg)) score_train, score_train_global, score_train_local = trainer.get_scores( ) trainer.reset_metrics() data_time.reset() batch_time.reset() if evaluation and epoch % 1 == 0 and local_rank == 0: with torch.no_grad(): model.eval() print("evaluating...") tbar = tqdm(dataloader_val) start_time = time.time() for i_batch, sample in enumerate(tbar): data_time.update(time.time() - start_time) predictions, predictions_global, predictions_local = evaluator.eval_test( sample, model, global_fixed) score_val, score_val_global, score_val_local = evaluator.get_scores( ) batch_time.update(time.time() - start_time) if i_batch % 20 == 0 and local_rank == 0: if mode == 1: tbar.set_description( 'global mIoU: %.4f; data time: %.2f; batch time: %.2f' % (score_val_global["iou_mean"], data_time.avg, batch_time.avg)) elif mode == 2: tbar.set_description( 'agg mIoU: %.4f; local mIoU: %.4f; data time: %.2f; batch time: %.2f' % (score_val["iou_mean"], score_val_local["iou_mean"], data_time.avg, batch_time.avg)) else: tbar.set_description( 'agg mIoU: %.4f; global mIoU: %.4f; local mIoU: %.4f; data time: %.2f; batch time: %.2f' % (score_val["iou_mean"], score_val_global["iou_mean"], score_val_local["iou_mean"], data_time.avg, batch_time.avg)) if val_vis and i_batch == len( tbar) // 2: # val set result visualize mask_rgb = class_to_RGB(np.array(sample['mask'][1])) mask_rgb = ToTensor()(mask_rgb) writer_info.update(mask=mask_rgb, prediction_global=ToTensor()( class_to_RGB( predictions_global[1]))) if mode == 2 or mode == 3: writer.update(prediction=ToTensor()(class_to_RGB( predictions[1])), prediction_local=ToTensor()( class_to_RGB( predictions_local[1]))) start_time = time.time() data_time.reset() batch_time.reset() score_val, score_val_global, score_val_local = evaluator.get_scores( ) evaluator.reset_metrics() # save model best_pred = save_ckpt_model(model, cfg, score_val, score_val_global, best_pred, epoch) # log update_log( f_log, cfg, [score_train, score_train_global, score_train_local], [score_val, score_val_global, score_val_local], epoch) # writer if mode == 1: writer_info.update( loss=train_loss / len(tbar), lr=optimizer.param_groups[0]['lr'], mIOU={ "train": score_train_global["iou_mean"], "val": score_val_global["iou_mean"], }, global_mIOU={ "train": score_train_global["iou_mean"], "val": score_val_global["iou_mean"], }, mucosa_iou={ "train": score_train_global["iou"][2], "val": score_val_global["iou"][2], }, tumor_iou={ "train": score_train_global["iou"][3], "val": score_val_global["iou"][3], }, ) else: writer_info.update( loss=train_loss / len(tbar), lr=optimizer.param_groups[0]['lr'], mIOU={ "train": score_train["iou_mean"], "val": score_val["iou_mean"], }, global_mIOU={ "train": score_train_global["iou_mean"], "val": score_val_global["iou_mean"], }, local_mIOU={ "train": score_train_local["iou_mean"], "val": score_val_local["iou_mean"], }, mucosa_iou={ "train": score_train["iou"][2], "val": score_val["iou"][2], }, tumor_iou={ "train": score_train["iou"][3], "val": score_val["iou"][3], }, ) update_writer(writer, writer_info, epoch) if local_rank == 0: f_log.close()
def train(self, dataset_train, dataset_val, criterion, optimizer_func, trainer_func, evaluator_func, collate, dataset_test=None, tester_func=None): if self.distributed: sampler_train = DistributedSampler(dataset_train, shuffle=True) dataloader_train = DataLoader(dataset_train, num_workers=self.cfg.num_workers, batch_size=self.cfg.batch_size, collate_fn=collate, sampler=sampler_train, pin_memory=True) else: dataloader_train = DataLoader(dataset_train, num_workers=self.cfg.num_workers, batch_size=self.cfg.batch_size, collate_fn=collate, shuffle=True, pin_memory=True) dataloader_val = DataLoader(dataset_val, num_workers=self.cfg.num_workers, batch_size=self.cfg.batch_size, collate_fn=collate, shuffle=False, pin_memory=True) # if dataset_test: # dataloader_test = DataLoader(dataset_test, num_workers=self.cfg.num_workers, batch_size=self.cfg.batch_size, collate_fn=collate, shuffle=False, pin_memory=True) ################################### print("creating models......") model = self.model_loader(self.model, self.device, distributed=self.distributed, local_rank=self.local_rank, evaluation=True, ckpt_path=self.cfg.ckpt_path) ################################### num_epochs = self.cfg.num_epochs learning_rate = self.cfg.lr data_time = AverageMeter("DataTime", ':3.3f') batch_time = AverageMeter("BatchTime", ':3.3f') optimizer = optimizer_func(model, learning_rate=learning_rate) scheduler = LR_Scheduler(self.cfg.scheduler, learning_rate, num_epochs, len(dataloader_train)) ################################## trainer = trainer_func(criterion, optimizer, self.cfg.n_class) evaluator = evaluator_func(self.cfg.n_class) if tester_func: tester = tester_func(self.cfg.n_class, self.cfg.num_workers, self.cfg.batch_size) evaluation = self.cfg.evaluation val_vis = self.cfg.val_vis best_pred = 0.0 print("start training......") # log if self.local_rank == 0: f_log = open(self.cfg.log_path + self.cfg.task_name + ".log", 'w') log = self.cfg.task_name + '\n' for k, v in self.cfg.__dict__.items(): log += str(k) + ' = ' + str(v) + '\n' print(log) f_log.write(log) f_log.flush() # writer if self.local_rank == 0: writer = SummaryWriter(log_dir=self.cfg.writer_path) writer_info = {} for epoch in range(num_epochs): optimizer.zero_grad() num_batch = len(dataloader_train) tbar = tqdm(dataloader_train) train_loss = 0 start_time = time.time() model.train() for i_batch, sample in enumerate(tbar): data_time.update(time.time() - start_time) scheduler(optimizer, i_batch, epoch, best_pred) # loss = trainer.train(sample, model) if self.distributed: loss = trainer.train(sample, model) else: loss = trainer.train_acc(sample, model, i_batch, 2, num_batch) train_loss += loss.item() scores_train = trainer.get_scores() batch_time.update(time.time() - start_time) start_time = time.time() if i_batch % 20 == 0 and self.local_rank == 0: tbar.set_description( 'Train loss: %.4f; mIoU: %.4f; data time: %.2f; batch time: %.2f' % (train_loss / (i_batch + 1), scores_train["iou_mean"], data_time.avg, batch_time.avg)) # break trainer.reset_metrics() data_time.reset() batch_time.reset() train_model_fr, train_seg_fr = trainer.calculate_avg_fr() if evaluation and epoch % 1 == 0 and self.local_rank == 0: with torch.no_grad(): model.eval() ##--** evaluating **-- print("evaluating...") tbar = tqdm(dataloader_val) start_time = time.time() for i_batch, sample in enumerate(tbar): data_time.update(time.time() - start_time) predictions = evaluator.eval(sample, model) scores_val = evaluator.get_scores() batch_time.update(time.time() - start_time) if i_batch % 20 == 0 and self.local_rank == 0: tbar.set_description( 'mIoU: %.4f; data time: %.2f; batch time: %.2f' % (scores_val["iou_mean"], data_time.avg, batch_time.avg)) if val_vis and ( 1 + epoch) % 10 == 0: # val set result visualize for i in range(len(sample['id'])): name = sample['id'][i] + '.png' slide = name.split('_')[0] slide_dir = os.path.join( self.cfg.val_output_path, slide) if not os.path.exists(slide_dir): os.makedirs(slide_dir) predictions_rgb = class_to_RGB(predictions[i]) predictions_rgb = cv2.cvtColor( predictions_rgb, cv2.COLOR_BGR2RGB) cv2.imwrite(os.path.join(slide_dir, name), predictions_rgb) # writer_info.update(mask=mask_rgb, prediction=predictions_rgb) start_time = time.time() # break data_time.reset() batch_time.reset() scores_val = evaluator.get_scores() evaluator.reset_metrics() val_model_fr, val_seg_fr = evaluator.calculate_avg_fr() ##--** testing **-- if dataset_test: print("testing...") num_slides = len(dataset_test.slides) tbar2 = tqdm(range(num_slides)) start_time = time.time() for i in tbar2: dataset_test.get_patches_from_index(i) data_time.update(time.time() - start_time) predictions, output, _ = tester.inference( dataset_test, model) mask = dataset_test.get_slide_mask_from_index(i) tester.update_scores(mask, predictions) scores_test = tester.get_scores() batch_time.update(time.time() - start_time) tbar2.set_description( 'mIoU: %.4f; data time: %.2f; slide time: %.2f' % (scores_test["iou_mean"], data_time.avg, batch_time.avg)) output = cv2.cvtColor(output, cv2.COLOR_BGR2RGB) cv2.imwrite( os.path.join(self.cfg.test_output_path, dataset_test.slide + '.png'), output) # writer_info.update(mask=mask_rgb, prediction=predictions_rgb) start_time = time.time() # break data_time.reset() batch_time.reset() scores_test = tester.get_scores() tester.reset_metrics() test_model_fr, test_seg_fr = tester.calculate_avg_fr() # save model best_pred = save_ckpt_model(model, self.cfg, scores_val, best_pred, epoch) # log update_log(f_log, self.cfg, scores_train, scores_val, [train_model_fr, train_seg_fr], [val_model_fr, val_seg_fr], epoch, scores_test=scores_test, test_fr=[test_model_fr, test_seg_fr]) # writer\ if self.cfg.n_class == 4: writer_info.update(loss=train_loss / len(tbar), lr=optimizer.param_groups[0]['lr'], mIOU={ "train": scores_train["iou_mean"], "val": scores_val["iou_mean"], "test": scores_test["iou_mean"], }, mucosa_iou={ "train": scores_train["iou"][2], "val": scores_val["iou"][2], "test": scores_test["iou"][2], }, tumor_iou={ "train": scores_train["iou"][3], "val": scores_val["iou"][3], "test": scores_test["iou"][3], }, mucosa_model_fr={ "train": train_model_fr[0], "val": val_model_fr[0], "test": test_model_fr[0], }, tumor_model_fr={ "train": train_model_fr[1], "val": val_model_fr[1], "test": val_model_fr[1], }, mucosa_seg_fr={ "train": train_seg_fr[0], "val": val_seg_fr[0], "test": test_seg_fr[0], }, tumor_seg_fr={ "train": train_seg_fr[1], "val": val_seg_fr[1], "test": test_seg_fr[1], }) else: writer_info.update(loss=train_loss / len(tbar), lr=optimizer.param_groups[0]['lr'], mIOU={ "train": scores_train["iou_mean"], "val": scores_val["iou_mean"], "test": scores_test["iou_mean"], }, merge_iou={ "train": scores_train["iou"][2], "val": scores_val["iou"][2], "test": scores_test["iou"][2], }, merge_model_fr={ "train": train_model_fr[0], "val": val_model_fr[0], "test": test_model_fr[0], }, merge_seg_fr={ "train": train_seg_fr[0], "val": val_seg_fr[0], "test": val_seg_fr[0], }) update_writer(writer, writer_info, epoch) if self.local_rank == 0: f_log.close()