from select import select, error as selecterror from signal import signal, SIGINT, SIG_DFL from socket import (socket, error as sockerr, has_ipv6, AF_UNSPEC, AF_INET, AF_INET6, SOCK_DGRAM, IPPROTO_UDP) from sys import exit, stderr from time import time # Local imports from config import config, ConfigError from config import log, LOG_ERROR, LOG_PRINT, LOG_VERBOSE, LOG_DEBUG from db import dbconnect # inet_pton isn't defined on windows, so use our own from utils import inet_pton, stringtosockaddr, valid_addr try: config.parse() except ConfigError as err: # Note that we don't know how much user config is loaded at this stage log(LOG_ERROR, err) exit(1) try: log_client, log_gamestat, db_id = dbconnect(config.db) except ImportError as ex: def nodb(*args): '''This function is defined and used when the database import fails''' log(LOG_DEBUG, 'No database, not logged:', args) log_client = log_gamestat = nodb log(LOG_PRINT, 'Warning: database not available') else: log(LOG_VERBOSE, db_id)
def train(**kwargs): config.parse(kwargs) vis = Visualizer(port=2333, env=config.env) vis.log('Use config:') for k, v in config.__class__.__dict__.items(): if not k.startswith('__'): vis.log(f"{k}: {getattr(config, k)}") # prepare data train_data = VB_Dataset(config.train_paths, phase='train', useRGB=config.useRGB, usetrans=config.usetrans, padding=config.padding, balance=config.data_balance) val_data = VB_Dataset(config.test_paths, phase='val', useRGB=config.useRGB, usetrans=config.usetrans, padding=config.padding, balance=False) print('Training Images:', train_data.__len__(), 'Validation Images:', val_data.__len__()) dist = train_data.dist() print('Train Data Distribution:', dist, 'Val Data Distribution:', val_data.dist()) train_dataloader = DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) val_dataloader = DataLoader(val_data, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) # prepare model # model = ResNet18(num_classes=config.num_classes) # model = Vgg16(num_classes=config.num_classes) # model = densenet_collapse(num_classes=config.num_classes) model = ShallowVgg(num_classes=config.num_classes) print(model) if config.load_model_path: model.load(config.load_model_path) if config.use_gpu: model.cuda() if config.parallel: model = torch.nn.DataParallel( model, device_ids=[x for x in range(config.num_of_gpu)]) # criterion and optimizer # weight = torch.Tensor([1/dist['0'], 1/dist['1'], 1/dist['2'], 1/dist['3']]) # weight = torch.Tensor([1/dist['0'], 1/dist['1']]) # weight = torch.Tensor([dist['1'], dist['0']]) # weight = torch.Tensor([1, 10]) # vis.log(f'loss weight: {weight}') # print('loss weight:', weight) # weight = weight.cuda() # criterion = torch.nn.CrossEntropyLoss() criterion = LabelSmoothing(size=config.num_classes, smoothing=0.1) # criterion = torch.nn.CrossEntropyLoss(weight=weight) # criterion = FocalLoss(gamma=4, alpha=None) lr = config.lr optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=config.weight_decay) # metric softmax = functional.softmax log_softmax = functional.log_softmax loss_meter = meter.AverageValueMeter() epoch_loss = meter.AverageValueMeter() train_cm = meter.ConfusionMeter(config.num_classes) train_AUC = meter.AUCMeter() previous_avgse = 0 # previous_AUC = 0 if config.parallel: save_model_dir = config.save_model_dir if config.save_model_dir else model.module.model_name save_model_name = config.save_model_name if config.save_model_name else model.module.model_name + '_best_model.pth' else: save_model_dir = config.save_model_dir if config.save_model_dir else model.model_name save_model_name = config.save_model_name if config.save_model_name else model.model_name + '_best_model.pth' save_epoch = 1 # 用于记录验证集上效果最好模型对应的epoch # process_record = {'epoch_loss': [], # 用于记录实验过程中的曲线,便于画曲线图 # 'train_avgse': [], 'train_se0': [], 'train_se1': [], 'train_se2': [], 'train_se3': [], # 'val_avgse': [], 'val_se0': [], 'val_se1': [], 'val_se2': [], 'val_se3': []} process_record = { 'epoch_loss': [], # 用于记录实验过程中的曲线,便于画曲线图 'train_avgse': [], 'train_se0': [], 'train_se1': [], 'val_avgse': [], 'val_se0': [], 'val_se1': [], 'train_AUC': [], 'val_AUC': [] } # train for epoch in range(config.max_epoch): print( f"epoch: [{epoch+1}/{config.max_epoch}] {config.save_model_name[:-4]} ==================================" ) epoch_loss.reset() train_cm.reset() train_AUC.reset() # train model.train() for i, (image, label, image_path) in tqdm(enumerate(train_dataloader)): loss_meter.reset() # prepare input if config.use_gpu: image = image.cuda() label = label.cuda() # go through the model score = model(image) # backpropagate optimizer.zero_grad() # loss = criterion(score, label) loss = criterion(log_softmax(score, dim=1), label) loss.backward() optimizer.step() loss_meter.add(loss.item()) epoch_loss.add(loss.item()) train_cm.add(softmax(score, dim=1).data, label.data) positive_score = np.array([ item[1] for item in softmax(score, dim=1).data.cpu().numpy().tolist() ]) train_AUC.add(positive_score, label.data) if (i + 1) % config.print_freq == 0: vis.plot('loss', loss_meter.value()[0]) # print result # train_se = [100. * train_cm.value()[0][0] / (train_cm.value()[0][0] + train_cm.value()[0][1] + train_cm.value()[0][2] + train_cm.value()[0][3]), # 100. * train_cm.value()[1][1] / (train_cm.value()[1][0] + train_cm.value()[1][1] + train_cm.value()[1][2] + train_cm.value()[1][3]), # 100. * train_cm.value()[2][2] / (train_cm.value()[2][0] + train_cm.value()[2][1] + train_cm.value()[2][2] + train_cm.value()[2][3]), # 100. * train_cm.value()[3][3] / (train_cm.value()[3][0] + train_cm.value()[3][1] + train_cm.value()[3][2] + train_cm.value()[3][3])] train_se = [ 100. * train_cm.value()[0][0] / (train_cm.value()[0][0] + train_cm.value()[0][1]), 100. * train_cm.value()[1][1] / (train_cm.value()[1][0] + train_cm.value()[1][1]) ] # validate model.eval() if (epoch + 1) % 1 == 0: val_cm, val_se, val_accuracy, val_AUC = val_2class( model, val_dataloader) if np.average( val_se) > previous_avgse: # 当测试集上的平均sensitivity升高时保存模型 # if val_AUC.value()[0] > previous_AUC: # 当测试集上的AUC升高时保存模型 if config.parallel: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])) model.module.save( os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0], save_model_name)) else: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])) model.save( os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0], save_model_name)) previous_avgse = np.average(val_se) # previous_AUC = val_AUC.value()[0] save_epoch = epoch + 1 process_record['epoch_loss'].append(epoch_loss.value()[0]) process_record['train_avgse'].append(np.average(train_se)) process_record['train_se0'].append(train_se[0]) process_record['train_se1'].append(train_se[1]) # process_record['train_se2'].append(train_se[2]) # process_record['train_se3'].append(train_se[3]) process_record['train_AUC'].append(train_AUC.value()[0]) process_record['val_avgse'].append(np.average(val_se)) process_record['val_se0'].append(val_se[0]) process_record['val_se1'].append(val_se[1]) # process_record['val_se2'].append(val_se[2]) # process_record['val_se3'].append(val_se[3]) process_record['val_AUC'].append(val_AUC.value()[0]) # vis.plot_many({'epoch_loss': epoch_loss.value()[0], # 'train_avgse': np.average(train_se), 'train_se0': train_se[0], 'train_se1': train_se[1], 'train_se2': train_se[2], 'train_se3': train_se[3], # 'val_avgse': np.average(val_se), 'val_se0': val_se[0], 'val_se1': val_se[1], 'val_se2': val_se[2], 'val_se3': val_se[3]}) # vis.log(f"epoch: [{epoch+1}/{config.max_epoch}] =========================================") # vis.log(f"lr: {optimizer.param_groups[0]['lr']}, loss: {round(loss_meter.value()[0], 5)}") # vis.log(f"train_avgse: {round(np.average(train_se), 4)}, train_se0: {round(train_se[0], 4)}, train_se1: {round(train_se[1], 4)}, train_se2: {round(train_se[2], 4)}, train_se3: {round(train_se[3], 4)},") # vis.log(f"val_avgse: {round(np.average(val_se), 4)}, val_se0: {round(val_se[0], 4)}, val_se1: {round(val_se[1], 4)}, val_se2: {round(val_se[2], 4)}, val_se3: {round(val_se[3], 4)}") # vis.log(f'train_cm: {train_cm.value()}') # vis.log(f'val_cm: {val_cm.value()}') # print("lr:", optimizer.param_groups[0]['lr'], "loss:", round(epoch_loss.value()[0], 5)) # print('train_avgse:', round(np.average(train_se), 4), 'train_se0:', round(train_se[0], 4), 'train_se1:', round(train_se[1], 4), 'train_se2:', round(train_se[2], 4), 'train_se3:', round(train_se[3], 4)) # print('val_avgse:', round(np.average(val_se), 4), 'val_se0:', round(val_se[0], 4), 'val_se1:', round(val_se[1], 4), 'val_se2:', round(val_se[2], 4), 'val_se3:', round(val_se[3], 4)) # print('train_cm:') # print(train_cm.value()) # print('val_cm:') # print(val_cm.value()) vis.plot_many({ 'epoch_loss': epoch_loss.value()[0], 'train_avgse': np.average(train_se), 'train_se0': train_se[0], 'train_se1': train_se[1], 'val_avgse': np.average(val_se), 'val_se0': val_se[0], 'val_se1': val_se[1], 'train_AUC': train_AUC.value()[0], 'val_AUC': val_AUC.value()[0] }) vis.log( f"epoch: [{epoch + 1}/{config.max_epoch}] =========================================" ) vis.log( f"lr: {optimizer.param_groups[0]['lr']}, loss: {round(loss_meter.value()[0], 5)}" ) vis.log( f"train_avgse: {round(np.average(train_se), 4)}, train_se0: {round(train_se[0], 4)}, train_se1: {round(train_se[1], 4)}" ) vis.log( f"val_avgse: {round(np.average(val_se), 4)}, val_se0: {round(val_se[0], 4)}, val_se1: {round(val_se[1], 4)}" ) vis.log(f'train_AUC: {train_AUC.value()[0]}') vis.log(f'val_AUC: {val_AUC.value()[0]}') vis.log(f'train_cm: {train_cm.value()}') vis.log(f'val_cm: {val_cm.value()}') print("lr:", optimizer.param_groups[0]['lr'], "loss:", round(epoch_loss.value()[0], 5)) print('train_avgse:', round(np.average(train_se), 4), 'train_se0:', round(train_se[0], 4), 'train_se1:', round(train_se[1], 4)) print('val_avgse:', round(np.average(val_se), 4), 'val_se0:', round(val_se[0], 4), 'val_se1:', round(val_se[1], 4)) print('train_AUC:', train_AUC.value()[0], 'val_AUC:', val_AUC.value()[0]) print('train_cm:') print(train_cm.value()) print('val_cm:') print(val_cm.value()) if os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])): write_json(file=os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0], 'process_record.json'), content=process_record) # if (epoch+1) % 5 == 0: # lr = lr * config.lr_decay # for param_group in optimizer.param_groups: # param_group['lr'] = lr vis.log(f"Best Epoch: {save_epoch}") print("Best Epoch:", save_epoch)
def iter_train(**kwargs): config.parse(kwargs) # ============================================ Visualization ============================================= # vis = Visualizer(port=2333, env=config.env) # vis.log('Use config:') # for k, v in config.__class__.__dict__.items(): # if not k.startswith('__'): # vis.log(f"{k}: {getattr(config, k)}") # ============================================= Prepare Data ============================================= train_data = VB_Dataset(config.train_paths, phase='train', num_classes=config.num_classes, useRGB=config.useRGB, usetrans=config.usetrans, padding=config.padding, balance=config.data_balance) val_data = VB_Dataset(config.test_paths, phase='val', num_classes=config.num_classes, useRGB=config.useRGB, usetrans=config.usetrans, padding=config.padding, balance=config.data_balance) train_dist, val_dist = train_data.dist(), val_data.dist() train_data_scale, val_data_scale = train_data.scale, val_data.scale print('Training Images:', train_data.__len__(), 'Validation Images:', val_data.__len__()) print('Train Data Distribution:', train_dist, 'Val Data Distribution:', val_dist) train_dataloader = DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) val_dataloader = DataLoader(val_data, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) # ============================================= Prepare Model ============================================ # model = ResNet18(num_classes=config.num_classes) # model = ResNet34(num_classes=config.num_classes) # model = ResNet50(num_classes=config.num_classes) model = Vgg16(num_classes=config.num_classes) # model = AlexNet(num_classes=config.num_classes) # model = densenet_collapse(num_classes=config.num_classes) # model = ShallowVgg(num_classes=config.num_classes) # model = CustomedNet(num_classes=config.num_classes) # model = DualNet(num_classes=config.num_classes) # model = SkipResNet18(num_classes=config.num_classes) # model = DensResNet18(num_classes=config.num_classes) # model = GuideResNet18(num_classes=config.num_classes) # print(model) if config.load_model_path: model.load(config.load_model_path) if config.use_gpu: model.cuda() if config.parallel: model = torch.nn.DataParallel(model, device_ids=list(range( config.num_of_gpu))) # =========================================== Criterion and Optimizer ===================================== # weight = torch.Tensor([1/dist['0'], 1/dist['1'], 1/dist['2'], 1/dist['3']]) # weight = torch.Tensor([1/dist['0'], 1/dist['1']]) # weight = torch.Tensor([dist['1'], dist['0']]) # weight = torch.Tensor([1, 10]) # vis.log(f'loss weight: {weight}') # print('loss weight:', weight) # weight = weight.cuda() criterion = torch.nn.CrossEntropyLoss() # criterion = torch.nn.CrossEntropyLoss(weight=weight) # criterion = LabelSmoothing(size=config.num_classes, smoothing=0.2) # criterion = FocalLoss(gamma=4, alpha=None) lr = config.lr optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=config.weight_decay) # ================================================== Metrics =============================================== log_softmax = functional.log_softmax loss_meter = meter.AverageValueMeter() # ====================================== Saving and Recording Configuration ================================= previous_AUC = 0 previous_mAP = 0 save_iter = 1 # 用于记录验证集上效果最好模型对应的epoch if config.parallel: save_model_dir = config.save_model_dir if config.save_model_dir else model.module.model_name save_model_name = config.save_model_name if config.save_model_name else model.module.model_name + '_best_model.pth' else: save_model_dir = config.save_model_dir if config.save_model_dir else model.model_name save_model_name = config.save_model_name if config.save_model_name else model.model_name + '_best_model.pth' if config.num_classes == 2: # 2分类 process_record = { 'loss': [], # 用于记录实验过程中的曲线,便于画曲线图 'train_avg': [], 'train_sp': [], 'train_se': [], 'val_avg': [], 'val_sp': [], 'val_se': [], 'train_AUC': [], 'val_AUC': [] } elif config.num_classes == 3: # 3分类 process_record = { 'loss': [], # 用于记录实验过程中的曲线,便于画曲线图 'train_sp0': [], 'train_se0': [], 'train_sp1': [], 'train_se1': [], 'train_sp2': [], 'train_se2': [], 'val_sp0': [], 'val_se0': [], 'val_sp1': [], 'val_se1': [], 'val_sp2': [], 'val_se2': [], 'train_mAUC': [], 'val_mAUC': [], 'train_mAP': [], 'val_mAP': [] } else: raise ValueError # ================================================== Training =============================================== iteration = 0 # ****************************************** train **************************************** train_iter = iter(train_dataloader) model.train() while iteration < config.max_iter: try: image, label, image_path = next(train_iter) except: train_iter = iter(train_dataloader) image, label, image_path = next(train_iter) iteration += 1 # ------------------------------------ prepare input ------------------------------------ if config.use_gpu: image = image.cuda() label = label.cuda() # ---------------------------------- go through the model -------------------------------- score = model(image) # ----------------------------------- backpropagate ------------------------------------- optimizer.zero_grad() loss = criterion(score, label) # loss = criterion(log_softmax(score, dim=1), label) # LabelSmoothing loss.backward() optimizer.step() # ------------------------------------ record loss ------------------------------------ loss_meter.add(loss.item()) if iteration % config.print_freq == 0: tqdm.write( f"iter: [{iteration}/{config.max_iter}] {config.save_model_name[:-4]} ==================================" ) # *************************************** validate *************************************** if config.num_classes == 2: # 2分类 model.eval() train_cm, train_AUC, train_sp, train_se, train_T, train_accuracy = val_2class( model, train_dataloader, train_dist) val_cm, val_AUC, val_sp, val_se, val_T, val_accuracy = val_2class( model, val_dataloader, val_dist) # vis.plot('loss', loss_meter.value()[0]) model.train() # ------------------------------------ save model ------------------------------------ if val_AUC > previous_AUC: # 当测试集上的AUC升高时保存模型 if config.parallel: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])) model.module.save( os.path.join('checkpoints', save_model_dir, save_model_name[:-4], save_model_name)) else: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])) model.save( os.path.join('checkpoints', save_model_dir, save_model_name[:-4], save_model_name)) previous_AUC = val_AUC save_iter = iteration # ---------------------------------- recond and print --------------------------------- process_record['loss'].append(loss_meter.value()[0]) process_record['train_avg'].append((train_sp + train_se) / 2) process_record['train_sp'].append(train_sp) process_record['train_se'].append(train_se) process_record['train_AUC'].append(train_AUC) process_record['val_avg'].append((val_sp + val_se) / 2) process_record['val_sp'].append(val_sp) process_record['val_se'].append(val_se) process_record['val_AUC'].append(val_AUC) # vis.plot_many({'loss': loss_meter.value()[0], # 'train_avg': (train_sp + train_se) / 2, 'train_sp': train_sp, 'train_se': train_se, # 'val_avg': (val_sp + val_se) / 2, 'val_sp': val_sp, 'val_se': val_se, # 'train_AUC': train_AUC, 'val_AUC': val_AUC}) # vis.log(f"iter: [{iteration}/{config.max_iter}] =========================================") # vis.log(f"lr: {optimizer.param_groups[0]['lr']}, loss: {round(loss_meter.value()[0], 5)}") # vis.log(f"train_avg: {round((train_sp + train_se) / 2, 4)}, train_sp: {round(train_sp, 4)}, train_se: {round(train_se, 4)}") # vis.log(f"val_avg: {round((val_sp + val_se) / 2, 4)}, val_sp: {round(val_sp, 4)}, val_se: {round(val_se, 4)}") # vis.log(f'train_AUC: {train_AUC}') # vis.log(f'val_AUC: {val_AUC}') # vis.log(f'train_cm: {train_cm}') # vis.log(f'val_cm: {val_cm}') print("lr:", optimizer.param_groups[0]['lr'], "loss:", round(loss_meter.value()[0], 5)) print('train_avg:', round((train_sp + train_se) / 2, 4), 'train_sp:', round(train_sp, 4), 'train_se:', round(train_se, 4)) print('val_avg:', round((val_sp + val_se) / 2, 4), 'val_sp:', round(val_sp, 4), 'val_se:', round(val_se, 4)) print('train_AUC:', train_AUC, 'val_AUC:', val_AUC) print('train_cm:') print(train_cm) print('val_cm:') print(val_cm) elif config.num_classes == 3: # 3分类 model.eval() train_cm, train_mAP, train_sp, train_se, train_mAUC, train_accuracy = val_3class( model, train_dataloader, train_data_scale) val_cm, val_mAP, val_sp, val_se, val_mAUC, val_accuracy = val_3class( model, val_dataloader, val_data_scale) model.train() # ------------------------------------ save model ------------------------------------ if val_mAP > previous_mAP: # 当测试集上的mAP升高时保存模型 if config.parallel: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])) model.module.save( os.path.join('checkpoints', save_model_dir, save_model_name[:-4], save_model_name)) else: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])) model.save( os.path.join('checkpoints', save_model_dir, save_model_name[:-4], save_model_name)) previous_mAP = val_mAP save_iter = iteration # ---------------------------------- recond and print --------------------------------- process_record['loss'].append(loss_meter.value()[0]) process_record['train_sp0'].append(train_sp[0]) process_record['train_se0'].append(train_se[0]) process_record['train_sp1'].append(train_sp[1]) process_record['train_se1'].append(train_se[1]) process_record['train_sp2'].append(train_sp[2]) process_record['train_se2'].append(train_se[2]) process_record['train_mAUC'].append(float(train_mAUC)) process_record['train_mAP'].append(float(train_mAP)) process_record['val_sp0'].append(val_sp[0]) process_record['val_se0'].append(val_se[0]) process_record['val_sp1'].append(val_sp[1]) process_record['val_se1'].append(val_se[1]) process_record['val_sp2'].append(val_sp[2]) process_record['val_se2'].append(val_se[2]) process_record['val_mAUC'].append(float(val_mAUC)) process_record['val_mAP'].append(float(val_mAP)) # vis.plot_many({'loss': loss_meter.value()[0], # 'train_sp0': train_se[0], 'train_sp1': train_se[1], 'train_sp2': train_se[2], # 'train_se0': train_se[0], 'train_se1': train_se[1], 'train_se2': train_se[2], # 'val_sp0': val_se[0], 'val_sp1': val_se[1], 'val_sp2': val_se[2], # 'val_se0': val_se[0], 'val_se1': val_se[1], 'val_se2': val_se[2], # 'train_mAP': train_mAP, 'val_mAP': val_mAP}) # vis.log(f"iter: [{iteration}/{config.max_iter}] =========================================") # vis.log(f"lr: {optimizer.param_groups[0]['lr']}, loss: {round(loss_meter.value()[0], 5)}") # vis.log(f"train_sp0: {round(train_sp[0], 4)}, train_sp1: {round(train_sp[1], 4)}, train_sp2: {round(train_sp[2], 4)}") # vis.log(f"train_se0: {round(train_se[0], 4)}, train_se1: {round(train_se[1], 4)}, train_se2: {round(train_se[2], 4)}") # vis.log(f"val_sp0: {round(val_sp[0], 4)}, val_sp1: {round(val_sp[1], 4)}, val_sp2: {round(val_sp[2], 4)}") # vis.log(f"val_se0: {round(val_se[0], 4)}, val_se1: {round(val_se[1], 4)}, val_se2: {round(val_se[2], 4)}") # vis.log(f"train_mAP: {train_mAP}, val_mAP: {val_mAP}") # vis.log(f'train_cm: {train_cm}') # vis.log(f'val_cm: {val_cm}') print("lr:", optimizer.param_groups[0]['lr'], "loss:", round(loss_meter.value()[0], 5)) print('train_sp0:', round(train_sp[0], 4), 'train_sp1:', round(train_sp[1], 4), 'train_sp2:', round(train_sp[2], 4)) print('train_se0:', round(train_se[0], 4), 'train_se1:', round(train_se[1], 4), 'train_se2:', round(train_se[2], 4)) print('val_sp0:', round(val_sp[0], 4), 'val_sp1:', round(val_sp[1], 4), 'val_sp2:', round(val_sp[2], 4)) print('val_se0:', round(val_se[0], 4), 'val_se1:', round(val_se[1], 4), 'val_se2:', round(val_se[2], 4)) print('mSP:', round(sum(val_sp) / 3, 5), 'mSE:', round(sum(val_se) / 3, 5)) print('train_mAUC:', train_mAUC, 'val_mAUC:', val_mAUC) print('train_mAP:', train_mAP, 'val_mAP:', val_mAP) print('train_cm:') print(train_cm) print('val_cm:') print(val_cm) print('Best mAP:', previous_mAP) loss_meter.reset() # ------------------------------------ save record ------------------------------------ if os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])): write_json(file=os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0], 'process_record.json'), content=process_record) # vis.log(f"Best Iter: {save_iter}") print("Best Iter:", save_iter)
def test(**kwargs): config.parse(kwargs) # prepare data test_data = PairSWDataset(config.test_paths, phase='test', useRGB=config.useRGB, usetrans=config.usetrans, balance=False) test_dataloader = DataLoader(test_data, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) print('Test Image:', test_data.__len__()) # prepare model model = SiameseNet(num_classes=config.num_classes) print(model) if config.load_model_path: model.load(config.load_model_path) print('Model has been loaded!') else: print("Don't load model") if config.use_gpu: model.cuda() if config.parallel: model = torch.nn.DataParallel(model, device_ids=[x for x in range(config.num_of_gpu)]) model.eval() test_cm = meter.ConfusionMeter(config.num_classes) softmax = functional.softmax results = [] # go through the model for i, (image, label, image_path) in tqdm(enumerate(test_dataloader)): img = Variable(image, volatile=True) target = Variable(label) if config.use_gpu: img = img.cuda() target = target.cuda() score = model(img) test_cm.add(softmax(score, dim=1).data, target.data) for l, p, ip in zip(label, softmax(score, dim=1).data, image_path): if p[0] >= p[1]: results.append((ip, l, 0, round(p[0], 4), round(p[1], 4))) else: results.append((ip, l, 1, round(p[0], 4), round(p[1], 4))) # for p, ip in zip(softmax(score, dim=1).data, image_path): # # print(p) # b = ip.split('/')[-1].split('.')[0].split('_')[2:6] # if p[1] >= 0.5: # if ip.split('/')[-2] in positive_bbox.keys(): # positive_bbox[ip.split('/')[-2]].append((int(b[0]), int(b[1]), int(b[2]), int(b[3]), p[1])) # else: # positive_bbox[ip.split('/')[-2]] = [(int(b[0]), int(b[1]), int(b[2]), int(b[3]), p[1])] # else: # pass ACC = 100. * sum([test_cm.value()[c][c] for c in range(config.num_classes)]) / test_cm.value().sum() SE = 100. * test_cm.value()[1][1] / (test_cm.value()[1][0] + test_cm.value()[1][1]) print('confusion matrix:') print(test_cm.value()) print('test accuracy:', ACC) print('Sensitivity:', SE) if config.result_file: write_csv(os.path.join('results', config.result_file), tag=['path', 'label', 'predict', 'p1', 'p2'], content=results)
def train_pair(**kwargs): config.parse(kwargs) vis = Visualizer(port=2333, env=config.env) vis.log('Use config:') for k, v in config.__class__.__dict__.items(): if not k.startswith('__'): vis.log(f"{k}: {getattr(config, k)}") # prepare data train_data = PairSWDataset(config.train_paths, phase='train', useRGB=config.useRGB, usetrans=config.usetrans, balance=config.data_balance) valpair_data = PairSWDataset(config.test_paths, phase='val_pair', useRGB=config.useRGB, usetrans=config.usetrans, balance=False) print('Training Samples:', train_data.__len__(), 'ValPair Samples:', valpair_data.__len__()) dist = train_data.dist() print('Train Data Distribution:', dist) train_dataloader = DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) valpair_dataloader = DataLoader(valpair_data, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) # prepare model model = SiameseNet(num_classes=config.num_classes) print(model) if config.load_model_path: model.load(config.load_model_path) if config.use_gpu: model.cuda() if config.parallel: model = torch.nn.DataParallel(model, device_ids=[x for x in range(config.num_of_gpu)]) model.train() # criterion and optimizer weight_pair = torch.Tensor([1, 1.5]) vis.log(f'pair loss weight: {weight_pair}') print('pair loss weight:', weight_pair) weight_pair = weight_pair.cuda() pair_criterion = torch.nn.CrossEntropyLoss(weight=weight_pair) lr = config.lr optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=config.weight_decay) # metric softmax = functional.softmax pair_loss_meter = meter.AverageValueMeter() pair_epoch_loss = meter.AverageValueMeter() pair_train_cm = meter.ConfusionMeter(config.num_classes) # previous_loss = 100 pair_previous_avg_se = 0 # train if config.parallel: if not os.path.exists(os.path.join('checkpoints', model.module.model_name)): os.mkdir(os.path.join('checkpoints', model.module.model_name)) else: if not os.path.exists(os.path.join('checkpoints', model.model_name)): os.mkdir(os.path.join('checkpoints', model.model_name)) for epoch in range(config.max_epoch): print(f"epoch: [{epoch+1}/{config.max_epoch}] =============================================") pair_train_cm.reset() pair_epoch_loss.reset() # train for i, (image_1, image_2, label_1, label_2, label_res, _, _) in tqdm(enumerate(train_dataloader)): pair_loss_meter.reset() # prepare input image_1 = Variable(image_1) image_2 = Variable(image_2) target_res = Variable(label_res) if config.use_gpu: image_1 = image_1.cuda() image_2 = image_2.cuda() target_res = target_res.cuda() # go through the model score_1, score_2, score_res = model(image_1, image_2) # backpropagate optimizer.zero_grad() pair_loss = pair_criterion(score_res, target_res) pair_loss.backward() optimizer.step() pair_loss_meter.add(pair_loss.data[0]) pair_epoch_loss.add(pair_loss.data[0]) pair_train_cm.add(softmax(score_res, dim=1).data, target_res.data) if (i+1) % config.print_freq == 0: vis.plot('loss', pair_loss_meter.value()[0]) # print result pair_train_se = [100. * pair_train_cm.value()[0][0] / (pair_train_cm.value()[0][0] + pair_train_cm.value()[0][1]), 100. * pair_train_cm.value()[1][1] / (pair_train_cm.value()[1][0] + pair_train_cm.value()[1][1])] model.eval() pair_val_cm, pair_val_accuracy, pair_val_se = val_pair(model, valpair_dataloader) if np.average(pair_val_se) > pair_previous_avg_se: # 当测试集上的平均sensitivity升高时保存模型 if config.parallel: save_model_dir = config.save_model_dir if config.save_model_dir else model.module.model_name save_model_name = config.save_model_name if config.save_model_name else model.module.model_name + '_best_model.pth' if not os.path.exists(os.path.join('checkpoints', save_model_dir)): os.makedirs(os.path.join('checkpoints', save_model_dir)) model.module.save(os.path.join('checkpoints', save_model_dir, save_model_name)) else: save_model_dir = config.save_model_dir if config.save_model_dir else model.model_name save_model_name = config.save_model_name if config.save_model_name else model.model_name + '_best_model.pth' if not os.path.exists(os.path.join('checkpoints', save_model_dir)): os.makedirs(os.path.join('checkpoints', save_model_dir)) model.save(os.path.join('checkpoints', save_model_dir, save_model_name)) pair_previous_avg_se = np.average(pair_val_se) if epoch+1 == config.max_epoch: # 保存最后一个模型 if config.parallel: save_model_dir = config.save_model_dir if config.save_model_dir else model.module.model_name save_model_name = config.save_model_name.split('.pth')[0]+'_last.pth' if config.save_model_name else model.module.model_name + '_last_model.pth' else: save_model_dir = config.save_model_dir if config.save_model_dir else model.model_name save_model_name = config.save_model_name.split('.pth')[0]+'_last.pth' if config.save_model_name else model.model_name + '_last_model.pth' if not os.path.exists(os.path.join('checkpoints', save_model_dir)): os.makedirs(os.path.join('checkpoints', save_model_dir)) model.save(os.path.join('checkpoints', save_model_dir, save_model_name)) vis.plot_many({'epoch_loss': pair_epoch_loss.value()[0], 'pair_train_avg_se': np.average(pair_train_se), 'pair_train_se_0': pair_train_se[0], 'pair_train_se_1': pair_train_se[1], 'pair_val_avg_se': np.average(pair_val_se), 'pair_val_se_0': pair_val_se[0], 'pair_val_se_1': pair_val_se[1]}) vis.log(f"epoch: [{epoch+1}/{config.max_epoch}] ===============================================") vis.log(f"lr: {lr}, loss: {round(pair_epoch_loss.value()[0], 5)}") vis.log(f"pair_train_avg_se: {round(np.average(pair_train_se), 4)}, pair_train_se_0: {round(pair_train_se[0], 4)}, pair_train_se_1: {round(pair_train_se[1], 4)}") vis.log(f"pair_val_avg_se: {round(sum(pair_val_se) / len(pair_val_se), 4)}, pair_val_se_0: {round(pair_val_se[0], 4)}, pair_val_se_1: {round(pair_val_se[1], 4)}") vis.log(f'pair_train_cm: {pair_train_cm.value()}') vis.log(f'pair_val_cm: {pair_val_cm.value()}') print("lr:", lr, "loss:", round(pair_epoch_loss.value()[0], 5)) print('pair_train_avg_se:', round(np.average(pair_train_se), 4), 'pair_train_se_0:', round(pair_train_se[0], 4), 'pair_train_se_1:', round(pair_train_se[1], 4)) print('pair_val_avg_se:', round(np.average(pair_val_se), 4), 'pair_val_se_0:', round(pair_val_se[0], 4), 'pair_val_se_1:', round(pair_val_se[1], 4)) print('pair_train_cm:') print(pair_train_cm.value()) print('pair_val_cm:') print(pair_val_cm.value()) # update learning rate # if loss_meter.value()[0] > previous_loss: # lr = lr * config.lr_decay # for param_group in optimizer.param_groups: # param_group['lr'] = lr # previous_loss = loss_meter.value()[0] if (epoch+1) % 5 == 0: lr = lr * config.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr
def train(**kwargs): config.parse(kwargs) # ============================================ Visualization ============================================= vis = Visualizer(port=2333, env=config.env) vis.log('Use config:') for k, v in config.__class__.__dict__.items(): if not k.startswith('__'): vis.log(f"{k}: {getattr(config, k)}") # ============================================= Prepare Data ============================================= train_data_1 = SlideWindowDataset(config.train_paths, phase='train', useRGB=config.useRGB, usetrans=config.usetrans, balance=config.data_balance) train_data_2 = SlideWindowDataset(config.train_paths, phase='train', useRGB=config.useRGB, usetrans=config.usetrans, balance=config.data_balance) val_data = SlideWindowDataset(config.test_paths, phase='val', useRGB=config.useRGB, usetrans=config.usetrans, balance=False) print('Training Images:', train_data_1.__len__(), 'Validation Images:', val_data.__len__()) dist = train_data_1.dist() print('Train Data Distribution:', dist) train_dataloader_1 = DataLoader(train_data_1, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) train_dataloader_2 = DataLoader(train_data_2, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) val_dataloader = DataLoader(val_data, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) # ============================================= Prepare Model ============================================ # model = PCResNet18(num_classes=config.num_classes) model = DualResNet18(num_classes=config.num_classes) print(model) if config.load_model_path: model.load(config.load_model_path) if config.use_gpu: model.cuda() if config.parallel: model = torch.nn.DataParallel(model, device_ids=[x for x in range(config.num_of_gpu)]) # =========================================== Criterion and Optimizer ===================================== # weight = torch.Tensor([1, 1]) # weight = torch.Tensor([dist['1']/(dist['0']+dist['1']), dist['0']/(dist['0']+dist['1'])]) # weight需要将二者反过来,多于二分类可以取倒数 # weight = torch.Tensor([1, 3.5]) # weight = torch.Tensor([1, 5]) weight = torch.Tensor([1, 7]) vis.log(f'loss weight: {weight}') print('loss weight:', weight) weight = weight.cuda() criterion = torch.nn.CrossEntropyLoss(weight=weight) MSELoss = torch.nn.MSELoss() sycriterion = torch.nn.CrossEntropyLoss() lr = config.lr optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=config.weight_decay) # ================================================== Metrics =============================================== softmax = functional.softmax loss_meter = meter.AverageValueMeter() epoch_loss = meter.AverageValueMeter() mse_meter = meter.AverageValueMeter() epoch_mse = meter.AverageValueMeter() syloss_meter = meter.AverageValueMeter() epoch_syloss = meter.AverageValueMeter() total_loss_meter = meter.AverageValueMeter() epoch_total_loss = meter.AverageValueMeter() train_cm = meter.ConfusionMeter(config.num_classes) # ====================================== Saving and Recording Configuration ================================= previous_auc = 0 if config.parallel: save_model_dir = config.save_model_dir if config.save_model_dir else model.module.model_name save_model_name = config.save_model_name if config.save_model_name else model.module.model_name + '_best_model.pth' else: save_model_dir = config.save_model_dir if config.save_model_dir else model.model_name save_model_name = config.save_model_name if config.save_model_name else model.model_name + '_best_model.pth' save_epoch = 1 # 用于记录验证集上效果最好模型对应的epoch process_record = {'epoch_loss': [], 'train_avg_se': [], 'train_se_0': [], 'train_se_1': [], 'val_avg_se': [], 'val_se_0': [], 'val_se_1': [], 'AUC': []} # 用于记录实验过程中的曲线,便于画曲线图 # ================================================== Training =============================================== for epoch in range(config.max_epoch): print(f"epoch: [{epoch+1}/{config.max_epoch}] {config.save_model_name[:-4]} ==================================") train_cm.reset() epoch_loss.reset() epoch_mse.reset() epoch_syloss.reset() epoch_total_loss.reset() # ****************************************** train **************************************** model.train() for i, (item1, item2) in tqdm(enumerate(zip(train_dataloader_1, train_dataloader_2))): loss_meter.reset() mse_meter.reset() syloss_meter.reset() total_loss_meter.reset() # ------------------------------------ prepare input ------------------------------------ image1, label1, image_path1 = item1 image2, label2, image_path2 = item2 if config.use_gpu: image1 = image1.cuda() image2 = image2.cuda() label1 = label1.cuda() label2 = label2.cuda() # ---------------------------------- go through the model -------------------------------- # score1, score2, logits1, logits2 = model(image1, image2) # Pairwise Confusion Network score1, score2, score3 = model(image1, image2) # Dual CNN # ----------------------------------- backpropagate ------------------------------------- # 两支之间的feature加入L2 norm # optimizer.zero_grad() # cls_loss1 = criterion(score1, label1) # cls_loss2 = criterion(score2, label2) # # ch_weight = torch.where(label1 == label2, torch.Tensor([0]).cuda(), torch.Tensor([1]).cuda()) # ch_weight = ch_weight.view(logits1.size(0), -1) # mse = MSELoss(logits1 * ch_weight, logits2 * ch_weight) # 只计算不同类之间的loss,相同类的置零 # # total_loss = cls_loss1 + cls_loss2 + 10 * mse # total_loss.backward() # optimizer.step() # 两支之间的logits加入判断是否属于同一类的loss optimizer.zero_grad() cls_loss1 = criterion(score1, label1) cls_loss2 = criterion(score2, label2) sylabel = torch.where(label1 == label2, torch.Tensor([0]).cuda(), torch.Tensor([1]).cuda()).long() sy_loss = sycriterion(score3, sylabel) total_loss = cls_loss1 + cls_loss2 + 2 * sy_loss total_loss.backward() optimizer.step() # ------------------------------------ record loss ------------------------------------ loss_meter.add((cls_loss1 + cls_loss2).item()) # mse_meter.add(mse.item()) # syloss_meter.add(sy_loss.item()) # total_loss_meter.add(total_loss.item()) epoch_loss.add((cls_loss1 + cls_loss2).item()) # epoch_mse.add(mse.item()) epoch_syloss.add(sy_loss.item()) epoch_total_loss.add(total_loss.item()) train_cm.add(softmax(score1, dim=1).detach(), label1.detach()) if (i+1) % config.print_freq == 0: vis.plot('loss', loss_meter.value()[0]) train_se = [100. * train_cm.value()[0][0] / (train_cm.value()[0][0] + train_cm.value()[0][1]), 100. * train_cm.value()[1][1] / (train_cm.value()[1][0] + train_cm.value()[1][1])] # *************************************** validate *************************************** model.eval() if (epoch + 1) % 1 == 0: Best_T, val_cm, val_spse, val_accuracy, AUC = val(model, val_dataloader) # ------------------------------------ save model ------------------------------------ if AUC > previous_auc and epoch + 1 > 5: if config.parallel: if not os.path.exists(os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])): os.makedirs(os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])) model.module.save(os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0], save_model_name)) else: if not os.path.exists(os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])): os.makedirs(os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])) model.save(os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0], save_model_name)) previous_auc = AUC save_epoch = epoch + 1 # ---------------------------------- recond and print --------------------------------- process_record['epoch_loss'].append(epoch_loss.value()[0]) process_record['train_avg_se'].append(np.average(train_se)) process_record['train_se_0'].append(train_se[0]) process_record['train_se_1'].append(train_se[1]) process_record['val_avg_se'].append(np.average(val_spse)) process_record['val_se_0'].append(val_spse[0]) process_record['val_se_1'].append(val_spse[1]) process_record['AUC'].append(AUC) # vis.plot('epoch_mse', epoch_mse.value()[0]) vis.plot('epoch_syloss', epoch_syloss.value()[0]) vis.plot_many({'epoch_loss': epoch_loss.value()[0], 'epoch_total_loss': epoch_total_loss.value()[0], 'train_avg_se': np.average(train_se), 'train_se_0': train_se[0], 'train_se_1': train_se[1], 'val_avg_se': np.average(val_spse), 'val_se_0': val_spse[0], 'val_se_1': val_spse[1], 'AUC': AUC}) vis.log(f"epoch: [{epoch+1}/{config.max_epoch}] =========================================") vis.log(f"lr: {optimizer.param_groups[0]['lr']}, loss: {round(loss_meter.value()[0], 5)}") vis.log(f"train_avg_se: {round(np.average(train_se), 4)}, train_se_0: {round(train_se[0], 4)}, train_se_1: {round(train_se[1], 4)}") vis.log(f"val_avg_se: {round(sum(val_spse)/len(val_spse), 4)}, val_se_0: {round(val_spse[0], 4)}, val_se_1: {round(val_spse[1], 4)}") vis.log(f"AUC: {AUC}") vis.log(f'train_cm: {train_cm.value()}') vis.log(f'Best Threshold: {Best_T}') vis.log(f'val_cm: {val_cm}') print("lr:", optimizer.param_groups[0]['lr'], "loss:", round(epoch_loss.value()[0], 5)) print('train_avg_se:', round(np.average(train_se), 4), 'train_se_0:', round(train_se[0], 4), 'train_se_1:', round(train_se[1], 4)) print('val_avg_se:', round(np.average(val_spse), 4), 'val_se_0:', round(val_spse[0], 4), 'val_se_1:', round(val_spse[1], 4)) print('AUC:', AUC) print('train_cm:') print(train_cm.value()) print('Best Threshold:', Best_T, 'val_cm:') print(val_cm) # ------------------------------------ save record ------------------------------------ if os.path.exists(os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])): write_json(file=os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0], 'process_record.json'), content=process_record) # if (epoch+1) % 5 == 0: # lr = lr * config.lr_decay # for param_group in optimizer.param_groups: # param_group['lr'] = lr vis.log(f"Best Epoch: {save_epoch}") print("Best Epoch:", save_epoch)
def train(**kwargs): config.parse(kwargs) # ============================================ Visualization ============================================= vis = Visualizer(port=2333, env=config.env) vis.log('Use config:') for k, v in config.__class__.__dict__.items(): if not k.startswith('__'): vis.log(f"{k}: {getattr(config, k)}") # ============================================= Prepare Data ============================================= train_data = SlideWindowDataset(config.train_paths, phase='train', useRGB=config.useRGB, usetrans=config.usetrans, balance=config.data_balance) val_data = SlideWindowDataset(config.test_paths, phase='val', useRGB=config.useRGB, usetrans=config.usetrans, balance=False) print('Training Images:', train_data.__len__(), 'Validation Images:', val_data.__len__()) dist = train_data.dist() print('Train Data Distribution:', dist) train_dataloader = DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) val_dataloader = DataLoader(val_data, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) # ============================================= Prepare Model ============================================ model = UNet_Classifier(num_classes=config.num_classes) print(model) if config.load_model_path: model.load(config.load_model_path) print('Model loaded') if config.use_gpu: model.cuda() if config.parallel: model = torch.nn.DataParallel( model, device_ids=[x for x in range(config.num_of_gpu)]) # =========================================== Criterion and Optimizer ===================================== # weight = torch.Tensor([1, 1]) # weight = torch.Tensor([dist['1']/(dist['0']+dist['1']), dist['0']/(dist['0']+dist['1'])]) # weight需要将二者反过来,多于二分类可以取倒数 # weight = torch.Tensor([1, 3.5]) # weight = torch.Tensor([1, 5]) weight = torch.Tensor([1, 7]) vis.log(f'loss weight: {weight}') print('loss weight:', weight) weight = weight.cuda() criterion = torch.nn.CrossEntropyLoss(weight=weight) lr = config.lr optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=config.weight_decay) # ================================================== Metrics =============================================== softmax = functional.softmax loss_meter_edge = meter.AverageValueMeter() epoch_loss_edge = meter.AverageValueMeter() loss_meter_cls = meter.AverageValueMeter() epoch_loss_cls = meter.AverageValueMeter() loss_meter = meter.AverageValueMeter() epoch_loss = meter.AverageValueMeter() train_cm = meter.ConfusionMeter(config.num_classes) # ====================================== Saving and Recording Configuration ================================= previous_auc = 0 if config.parallel: save_model_dir = config.save_model_dir if config.save_model_dir else model.module.model_name save_model_name = config.save_model_name if config.save_model_name else model.module.model_name + '_best_model.pth' else: save_model_dir = config.save_model_dir if config.save_model_dir else model.model_name save_model_name = config.save_model_name if config.save_model_name else model.model_name + '_best_model.pth' save_epoch = 1 # 用于记录验证集上效果最好模型对应的epoch process_record = { 'epoch_loss': [], 'epoch_loss_edge': [], 'epoch_loss_cls': [], 'train_avg_se': [], 'train_se_0': [], 'train_se_1': [], 'val_avg_se': [], 'val_se_0': [], 'val_se_1': [], 'AUC': [], 'DICE': [] } # 用于记录实验过程中的曲线,便于画曲线图 # ================================================== Training =============================================== for epoch in range(config.max_epoch): print( f"epoch: [{epoch + 1}/{config.max_epoch}] {config.save_model_name[:-4]} ==================================" ) train_cm.reset() epoch_loss.reset() dice = [] # ****************************************** train **************************************** model.train() for i, (image, label, edge_mask, image_path) in tqdm(enumerate(train_dataloader)): loss_meter.reset() # ------------------------------------ prepare input ------------------------------------ if config.use_gpu: image = image.cuda() label = label.cuda() edge_mask = edge_mask.cuda() # ---------------------------------- go through the model -------------------------------- score, score_mask = model(x=image) # ----------------------------------- backpropagate ------------------------------------- optimizer.zero_grad() # 分类loss loss_cls = criterion(score, label) # 对Edge包含pixel加loss log_prob_mask = functional.logsigmoid(score_mask) count_edge = torch.sum(edge_mask, dim=(1, 2, 3), keepdim=True) loss_edge = -1 * torch.mean( torch.sum( edge_mask * log_prob_mask, dim=(1, 2, 3), keepdim=True) / (count_edge + 1e-8)) # 对非Edge包含pixel加loss r_prob_mask = torch.Tensor([1.0 ]).cuda() - torch.sigmoid(score_mask) r_edge_mask = torch.Tensor([1.0]).cuda() - edge_mask log_rprob_mask = torch.log(r_prob_mask + 1e-5) count_redge = torch.sum(r_edge_mask, dim=(1, 2, 3), keepdim=True) loss_redge = -1 * torch.mean( torch.sum(r_edge_mask * log_rprob_mask, dim=(1, 2, 3), keepdim=True) / (count_redge + 1e-8)) # 权重按照前景和背景的像素点数量来算 w1 = torch.sum(count_edge).item() / (torch.sum(count_edge).item() + torch.sum(count_redge).item()) w2 = torch.sum(count_redge).item() / ( torch.sum(count_edge).item() + torch.sum(count_redge).item()) loss = loss_cls + w1 * loss_edge + w2 * loss_redge loss.backward() optimizer.step() # ------------------------------------ record loss ------------------------------------ loss_meter_edge.add((w1 * loss_edge + w2 * loss_redge).item()) epoch_loss_edge.add((w1 * loss_edge + w2 * loss_redge).item()) loss_meter_cls.add(loss_cls.item()) epoch_loss_cls.add(loss_cls.item()) loss_meter.add(loss.item()) epoch_loss.add(loss.item()) train_cm.add(softmax(score, dim=1).detach(), label.detach()) dice.append( dice_coeff(input=(score_mask > 0.5).float(), target=edge_mask[:, 0, :, :]).item()) if (i + 1) % config.print_freq == 0: vis.plot_many({ 'loss': loss_meter.value()[0], 'loss_edge': loss_meter_edge.value()[0], 'loss_cls': loss_meter_cls.value()[0] }) train_se = [ 100. * train_cm.value()[0][0] / (train_cm.value()[0][0] + train_cm.value()[0][1]), 100. * train_cm.value()[1][1] / (train_cm.value()[1][0] + train_cm.value()[1][1]) ] train_dice = sum(dice) / len(dice) # *************************************** validate *************************************** model.eval() if (epoch + 1) % 1 == 0: Best_T, val_cm, val_spse, val_accuracy, AUC, val_dice = val( model, val_dataloader) # ------------------------------------ save model ------------------------------------ if AUC > previous_auc and epoch + 1 > 5: # 5个epoch之后,当测试集上的平均sensitivity升高时保存模型 if config.parallel: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])) model.module.save( os.path.join('checkpoints', save_model_dir, save_model_name[:-4], save_model_name)) else: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])) model.save( os.path.join('checkpoints', save_model_dir, save_model_name[:-4], save_model_name)) previous_auc = AUC save_epoch = epoch + 1 # ---------------------------------- recond and print --------------------------------- process_record['epoch_loss'].append(epoch_loss.value()[0]) process_record['epoch_loss_edge'].append( epoch_loss_edge.value()[0]) process_record['epoch_loss_cls'].append(epoch_loss_cls.value()[0]) process_record['train_avg_se'].append(np.average(train_se)) process_record['train_se_0'].append(train_se[0]) process_record['train_se_1'].append(train_se[1]) process_record['val_avg_se'].append(np.average(val_spse)) process_record['val_se_0'].append(val_spse[0]) process_record['val_se_1'].append(val_spse[1]) process_record['AUC'].append(AUC) process_record['DICE'].append(val_dice) vis.plot_many({ 'epoch_loss': epoch_loss.value()[0], 'epoch_loss_edge': epoch_loss_edge.value()[0], 'epoch_loss_cls': epoch_loss_cls.value()[0], 'train_avg_se': np.average(train_se), 'train_se_0': train_se[0], 'train_se_1': train_se[1], 'val_avg_se': np.average(val_spse), 'val_se_0': val_spse[0], 'val_se_1': val_spse[1], 'AUC': AUC, 'train_dice': train_dice, 'val_dice': val_dice }) vis.log( f"epoch: [{epoch + 1}/{config.max_epoch}] ===============================================" ) vis.log( f"lr: {optimizer.param_groups[0]['lr']}, loss: {round(loss_meter.value()[0], 5)}" ) vis.log( f"train_avg_se: {round(np.average(train_se), 4)}, train_se_0: {round(train_se[0], 4)}, train_se_1: {round(train_se[1], 4)}" ) vis.log(f"train_dice: {round(train_dice, 4)}") vis.log( f"val_avg_se: {round(sum(val_spse) / len(val_spse), 4)}, val_se_0: {round(val_spse[0], 4)}, val_se_1: {round(val_spse[1], 4)}" ) vis.log(f"val_dice: {round(val_dice, 4)}") vis.log(f"AUC: {AUC}") vis.log(f'train_cm: {train_cm.value()}') vis.log(f'Best Threshold: {Best_T}') vis.log(f'val_cm: {val_cm}') print("lr:", optimizer.param_groups[0]['lr'], "loss:", round(epoch_loss.value()[0], 5)) print('train_avg_se:', round(np.average(train_se), 4), 'train_se_0:', round(train_se[0], 4), 'train_se_1:', round(train_se[1], 4)) print('train_dice:', train_dice) print('val_avg_se:', round(np.average(val_spse), 4), 'val_se_0:', round(val_spse[0], 4), 'val_se_1:', round(val_spse[1], 4)) print('val_dice:', val_dice) print('AUC:', AUC) print('train_cm:') print(train_cm.value()) print('Best Threshold:', Best_T, 'val_cm:') print(val_cm) # ------------------------------------ save record ------------------------------------ if os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])): write_json(file=os.path.join('checkpoints', save_model_dir, save_model_name[:-4], 'process_record.json'), content=process_record) # if (epoch+1) % 20 == 0: # lr = lr * config.lr_decay # for param_group in optimizer.param_groups: # param_group['lr'] = lr vis.log(f"Best Epoch: {save_epoch}") print("Best Epoch:", save_epoch)
def iter_train(**kwargs): config.parse(kwargs) # ============================================ Visualization ============================================= # vis = Visualizer(port=2333, env=config.env) # vis.log('Use config:') # for k, v in config.__class__.__dict__.items(): # if not k.startswith('__'): # vis.log(f"{k}: {getattr(config, k)}") # ============================================= Prepare Data ============================================= train_data = ContextVB_Dataset(config.train_paths, phase='train', num_classes=config.num_classes, useRGB=config.useRGB, usetrans=config.usetrans, padding=config.padding, balance=config.data_balance) val_data = ContextVB_Dataset(config.test_paths, phase='val', num_classes=config.num_classes, useRGB=config.useRGB, usetrans=False, padding=config.padding, balance=config.data_balance) train_dist, val_dist = train_data.dist(), val_data.dist() train_data_scale, val_data_scale = train_data.scale, val_data.scale print('Training Images:', train_data.__len__(), 'Validation Images:', val_data.__len__()) print('Train Data Distribution:', train_dist, 'Val Data Distribution:', val_dist) train_dataloader = DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) val_dataloader = DataLoader(val_data, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) # ============================================= Prepare Model ============================================ model = ContextAlexNet(num_classes=config.num_classes) # model = ContextVgg16(num_classes=config.num_classes) # model = ContextResNet18(num_classes=config.num_classes) # model = ContextShareNet(num_classes=config.num_classes) # model = ContextResNet50(num_classes=config.num_classes) # print(model) if config.load_model_path: model.load(config.load_model_path) if config.use_gpu: model.cuda() if config.parallel: model = torch.nn.DataParallel(model, device_ids=list(range( config.num_of_gpu))) # =========================================== Criterion and Optimizer ===================================== # criterion = torch.nn.CrossEntropyLoss(reduction='mean') criterion = torch.nn.CrossEntropyLoss( reduction='none') # for Self-paced Learning # criterion = LabelSmoothing(size=config.num_classes, smoothing=0.2) # criterion = LabelSmoothing(size=config.num_classes, smoothing=0.2, reduction='none') # for Self-paced Learning # criterion = FocalLoss(gamma=4, alpha=None) MSELoss = torch.nn.MSELoss() lr = config.lr optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=config.weight_decay) # ================================================== Metrics =============================================== log_softmax = functional.log_softmax loss_meter = meter.AverageValueMeter() mse_meter1_2 = meter.AverageValueMeter() mse_meter2_3 = meter.AverageValueMeter() total_loss_meter = meter.AverageValueMeter() # ====================================== Saving and Recording Configuration ================================= previous_AUC = 0 previous_mAP = 0 save_iter = 1 # 用于记录验证集上效果最好模型对应的epoch if config.parallel: save_model_dir = config.save_model_dir if config.save_model_dir else model.module.model_name save_model_name = config.save_model_name if config.save_model_name else model.module.model_name + '_best_model.pth' else: save_model_dir = config.save_model_dir if config.save_model_dir else model.model_name save_model_name = config.save_model_name if config.save_model_name else model.model_name + '_best_model.pth' if config.num_classes == 2: # 2分类 process_record = { 'loss': [], 'mse': [], # 用于记录实验过程中的曲线,便于画曲线图 'train_avg': [], 'train_sp': [], 'train_se': [], 'val_avg': [], 'val_sp': [], 'val_se': [], 'train_AUC': [], 'val_AUC': [] } elif config.num_classes == 3: # 3分类 process_record = { 'loss': [], 'mse': [], # 用于记录实验过程中的曲线,便于画曲线图 'train_sp0': [], 'train_se0': [], 'train_sp1': [], 'train_se1': [], 'train_sp2': [], 'train_se2': [], 'val_sp0': [], 'val_se0': [], 'val_sp1': [], 'val_se1': [], 'val_sp2': [], 'val_se2': [], 'train_mAUC': [], 'val_mAUC': [], 'train_mAP': [], 'val_mAP': [] } else: raise ValueError # ================================================== Training =============================================== iteration = 0 # ****************************************** train **************************************** train_iter = iter(train_dataloader) model.train() while iteration < config.max_iter: # Fine-tune with clean data after 4000 epochs # if iteration == 4000: # train_data = ContextVB_Dataset(config.train_paths, phase='train', num_classes=config.num_classes, # useRGB=config.useRGB, usetrans=False, padding=config.padding, # balance=config.data_balance) # train_dataloader = DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) # train_iter = iter(train_dataloader) try: image, label, image_path = next(train_iter) except: train_iter = iter(train_dataloader) image, label, image_path = next(train_iter) iteration += 1 # ------------------------------------ prepare input ------------------------------------ if config.use_gpu: last_image, cur_image, next_image = image[0].cuda(), image[1].cuda( ), image[2].cuda() last_label, cur_label, next_label = label[0].cuda(), label[1].cuda( ), label[2].cuda() else: last_image, cur_image, next_image = image[0], image[1], image[2] last_label, cur_label, next_label = label[0], label[1], label[2] # ---------------------------------- go through the model -------------------------------- # score = model(last_image, cur_image, next_image) score, diff1, diff2 = model(last_image, cur_image, next_image) # score, f1, f2, f3 = model(last_image, cur_image, next_image) # ----------------------------------- backpropagate ------------------------------------- # 单支loss # optimizer.zero_grad() # loss = criterion(score, cur_label) # # loss = criterion(log_softmax(score, dim=1), cur_label) # LabelSmoothing # loss.backward() # optimizer.step() # 加入每两支之间的回归loss # optimizer.zero_grad() # loss = criterion(score, cur_label) # # loss = criterion(log_softmax(score, dim=1), cur_label) # LabelSmoothing # mse1_2 = MSELoss(diff1, torch.abs(cur_label - last_label).float()) # mse2_3 = MSELoss(diff2, torch.abs(cur_label - next_label).float()) # total_loss = loss + 0.2 * (mse1_2 + mse2_3) # total_loss.backward() # optimizer.step() # 使用Self-paced Learning + 每两支之间的MSE loss if iteration < 500: optimizer.zero_grad() loss = criterion(score, cur_label) # loss = criterion(log_softmax(score, dim=1), cur_label) # LabelSmoothing loss = torch.sum(loss) / config.batch_size mse1_2 = MSELoss(diff1, torch.abs(cur_label - last_label).float()) mse2_3 = MSELoss(diff2, torch.abs(cur_label - next_label).float()) total_loss = loss + 0.2 * (mse1_2 + mse2_3) total_loss.backward() optimizer.step() else: optimizer.zero_grad() loss = criterion(score, cur_label) # loss = criterion(log_softmax(score, dim=1), cur_label) # LabelSmoothing T = np.percentile(loss.data.cpu().numpy(), 90) loss = torch.where(loss > T, torch.Tensor([0]).cuda(), loss) count = torch.sum( torch.where(loss > 0, torch.Tensor([1]).cuda(), loss)) loss = torch.sum(loss) / count mse1_2 = MSELoss(diff1, torch.abs(cur_label - last_label).float()) mse2_3 = MSELoss(diff2, torch.abs(cur_label - next_label).float()) total_loss = loss + 0.2 * (mse1_2 + mse2_3) total_loss.backward() optimizer.step() # 模仿pairwise loss函数的设计方式,两支之间的feature加入L2 norm # optimizer.zero_grad() # loss = criterion(score, cur_label) # ch_weight12 = torch.where(cur_label == last_label, torch.Tensor([0]).cuda(), torch.Tensor([1]).cuda()) # ch_weight23 = torch.where(cur_label == next_label, torch.Tensor([0]).cuda(), torch.Tensor([1]).cuda()) # ch_weight12 = ch_weight12.view(cur_label.size(0), 1, 1, 1) # ch_weight23 = ch_weight23.view(cur_label.size(0), 1, 1, 1) # # mse1_2 = MSELoss(f1 * ch_weight12, f2 * ch_weight12) # 只计算不同类之间的loss,相同类的置零 # mse2_3 = MSELoss(f2 * ch_weight23, f3 * ch_weight23) # total_loss = loss + 1 * (mse1_2 + mse2_3) # total_loss.backward() # optimizer.step() # ------------------------------------ record loss ------------------------------------ loss_meter.add(loss.item()) mse_meter1_2.add(mse1_2.item()) mse_meter2_3.add(mse2_3.item()) total_loss_meter.add(total_loss.item()) if iteration % config.print_freq == 0: tqdm.write( f"iter: [{iteration}/{config.max_iter}] {config.save_model_name[:-4]} ==================================" ) # *************************************** validate *************************************** if config.num_classes == 2: # 2分类 model.eval() train_cm, train_AUC, train_sp, train_se, train_T, train_accuracy = val_2class( model, train_dataloader, train_dist) val_cm, val_AUC, val_sp, val_se, val_T, val_accuracy = val_2class( model, val_dataloader, val_dist) model.train() # ------------------------------------ save model ------------------------------------ if val_AUC > previous_AUC: # 当测试集上的AUC升高时保存模型 if config.parallel: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])) model.module.save( os.path.join('checkpoints', save_model_dir, save_model_name[:-4], save_model_name)) else: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])) model.save( os.path.join('checkpoints', save_model_dir, save_model_name[:-4], save_model_name)) previous_AUC = val_AUC save_iter = iteration # ---------------------------------- recond and print --------------------------------- process_record['loss'].append(loss_meter.value()[0]) process_record['mse'].append(mse_meter1_2.value()[0] + mse_meter2_3.value()[0]) process_record['train_avg'].append((train_sp + train_se) / 2) process_record['train_sp'].append(train_sp) process_record['train_se'].append(train_se) process_record['train_AUC'].append(train_AUC) process_record['val_avg'].append((val_sp + val_se) / 2) process_record['val_sp'].append(val_sp) process_record['val_se'].append(val_se) process_record['val_AUC'].append(val_AUC) # vis.plot_many({'loss': loss_meter.value()[0], # 'train_avg': (train_sp + train_se) / 2, 'train_sp': train_sp, 'train_se': train_se, # 'val_avg': (val_sp + val_se) / 2, 'val_sp': val_sp, 'val_se': val_se, # 'train_AUC': train_AUC, 'val_AUC': val_AUC}) # vis.log(f"iter: [{iteration}/{config.max_iter}] =========================================") # vis.log(f"lr: {optimizer.param_groups[0]['lr']}, loss: {round(loss_meter.value()[0], 5)}") # vis.log(f"train_avg: {round((train_sp + train_se) / 2, 4)}, train_sp: {round(train_sp, 4)}, train_se: {round(train_se, 4)}") # vis.log(f"val_avg: {round((val_sp + val_se) / 2, 4)}, val_sp: {round(val_sp, 4)}, val_se: {round(val_se, 4)}") # vis.log(f'train_AUC: {train_AUC}') # vis.log(f'val_AUC: {val_AUC}') # vis.log(f'train_cm: {train_cm}') # vis.log(f'val_cm: {val_cm}') print("lr:", optimizer.param_groups[0]['lr'], "loss:", round(loss_meter.value()[0], 5)) print('train_avg:', round((train_sp + train_se) / 2, 4), 'train_sp:', round(train_sp, 4), 'train_se:', round(train_se, 4)) print('val_avg:', round((val_sp + val_se) / 2, 4), 'val_sp:', round(val_sp, 4), 'val_se:', round(val_se, 4)) print('train_AUC:', train_AUC, 'val_AUC:', val_AUC) print('train_cm:') print(train_cm) print('val_cm:') print(val_cm) elif config.num_classes == 3: # 3分类 model.eval() train_cm, train_mAP, train_sp, train_se, train_mAUC, train_accuracy = val_3class( model, train_dataloader, train_data_scale) val_cm, val_mAP, val_sp, val_se, val_mAUC, val_accuracy = val_3class( model, val_dataloader, val_data_scale) model.train() # ------------------------------------ save model ------------------------------------ if val_mAP > previous_mAP: # 当测试集上的mAP升高时保存模型 if config.parallel: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])) model.module.save( os.path.join('checkpoints', save_model_dir, save_model_name[:-4], save_model_name)) else: if not os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])): os.makedirs( os.path.join('checkpoints', save_model_dir, save_model_name[:-4])) model.save( os.path.join('checkpoints', save_model_dir, save_model_name[:-4], save_model_name)) previous_mAP = val_mAP save_iter = iteration # ---------------------------------- recond and print --------------------------------- process_record['loss'].append(loss_meter.value()[0]) process_record['mse'].append(mse_meter1_2.value()[0] + mse_meter2_3.value()[0]) process_record['train_sp0'].append(train_sp[0]) process_record['train_se0'].append(train_se[0]) process_record['train_sp1'].append(train_sp[1]) process_record['train_se1'].append(train_se[1]) process_record['train_sp2'].append(train_sp[2]) process_record['train_se2'].append(train_se[2]) process_record['train_mAUC'].append(float(train_mAUC)) process_record['train_mAP'].append(float(train_mAP)) process_record['val_sp0'].append(val_sp[0]) process_record['val_se0'].append(val_se[0]) process_record['val_sp1'].append(val_sp[1]) process_record['val_se1'].append(val_se[1]) process_record['val_sp2'].append(val_sp[2]) process_record['val_se2'].append(val_se[2]) process_record['val_mAUC'].append(float(val_mAUC)) process_record['val_mAP'].append(float(val_mAP)) # vis.plot_many({'mse1': mse_meter1_2.value()[0], 'mse2': mse_meter2_3.value()[0], # 'total_loss': total_loss_meter.value()[0]}) # vis.plot_many({'loss': loss_meter.value()[0], # 'train_sp0': train_se[0], 'train_sp1': train_se[1], 'train_sp2': train_se[2], # 'train_se0': train_se[0], 'train_se1': train_se[1], 'train_se2': train_se[2], # 'val_sp0': val_se[0], 'val_sp1': val_se[1], 'val_sp2': val_se[2], # 'val_se0': val_se[0], 'val_se1': val_se[1], 'val_se2': val_se[2], # 'train_mAP': train_mAP, 'val_mAP': val_mAP}) # vis.log(f"iter: [{iteration}/{config.max_iter}] =========================================") # vis.log(f"lr: {optimizer.param_groups[0]['lr']}, loss: {round(loss_meter.value()[0], 5)}") # vis.log(f"train_sp0: {round(train_sp[0], 4)}, train_sp1: {round(train_sp[1], 4)}, train_sp2: {round(train_sp[2], 4)}") # vis.log(f"train_se0: {round(train_se[0], 4)}, train_se1: {round(train_se[1], 4)}, train_se2: {round(train_se[2], 4)}") # vis.log(f"val_sp0: {round(val_sp[0], 4)}, val_sp1: {round(val_sp[1], 4)}, val_sp2: {round(val_sp[2], 4)}") # vis.log(f"val_se0: {round(val_se[0], 4)}, val_se1: {round(val_se[1], 4)}, val_se2: {round(val_se[2], 4)}") # vis.log(f"train_mAP: {train_mAP}, val_mAP: {val_mAP}") # vis.log(f'train_cm: {train_cm}') # vis.log(f'val_cm: {val_cm}') # print("lr:", optimizer.param_groups[0]['lr'], "loss:", round(loss_meter.value()[0], 5)) print( "lr:", optimizer.param_groups[0]['lr'], "loss:", round(loss_meter.value()[0], 5), "mse:", round(mse_meter1_2.value()[0] + mse_meter2_3.value()[0], 5)) print('train_sp0:', round(train_sp[0], 4), 'train_sp1:', round(train_sp[1], 4), 'train_sp2:', round(train_sp[2], 4)) print('train_se0:', round(train_se[0], 4), 'train_se1:', round(train_se[1], 4), 'train_se2:', round(train_se[2], 4)) print('val_sp0:', round(val_sp[0], 4), 'val_sp1:', round(val_sp[1], 4), 'val_sp2:', round(val_sp[2], 4)) print('val_se0:', round(val_se[0], 4), 'val_se1:', round(val_se[1], 4), 'val_se2:', round(val_se[2], 4)) print('mSP:', round(sum(val_sp) / 3, 5), 'mSE:', round(sum(val_se) / 3, 5)) print('train_mAUC:', train_mAUC, 'val_mAUC:', val_mAUC) print('train_mAP:', train_mAP, 'val_mAP:', val_mAP) print('train_cm:') print(train_cm) print('val_cm:') print(val_cm) print('Best mAP:', previous_mAP) loss_meter.reset() # ------------------------------------ save record ------------------------------------ if os.path.exists( os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0])): write_json(file=os.path.join('checkpoints', save_model_dir, save_model_name.split('.')[0], 'process_record.json'), content=process_record) # vis.log(f"Best Iter: {save_iter}") print("Best Iter:", save_iter)