def train(args, snapshot_path): base_lr = args.base_lr num_classes = args.num_classes batch_size = args.batch_size max_iterations = args.max_iterations def worker_init_fn(worker_id): random.seed(args.seed + worker_id) model = net_factory(net_type=args.model, in_chns=1, class_num=num_classes) db_train = BaseDataSets(base_dir=args.root_path, split="train", num=None, transform=transforms.Compose( [RandomGenerator(args.patch_size)])) total_slices = len(db_train) labeled_slice = patients_to_slices(args.root_path, args.labeled_num) print("Total silices is: {}, labeled slices is: {}".format( total_slices, labeled_slice)) labeled_idxs = list(range(0, labeled_slice)) unlabeled_idxs = list(range(labeled_slice, total_slices)) batch_sampler = TwoStreamBatchSampler(labeled_idxs, unlabeled_idxs, batch_size, batch_size - args.labeled_bs) trainloader = DataLoader(db_train, batch_sampler=batch_sampler, num_workers=16, pin_memory=True, worker_init_fn=worker_init_fn) db_val = BaseDataSets(base_dir=args.root_path, split="val") valloader = DataLoader(db_val, batch_size=1, shuffle=False, num_workers=1) model.train() optimizer = optim.SGD(model.parameters(), lr=base_lr, momentum=0.9, weight_decay=0.0001) ce_loss = CrossEntropyLoss() dice_loss = losses.DiceLoss(num_classes) writer = SummaryWriter(snapshot_path + '/log') logging.info("{} iterations per epoch".format(len(trainloader))) iter_num = 0 max_epoch = max_iterations // len(trainloader) + 1 best_performance = 0.0 iterator = tqdm(range(max_epoch), ncols=70) for epoch_num in iterator: for i_batch, sampled_batch in enumerate(trainloader): volume_batch, label_batch = sampled_batch['image'], sampled_batch[ 'label'] volume_batch, label_batch = volume_batch.cuda(), label_batch.cuda() unlabeled_volume_batch = volume_batch[args.labeled_bs:] outputs = model(volume_batch) outputs_soft = torch.softmax(outputs, dim=1) loss_ce = ce_loss(outputs[:args.labeled_bs], label_batch[:][:args.labeled_bs].long()) loss_dice = dice_loss(outputs_soft[:args.labeled_bs], label_batch[:args.labeled_bs].unsqueeze(1)) supervised_loss = 0.5 * (loss_dice + loss_ce) consistency_weight = get_current_consistency_weight(iter_num // 150) consistency_loss = losses.entropy_loss(outputs_soft, C=4) loss = supervised_loss + consistency_weight * consistency_loss optimizer.zero_grad() loss.backward() optimizer.step() lr_ = base_lr * (1.0 - iter_num / max_iterations)**0.9 for param_group in optimizer.param_groups: param_group['lr'] = lr_ iter_num = iter_num + 1 writer.add_scalar('info/lr', lr_, iter_num) writer.add_scalar('info/total_loss', loss, iter_num) writer.add_scalar('info/loss_ce', loss_ce, iter_num) writer.add_scalar('info/loss_dice', loss_dice, iter_num) writer.add_scalar('info/consistency_loss', consistency_loss, iter_num) writer.add_scalar('info/consistency_weight', consistency_weight, iter_num) logging.info( 'iteration %d : loss : %f, loss_ce: %f, loss_dice: %f' % (iter_num, loss.item(), loss_ce.item(), loss_dice.item())) if iter_num % 20 == 0: image = volume_batch[1, 0:1, :, :] writer.add_image('train/Image', image, iter_num) outputs = torch.argmax(torch.softmax(outputs, dim=1), dim=1, keepdim=True) writer.add_image('train/Prediction', outputs[1, ...] * 50, iter_num) labs = label_batch[1, ...].unsqueeze(0) * 50 writer.add_image('train/GroundTruth', labs, iter_num) if iter_num > 0 and iter_num % 200 == 0: model.eval() metric_list = 0.0 for i_batch, sampled_batch in enumerate(valloader): metric_i = test_single_volume(sampled_batch["image"], sampled_batch["label"], model, classes=num_classes) metric_list += np.array(metric_i) metric_list = metric_list / len(db_val) for class_i in range(num_classes - 1): writer.add_scalar('info/val_{}_dice'.format(class_i + 1), metric_list[class_i, 0], iter_num) writer.add_scalar('info/val_{}_hd95'.format(class_i + 1), metric_list[class_i, 1], iter_num) performance = np.mean(metric_list, axis=0)[0] mean_hd95 = np.mean(metric_list, axis=0)[1] writer.add_scalar('info/val_mean_dice', performance, iter_num) writer.add_scalar('info/val_mean_hd95', mean_hd95, iter_num) if performance > best_performance: best_performance = performance save_mode_path = os.path.join( snapshot_path, 'iter_{}_dice_{}.pth'.format( iter_num, round(best_performance, 4))) save_best = os.path.join( snapshot_path, '{}_best_model.pth'.format(args.model)) torch.save(model.state_dict(), save_mode_path) torch.save(model.state_dict(), save_best) logging.info('iteration %d : mean_dice : %f mean_hd95 : %f' % (iter_num, performance, mean_hd95)) model.train() if iter_num % 3000 == 0: save_mode_path = os.path.join(snapshot_path, 'iter_' + str(iter_num) + '.pth') torch.save(model.state_dict(), save_mode_path) logging.info("save model to {}".format(save_mode_path)) if iter_num >= max_iterations: break if iter_num >= max_iterations: iterator.close() break writer.close() return "Training Finished!"
transform=transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), normalize, ])) test_dataset = dataset.CheXpertDataset(root_dir=args.root_path, csv_file=args.csv_file_test, transform=transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), normalize, ])) labeled_idxs = list(range(args.labeled_num)) unlabeled_idxs = list(range(args.labeled_num, 7000)) batch_sampler = TwoStreamBatchSampler(labeled_idxs, unlabeled_idxs, batch_size, batch_size - labeled_bs) def worker_init_fn(worker_id): random.seed(args.seed + worker_id) train_dataloader = DataLoader(dataset=train_dataset, batch_sampler=batch_sampler, num_workers=8, pin_memory=True, worker_init_fn=worker_init_fn) val_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True, worker_init_fn=worker_init_fn)
def train(args, snapshot_path): base_lr = args.base_lr num_classes = args.num_classes batch_size = args.batch_size max_iterations = args.max_iterations def create_model(ema=False): # Network definition model = net_factory(net_type=args.model, in_chns=1, class_num=num_classes) if ema: for param in model.parameters(): param.detach_() return model model = create_model() ema_model = create_model(ema=True) def worker_init_fn(worker_id): random.seed(args.seed + worker_id) db_train = BaseDataSets(base_dir=args.root_path, split="train", num=None, transform=transforms.Compose( [RandomGenerator(args.patch_size)])) db_val = BaseDataSets(base_dir=args.root_path, split="val") total_slices = len(db_train) labeled_slice = patients_to_slices(args.root_path, args.labeled_num) print("Total silices is: {}, labeled slices is: {}".format( total_slices, labeled_slice)) labeled_idxs = list(range(0, labeled_slice)) unlabeled_idxs = list(range(labeled_slice, total_slices)) batch_sampler = TwoStreamBatchSampler(labeled_idxs, unlabeled_idxs, batch_size, batch_size - args.labeled_bs) trainloader = DataLoader(db_train, batch_sampler=batch_sampler, num_workers=4, pin_memory=True, worker_init_fn=worker_init_fn) model.train() valloader = DataLoader(db_val, batch_size=1, shuffle=False, num_workers=1) optimizer = optim.SGD(model.parameters(), lr=base_lr, momentum=0.9, weight_decay=0.0001) ce_loss = CrossEntropyLoss() dice_loss = losses.DiceLoss(num_classes) writer = SummaryWriter(snapshot_path + '/log') logging.info("{} iterations per epoch".format(len(trainloader))) iter_num = 0 max_epoch = max_iterations // len(trainloader) + 1 best_performance = 0.0 iterator = tqdm(range(max_epoch), ncols=70) for epoch_num in iterator: for i_batch, sampled_batch in enumerate(trainloader): volume_batch, label_batch = sampled_batch['image'], sampled_batch[ 'label'] volume_batch, label_batch = volume_batch.cuda(), label_batch.cuda() unlabeled_volume_batch = volume_batch[args.labeled_bs:] noise = torch.clamp( torch.randn_like(unlabeled_volume_batch) * 0.1, -0.2, 0.2) ema_inputs = unlabeled_volume_batch + noise outputs = model(volume_batch) outputs_soft = torch.softmax(outputs, dim=1) with torch.no_grad(): ema_output = ema_model(ema_inputs) T = 8 _, _, w, h = unlabeled_volume_batch.shape volume_batch_r = unlabeled_volume_batch.repeat(2, 1, 1, 1) stride = volume_batch_r.shape[0] // 2 preds = torch.zeros([stride * T, num_classes, w, h]).cuda() for i in range(T // 2): ema_inputs = volume_batch_r + \ torch.clamp(torch.randn_like( volume_batch_r) * 0.1, -0.2, 0.2) with torch.no_grad(): preds[2 * stride * i:2 * stride * (i + 1)] = ema_model(ema_inputs) preds = F.softmax(preds, dim=1) preds = preds.reshape(T, stride, num_classes, w, h) preds = torch.mean(preds, dim=0) uncertainty = -1.0 * \ torch.sum(preds*torch.log(preds + 1e-6), dim=1, keepdim=True) loss_ce = ce_loss(outputs[:args.labeled_bs], label_batch[:args.labeled_bs][:].long()) loss_dice = dice_loss(outputs_soft[:args.labeled_bs], label_batch[:args.labeled_bs].unsqueeze(1)) supervised_loss = 0.5 * (loss_dice + loss_ce) consistency_weight = get_current_consistency_weight(iter_num // 150) consistency_dist = losses.softmax_mse_loss( outputs[args.labeled_bs:], ema_output) # (batch, 2, 112,112,80) threshold = (0.75 + 0.25 * ramps.sigmoid_rampup( iter_num, max_iterations)) * np.log(2) mask = (uncertainty < threshold).float() consistency_loss = torch.sum( mask * consistency_dist) / (2 * torch.sum(mask) + 1e-16) loss = supervised_loss + consistency_weight * consistency_loss optimizer.zero_grad() loss.backward() optimizer.step() update_ema_variables(model, ema_model, args.ema_decay, iter_num) lr_ = base_lr * (1.0 - iter_num / max_iterations)**0.9 for param_group in optimizer.param_groups: param_group['lr'] = lr_ iter_num = iter_num + 1 writer.add_scalar('info/lr', lr_, iter_num) writer.add_scalar('info/total_loss', loss, iter_num) writer.add_scalar('info/loss_ce', loss_ce, iter_num) writer.add_scalar('info/loss_dice', loss_dice, iter_num) writer.add_scalar('info/consistency_loss', consistency_loss, iter_num) writer.add_scalar('info/consistency_weight', consistency_weight, iter_num) logging.info( 'iteration %d : loss : %f, loss_ce: %f, loss_dice: %f' % (iter_num, loss.item(), loss_ce.item(), loss_dice.item())) if iter_num % 20 == 0: image = volume_batch[1, 0:1, :, :] writer.add_image('train/Image', image, iter_num) outputs = torch.argmax(torch.softmax(outputs, dim=1), dim=1, keepdim=True) writer.add_image('train/Prediction', outputs[1, ...] * 50, iter_num) labs = label_batch[1, ...].unsqueeze(0) * 50 writer.add_image('train/GroundTruth', labs, iter_num) if iter_num > 0 and iter_num % 200 == 0: model.eval() metric_list = 0.0 for i_batch, sampled_batch in enumerate(valloader): metric_i = test_single_volume(sampled_batch["image"], sampled_batch["label"], model, classes=num_classes) metric_list += np.array(metric_i) metric_list = metric_list / len(db_val) for class_i in range(num_classes - 1): writer.add_scalar('info/val_{}_dice'.format(class_i + 1), metric_list[class_i, 0], iter_num) writer.add_scalar('info/val_{}_hd95'.format(class_i + 1), metric_list[class_i, 1], iter_num) performance = np.mean(metric_list, axis=0)[0] mean_hd95 = np.mean(metric_list, axis=0)[1] writer.add_scalar('info/val_mean_dice', performance, iter_num) writer.add_scalar('info/val_mean_hd95', mean_hd95, iter_num) if performance > best_performance: best_performance = performance save_mode_path = os.path.join( snapshot_path, 'iter_{}_dice_{}.pth'.format( iter_num, round(best_performance, 4))) save_best = os.path.join( snapshot_path, '{}_best_model.pth'.format(args.model)) torch.save(model.state_dict(), save_mode_path) torch.save(model.state_dict(), save_best) logging.info('iteration %d : mean_dice : %f mean_hd95 : %f' % (iter_num, performance, mean_hd95)) model.train() if iter_num % 3000 == 0: save_mode_path = os.path.join(snapshot_path, 'iter_' + str(iter_num) + '.pth') torch.save(model.state_dict(), save_mode_path) logging.info("save model to {}".format(save_mode_path)) if iter_num >= max_iterations: break if iter_num >= max_iterations: iterator.close() break writer.close() return "Training Finished!"
def train(args, snapshot_path): base_lr = args.base_lr num_classes = args.num_classes batch_size = args.batch_size max_iterations = args.max_iterations def create_model(ema=False): # Network definition model = net_factory(net_type=args.model, in_chns=1, class_num=num_classes) if ema: for param in model.parameters(): param.detach_() return model model1 = kaiming_normal_init_weight(create_model()) model2 = xavier_normal_init_weight(create_model()) def worker_init_fn(worker_id): random.seed(args.seed + worker_id) db_train = BaseDataSets(base_dir=args.root_path, split="train", num=None, transform=transforms.Compose( [RandomGenerator(args.patch_size)])) db_val = BaseDataSets(base_dir=args.root_path, split="val") total_slices = len(db_train) labeled_slice = patients_to_slices(args.root_path, args.labeled_num) print("Total silices is: {}, labeled slices is: {}".format( total_slices, labeled_slice)) labeled_idxs = list(range(0, labeled_slice)) unlabeled_idxs = list(range(labeled_slice, total_slices)) batch_sampler = TwoStreamBatchSampler(labeled_idxs, unlabeled_idxs, batch_size, batch_size - args.labeled_bs) trainloader = DataLoader(db_train, batch_sampler=batch_sampler, num_workers=4, pin_memory=True, worker_init_fn=worker_init_fn) model1.train() model2.train() valloader = DataLoader(db_val, batch_size=1, shuffle=False, num_workers=1) optimizer1 = optim.SGD(model1.parameters(), lr=base_lr, momentum=0.9, weight_decay=0.0001) optimizer2 = optim.SGD(model2.parameters(), lr=base_lr, momentum=0.9, weight_decay=0.0001) ce_loss = CrossEntropyLoss() dice_loss = losses.DiceLoss(num_classes) writer = SummaryWriter(snapshot_path + '/log') logging.info("{} iterations per epoch".format(len(trainloader))) iter_num = 0 max_epoch = max_iterations // len(trainloader) + 1 best_performance1 = 0.0 best_performance2 = 0.0 iterator = tqdm(range(max_epoch), ncols=70) for epoch_num in iterator: for i_batch, sampled_batch in enumerate(trainloader): volume_batch, label_batch = sampled_batch['image'], sampled_batch[ 'label'] volume_batch, label_batch = volume_batch.cuda(), label_batch.cuda() outputs1 = model1(volume_batch) outputs_soft1 = torch.softmax(outputs1, dim=1) outputs2 = model2(volume_batch) outputs_soft2 = torch.softmax(outputs2, dim=1) consistency_weight = get_current_consistency_weight(iter_num // 150) loss1 = 0.5 * ( ce_loss(outputs1[:args.labeled_bs], label_batch[:][:args.labeled_bs].long()) + dice_loss(outputs_soft1[:args.labeled_bs], label_batch[:args.labeled_bs].unsqueeze(1))) loss2 = 0.5 * ( ce_loss(outputs2[:args.labeled_bs], label_batch[:][:args.labeled_bs].long()) + dice_loss(outputs_soft2[:args.labeled_bs], label_batch[:args.labeled_bs].unsqueeze(1))) pseudo_outputs1 = torch.argmax( outputs_soft1[args.labeled_bs:].detach(), dim=1, keepdim=False) pseudo_outputs2 = torch.argmax( outputs_soft2[args.labeled_bs:].detach(), dim=1, keepdim=False) pseudo_supervision1 = ce_loss(outputs1[args.labeled_bs:], pseudo_outputs2) pseudo_supervision2 = ce_loss(outputs2[args.labeled_bs:], pseudo_outputs1) model1_loss = loss1 + consistency_weight * pseudo_supervision1 model2_loss = loss2 + consistency_weight * pseudo_supervision2 loss = model1_loss + model2_loss optimizer1.zero_grad() optimizer2.zero_grad() loss.backward() optimizer1.step() optimizer2.step() iter_num = iter_num + 1 lr_ = base_lr * (1.0 - iter_num / max_iterations)**0.9 for param_group in optimizer1.param_groups: param_group['lr'] = lr_ for param_group in optimizer2.param_groups: param_group['lr'] = lr_ writer.add_scalar('lr', lr_, iter_num) writer.add_scalar('consistency_weight/consistency_weight', consistency_weight, iter_num) writer.add_scalar('loss/model1_loss', model1_loss, iter_num) writer.add_scalar('loss/model2_loss', model2_loss, iter_num) logging.info('iteration %d : model1 loss : %f model2 loss : %f' % (iter_num, model1_loss.item(), model2_loss.item())) if iter_num % 50 == 0: image = volume_batch[1, 0:1, :, :] writer.add_image('train/Image', image, iter_num) outputs = torch.argmax(torch.softmax(outputs1, dim=1), dim=1, keepdim=True) writer.add_image('train/model1_Prediction', outputs[1, ...] * 50, iter_num) outputs = torch.argmax(torch.softmax(outputs2, dim=1), dim=1, keepdim=True) writer.add_image('train/model2_Prediction', outputs[1, ...] * 50, iter_num) labs = label_batch[1, ...].unsqueeze(0) * 50 writer.add_image('train/GroundTruth', labs, iter_num) if iter_num > 0 and iter_num % 200 == 0: model1.eval() metric_list = 0.0 for i_batch, sampled_batch in enumerate(valloader): metric_i = test_single_volume(sampled_batch["image"], sampled_batch["label"], model1, classes=num_classes) metric_list += np.array(metric_i) metric_list = metric_list / len(db_val) for class_i in range(num_classes - 1): writer.add_scalar( 'info/model1_val_{}_dice'.format(class_i + 1), metric_list[class_i, 0], iter_num) writer.add_scalar( 'info/model1_val_{}_hd95'.format(class_i + 1), metric_list[class_i, 1], iter_num) performance1 = np.mean(metric_list, axis=0)[0] mean_hd951 = np.mean(metric_list, axis=0)[1] writer.add_scalar('info/model1_val_mean_dice', performance1, iter_num) writer.add_scalar('info/model1_val_mean_hd95', mean_hd951, iter_num) if performance1 > best_performance1: best_performance1 = performance1 save_mode_path = os.path.join( snapshot_path, 'model1_iter_{}_dice_{}.pth'.format( iter_num, round(best_performance1, 4))) save_best = os.path.join( snapshot_path, '{}_best_model1.pth'.format(args.model)) torch.save(model1.state_dict(), save_mode_path) torch.save(model1.state_dict(), save_best) logging.info( 'iteration %d : model1_mean_dice : %f model1_mean_hd95 : %f' % (iter_num, performance1, mean_hd951)) model1.train() model2.eval() metric_list = 0.0 for i_batch, sampled_batch in enumerate(valloader): metric_i = test_single_volume(sampled_batch["image"], sampled_batch["label"], model2, classes=num_classes) metric_list += np.array(metric_i) metric_list = metric_list / len(db_val) for class_i in range(num_classes - 1): writer.add_scalar( 'info/model2_val_{}_dice'.format(class_i + 1), metric_list[class_i, 0], iter_num) writer.add_scalar( 'info/model2_val_{}_hd95'.format(class_i + 1), metric_list[class_i, 1], iter_num) performance2 = np.mean(metric_list, axis=0)[0] mean_hd952 = np.mean(metric_list, axis=0)[1] writer.add_scalar('info/model2_val_mean_dice', performance2, iter_num) writer.add_scalar('info/model2_val_mean_hd95', mean_hd952, iter_num) if performance2 > best_performance2: best_performance2 = performance2 save_mode_path = os.path.join( snapshot_path, 'model2_iter_{}_dice_{}.pth'.format( iter_num, round(best_performance2))) save_best = os.path.join( snapshot_path, '{}_best_model2.pth'.format(args.model)) torch.save(model2.state_dict(), save_mode_path) torch.save(model2.state_dict(), save_best) logging.info( 'iteration %d : model2_mean_dice : %f model2_mean_hd95 : %f' % (iter_num, performance2, mean_hd952)) model2.train() # change lr if iter_num % 2500 == 0: lr_ = base_lr * 0.1**(iter_num // 2500) for param_group in optimizer1.param_groups: param_group['lr'] = lr_ for param_group in optimizer2.param_groups: param_group['lr'] = lr_ if iter_num % 3000 == 0: save_mode_path = os.path.join( snapshot_path, 'model1_iter_' + str(iter_num) + '.pth') torch.save(model1.state_dict(), save_mode_path) logging.info("save model1 to {}".format(save_mode_path)) save_mode_path = os.path.join( snapshot_path, 'model2_iter_' + str(iter_num) + '.pth') torch.save(model2.state_dict(), save_mode_path) logging.info("save model2 to {}".format(save_mode_path)) if iter_num >= max_iterations: break time1 = time.time() if iter_num >= max_iterations: iterator.close() break writer.close()