def train(args, dataloader, model): epoch = 1 optimizer = optim.Adam(list(model.parameters()), lr=args.lr) scheduler = MultiStepLR(optimizer, milestones=LR_milestones, gamma=args.lr) model.train() for epoch in range(5000): for batch_idx, data in enumerate(dataloader): model.zero_grad() features = data['features'].float() adj_input = data['adj'].float() features = Variable(features).cuda() adj_input = Variable(adj_input).cuda() loss = model(features, adj_input) print('Epoch: ', epoch, ', Iter: ', batch_idx, ', Loss: ', loss) loss.backward() optimizer.step() scheduler.step() break
def main(log, args=None, arglist=None): global image_size help_text = """ Collect the required arguments """ parser = argparse.ArgumentParser(description=help_text, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("-e", "--num_epochs", type=int, help="number of trainig epochs", default=20) parser.add_argument("-bs", "--batch_size", type=int, help="batch size", default=16) parser.add_argument("-ir", "--image_res", type=int, help="batch size", default=320) parser.add_argument("--plot_every", type=int, help="batch size", default=50) if not args: args = parser.parse_args(arglist) #initialize the model model = Net().to(device) log.info("Model initialization completed") #log.info('Data generation completed') #set the optimizer #optimizer = optim.SGD([{"params": model.conv_stn.parameters()}, # {"params": model.localization.parameters(), "weight_decay": 0}, # {"params": model.fc_loc.parameters(), "weight_decay": 0}, # {"params": model.features.parameters()}, # {"params": model.conv_last_10map.parameters()}, # {"params": model.bn_last_10map.parameters()} # ], lr=1e-4, weight_decay=1e-4, momentum=0.9) optimizer = optim.SGD([{"params": model.stn.parameters(), "weight_decay": 0}, {"params": model.features.parameters()}, {"params": model.conv_last_10map.parameters()}, {"params": model.bn_last_10map.parameters()} ], lr=1e-3, weight_decay=1e-4, momentum=0.9) scheduler = MultiStepLR(optimizer, milestones=[20, 30], gamma=0.1) # load the image databases train_imdb = cub_200("train") test_imdb = cub_200("test") image_size = args.image_res # rescale the ground truth according to image resolution log.info("Image resolution: {}".format(args.image_res)) scale_ground_truth_boxes(train_imdb, "train", args.image_res) scale_ground_truth_boxes(test_imdb, "test", args.image_res) transform_cub_train = transforms.Compose([ transforms.Resize((args.image_res, args.image_res)), # transforms.CenterCrop(512), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) transform_cub_test = transforms.Compose([ transforms.Resize((args.image_res, args.image_res)), # transforms.CenterCrop(512), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) test_imdb1 = deepcopy(test_imdb) test_imdb1, indicator = filter_test_for_scale(test_imdb1) indicator = np.where(indicator==1)[0] area_hist = get_area_hist(test_imdb) item, count = np.unique(area_hist, return_counts=True) log.info("Histogram of areas") log.info(item) log.info(count) log.info("Train size: {}".format(len(train_imdb._image_index))) log.info("Test size: {}".format(len(test_imdb._image_index))) log.info("Test size after picking the selected scale: {}".format(len(test_imdb1._image_index))) for epoch in range(args.num_epochs): scheduler.step() train_model(model, train_imdb, log, optimizer, epoch+1, args.batch_size, transform_cub_train) test_model(model, test_imdb1, log, epoch+1, args.batch_size, area_hist, transform_cub_test, indicator) test_model(model, test_imdb, log, epoch+1, args.batch_size, area_hist, transform_cub_test, indicator=None) log.info("Training [CLASSIFICATION] accuracies") log.info(class_train_acc) log.info("\nTrain [LOC] accuracy with max selection") log.info(loc_train_acc_max) loc_train_acc = np.array(loc_train_acc_max) log.info(np.mean(loc_train_acc)) log.info("\nTrain [LOC] accuracy with all boxes") log.info(loc_train_acc_all) loc_train_acc = np.array(loc_train_acc_all) log.info(np.mean(loc_train_acc)) log.info("\nTrain [LOC] accuracy with max selection and NO TRANSFORM") log.info(loc_train_acc_max_NT) loc_train_acc = np.array(loc_train_acc_max_NT) log.info(np.mean(loc_train_acc)) log.info("\nTrain [LOC] accuracy with all boxes and NO TRANSFORM") log.info(loc_train_acc_all_NT) loc_train_acc = np.array(loc_train_acc_all_NT) log.info(np.mean(loc_train_acc)) log.info("Test [CLASSIFICATION] accuracies on [FULL SET]") log.info(class_test_acc1) log.info("\nTest [LOC] accuracy with max selection [FULL SET]") log.info(loc_test_acc1_max) loc_test_acc = np.array(loc_test_acc1_max) log.info(np.mean(loc_test_acc)) log.info("\nTest [LOC] accuracy with all boxes [FULL SET]") log.info(loc_test_acc1_all) loc_test_acc = np.array(loc_test_acc1_all) log.info(np.mean(loc_test_acc)) log.info("\nTest [LOC] accuracy with max selection [FULL SET] NO TRANSFORM") log.info(loc_test_acc1_max_NT) loc_test_acc = np.array(loc_test_acc1_max_NT) log.info(np.mean(loc_test_acc)) log.info("\nTest [LOC] accuracy with all boxes [FULL SET] NO TRANSFORM") log.info(loc_test_acc1_all_NT) loc_test_acc = np.array(loc_test_acc1_all_NT) log.info(np.mean(loc_test_acc)) log.info("Test [CLASSIFICATION] accuracies on [REDUCED SET]") log.info(class_test_acc2) log.info("\nTest [LOC] accuracy with max selection [REDUCED SET]") log.info(loc_test_acc2_max) loc_test_acc = np.array(loc_test_acc2_max) log.info(np.mean(loc_test_acc)) log.info("\nTest [LOC] accuracy with all boxes [REDUCED SET]") log.info(loc_test_acc2_all) loc_test_acc = np.array(loc_test_acc2_all) log.info(np.mean(loc_test_acc)) log.info("\nTest [LOC] accuracy with max selection [REDUCED SET] NO TRANSFORM") log.info(loc_test_acc2_max_NT) loc_test_acc = np.array(loc_test_acc2_max_NT) log.info(np.mean(loc_test_acc)) log.info("\nTest [LOC] accuracy with all boxes [REDUCED SET] NO TRANSFORM") log.info(loc_test_acc2_all_NT) loc_test_acc = np.array(loc_test_acc2_all_NT) log.info(np.mean(loc_test_acc))
milestones = [int(v.strip()) for v in milestones.split(",")] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch) elif scheduler == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingLR(optimizer, t_max, last_epoch=last_epoch) else: logging.fatal(f"Unsupported Scheduler: {scheduler}.") # parser.print_help(sys.stderr) sys.exit(1) logging.info(f"Start training from epoch {last_epoch + 1}.") for epoch in range(last_epoch + 1, num_epochs): scheduler.step() train(train_loader, net, criterion, optimizer, device=DEVICE, debug_steps=debug_steps, epoch=epoch) if epoch % validation_epochs == 0 or epoch == num_epochs - 1: val_loss, val_regression_loss, val_classification_loss = test( val_loader, net, criterion, DEVICE) logging.info( f"Epoch: {epoch}, " + f"Validation Loss: {val_loss:.4f}, " + f"Validation Regression Loss {val_regression_loss:.4f}, " + f"Validation Classification Loss: {val_classification_loss:.4f}"
def experiment(args): if args.do_print: print(args) # Setup the random seeds torch.manual_seed(args.seed) np.random.seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) model, mixup_mat, train_loader, val_loader, test_loader = get_models(args) # Set up the logger and the save directories # If we load a mixup grid, then save in a different location. Otherwise, it's a baseline run if args.load_checkpoint: args.save_dir = os.path.join(args.save_dir, 'test_aug/') # args.save_loc is for model ckpts args.save_loc = os.path.join(args.save_dir, get_id(args), 'test_aug_checkpoints/') else: args.save_dir = os.path.join(args.save_dir, 'test_no_aug/') # args.save_loc is for model ckpts args.save_loc = os.path.join(args.save_dir, get_id(args), 'test_no_aug_checkpoints/') csv_logger, _ = load_logger(args) os.makedirs(args.save_loc, exist_ok=True) # Standard LR/schedule settings for CIFAR optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[30, 60, 80], gamma=0.1) # [60, 120, 160] def train_loss_func(x, y): x, y = x.cuda(), y.cuda() if args.load_checkpoint: mixed_x, y_a, y_b, lam = mixup_data(x, y, mixup_mat) pred = model(mixed_x) xentropy_loss = mixup_criterion(pred, y_a, y_b, lam) else: pred = model(x) xentropy_loss = F.cross_entropy(pred, y, reduction='none') final_loss = xentropy_loss.mean() return final_loss, pred def test(loader): model.eval() # Change model to 'eval' mode (BN uses moving mean/var). correct, total = 0., 0. losses = [] for images, labels in loader: images, labels = images.cuda(), labels.cuda() with torch.no_grad(): pred = model(images) xentropy_loss = F.cross_entropy(pred, labels) losses.append(xentropy_loss.item()) xentropy_loss = F.cross_entropy(pred, labels) losses.append(xentropy_loss.item()) pred = torch.max(pred.data, 1)[1] total += labels.size(0) correct += (pred == labels).sum().item() avg_loss = float(np.mean(losses)) acc = correct / total model.train() return avg_loss, acc init_time = time.time() val_loss, val_acc = test(val_loader) test_loss, test_acc = test(test_loader) if args.do_print: print(f"Initial Val Loss: {val_loss, val_acc}") print(f"Initial Test Loss: {test_loss, test_acc}") iteration = 0 for epoch in range(0, args.epochs): reg_anneal_epoch = epoch xentropy_loss_avg = 0. total_val_loss, val_loss = 0., 0. correct = 0. total = 0. weight_norm, grad_norm = .0, .0 if args.do_print: progress_bar = tqdm(train_loader) else: progress_bar = train_loader for i, (images, labels) in enumerate(progress_bar): if args.do_print: progress_bar.set_description('Epoch ' + str(epoch)) images, labels = images.cuda(), labels.cuda() optimizer.zero_grad() xentropy_loss, pred = train_loss_func(images, labels) xentropy_loss.backward() optimizer.step() optimizer.zero_grad() xentropy_loss_avg += xentropy_loss.item() iteration += 1 # Calculate running average of accuracy if args.do_classification: pred = torch.max(pred.data, 1)[1] total += labels.size(0) correct += (pred == labels.data).sum().item() accuracy = correct / total else: total = 1 accuracy = 0 if args.do_print: progress_bar.set_postfix( train='%.4f' % (xentropy_loss_avg / (i + 1)), val='%.4f' % (total_val_loss / (i + 1)), acc='%.4f' % accuracy, weight='%.10f' % weight_norm, update='%.10f' % grad_norm) if i % 100 == 0: val_loss, val_acc = test(val_loader) test_loss, test_acc = test(test_loader) csv_logger.writerow({ 'epoch': str(epoch), 'train_loss': str(xentropy_loss_avg / (i + 1)), 'train_acc': str(accuracy), 'val_loss': str(val_loss), 'val_acc': str(val_acc), 'test_loss': str(test_loss), 'test_acc': str(test_acc), 'run_time': time.time() - init_time, 'iteration': iteration }) scheduler.step(epoch) train_loss = xentropy_loss_avg / (i + 1) only_print_final_vals = not args.do_print if not only_print_final_vals: val_loss, val_acc = test(val_loader) # if val_acc >= 0.99 and accuracy >= 0.99 and epoch >= 50: break test_loss, test_acc = test(test_loader) tqdm.write( 'val loss: {:6.4f} | val acc: {:6.4f} | test loss: {:6.4f} | test_acc: {:6.4f}' .format(val_loss, val_acc, test_loss, test_acc)) csv_logger.writerow({ 'epoch': str(epoch), 'train_loss': str(train_loss), 'train_acc': str(accuracy), 'val_loss': str(val_loss), 'val_acc': str(val_acc), 'test_loss': str(test_loss), 'test_acc': str(test_acc), 'run_time': time.time() - init_time, 'iteration': iteration }) else: if args.do_print: val_loss, val_acc = test(val_loader, do_test_augment=False) tqdm.write('val loss: {:6.4f} | val acc: {:6.4f}'.format( val_loss, val_acc)) val_loss, val_acc = test(val_loader) test_loss, test_acc = test(test_loader) saver(args.num_finetune_epochs, model, optimizer, args.save_loc) return train_loss, accuracy, val_loss, val_acc, test_loss, test_acc
class Solver(object): """ """ def __init__(self, config, reuse=False): self.config = config self.reuse = reuse self.build_model() if reuse: # remember to manually load_data by specifying modes=[...] self.load_model(self.config.model_path) else: self.load_data() def load_data(self, modes=["train", "valid", "test"]): self.train_dataloader = get_dataloader( self.config, mode="train") if "train" in modes else None self.valid_dataloader = get_dataloader( self.config, mode="valid") if "valid" in modes else None self.test_dataloader = get_dataloader( self.config, mode="test") if "test" in modes else None def build_model(self): # can add preprocessing logic layer here self.model = AtecModel(self.config) # training stuff self.criterion = nn.CrossEntropyLoss() self.trainable_params = list(self.model.encoder.parameters()) + list( self.model.comparator.parameters()) self.optimizer = Adam(self.trainable_params, lr=self.config.lr) self.scheduler = MultiStepLR(self.optimizer, milestones=[10, 20, 30], gamma=0.1) # bookkeeping stuff self.writer = SummaryWriter(self.config.log_dir) def load_model(self, model_path): self.model.load_state_dict(torch.load(model_path)) def save_model(self, model, model_path): torch.save(model.state_dict(), model_path) def train(self): for epoch in range(self.config.num_epoch): self.scheduler.step() self.train_step(epoch) # save the model per epoch, only save parameters if (epoch + 1) % self.config.save_step == 0: model_path = os.path.join(self.config.model_dir, 'model-%d.pkl' % (epoch + 1)) self.save_model(self.model, model_path) # log model performance over epochs valid_acc = self.evaluate(self.valid_dataloader) test_acc = self.evaluate((self.test_dataloader)) self.writer.add_scalars('data/accuracy', { 'valid': valid_acc.data[0], 'test': test_acc.data[0] }, epoch) print('Epoch [%d/%d], valid acc: %.4f, test acc: %.4f' % (epoch + 1, self.config.num_epoch, valid_acc.data[0], test_acc.data[0])) self.close_log(self.writer) def train_step(self, epoch): total_steps = len( self.train_dataloader.dataset) // self.config.batch_size + 1 for i, (data, labels, indices, lengths) in enumerate(self.train_dataloader): logits = self.model(data, indices) preds = torch.argmax(logits, dim=1).long() loss = self.criterion(logits, labels) acc = self.metric(preds, labels) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # log loss, could visualize in tensorboard if needed if (i + 1) % self.config.log_step == 0: self.writer.add_scalar('data/loss', loss.data[0], epoch * total_steps + i) self.writer.add_scalar('data/train_acc', acc.data[0], epoch * total_steps + i) print('Epoch [%d/%d], Step[%d/%d], loss: %.4f, acc: %.4f' % (epoch + 1, self.config.num_epoch, i + 1, total_steps, loss.data[0], acc.data[0])) # print(acc.data[0]) # print(loss.data[0]) def inference(self, data, indices): logits = self.model(data, indices) preds = torch.argmax(logits, dim=1).long() return preds def evaluate(self, dataloader): accs = [] for i, (data, labels, indices, lengths) in enumerate(dataloader): preds = self.inference(data, indices) acc = self.metric(preds, labels) accs.append(acc.data[0]) return sum(accs) / float(len(accs)) def metric(self, preds, labels): # accuracy res = torch.eq(preds, labels) acc = torch.sum(res).double() / torch.tensor(res.shape[0]).double() # print('-------------Good Boy--------------') # print(res) tp = 1 fp = 1 tn = 1 fn = 1 precision = tp / (tp + fp) recall = tp / (tp + fn) acc_f1 = (tp + tn) / (tp + tn + fp + fn) f1 = 2 * precision * recall / (precision + recall) return acc def close_log(self, writer, log_path="./all_scalars.json"): # export scalar data to JSON for external processing writer.export_scalars_to_json(log_path) writer.close()
def main(): parser = ArgumentParser() parser.add_argument('--tag', type=str, default='run', help='optional tag to identify the run') parser.add_argument('--dataset', choices=['nuscenes', 'argoverse'], default='nuscenes', help='dataset to train on') parser.add_argument('--model', choices=['pyramid', 'vpn', 'ved'], default='pyramid', help='model to train') parser.add_argument('--experiment', default='test', help='name of experiment config to load') parser.add_argument('--resume', default=None, help='path to an experiment to resume') parser.add_argument( '--options', nargs='*', default=[], help='list of addition config options as key-val pairs') args = parser.parse_args() # Load configuration config = get_configuration(args) # Create a directory for the experiment logdir = create_experiment(config, args.tag, args.resume) # Create tensorboard summary summary = SummaryWriter(logdir) # Set default device if len(config.gpus) > 0: torch.cuda.set_device(config.gpus[0]) # Setup experiment model = build_model(config.model, config) criterion = build_criterion(config.model, config) train_loader, val_loader = build_dataloaders(config.train_dataset, config) # Build optimiser and learning rate scheduler optimiser = SGD(model.parameters(), config.learning_rate, weight_decay=config.weight_decay) lr_scheduler = MultiStepLR(optimiser, config.lr_milestones, 0.1) # Load checkpoint if args.resume: epoch, best_iou = load_checkpoint(os.path.join(logdir, 'latest.pth'), model, optimiser, lr_scheduler) else: epoch, best_iou = 1, 0 # epoch = 1 # Main training loop while epoch <= config.num_epochs: print('\n\n=== Beginning epoch {} of {} ==='.format( epoch, config.num_epochs)) # Train model for one epoch train(train_loader, model, criterion, optimiser, summary, config, epoch) # Evaluate on the validation set val_iou = evaluate(val_loader, model, criterion, summary, config, epoch) # Update learning rate lr_scheduler.step() # Save checkpoints if val_iou > best_iou: best_iou = val_iou save_checkpoint(os.path.join(logdir, 'best.pth'), model, optimiser, lr_scheduler, epoch, best_iou) save_checkpoint(os.path.join(logdir, 'latest.pth'), model, optimiser, lr_scheduler, epoch, best_iou) epoch += 1 print("\nTraining complete!")
def main(): if opt.show: if not os.path.exists("logs/"): os.makedirs("logs/") global writer writer = SummaryWriter(log_dir='logs') if opt.cuda: print("=> Use GPU ID: '{}'".format(opt.gpus)) os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpus if not torch.cuda.is_available(): raise Exception( "No GPU found or Wrong gpu id, please run without --cuda") torch.manual_seed(opt.seed) if opt.cuda: torch.cuda.manual_seed(opt.seed) cudnn.benchmark = True # Loading datasets train_set = TrainsetFromFolder('/media/hdisk/liqiang/hyperSR/train/' + opt.datasetName + '/' + str(opt.upscale_factor) + '/') train_loader = DataLoader(dataset=train_set, num_workers=opt.threads, batch_size=opt.batchSize, shuffle=True) val_set = ValsetFromFolder('/media/hdisk/liqiang/hyperSR/test/' + opt.datasetName + '/' + str(opt.upscale_factor)) val_loader = DataLoader(dataset=val_set, num_workers=opt.threads, batch_size=1, shuffle=False) # Buliding model model = MCNet(opt) criterion = nn.L1Loss() if opt.cuda: model = nn.DataParallel(model).cuda() criterion = criterion.cuda() else: model = model.cpu() print('# parameters:', sum(param.numel() for param in model.parameters())) # Setting Optimizer optimizer = optim.Adam(model.parameters(), lr=opt.lr, betas=(0.9, 0.999), eps=1e-08) # optionally resuming from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) opt.start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Setting learning rate scheduler = MultiStepLR(optimizer, milestones=[35, 70, 105, 140, 175], gamma=0.5, last_epoch=-1) # Training for epoch in range(opt.start_epoch, opt.nEpochs + 1): scheduler.step() print("Epoch = {}, lr = {}".format(epoch, optimizer.param_groups[0]["lr"])) train(train_loader, optimizer, model, criterion, epoch) val(val_loader, model, epoch) save_checkpoint(epoch, model, optimizer)
def train_(train_set,test_set,lr, depth, mixup_enbale, alpha, model_checkpoint,epochs): torch.manual_seed(1) train_loader=torch.utils.data.DataLoader(train_set, batch_size=128, shuffle=False, pin_memory=True,num_workers=2) test_loader=torch.utils.data.DataLoader(test_set, batch_size=100, shuffle=False, pin_memory=True,num_workers=2) network= Net(depth).to(device) optimizer = optim.SGD(network.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=5e-4) criterion = torch.nn.CrossEntropyLoss().to(device) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95) scheduler = MultiStepLR(optimizer, milestones=[30, 60, 80], gamma=0.2) acc_train=[] acc_test=[] acc = 0 best_acc = 0 for epoch in range(epochs): total_loss = 0 total_correct = 0 network.train() count_in = 0 for batch in train_loader: #Get batch images,labels = batch images, labels = images.to(device), labels.to(device) if mixup_enbale: images, targets_a, targets_b, lam = mixup_data(images, labels, alpha) images, targets_a, targets_b = map(Variable, (images, targets_a, targets_b)) preds = network(images) loss = mixup_criterion(criterion, preds, targets_a, targets_b, lam) _, predicted = torch.max(preds.data, 1) correct = (lam * predicted.eq(targets_a.data).cpu().sum().float() + (1 - lam) * predicted.eq(targets_b.data).cpu().sum().float()) total_correct += correct if not mixup_enbale: preds=network(images) #pass batch to network correct = get_num_correct(preds, labels) loss = criterion(preds,labels) #Calculate loss total_correct+=correct optimizer.zero_grad() loss.backward() #Calculate gradients optimizer.step() #Update weights print("epoch: ", epoch, "total_correct: ", total_correct.item() ) print("training accuracy: ", total_correct.item() /len(train_set)) acc_train.append(deepcopy(float(total_correct)/len(train_set))) with torch.no_grad(): correct_test=0 for batch_test in test_loader: #Get batch images_test,labels_test = batch_test images_test, labels_test = images_test.to(device), labels_test.to(device) preds_test=network(images_test) #pass batch to network correct_test += get_num_correct(preds_test, labels_test) print("testing accuracy: ", correct_test / len(test_set)) if epoch == epochs - 1: print(correct_test / len(test_set)) acc = correct_test / len(test_set) acc_test.append(deepcopy(float(correct_test)/len(test_set))) scheduler.step() if best_acc < acc: best_acc = acc torch.save(network.state_dict(), model_checkpoint) return (acc_train,acc_test)
def train_panet(device, resume=False, dataset_name='voc'): pre_trained_encoder_path = '../data/vgg16-397923af.pth' if cfg['panet'][ 'use_pretrained'] else None model = PANetFewShotSeg(in_channels=cfg[dataset_name]['channels'], pretrained_path=pre_trained_encoder_path, cfg={ 'align': True }, encoder_type=cfg['panet']['backbone']).to(device) optimizer = torch.optim.SGD(model.parameters(), lr=cfg['panet']['lr'], momentum=cfg['panet']['momentum'], weight_decay=cfg['panet']['weight_decay']) scheduler = MultiStepLR(optimizer, milestones=cfg['panet']['lr_milestones'], gamma=0.1) epoch = 0 model.train() if resume: epoch = load_state(cfg[dataset_name]['model_name'], model, optimizer, scheduler) if dataset_name == 'voc': transforms = Compose([ Resize(size=cfg['panet']['vgg_inp_size']), ]) elif dataset_name == 'ircadb': transforms = Compose([ Resize(size=cfg['panet']['unet_inp_size']), ]) if dataset_name == 'voc': train_dataset = get_pascal_few_shot_datasets( range(1, 16), cfg['panet']['train_iterations'], cfg['nshot'], cfg['nquery'], transforms) elif dataset_name == 'ircadb': train_dataset = get_ircadb_few_shot_datasets( organs=[ "bone", "spleen", "leftkidney", "rightkidney", "leftlung", "rightlung", "gallbladder" ], patient_ids=range(1, 16), iterations=cfg['panet']['train_iterations'], N_shot=cfg['nshot'], N_query=cfg['nquery'], transforms=transforms) trainloader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=1, pin_memory=True, drop_last=True) criterion = nn.CrossEntropyLoss(ignore_index=255) log_loss = {'loss': 0, 'align_loss': 0} for i_iter, (support, query) in enumerate(tqdm(trainloader)): support_images = [[]] support_fg_mask = [[]] support_bg_mask = [[]] for i in range(len(support)): # print(support[i][0].shape) support_images[0].append(support[i][0].to(device)) support_fg_mask[0].append(support[i][1].to(device)) support_bg_mask[0].append(support[i][2].to(device)) query_images = [] query_labels = [] for i in range(len(query)): query_images.append(query[i][0].to(device)) query_labels.append(query[i][1].to(device)) query_labels = torch.cat(query_labels, dim=0).long().to(device) # Forward and Backward optimizer.zero_grad() query_pred, align_loss = model(support_images, support_fg_mask, support_bg_mask, query_images) query_loss = criterion(query_pred, query_labels) loss = query_loss + align_loss * cfg['panet']['align_loss_scalar'] loss.backward() optimizer.step() scheduler.step() # Log loss query_loss = query_loss.detach().data.cpu().numpy() align_loss = align_loss.detach().data.cpu().numpy( ) if align_loss != 0 else 0 log_loss['loss'] += query_loss log_loss['align_loss'] += align_loss # print loss and take snapshots if (i_iter + 1) % cfg['panet']['save_period'] == 0: loss = log_loss['loss'] / (i_iter + 1) align_loss = log_loss['align_loss'] / (i_iter + 1) print('\nstep {}: loss: {}, align_loss: {}'.format( i_iter + 1, loss, align_loss)) if (i_iter + 1) % cfg['panet']['save_period'] == 0: save_state(cfg[dataset_name]['model_name'], model, optimizer, scheduler, epoch + i_iter + 1) print("\nModel Saved On Iteration {} ...".format(epoch + i_iter + 1)) return model
class Trainer(TorchTrainer): def __init__(self, config, validate_tracktor): super().__init__(config) self.validate_tracktor = validate_tracktor self.mse = nn.MSELoss(reduction='none') self.optim = torch.optim.Adam(context.model.parameters(), lr=context.cfg.lr, weight_decay=context.cfg.weight_decay) if context.cfg.scheduler_type == 'plateau': self.sched = ReduceLROnPlateau(self.optim, verbose=True, **context.cfg.scheduler_args) elif context.cfg.scheduler_type == 'multistep': self.sched = MultiStepLR(self.optim, **context.cfg.scheduler_args) else: raise ValueError( f'Unknown scheduler: {context.cfg.scheduler_type}') if context.cfg.use_box_coding: self.use_box_coding = True self.predict_coded_a = context.cfg.predict_coded_a self.loss_coded = context.cfg.loss_coded self.box_coder = BoxCoder(context.cfg.box_coding_weights) else: self.use_box_coding = False self.loss_coded = False def criterion(self, input, target): input = input.view(-1, 4) target = target.view(-1, 4) assert context.cfg.loss == 'mse' return self.mse(input, target).mean() def criterion_coded(self, prediction, x, target, last_coded): target_coded = self.box_coder.encode(list(target[:, :, :4]), list(x[:, [-1], :4])) last_coded = last_coded[:, [-1], :4] loss = self.criterion(prediction[:, :, :4], torch.stack(target_coded) - last_coded) return loss def epoch(self): loss_epoch = [] loss_lengths = [[], [], [], [], [], []] # losses for different episode lengths iou_epoch = [] miou_epoch = [] all_input = [] all_out = [] all_diffs = [] all_gt = [] all_pred_pos = [] all_prev_pos = [] for boxes_in, boxes_target, boxes_resized, image_features, image_sizes, lengths, levels in tqdm( context.data_loader): # move tensors to GPU boxes_in = boxes_in.cuda() boxes_target = boxes_target.cuda() boxes_resized = boxes_resized.cuda() # in case we're working with float16 features, only convert to float32 once they're on the gpu if isinstance(image_features, list): image_features = [ feat.cuda().float() for feat in image_features ] else: image_features = image_features.cuda().float() diffs = torch.zeros(boxes_in.shape[0], context.cfg.model_args['input_length'], 6).cuda() if self.use_box_coding: encoded = self.box_coder.encode(list(boxes_in[:, 1:, :4]), list(boxes_in[:, :-1, :4])) diffs[:, :, :4] = torch.stack(encoded, dim=0) else: # raises error if model and dataset lengths do not match diffs[:, :, :4] = boxes_in[:, 1:, :4] - boxes_in[:, :-1, :4] diffs[:, :, 5] = 1. diffs[(boxes_in[:, :, 5] == 0.)[:, :-1]] = 0. if not context.validate: self.optim.zero_grad() do_tf = context.cfg.teacher_forcing > 0 and np.random.uniform( ) < context.cfg.teacher_forcing out = context.model(diffs, boxes_target, boxes_resized, image_features, image_sizes, lengths, do_tf) else: out = context.model.predict(diffs, boxes_resized, image_features, image_sizes, lengths, boxes_target.shape[1]) assert out.shape[1] == 1 last_input = boxes_in[:, -1, :].unsqueeze(1) if self.use_box_coding: if self.predict_coded_a: # out is the acceleration in encoding space last_offset = diffs[:, [-1], :4] pred_offset = last_offset + out[:, :, :4] pred_pos = self.box_coder.decode(list(pred_offset), list(last_input)) else: # out is the absolute encoded offset pred_pos = self.box_coder.decode(list(out[:, :, :4]), list(last_input)) else: pred_pos = last_input[:, :, :4] + diffs[:, [-1], : 4] + out[:, :, :4] # calculate loss if self.use_box_coding: last_coded = diffs[:, [-1], :4] loss = self.criterion_coded(out, boxes_in, boxes_target, last_coded) else: loss = self.criterion(pred_pos, boxes_target[:, :, :4]) loss_epoch.append(loss.detach().cpu()) # DIFFERENT LENGTH ANALYSIS for i, loss_list in zip(range(2, 8), loss_lengths): mask = lengths == i if mask.any(): if self.use_box_coding: loss_part = self.criterion_coded( out[mask], boxes_in[mask], boxes_target[mask], diffs[mask]) else: loss_part = self.criterion( pred_pos[mask], boxes_target[:, :, :4][mask]) loss_list.append(loss_part.detach().cpu()) if not context.validate and context.epoch > 0: loss.backward() # nn.utils.clip_grad_norm_(model.parameters(), 1.0) self.optim.step() all_input.append(boxes_in.detach().cpu()) all_out.append(out.detach().cpu()) all_diffs.append(diffs.detach().cpu()) all_gt.append(boxes_target[:, :, :4].detach().cpu()) all_pred_pos.append(pred_pos.detach().cpu()) all_prev_pos.append( torch.cat([ last_input[:, :, :4].detach().cpu(), boxes_target[:, :-1, :4].detach().cpu() ], dim=1)) # evaluate iou iou = jaccard( pred_pos.view(-1, 4).detach(), boxes_target[:, :, :4].view(-1, 4).detach()) iou = iou[~torch.isnan(iou)] iou_epoch.append(iou) miou_epoch.append((iou > 0.7).sum().float() / len(iou)) all_input = torch.cat(all_input) all_out = torch.cat(all_out) all_diffs = torch.cat(all_diffs) all_gt = torch.cat(all_gt) all_pred_pos = torch.cat(all_pred_pos) all_prev_pos = torch.cat(all_prev_pos) assert all_prev_pos.shape[1] == all_gt.shape[1] == all_pred_pos.shape[ 1] == 1 eval_df = evaluate_classes(all_prev_pos.squeeze(1), all_gt.squeeze(1), all_pred_pos.squeeze(1))['df'] # calculate cva performance for current epoch diff = all_input[:, 1:, :4] - all_input[:, :-1, :4] m = (all_input[:, :, 5] == 1.)[:, :-1].unsqueeze(1).float() v_mean = torch.bmm(m, diff) / m.sum(dim=2).unsqueeze(2) # set NaNs to zero (https://discuss.pytorch.org/t/how-to-set-nan-in-tensor-to-0/3918/4) v_mean[v_mean != v_mean] = 0. v_mean = v_mean.squeeze(1) pred_cva = all_input[:, -1, :4] + v_mean val_mask = ((pred_cva[:, 2] - pred_cva[:, 0]) >= 0) & ( (pred_cva[:, 3] - pred_cva[:, 1]) >= 0) iou_cva = jaccard(pred_cva, all_gt.squeeze(1)).mean() miou_cva = (jaccard(pred_cva, all_gt.squeeze(1)) > 0.7).sum().float() / len(pred_cva) if self.use_box_coding: offset_cva = self.box_coder.encode( list(pred_cva[val_mask].unsqueeze(1)[:, :, :4]), list(all_input[val_mask][:, [-1], :4])) coded_cva = torch.stack(offset_cva) - all_diffs[val_mask][:, [-1], :4] loss_cva = self.criterion_coded(coded_cva, all_input[val_mask], all_gt[val_mask], all_diffs[val_mask]) else: loss_cva = self.criterion(pred_cva, all_gt.squeeze(1)) if context.validate: if self.use_box_coding: loss_epoch = self.criterion_coded(all_out, all_input, all_gt, all_diffs) else: loss_epoch = self.criterion( all_pred_pos.squeeze(1)[:, :4], all_gt.squeeze(1)).mean() iou = jaccard(all_pred_pos.squeeze(1)[:, :4], all_gt.squeeze(1)) iou_epoch = iou.mean() miou_epoch = ((iou > 0.7).sum().float() / len(iou)) with open(context.log_path / f'{context.epoch}_df_val.txt', 'w') as fh: fh.write(eval_df.to_string()) if context.cfg.scheduler_type == 'plateau': self.sched.step(loss_epoch, epoch=context.epoch) elif context.cfg.scheduler_type == 'multistep': self.sched.step(epoch=context.epoch) else: loss_epoch = torch.tensor(loss_epoch).mean() iou_epoch = torch.cat(iou_epoch).mean() miou_epoch = torch.stack(miou_epoch).float().mean() with open(context.log_path / f'{context.epoch}_df_train.txt', 'w') as fh: fh.write(eval_df.to_string()) metrics = { 'loss': loss_epoch, 'iou': iou_epoch, 'miou': miou_epoch, 'iou_cva': iou_cva, 'miou_cva': miou_cva, 'loss_cva': loss_cva, 'loss_1': torch.tensor(loss_lengths[0]).mean(), 'loss_2': torch.tensor(loss_lengths[1]).mean(), 'loss_3': torch.tensor(loss_lengths[2]).mean(), 'loss_4': torch.tensor(loss_lengths[3]).mean(), 'loss_5': torch.tensor(loss_lengths[4]).mean(), 'loss_6': torch.tensor(loss_lengths[5]).mean() } if context.epoch % context.cfg.tracktor_val_every == 0 and context.validate: with torch.no_grad(): metrics = { **metrics, **self.validate_tracktor(context.model, context.epoch) } return metrics
def train(args): torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True np.random.seed(args.seed) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # -------------------- Load data ---------------------------------- transform = transforms.Compose([ Rescale((224, 224)), ColorJitter(0.5, 0.5, 0.5, 0.3, 0.5), ToTensor(), ]) dataset = FaceDataset(args.train_data, True, transform=transform) data_loader = DataLoader(dataset, shuffle=True, batch_size=args.batch_size, drop_last=True, num_workers=4) # ----------------- Define networks --------------------------------- Gnet = SketchNet(in_channels=3, out_channels=1, norm_type=args.Gnorm) Dnet = DNet(norm_type=args.Dnorm) vgg19_model = vgg19(args.vgg19_weight) gpu_ids = [int(x) for x in args.gpus.split(',')] if len(gpu_ids) > 0: Gnet.cuda() Dnet.cuda() Gnet = nn.DataParallel(Gnet, device_ids=gpu_ids) Dnet = nn.DataParallel(Dnet, device_ids=gpu_ids) vgg19_model = nn.DataParallel(vgg19_model, device_ids=gpu_ids) Gnet.train() Dnet.train() if args.resume: weights = glob(os.path.join(args.save_weight_path, '*-*.pth')) weight_path = sorted(weights)[-1][:-5] Gnet.load_state_dict(torch.load(weight_path + 'G.pth')) Dnet.load_state_dict(torch.load(weight_path + 'D.pth')) # ---------------- set optimizer and learning rate --------------------- args.epochs = np.ceil(args.epochs * 1000 / len(dataset)) args.epochs = max(int(args.epochs), 4) ms = [int(1. / 4 * args.epochs), int(2. / 4 * args.epochs)] optim_G = Adam(Gnet.parameters(), args.lr) optim_D = Adam(Dnet.parameters(), args.lr) scheduler_G = MultiStepLR(optim_G, milestones=ms, gamma=0.1) scheduler_D = MultiStepLR(optim_D, milestones=ms, gamma=0.1) mse_crit = nn.MSELoss() # ---------------------- Define reference styles and feature loss layers ---------- if args.train_style == 'cufs': ref_style_dataset = ['CUHK_student', 'AR', 'XM2VTS'] ref_feature = './data/cufs_feature_dataset.pth' ref_img_list = './data/cufs_reference_img_list.txt' elif args.train_style == 'cufsf': ref_style_dataset = ['CUFSF'] ref_feature = './data/cufsf_feature_dataset.pth' ref_img_list = './data/cufsf_reference_img_list.txt' else: assert 1 == 0, 'Train style {} not supported.'.format(args.train_style) vgg_feature_layers = ['r11', 'r21', 'r31', 'r41', 'r51'] feature_loss_layers = list( itertools.compress(vgg_feature_layers, args.flayers)) log = logger.Logger(args.save_weight_path) for e in range(args.epochs): scheduler_G.step() scheduler_D.step() sample_count = 0 for batch_idx, batch_data in enumerate(data_loader): # ---------------- Load data ------------------- start = time() train_img, train_img_org = [ utils.tensorToVar(x) for x in batch_data ] topk_sketch_img, topk_photo_img = search_dataset.find_photo_sketch_batch( train_img_org, ref_feature, ref_img_list, vgg19_model, dataset_filter=ref_style_dataset, topk=args.topk) random_real_sketch = search_dataset.get_real_sketch_batch( train_img.size(0), ref_img_list, dataset_filter=ref_style_dataset) end = time() data_time = end - start sample_count += train_img.size(0) # ---------------- Model forward ------------------- start = time() fake_sketch = Gnet(train_img) fake_score = Dnet(fake_sketch) real_score = Dnet(random_real_sketch) real_label = torch.ones_like(fake_score) fake_label = torch.zeros_like(fake_score) # ----------------- Calculate loss and backward ------------------- train_img_org_vgg = img_process.subtract_mean_batch( train_img_org, 'face') topk_sketch_img_vgg = img_process.subtract_mean_batch( topk_sketch_img, 'sketch') topk_photo_img_vgg = img_process.subtract_mean_batch( topk_photo_img, 'face') fake_sketch_vgg = img_process.subtract_mean_batch( fake_sketch.expand_as(train_img_org), 'sketch', args.meanshift) style_loss = loss.feature_mrf_loss_func( fake_sketch_vgg, topk_sketch_img_vgg, vgg19_model, feature_loss_layers, [train_img_org_vgg, topk_photo_img_vgg], topk=args.topk) tv_loss = loss.total_variation(fake_sketch) # GAN Loss adv_loss = mse_crit(fake_score, real_label) * args.weight[1] tv_loss = tv_loss * args.weight[2] loss_G = style_loss * args.weight[0] + adv_loss + tv_loss loss_D = 0.5 * mse_crit(fake_score, fake_label) + 0.5 * mse_crit( real_score, real_label) # Update parameters optim_D.zero_grad() loss_D.backward(retain_graph=True) optim_D.step() optim_G.zero_grad() loss_G.backward() optim_G.step() end = time() train_time = end - start # ----------------- Print result and log the output ------------------- log.iterLogUpdate(loss_G.data[0]) if batch_idx % 100 == 0: log.draw_loss_curve() msg = "{:%Y-%m-%d %H:%M:%S}\tEpoch [{:03d}/{:03d}]\tBatch [{:03d}/{:03d}]\tData: {:.2f} Train: {:.2f}\tLoss: G-{:.4f}, Adv-{:.4f}, tv-{:.4f}, D-{:.4f}".format( datetime.now(), e, args.epochs, sample_count, len(dataset), data_time, train_time, *[x.data[0] for x in [loss_G, adv_loss, tv_loss, loss_D]]) print(msg) log_file = open(os.path.join(args.save_weight_path, 'log.txt'), 'a+') log_file.write(msg + '\n') log_file.close() save_weight_name = "epochs-{:03d}-".format(e) G_cpu_model = copy.deepcopy(Gnet).cpu() D_cpu_model = copy.deepcopy(Dnet).cpu() torch.save( G_cpu_model.state_dict(), os.path.join(args.save_weight_path, save_weight_name + 'G.pth')) torch.save( D_cpu_model.state_dict(), os.path.join(args.save_weight_path, save_weight_name + 'D.pth'))
class Experiment: def __init__(self, config_file="config.json"): # read config.json file if os.path.isfile(config_file): with open(config_file) as json_file: config = json.load(json_file) self.config = config else: raise Exception("file does not exist: %s" % config_file) # read in root = config["dataset"]["root"] self.root = os.path.abspath(root) self.num_epoch = config["num_epoch"] self.warmup_epoch = config["train"]["G_warming"] self.batch_size = config["dataset"]["batch_size"] self.G_path = config["model"]["G_path"] self.D_path = config["model"]["D_path"] # edge promoting if not os.path.isdir(os.path.join(self.root, "edge_smoothed")): src_dir = os.path.join(self.root, "violet", "train") target_dir = os.path.join(self.root, "edge_smoothed") utils.edge_promoting(src_dir, target_dir) else: print("edge-promoting already done %s" % os.path.join(self.root, "edge_smoothed")) # initialize dataset train_real_dataset = MyDataset(self.root, style="real", mode="train") train_anim_dataset = MyDataset(self.root, style="edge_smoothed", mode="") val_real_dataset = MyDataset(self.root, style="real", mode="valid") val_anim_dataset = MyDataset(self.root, style="violet", mode="valid") test_dataset = MyDataset(self.root, style="real", mode="test") self.train_real_loader = DataLoader(train_real_dataset, batch_size=self.batch_size, shuffle=True, num_workers=12) self.train_anim_loader = DataLoader(train_anim_dataset, batch_size=self.batch_size, shuffle=True, num_workers=12) self.val_real_loader = DataLoader(val_real_dataset, batch_size=self.batch_size, shuffle=True, num_workers=12) self.val_anim_loader = DataLoader(val_anim_dataset, batch_size=self.batch_size, shuffle=True, num_workers=12) self.test_loader = ... self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') print("Using device: ", self.device) # initialize Discriminator and Generator self.D = model.discriminator() self.D.to(self.device) self.G = model.generator() self.G.to(self.device) # initialize vgg19 pretrained model self.vgg19 = torchvision.models.vgg19(pretrained=True) self.vgg19.to(self.device) self.vgg19.eval() # initialize optimizer self.D_optimizer = optim.Adam(self.D.parameters(), config["optim"]["D_lr"], betas=(0.5, 0.99)) self.G_optimizer = optim.Adam(self.G.parameters(), config["optim"]["G_lr"], betas=(0.5, 0.99)) # initialize loss function self.BCE_loss = nn.BCELoss().to(self.device) self.L1_loss = nn.L1Loss().to(self.device) self.content_loss_lambda = 10 # initialize scheduler self.D_scheduler = MultiStepLR(self.D_optimizer, config["optim"]["D_step"], config["optim"]["D_gamma"]) self.G_scheduler = MultiStepLR(self.G_optimizer, config["optim"]["G_step"], config["optim"]["G_gamma"]) def _train(self, e): """ train the model for 1 epoch :return: """ # put model to training mode self.D.train() self.G.train() # arrays to store the losses D_losses = [] G_losses = [] Content_losses = [] for i, data in enumerate( zip(self.train_real_loader, self.train_anim_loader)): src, anim = data[0], data[1] origin_anim = anim[:, :, :, :256] edge_smooth_anim = anim[:, :, :, 256:] src = src.to(self.device) edge_smooth_anim, origin_anim = edge_smooth_anim.to( self.device), origin_anim.to(self.device) # train discriminator... # discriminate real anime image D_real = self.D(origin_anim) D_real_loss = self.BCE_loss( D_real, torch.ones_like(D_real, device=self.device)) # discriminate generated/fake anime image fake_anim = self.G(src) D_fake = self.D(fake_anim) D_fake_loss = self.BCE_loss( D_fake, torch.zeros_like(D_fake, device=self.device)) # discriminate real anime image without clear edges D_edge = self.D(edge_smooth_anim) D_edge_loss = self.BCE_loss( D_edge, torch.zeros_like(D_edge, device=self.device)) D_loss = D_real_loss + D_fake_loss + D_edge_loss self.D_optimizer.zero_grad() D_loss.backward() self.D_optimizer.step() # train generator... # generated/fake anime image fake_anim = self.G(src) D_fake = self.D(fake_anim) D_fake_loss = self.BCE_loss( D_fake, torch.ones_like(D_fake, device=self.device)) # content loss (L1) src_feature = self.vgg19((src + 1) / 2) G_feature = self.vgg19((fake_anim + 1) / 2) Content_loss = self.content_loss_lambda * self.L1_loss( G_feature, src_feature.detach()) G_loss = D_fake_loss + Content_loss self.G_optimizer.zero_grad() G_loss.backward() self.G_optimizer.step() print( "Epoch: %s, Index: %s, Discriminator loss: %.3f, Generator loss: %.3f, Content loss: %.3f" % (e, i, D_loss.item(), G_loss.item(), Content_loss.item())) D_losses.append(D_loss.item()) G_losses.append(G_loss.item()) Content_losses.append(Content_loss.item()) average_D_loss = np.mean(D_losses) average_G_loss = np.mean(G_losses) average_content_loss = np.mean(Content_losses) print() print( "Average: Epoch: %s, Discriminator loss: %.3f, Generator loss: %.3f, Content loss: %.3f" % (e, average_D_loss, average_G_loss, average_content_loss)) print() self.G_scheduler.step() self.D_scheduler.step() return average_D_loss, average_G_loss, average_content_loss def _train_warming(self, e): """ warm up the model for 1 epoch :return: """ # put generator to training mode self.G.train() # arrays to store the losses Content_losses = [] for i, src in enumerate(self.train_real_loader): src = src.to(self.device) # train generator # generated/fake anime image fake_anim = self.G(src) # content loss (L1) src_feature = self.vgg19((src + 1) / 2) G_feature = self.vgg19((fake_anim + 1) / 2) Content_loss = self.content_loss_lambda * self.L1_loss( G_feature, src_feature.detach()) self.G_optimizer.zero_grad() Content_loss.backward() self.G_optimizer.step() print("Epoch: %s, Index: %s, Content loss: %.3f" % (e, i, Content_loss.item())) Content_losses.append(Content_loss.item()) average_content_loss = np.mean(Content_losses) print() print("Epoch: %s, Average content loss: %.3f" % (e, average_content_loss)) print() return average_content_loss def _valid(self, e, pretrain=False): # use e for image names save_path = os.path.join(self.config["valid"]["save_path"]) with torch.no_grad(): self.G.eval() for i, src in enumerate(self.val_real_loader): src = src.to(self.device) generated_img = self.G(src) result = torch.cat((src[0], generated_img[0]), 2) result = (result.cpu().numpy().transpose(1, 2, 0) + 1) / 2 filename = "" if pretrain == True: filename = "pretrain_%s_%s.png" % (e, i) elif pretrain == False: filename = "during_train_%s_%s.png" % (e, i) path = os.path.join(save_path, filename) plt.imsave(path, result) if i == 6: break def run(self): warm_up_content_losses = [] # store the average loss at each epoch training_D_losses = [] training_G_losses = [] training_content_losses = [] print("start warming up") for e in range(self.warmup_epoch): curr_content_loss = self._train_warming(e) warm_up_content_losses.append(curr_content_loss) self._valid(e, True) print("start training and validating") for e in range(self.num_epoch): curr_D_loss, curr_G_loss, curr_content_loss = self._train(e) training_D_losses.append(curr_D_loss) training_G_losses.append(curr_G_loss) training_content_losses.append(curr_content_loss) self._valid(e, False) return warm_up_content_losses, training_D_losses, training_G_losses, training_content_losses def _save_model(self, epoch, D_state, G_state, D_optim_state, G_optim_state): """ save model """ torch.save( { "epoch": epoch, "D_state": D_state, "D_optim_state": D_optim_state }, os.path.join(self.D_path)) torch.save( { "epoch": epoch, "G_state": G_state, "G_optim_state": G_optim_state }, os.path.join(self.G_path)) def _test(self): return
def train_model(model, args): # Image normalization normalize = transforms.Normalize( mean=[x / 255.0 for x in [109.9, 109.7, 113.8]], std=[x / 255.0 for x in [50.1, 50.6, 50.8]]) # Compose transformations to be applied on image train_transform = transforms.Compose([]) train_transform.transforms.append(transforms.Resize((54, 54))) train_transform.transforms.append(transforms.ToTensor()) train_transform.transforms.append(normalize) # Define train data set and data loader train_dataset = Dataset(args.train_lmdb_path, train_transform) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True) # Initialize loss, optimizer, scheduler loss_object = Custom_loss() optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, nesterov=True, weight_decay=0.0005) scheduler = MultiStepLR(optimizer, milestones=[80, 120], gamma=0.1) best_accuracy = 0 training_accuracies = dict() training_losses = dict() validation_accuracies = dict() validation_losses = dict() # Start training for epoch in range(args.epochs): total_epoch_loss = 0 total_epoch_accuracy = 0 print(f'Starting training for epoch {epoch}') for i, data in enumerate(train_loader): print(f'Starting training for iteration {i}') images, lengths, labels = data if torch.cuda.is_available(): images = images.cuda() lengths = lengths.cuda() labels = labels.cuda() model.cuda() loss_object = loss_object.cuda() pred_length, pred_digit1, pred_digit2, pred_digit3, pred_digit4, pred_digit5 = model( images) loss = loss_object.loss(pred_length, pred_digit1, pred_digit2, pred_digit3, pred_digit4, pred_digit5, lengths, labels) / args.batch_size total_epoch_loss += loss total_epoch_accuracy += calculate_accuracy( pred_length, pred_digit1, pred_digit2, pred_digit3, pred_digit4, pred_digit5, lengths, labels) optimizer.zero_grad() loss.backward() optimizer.step() training_accuracies[epoch] = total_epoch_accuracy / len(train_loader) training_losses[epoch] = total_epoch_loss / len(train_loader) validation_loss, validation_accuracy = validation( model, args.val_lmdb_path, args.batch_size, loss_object) validation_losses[epoch] = validation_loss validation_accuracies[epoch] = validation_accuracy if validation_accuracy > best_accuracy: torch.save(model.state_dict(), args.weights_path + '/' + epoch + '.pt') scheduler.step(epoch) return training_losses, training_accuracies, validation_accuracies, validation_losses
def main(): if not os.path.isdir(opt.save_path): os.makedirs(opt.save_path) # Load dataset print('Loading dataset ...\n') if (opt.data_path.find('Light') != -1 or opt.data_path.find('Heavy') != -1): dataset_train = newDataset(data_path=opt.data_path) else: dataset_train = Dataset(data_path=opt.data_path) # dataset_val = Dataset(train=False) loader_train = DataLoader(dataset=dataset_train, num_workers=4, batch_size=opt.batchSize, shuffle=True) print("# of training samples: %d\n" % int(len(dataset_train))) # Build model net = BRN(recurrent_iter=opt.inter_iter, use_GPU=opt.use_GPU) net = nn.DataParallel(net) #print_network(net) #criterion = nn.MSELoss(size_average=False) criterion = pytorch_ssim.SSIM() # Move to GPU model = net.cuda() criterion.cuda() # Optimizer optimizer = optim.Adam(model.parameters(), lr=opt.lr) scheduler = MultiStepLR(optimizer, milestones=[30, 50, 80], gamma=0.2) # learning rates #scheduler = MultiStepLR(optimizer, milestones=[120, 140], gamma=0.2) # training writer = SummaryWriter(opt.save_path) step = 0 initial_epoch = findLastCheckpoint( save_dir=opt.save_path) # load the last model in matconvnet style if initial_epoch > 0: print('resuming by loading epoch %03d' % initial_epoch) model.load_state_dict( torch.load( os.path.join(opt.save_path, 'net_epoch%d.pth' % initial_epoch))) for epoch in range(initial_epoch, opt.epochs): scheduler.step(epoch) # set learning rate for param_group in optimizer.param_groups: # param_group["lr"] = current_lr print('learning rate %f' % param_group["lr"]) # train for i, (input, target) in enumerate(loader_train, 0): # training step model.train() model.zero_grad() optimizer.zero_grad() # rain = input - target input_train, target_train = Variable(input.cuda()), Variable( target.cuda()) out_train, _, _, _ = model(input_train) pixel_loss = criterion(target_train, out_train) #mse = criterion(input_train1 - target_train, r) loss = (-pixel_loss) #+ mse loss.backward() optimizer.step() # results model.eval() with torch.no_grad(): out_train, _, _, _ = model(input_train) out_train = torch.clamp(out_train, 0., 1.) #out_r_train = torch.clamp(out_r_train, 0., 1.) psnr_train = batch_PSNR(out_train, target_train, 1.) #psnr_train_r = batch_PSNR(out_r_train, rain_train, 1.) print( "[epoch %d][%d/%d] loss: %.4f, PSNR_train: %.4f" % (epoch + 1, i + 1, len(loader_train), loss.item(), psnr_train)) # if you are using older version of PyTorch, you may need to change loss.item() to loss.data[0] if step % 10 == 0: # Log the scalar values writer.add_scalar('loss', loss.item(), step) writer.add_scalar('PSNR on training data', psnr_train, step) # writer.add_scalar('loss_r', loss_r.item(), step) #writer.add_scalar('PSNR_r on training data', psnr_train_r, step) step += 1 ## the end of each epoch model.eval() with torch.no_grad(): # log the images out_train, _, _, _ = model(input_train) out_train = torch.clamp(out_train, 0., 1.) #out_r_train = torch.clamp(out_r_train, 0., 1.) Img = utils.make_grid(target_train.data, nrow=8, normalize=True, scale_each=True) Imgn = utils.make_grid(input_train.data, nrow=8, normalize=True, scale_each=True) Irecon = utils.make_grid(out_train.data, nrow=8, normalize=True, scale_each=True) #rainstreak = utils.make_grid(out_r_train.data, nrow=8, normalize=True, scale_each=True) writer.add_image('clean image', Img, epoch) writer.add_image('noisy image', Imgn, epoch) writer.add_image('reconstructed image', Irecon, epoch) #writer.add_image('estimated rain image', rainstreak, epoch) # save model torch.save(model.state_dict(), os.path.join(opt.save_path, 'net_latest.pth')) if epoch % opt.save_freq == 0: torch.save( model.state_dict(), os.path.join(opt.save_path, 'net_epoch%d.pth' % (epoch + 1)))
def main(): logger = get_logger() parser = common_parser(pac_bayes=True) args = parser.parse_args() check_args(args, pac_bayes=True) use_cuda = not args.no_cuda and torch.cuda.is_available() is_criterion_val_loss = args.criterion == 'loss' torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False rnd = np.random.RandomState(args.seed) iid = not args.non_iid device = torch.device('cuda' if use_cuda else 'cpu') contrastive_loss = ContrastiveLoss(loss_name=args.loss, device=device) # `num_blocks_per_class` is ignored due to non-iid setting. train_loader, val_loader = get_contrastive_data_loaders( rnd=rnd, data_name='auslan', validation_ratio=args.validation_ratio, mini_batch_size=args.batch_size, num_blocks_per_class=45 * 24, # this value is ignored when iid is False. block_size=args.block_size, neg_size=args.neg_size, root=args.root, include_test=False, iid=iid) num_training_samples = len(train_loader.dataset) if val_loader is None: num_val_samples = 0 else: num_val_samples = len(val_loader.dataset) if args.criterion == 'pb': logger.warn( 'You can pass 0. to `validation-ratio` argument. It could make performance better.' ) logger.info('# training samples: {} # val samples: {}\n'.format( num_training_samples, num_val_samples)) logger.info( 'PAC-Bayes parameters: b: {}, c: {}, δ: {}, prior log std: {}\n'. format(args.b, args.c, args.delta, args.prior_log_std)) model = StochasticMLP( num_training_samples=num_training_samples, rnd=rnd, num_last_units=args.dim_h, catoni_lambda=args.catoni_lambda, b=args.b, c=args.c, delta=args.delta, prior_log_std=args.prior_log_std, ).to(device) optimizer_name = args.optim.lower() if optimizer_name == 'adam': optimizer = optim.Adam(params=model.parameters(), lr=args.lr) elif optimizer_name == 'sgd': optimizer = optim.SGD(params=model.parameters(), lr=args.lr, momentum=args.momentum) elif optimizer_name == 'rmsprop': optimizer = optim.RMSprop(params=model.parameters(), lr=args.lr) else: raise ValueError( 'Optimizer must be adam, sgd, or rmsprop. Not {}'.format( optimizer_name)) logger.info('optimizer: {}\n'.format(optimizer_name)) scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=args.gamma) if is_criterion_val_loss: early_stoppings = { 'stochastic': EarlyStopping(mode='min', patience=args.patience), 'deterministic': EarlyStopping(mode='min', patience=args.patience), } learning_histories = { 'stochastic': { 'val_loss': [] }, 'deterministic': { 'val_loss': [] } } else: learning_history = {args.criterion: []} save_name = 'lr-{}_{}_{}_{}'.format(args.lr, optimizer_name, args.criterion, args.output_model_name) if is_criterion_val_loss: save_names = dict() save_names['stochastic'] = 'lr-{}_{}_{}_stochastic_{}'.format( args.lr, optimizer_name, args.criterion, args.output_model_name) save_names['deterministic'] = 'lr-{}_{}_{}_deterministic_{}'.format( args.lr, optimizer_name, args.criterion, args.output_model_name) if iid: T = 0. else: T = args.block_size for epoch in range(1, args.epoch + 1): average_objective = train(args, model, device, train_loader, optimizer, epoch, contrastive_loss, T, logger) scheduler.step() # calculate criterion value for early-stopping if is_criterion_val_loss: delete_keys = [] for eval_type, early_stopping in early_stoppings.items(): is_deterministic = eval_type == 'deterministic' val_loss = validation_loss(args, model, device, val_loader, contrastive_loss, logger, args.num_snn, deterministic=is_deterministic) learning_histories[eval_type]['val_loss'].append(val_loss) # check early_stopping is_stopped = early_stopping.is_stopped_and_save( val_loss, model, save_name=save_names[eval_type]) if is_stopped: delete_keys.append(eval_type) learning_histories[eval_type][ 'lowest_val_loss'] = early_stopping.best for delete_key in delete_keys: logger.info('Remove {} evaluation\n'.format(delete_key)) del early_stoppings[delete_key] # if early stopping dict becomes empty, stop the training if not early_stoppings: break else: learning_history[args.criterion].append(average_objective) # save learning history to json if is_criterion_val_loss: # store the lowest validation loss for eval_type, early_stopping in early_stoppings.items(): filed_name = 'lowest_val_loss' learning_histories[eval_type][filed_name] = early_stopping.best for eval_type, fname in save_names.items(): json_fname = fname.replace('.pt', '.json') with open(json_fname, 'w') as log_file: json.dump(learning_histories[eval_type], log_file) else: torch.save(model.state_dict(), save_name) json_fname = save_name.replace('.pt', '.json') with open(json_fname, 'w') as log_file: json.dump(learning_history, log_file)
class ExperimentBuilder(nn.Module): def __init__(self, network_model, experiment_name, num_epochs, train_data, val_data, test_data, use_gpu, continue_from_epoch=-1, scheduler=None, optimiser=None, sched_params=None, optim_params=None, pretrained_weights_locations=None): """ Initializes an ExperimentBuilder object. Such an object takes care of running training and evaluation of a deep net on a given dataset. It also takes care of saving per epoch models and automatically inferring the best val model to be used for evaluating the test set metrics. :param network_model: A pytorch nn.Module which implements a network architecture. :param experiment_name: The name of the experiment. This is used mainly for keeping track of the experiment and creating and directory structure that will be used to save logs, model parameters and other. :param num_epochs: Total number of epochs to run the experiment :param train_data: An object of the DataProvider type. Contains the training set. :param val_data: An object of the DataProvider type. Contains the val set. :param test_data: An object of the DataProvider type. Contains the test set. :param use_gpu: A boolean indicating whether to use a GPU or not. :param continue_from_epoch: An int indicating whether we'll start from scrach (-1) or whether we'll reload a previously saved model of epoch 'continue_from_epoch' and continue training from there. """ super(ExperimentBuilder, self).__init__() self.experiment_name = experiment_name self.model = network_model # self.model.reset_parameters() self.device = torch.cuda.current_device() if torch.cuda.device_count() > 1 and use_gpu: self.device = torch.cuda.current_device() self.model.to(self.device) self.model = nn.DataParallel(module=self.model) print('Use Multi GPU', self.device) elif torch.cuda.device_count() == 1 and use_gpu: self.device = torch.cuda.current_device() self.model.to( self.device) # sends the model from the cpu to the gpu print('Use GPU', self.device) else: print("use CPU") self.device = torch.device('cpu') # sets the device to be CPU print(self.device) # re-initialize network parameters self.train_data = train_data self.val_data = val_data self.test_data = test_data if optimiser is None or optimiser == 'Adam': self.optimizer = Adam(self.parameters(), amsgrad=False, weight_decay=optim_params['weight_decay'], lr=sched_params['lr_max']) elif optimiser == 'SGD': self.optimizer = SGD(self.parameters(), lr=sched_params['lr_max'], momentum=optim_params['momentum'], nesterov=optim_params['nesterov'], weight_decay=optim_params['weight_decay']) if scheduler == 'ERF': self.scheduler = ERF(self.optimizer, min_lr=sched_params['lr_min'], alpha=sched_params['erf_alpha'], beta=sched_params['erf_beta'], epochs=num_epochs) elif scheduler == 'Step': self.scheduler = MultiStepLR(self.optimizer, milestones=[30, 60], gamma=0.1) elif scheduler == 'Cos': self.scheduler = optim.lr_scheduler.CosineAnnealingLR( self.optimizer, T_max=num_epochs, eta_min=0.00001) else: self.scheduler = None print('System learnable parameters') num_conv_layers = 0 num_linear_layers = 0 total_num_parameters = 0 for name, value in self.named_parameters(): print(name, value.shape) if all(item in name for item in ['conv', 'weight']): num_conv_layers += 1 if all(item in name for item in ['linear', 'weight']): num_linear_layers += 1 total_num_parameters += np.prod(value.shape) print('Total number of parameters', total_num_parameters) print('Total number of conv layers', num_conv_layers) print('Total number of linear layers', num_linear_layers) # Generate the directory names self.experiment_folder = os.path.abspath(experiment_name) self.experiment_logs = os.path.abspath( os.path.join(self.experiment_folder, "result_outputs")) self.experiment_saved_models = os.path.abspath( os.path.join(self.experiment_folder, "saved_models")) print(self.experiment_folder, self.experiment_logs) # Set best models to be at 0 since we are just starting self.best_val_model_idx = 0 self.best_val_model_acc = 0. self.best_train_loss = math.inf if not os.path.exists(self.experiment_folder ): # If experiment directory does not exist os.mkdir(self.experiment_folder) # create the experiment directory if not os.path.exists(self.experiment_logs): os.mkdir( self.experiment_logs) # create the experiment log directory if not os.path.exists(self.experiment_saved_models): os.mkdir(self.experiment_saved_models ) # create the experiment saved models directory self.num_epochs = num_epochs self.criterion = nn.MSELoss().to( self.device) # send the loss computation to the GPU if continue_from_epoch == -2: try: self.best_val_model_idx, self.best_val_model_acc, self.state = self.load_model( model_save_dir=self.experiment_saved_models, model_save_name="train_model", model_idx='latest' ) # reload existing model from epoch and return best val model index # and the best val acc of that model self.starting_epoch = self.state['current_epoch_idx'] except: print( "Model objects cannot be found, initializing a new model and starting from scratch" ) self.starting_epoch = 0 self.state = dict() elif continue_from_epoch != -1: # if continue from epoch is not -1 then self.best_val_model_idx, self.best_val_model_acc, self.state = self.load_model( model_save_dir=self.experiment_saved_models, model_save_name="train_model", model_idx=continue_from_epoch ) # reload existing model from epoch and return best val model index # and the best val acc of that model self.starting_epoch = self.state['current_epoch_idx'] else: self.starting_epoch = 0 self.state = dict() if pretrained_weights_locations is not None: self.load_pre_trained_model( model_save_dir=pretrained_weights_locations, model_save_name="train_model", model_idx='best') def load_pre_trained_model(self, model_save_dir, model_save_name, model_idx): state = torch.load(f=os.path.join( model_save_dir, "{}_{}".format(model_save_name, str(model_idx)))) self.load_state_dict(state_dict=state['network']) def get_num_parameters(self): total_num_params = 0 for param in self.parameters(): total_num_params += np.prod(param.shape) return total_num_params def run_train_iter(self, image, image_with_noise): """ Receives the inputs and targets for the model and runs a training iteration. Returns loss and accuracy metrics. :param x: The inputs to the model. A numpy array of shape batch_size, channels, height, width :param y: The targets for the model. A numpy array of shape batch_size, num_classes :return: the loss and accuracy for this batch """ self.train( ) # sets model to training mode (in case batch normalization or other methods have different procedures for training and evaluation) image = image.to(self.device) image_with_noise = image_with_noise.to(self.device) out = self.model.forward( image_with_noise) # forward the data in the model loss = self.criterion(input=out, target=image) # compute loss self.optimizer.zero_grad( ) # set all weight grads from previous training iters to 0 loss.backward( ) # backpropagate to compute gradients for current iter loss self.optimizer.step() # update network parameters _, predicted = torch.max(out.data, 1) # get argmax of predictions # accuracy = np.mean(list(predicted.eq(y.data).cpu())) # compute accuracy for n, p in self.model.named_parameters(): if (p.requires_grad) and ("bias" not in n): if p.abs().max() < 10**(-30): raise Exception('Weights smaller than 10e-30') return loss.data.detach().cpu().numpy(), 0 def run_evaluation_iter(self, x, y): """ Receives the inputs and targets for the model and runs an evaluation iterations. Returns loss and accuracy metrics. :param x: The inputs to the model. A numpy array of shape batch_size, channels, height, width :param y: The targets for the model. A numpy array of shape batch_size, num_classes :return: the loss and accuracy for this batch """ self.eval() # sets the system to validation mode if len(y.shape) > 1: y = np.argmax( y, axis=1 ) # convert one hot encoded labels to single integer labels if type(x) is np.ndarray: x, y = torch.Tensor(x).float( ).to(device=self.device), torch.Tensor(y).long().to( device=self.device ) # convert data to pytorch tensors and send to the computation device x = x.to(self.device) y = y.to(self.device) out = self.model.forward(x) # forward the data in the model loss = F.cross_entropy(out, y) # compute loss _, predicted = torch.max(out.data, 1) # get argmax of predictions accuracy = np.mean(list(predicted.eq( y.data).cpu())) # compute accuracy y_cpu = y.data.cpu() predicted_cpu = predicted.cpu() f1 = f1_score(y_cpu, predicted_cpu, average='macro') precision = precision_score(y_cpu, predicted_cpu, average='macro') recall = recall_score(y_cpu, predicted_cpu, average='macro') return loss.data.detach().cpu().numpy( ), accuracy, f1, precision, recall def save_model(self, model_save_dir, model_save_name, model_idx, state): """ Save the network parameter state and current best val epoch idx and best val accuracy. :param model_save_name: Name to use to save model without the epoch index :param model_idx: The index to save the model with. :param best_validation_model_idx: The index of the best validation model to be stored for future use. :param best_validation_model_acc: The best validation accuracy to be stored for use at test time. :param model_save_dir: The directory to store the state at. :param state: The dictionary containing the system state. """ state['network'] = self.state_dict( ) # save network parameter and other variables. torch.save( state, f=os.path.join(model_save_dir, "{}_{}".format( model_save_name, str(model_idx)))) # save state at prespecified filepath def run_training_epoch(self, current_epoch_losses): with tqdm.tqdm(total=len(self.train_data), file=sys.stdout ) as pbar_train: # create a progress bar for training for idx, (image, image_with_noise) in enumerate( self.train_data): # get data batches loss, accuracy = self.run_train_iter( image=image, image_with_noise=image_with_noise ) # take a training iter step current_epoch_losses["train_loss"].append( loss) # add current iter loss to the train loss list current_epoch_losses["train_acc"].append( accuracy) # add current iter acc to the train acc list pbar_train.update(1) pbar_train.set_description( "loss: {:.4f}, accuracy: {:.4f}".format(loss, accuracy)) return current_epoch_losses def run_validation_epoch(self, current_epoch_losses): with tqdm.tqdm(total=len(self.val_data), file=sys.stdout ) as pbar_val: # create a progress bar for validation for x, y in self.val_data: # get data batches loss, accuracy, f1, precision, recall = self.run_evaluation_iter( x=x, y=y) # run a validation iter current_epoch_losses["val_loss"].append( loss) # add current iter loss to val loss list. current_epoch_losses["val_acc"].append( accuracy) # add current iter acc to val acc lst. current_epoch_losses["val_f1"].append(f1) current_epoch_losses["val_precision"].append(precision) current_epoch_losses["val_recall"].append(recall) pbar_val.update(1) # add 1 step to the progress bar pbar_val.set_description( "loss: {:.4f}, accuracy: {:.4f}".format(loss, accuracy)) return current_epoch_losses def run_testing_epoch(self, current_epoch_losses): with tqdm.tqdm(total=len(self.test_data), file=sys.stdout) as pbar_test: # ini a progress bar for x, y in self.test_data: # sample batch # compute loss and accuracy by running an evaluation step loss, accuracy, f1, precision, recall = self.run_evaluation_iter( x=x, y=y) current_epoch_losses["test_loss"].append( loss) # save test loss current_epoch_losses["test_acc"].append( accuracy) # save test accuracy current_epoch_losses["test_f1"].append(f1) current_epoch_losses["test_precision"].append(precision) current_epoch_losses["test_recall"].append(recall) pbar_test.update(1) # update progress bar status pbar_test.set_description( "loss: {:.4f}, accuracy: {:.4f}".format( loss, accuracy)) # update progress bar string output return current_epoch_losses def load_model(self, model_save_dir, model_save_name, model_idx): """ Load the network parameter state and the best val model idx and best val acc to be compared with the future val accuracies, in order to choose the best val model :param model_save_dir: The directory to store the state at. :param model_save_name: Name to use to save model without the epoch index :param model_idx: The index to save the model with. :return: best val idx and best val model acc, also it loads the network state into the system state without returning it """ state = torch.load(f=os.path.join( model_save_dir, "{}_{}".format(model_save_name, str(model_idx)))) self.load_state_dict(state_dict=state['network']) return state['best_val_model_idx'], state['best_val_model_acc'], state def run_experiment(self): """ Runs experiment train and evaluation iterations, saving the model and best val model and val model accuracy after each epoch :return: The summary current_epoch_losses from starting epoch to total_epochs. """ total_losses = { "train_acc": [], "train_loss": [], "val_acc": [], "val_loss": [], "val_f1": [], "val_precision": [], "val_recall": [], "curr_epoch": [] } # initialize a dict to keep the per-epoch metrics for i, epoch_idx in enumerate( range(self.starting_epoch, self.num_epochs)): epoch_start_time = time.time() current_epoch_losses = { "train_acc": [], "train_loss": [], "val_acc": [], "val_loss": [], "val_f1": [], "val_precision": [], "val_recall": [] } current_epoch_losses = self.run_training_epoch( current_epoch_losses) if self.scheduler is not None: self.scheduler.step() train_loss_average = np.mean(current_epoch_losses['train_loss']) if train_loss_average < self.best_train_loss: print(f'Saving Best Model') self.best_train_loss = train_loss_average self.save_model( model_save_dir=self.experiment_saved_models, # save model and best val idx and best val acc, using the model dir, model name and model idx model_save_name="train_model", model_idx='best', state=self.state) for key, value in current_epoch_losses.items(): total_losses[key].append(np.mean(value)) # get mean of all metrics of current epoch metrics dict, # to get them ready for storage and output on the terminal. total_losses['curr_epoch'].append(epoch_idx) save_statistics(experiment_log_dir=self.experiment_logs, filename='summary.csv', stats_dict=total_losses, current_epoch=i, continue_from_mode=True if (self.starting_epoch != 0 or i > 0) else False) # save statistics to stats file. # load_statistics(experiment_log_dir=self.experiment_logs, filename='summary.csv') # How to load a csv file if you need to out_string = "_".join([ "{}_{:.4f}".format(key, np.mean(value)) for key, value in current_epoch_losses.items() ]) # create a string to use to report our epoch metrics epoch_elapsed_time = time.time( ) - epoch_start_time # calculate time taken for epoch epoch_elapsed_time = "{:.4f}".format(epoch_elapsed_time) print("Epoch {}:".format(epoch_idx), out_string, "epoch time", epoch_elapsed_time, "seconds") self.state['current_epoch_idx'] = epoch_idx return total_losses
ax1.plot(np.arange(epoch + 1), loss[0], '-y', label='ste-model loss') ax1.plot(np.arange(epoch + 1), loss[1], '-r', label='discriminator loss') ax2.plot(np.arange(epoch + 1), acc[0], '-g', label='real_acc') ax2.plot(np.arange(epoch + 1), acc[1], '-b', label='wm_acc') ax1.set_xlabel('Epoch(' + ",".join(str(l) for l in args.hyper_parameters) + ')') ax1.set_ylabel('Train Loss') ax2.set_ylabel('Accuracy (%)') ax1.set_ylim(0, 5) ax2.set_ylim(0, 100) ax1.legend(loc=1) ax2.legend(loc=2) if train: plt.savefig(args.save_path + 'results_train_' + GPU + '.png') else: plt.savefig(args.save_path + 'results_test_' + GPU + '.png') plt.close() for epoch in range(args.num_epochs): train(epoch) val_hloss, val_disloss, val_dnnloss, acc, wm_acc, wm_inut_acc = test(epoch) schedulerH.step(val_hloss) schedulerD.step(val_disloss) schedulerN.step() print(acc, wm_acc, wm_inut_acc)
def main(args: argparse.Namespace): logger = CompleteLogger(args.log, args.phase) print(args) if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') cudnn.benchmark = True # Data loading code normalize = T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) train_transform = T.Compose([ T.RandomRotation(args.rotation), T.RandomResizedCrop(size=args.image_size, scale=args.resize_scale), T.ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25), T.GaussianBlur(), T.ToTensor(), normalize ]) val_transform = T.Compose( [T.Resize(args.image_size), T.ToTensor(), normalize]) image_size = (args.image_size, args.image_size) heatmap_size = (args.heatmap_size, args.heatmap_size) source_dataset = datasets.__dict__[args.source] train_source_dataset = source_dataset(root=args.source_root, transforms=train_transform, image_size=image_size, heatmap_size=heatmap_size) train_source_loader = DataLoader(train_source_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) val_source_dataset = source_dataset(root=args.source_root, split='test', transforms=val_transform, image_size=image_size, heatmap_size=heatmap_size) val_source_loader = DataLoader(val_source_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True) target_dataset = datasets.__dict__[args.target] train_target_dataset = target_dataset(root=args.target_root, transforms=train_transform, image_size=image_size, heatmap_size=heatmap_size) train_target_loader = DataLoader(train_target_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) val_target_dataset = target_dataset(root=args.target_root, split='test', transforms=val_transform, image_size=image_size, heatmap_size=heatmap_size) val_target_loader = DataLoader(val_target_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True) print("Source train:", len(train_source_loader)) print("Target train:", len(train_target_loader)) print("Source test:", len(val_source_loader)) print("Target test:", len(val_target_loader)) train_source_iter = ForeverDataIterator(train_source_loader) train_target_iter = ForeverDataIterator(train_target_loader) # create model model = models.__dict__[args.arch]( num_keypoints=train_source_dataset.num_keypoints).to(device) criterion = JointsMSELoss() # define optimizer and lr scheduler optimizer = Adam(model.get_parameters(lr=args.lr)) lr_scheduler = MultiStepLR(optimizer, args.lr_step, args.lr_factor) # optionally resume from a checkpoint start_epoch = 0 if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) start_epoch = checkpoint['epoch'] + 1 # define visualization function tensor_to_image = Compose([ Denormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ToPILImage() ]) def visualize(image, keypoint2d, name): """ Args: image (tensor): image in shape 3 x H x W keypoint2d (tensor): keypoints in shape K x 2 name: name of the saving image """ train_source_dataset.visualize( tensor_to_image(image), keypoint2d, logger.get_image_path("{}.jpg".format(name))) if args.phase == 'test': # evaluate on validation set source_val_acc = validate(val_source_loader, model, criterion, None, args) target_val_acc = validate(val_target_loader, model, criterion, visualize, args) print("Source: {:4.3f} Target: {:4.3f}".format(source_val_acc['all'], target_val_acc['all'])) for name, acc in target_val_acc.items(): print("{}: {:4.3f}".format(name, acc)) return # start training best_acc = 0 for epoch in range(start_epoch, args.epochs): logger.set_epoch(epoch) lr_scheduler.step() # train for one epoch train(train_source_iter, train_target_iter, model, criterion, optimizer, epoch, visualize if args.debug else None, args) # evaluate on validation set source_val_acc = validate(val_source_loader, model, criterion, None, args) target_val_acc = validate(val_target_loader, model, criterion, visualize if args.debug else None, args) # remember best acc and save checkpoint torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args }, logger.get_checkpoint_path(epoch)) if target_val_acc['all'] > best_acc: shutil.copy(logger.get_checkpoint_path(epoch), logger.get_checkpoint_path('best')) best_acc = target_val_acc['all'] print("Source: {:4.3f} Target: {:4.3f} Target(best): {:4.3f}".format( source_val_acc['all'], target_val_acc['all'], best_acc)) for name, acc in target_val_acc.items(): print("{}: {:4.3f}".format(name, acc)) logger.close()
def main_worker(args): global best_acc1 global best_acc1_index # os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus args.outpath = args.outpath + '_' + args.arch output_process(args.outpath) write_settings(args) logger = get_logger(args.outpath, 'DataParallel') writer = SummaryWriter(args.outpath) logger.info(args) # create model if args.pretrained: logger.info("=> using pre-trained model: {}".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: logger.info('=> creating model: {}'.format(args.arch)) model = models.__dict__[args.arch]() model = nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.lr_scheduler == 'steplr': lr_scheduler = MultiStepLR(optimizer, milestones=args.step, gamma=args.gamma) logger.info('lr_scheduler: SGD MultiStepLR !!!') else: assert False, logger.info("invalid lr_scheduler={}".format(args.lr_scheduler)) # logger.info('lr_scheduler={}'.format(lr_scheduler)) # dataloader traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder(traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_dataset = datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args, logger, writer, epoch=-1) return 0 total_start = time.time() for epoch in range(args.start_epoch, args.epochs): epoch_start = time.time() lr_scheduler.step(epoch) # train for every epoch train(train_loader, model, criterion, optimizer, epoch, args, logger, writer) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args, logger, writer, epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 if is_best: best_acc1_index = epoch best_acc1 = acc1 epoch_end = time.time() logger.info('||==> Epoch=[{:d}/{:d}]\tbest_acc1={:.4f}\tbest_acc1_index={}\ttime_cost={:.4f}s' .format(epoch, args.epochs, best_acc1, best_acc1_index, epoch_end - epoch_start)) # save model save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.module.state_dict(), 'best_acc1': best_acc1, }, is_best, args.outpath) total_end = time.time() logger.info('||==> total_time_cost={:.4f}s'.format(total_end - total_start)) writer.close()
def main(): args = parser.parse_args() model_name = args.name writer = SummaryWriter(pathlib.Path(args.eventdir)/ model_name) # Create a pytorch dataset data_dir = pathlib.Path(args.datadir) image_count = len(list(data_dir.glob('**/*.JPEG'))) CLASS_NAMES = np.array([item.name for item in (data_dir / 'train').glob('*')]) print('Discovered {} images'.format(image_count)) # Create the training data generator batch_size = args.batch_size im_height = 64 im_width = 64 num_epochs = args.epochs mean = [0.4802, 0.4481, 0.3975] std = [0.2296, 0.2263, 0.2255] val_transforms = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean, std), ]) train_transform = transforms.Compose( [transforms.RandomHorizontalFlip()]) preprocess = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean, std)]) train_set = torchvision.datasets.ImageFolder(data_dir / 'train', train_transform) val_set = Val_Dataset(train_transform,train_set.class_to_idx) full_train_set = torch.utils.data.ConcatDataset([train_set, val_set]) am_train_set = AugMixDataset(full_train_set, preprocess,augmentations.augmentations_all) train_loader = torch.utils.data.DataLoader(am_train_set, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) # Create a simple modeli lr = args.lr mom = args.momentum wd = args.weight_decay model = None if args.model_name == 'vgg16_slim': model = vgg_slim.vgg16_slim().cuda() elif args.model_name == 'vgg16': model = vgg_slim.vgg16().cuda() elif args.model_name == 'efficientnet-b0': model = EfficientNet.from_name('efficientnet-b0').cuda() elif args.model_name == 'resnet-18': model = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=False).cuda() else: print("Unknown model name: {}".format(args.model_name)) optim = torch.optim.SGD(model.parameters(),lr=lr, momentum=mom, weight_decay=wd) # !!!!!!!!!!!! TODO: put the right milestones in here: !!!!!!!!!!!! epoch_milestones = [] # Sets the epochs at which the LR will decay by a factor of 0.1 sched = MultiStepLR(optim, epoch_milestones) print(get_n_params(model)) criterion = nn.CrossEntropyLoss() for i in range(num_epochs): model.train() train_total, train_correct = 0,0 running_loss = 0.0 for idx, (inputs, targets) in enumerate(train_loader): split_s = inputs[0].size(0) if(args.augmix_off): inputs = inputs[0].cuda() else: inputs = torch.cat(inputs,0).cuda() targets = targets.cuda() optim.zero_grad() outputs = model(inputs) if(args.augmix_off): logits_clean = outputs else: logits_clean, logits_aug1, logits_aug2 = torch.split(outputs, split_s) loss = criterion(logits_clean, targets) if(not args.augmix_off): p_clean = F.softmax(logits_clean, dim=1) p_aug1 = F.softmax(logits_aug1, dim=1) p_aug2 = F.softmax(logits_aug2, dim=1) p_mixture = torch.clamp((p_clean + p_aug1 + p_aug2) / 3., 1e-7, 1).log() loss += 12 * (F.kl_div(p_mixture, p_clean, reduction='batchmean') + F.kl_div(p_mixture, p_aug1, reduction='batchmean') + F.kl_div(p_mixture, p_aug2, reduction='batchmean')) / 3 running_loss += loss.item() loss.backward() optim.step() _, predicted = logits_clean.max(1) train_total += targets.size(0) train_correct += predicted.eq(targets).sum().item() print("\r", end='') print(f'training {100 * idx / len(train_loader):.2f}%: {train_correct / train_total:.3f}', end='') torch.save({ 'net': model.state_dict(), }, args.savedir + '/' + model_name +'_e' + str(i) +'.pt') writer.add_scalar('Train Accuracy', float(train_correct)/float(train_total),i) writer.add_scalar('Train Loss', running_loss, i) sched.step() writer.close()
print('Validation loss of last epoch: %f' % (Validation_loss[-1])) recon_loss_sum, kl_loss_sum = 0, 0 qsms = (qsms.to(device, dtype=torch.float) + trans) * scale masks = masks.to(device, dtype=torch.float) qsms = qsms * masks recon_loss, kl_loss = vae_train(model=vae3d, optimizer=optimizer, x=qsms, mask=masks) recon_loss_sum += recon_loss kl_loss_sum += kl_loss gen_iterations += 1 time.sleep(1) scheduler.step(epoch) # validation phase vae3d.eval() loss_total = 0 idx = 0 with torch.no_grad(): # to solve memory exploration issue for idx, (rdfs, masks, weights, qsms) in enumerate(valLoader): idx += 1 qsms = (qsms.to(device, dtype=torch.float) + trans) * scale masks = masks.to(device, dtype=torch.float) qsms = qsms * masks x_mu, x_var, z_mu, z_logvar = vae3d(qsms) x_factor = torch.prod(torch.tensor(x_mu.size())) z_factor = torch.prod(torch.tensor(z_mu.size()))
def main_worker(gpu, ngpus_per_node, cfg): if cfg.gpu is not None: print("Use GPU: {} for training".format(cfg.gpu)) if cfg.distributed: print('init distributing process') if cfg.dist_url == "env://" and cfg.rank == -1: cfg.rank = int(os.environ["RANK"]) dist.init_process_group(backend=cfg.dist_backend, init_method=cfg.dist_url, world_size=cfg.world_size, rank=cfg.rank) # Data print('==> Preparing data..') # Load vocabulary wrapper for image caption with open(cfg.vocab_path, 'rb') as f: vocab = pickle.load(f) # Image preprocessing, normalization for the pretrained resnet # cifar cls, use resized 36x36 image if cfg.task == 'cifar_cls': transform = transforms.Compose([ transforms.RandomCrop(cfg.crop_size, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # imagenet cls, 224x224 # same as MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978 if cfg.task == 'imagenet_cls': transform = transforms.Compose([ transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # coco det, 1333x800 # same as MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978 if cfg.task == 'coco_det': transform = transforms.Compose([ transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # COCO caption dataset coco = CocoDataset(root=cfg.image_dir, json=cfg.caption_path, vocab=vocab, transform=transform) #Build data loader for image caption training if cfg.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(coco) else: train_sampler = None # Data loader for COCO dataset # This will return (images, captions, lengths) for each iteration. # images: a tensor of shape (batch_size, 3, 224, 224). # captions: a tensor of shape (batch_size, padded_length). # lengths: a list indicating valid length for each caption. length is (batch_size). data_loader = torch.utils.data.DataLoader(dataset=coco, batch_size=cfg.batch_size, shuffle=(train_sampler is None), num_workers=cfg.num_workers, collate_fn=collate_fn, pin_memory=True, sampler=train_sampler) # Build the Decoder models decoder = DecoderRNN(cfg.model['embed_size'], cfg.model['hidden_size'], len(vocab), cfg.model['num_layers']) if cfg.model['net'] == 'densenet121': linear_ic = nn.Linear(1024, 256) bn_ic = nn.BatchNorm1d(256, momentum=0.01) net = DenseNet121() if cfg.model['net'] == 'densenet169': linear_ic = nn.Linear(4096, 256) bn_ic = nn.BatchNorm1d(256, momentum=0.01) net = DenseNet169() if cfg.model['net'] == 'resnet34': linear_ic = nn.Linear(512, 256) bn_ic = nn.BatchNorm1d(256, momentum=0.01) net = ResNet34() if cfg.model['net'] == 'resnet50': linear_ic = nn.Linear(2048, 256) bn_ic = nn.BatchNorm1d(256, momentum=0.01) net = ResNet50() if cfg.model['net'] == 'resnet101': linear_ic = nn.Linear(2048, 256) bn_ic = nn.BatchNorm1d(256, momentum=0.01) net = ResNet101() print('cfg.distributed:', cfg.distributed) if cfg.distributed: linear_ic.cuda() bn_ic.cuda() net.cuda() decoder.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set linear_ic = torch.nn.parallel.DistributedDataParallel(linear_ic) bn_ic = torch.nn.parallel.DistributedDataParallel(bn_ic) net = torch.nn.parallel.DistributedDataParallel(net) decoder = torch.nn.parallel.DistributedDataParallel(decoder) else: torch.cuda.set_device(device) linear_ic.cuda(cfg.gpu) bn_ic.cuda(cfg.gpu) net.cuda(cfg.gpu) decoder.cuda(cfg.gpu) criterion = nn.CrossEntropyLoss() # Optimizer for image classificaation # optimizer = optim.Adam(list(net.parameters()), lr=cfg.lr) optimizer_ic = optim.Adam( list(net.parameters()) + list(linear_ic.parameters()) + list(decoder.parameters()) + list(bn_ic.parameters()), lr=cfg.lr) #0.0001 scheduler = MultiStepLR(optimizer_ic, milestones=[60, 120, 160], gamma=0.1) if cfg.loading: # Load checkpoint. print('==> Resuming from checkpoint..') # assert os.path.isdir(cfg.checkpoint), 'Error: no checkpoint directory found!' checkpoint = torch.load(cfg.checkpoint) net.load_state_dict(checkpoint) # best_acc = checkpoint['acc'] start_epoch = int(cfg.checkpoint.split('/')[-1].split('-')[1]) else: start_epoch = 0 #scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_ic, T_max=200) log_dir = 'log/' + cfg.config.split('/')[1][:-3] if not os.path.exists(log_dir): os.makedirs(log_dir) writer = SummaryWriter(log_dir=log_dir) #start training for epoch in range(start_epoch, cfg.num_epochs): if cfg.distributed: train_sampler.set_epoch(epoch) net = train_ic(epoch, cfg, net=net, decoder=decoder, linear=linear_ic, bn=bn_ic, optimizer_ic=optimizer_ic, criterion=criterion, data_loader=data_loader, writer=writer) scheduler.step()
def train(model, train_dataset, test_dataset=None, model_dir='models', lr=1e-04, lr_decay=.1, lr_decay_epochs=None, weight_decay=1e-04, gamma1=1., gamma2=1., gamma3=10., batch_size=32, test_size=256, epochs=5, eval_log_interval=30, loss_log_interval=30, weight_log_interval=500, checkpoint_interval=500, resume_best=False, resume_latest=False, cuda=False): criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) scheduler = MultiStepLR(optimizer, lr_decay_epochs, gamma=lr_decay) # prepare the model and statistics. model.train() epoch_start = 1 best_precision = 0 # load checkpoint if needed. if resume_latest or resume_best: epoch_start, best_precision = utils.load_checkpoint(model, model_dir, best=resume_best) for epoch in range(epoch_start, epochs + 1): # adjust learning rate if needed. scheduler.step(epoch - 1) # prepare a data stream for the epoch. data_loader = utils.get_data_loader(train_dataset, batch_size, cuda=cuda) data_stream = tqdm(enumerate(data_loader, 1)) for batch_index, (data, labels) in data_stream: # where are we? data_size = len(data) dataset_size = len(data_loader.dataset) dataset_batches = len(data_loader) iteration = ((epoch - 1) * (len(data_loader.dataset) // batch_size) + batch_index + 1) # clear the gradients. optimizer.zero_grad() # run the network. x = Variable(data).cuda() if cuda else Variable(data) labels = Variable(labels).cuda() if cuda else Variable(labels) scores = model(x) _, predicted = scores.max(1) precision = (labels == predicted).sum().data[0] / data_size # update the network. cross_entropy_loss = criterion(scores, labels) overlap_loss, uniform_loss, split_loss = model.reg_loss() overlap_loss *= gamma1 uniform_loss *= gamma3 split_loss *= gamma2 reg_loss = overlap_loss + uniform_loss + split_loss total_loss = cross_entropy_loss + reg_loss total_loss.backward(retain_graph=True) optimizer.step() # update & display statistics. data_stream.set_description( ('epoch: {epoch}/{epochs} | ' 'it: {iteration} | ' 'progress: [{trained}/{total}] ({progress:.0f}%) | ' 'prec: {prec:.3} | ' 'loss => ' 'ce: {ce_loss:.4} / ' 'reg: {reg_loss:.4} / ' 'total: {total_loss:.4}').format( epoch=epoch, epochs=epochs, iteration=iteration, trained=(batch_index + 1) * batch_size, total=dataset_size, progress=(100. * (batch_index + 1) / dataset_batches), prec=precision, ce_loss=(cross_entropy_loss.data[0] / data_size), reg_loss=(reg_loss.data[0] / data_size), total_loss=(total_loss.data[0] / data_size), )) # Send test precision to the visdom server. if iteration % eval_log_interval == 0: visual.visualize_scalar(utils.validate(model, test_dataset, test_size=test_size, cuda=cuda, verbose=False), 'precision', iteration, env=model.name) # Send losses to the visdom server. if iteration % loss_log_interval == 0: reg_losses_and_names = ([ overlap_loss.data / data_size, uniform_loss.data / data_size, split_loss.data / data_size, reg_loss.data / data_size, ], ['overlap', 'uniform', 'split', 'total']) visual.visualize_scalar(overlap_loss.data / data_size, 'overlap loss', iteration, env=model.name) visual.visualize_scalar(uniform_loss.data / data_size, 'uniform loss', iteration, env=model.name) visual.visualize_scalar(split_loss.data / data_size, 'split loss', iteration, env=model.name) visual.visualize_scalars(*reg_losses_and_names, 'regulaization losses', iteration, env=model.name) model_losses_and_names = ([ cross_entropy_loss.data / data_size, reg_loss.data / data_size, total_loss.data / data_size, ], ['cross entropy', 'regularization', 'total']) visual.visualize_scalar(cross_entropy_loss.data / data_size, 'cross entropy loss', iteration, env=model.name) visual.visualize_scalar(reg_loss.data / data_size, 'regularization loss', iteration, env=model.name) visual.visualize_scalars(*model_losses_and_names, 'model losses', iteration, env=model.name) if iteration % weight_log_interval == 0: # Send visualized weights to the visdom server. weights = [ (w.data, p, q) for i, g in enumerate(model.residual_block_groups) for b in g.residual_blocks for w, p, q in ( (b.w1, b.p(), b.r()), (b.w2, b.r(), b.q()), (b.w3, b.p(), b.q()), ) if i + 1 > (len(model.residual_block_groups) - (len(model.split_sizes) - 1)) and w is not None ] + [(model.fc.linear.weight.data, model.fc.p(), model.fc.q())] names = [ 'g{i}-b{j}-w{k}'.format(i=i + 1, j=j + 1, k=k + 1) for i, g in enumerate(model.residual_block_groups) for j, b in enumerate(g.residual_blocks) for k, w in enumerate((b.w1, b.w2, b.w3)) if i + 1 > (len(model.residual_block_groups) - (len(model.split_sizes) - 1)) and w is not None ] + ['fc-w'] for (w, p, q), name in zip(weights, names): visual.visualize_kernel( splits.block_diagonalize_kernel(w, p, q), name, label='epoch{}-{}'.format(epoch, batch_index + 1), update_window_without_label=True, env=model.name, ) # Send visualized split indicators to the visdom server. indicators = [ q.data for i, g in enumerate(model.residual_block_groups) for j, b in enumerate(g.residual_blocks) for k, q in enumerate((b.p(), b.r())) if q is not None ] + [model.fc.p().data, model.fc.q().data] names = [ 'g{i}-b{j}-{indicator}'.format( i=i + 1, j=j + 1, indicator=ind) for i, g in enumerate(model.residual_block_groups) for j, b in enumerate(g.residual_blocks) for ind, q in zip(('p', 'r'), (b.p(), b.r())) if q is not None ] + ['fc-p', 'fc-q'] for q, name in zip(indicators, names): # Stretch the split indicators before visualization. q_diagonalized = splits.block_diagonalize_indacator(q) q_diagonalized_expanded = q_diagonalized\ .view(*q.size(), 1)\ .repeat(1, 20, 1)\ .view(-1, q.size()[1]) visual.visualize_kernel(q_diagonalized_expanded, name, label='epoch{}-{}'.format( epoch, batch_index + 1), update_window_without_label=True, env=model.name, w=100, h=100) if iteration % checkpoint_interval == 0: # notify that we've reached to a new checkpoint. print() print() print('#############') print('# checkpoint!') print('#############') print() # test the model. model_precision = utils.validate(model, test_dataset or train_dataset, test_size=test_size, cuda=cuda, verbose=True) # update best precision if needed. is_best = model_precision > best_precision best_precision = max(model_precision, best_precision) # save the checkpoint. utils.save_checkpoint(model, model_dir, epoch, model_precision, best=is_best) print()
img_mode='NHWC') val_loss, val_accuracy1, = TrainMethod.test(model, val, 512 // args.batch_size, criterion, device, dtype, img_mode='NHWC') val_writer.add_scalar('epoch_accuracy', val_accuracy1, epoch) val_writer.add_scalar('epoch_loss', val_loss, epoch) train_writer.add_scalar('epoch_loss', train_loss, epoch) train_writer.add_scalar('epoch_accuracy', train_accuracy1, epoch) if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(val_loss) elif isinstance(scheduler, MultiStepLR): scheduler.step() if val_accuracy1 > best_test: best_test = val_accuracy1 # TODO:没输入进去? # logger.info('epoch:{}, val_loss:{:.5f}, val_accuracy:{:.5f}'.format(epoch+1, val_loss, val_accuracy1)) save_checkpoint( is_best=False, filepath=args.save, filename='{}-epoch{}-val_loss{:.4f}.pth'.format( args.model_name, epoch, val_loss), state={ 'epoch': epoch, 'state_dict': model.state_dict(),
def main(): setup_information() landmarks_model = _model_init() train_loader = get_dataset() for p in landmarks_model.resnet.conv1.parameters(): p.requires_grad = False for p in landmarks_model.resnet.bn1.parameters(): p.requires_grad = False for p in landmarks_model.resnet.layer1.parameters(): p.requires_grad = False #for p in landmarks_model.resnet.layer1.0.parameters(): p.requires_grad=False params = [] #params = list(net.parameters()) for p in list(landmarks_model.parameters()): if p.requires_grad == False: continue params.append(p) optimizer = optim.RMSprop(params, lr=LEARNING_RATE) scheduler = MultiStepLR(optimizer, milestones=MILESTONES, gamma=0.2) landmark_avg_loss = AverageMeter('landmark_avg_loss', ':.4e') ks_landmark_avg_loss = AverageMeter('ks_landmark_avg_loss', ':.4e') coeff_avg_loss = AverageMeter('coeff_avg_loss', ':.4e') pixel_avg_loss = AverageMeter('pixel_avg_loss', ':.4e') skin_mask_avg_loss = AverageMeter('skin_mask_loss', ':.4e') avg_loss = AverageMeter('loss', ':.4e') #normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], # std=[0.229, 0.224, 0.225]) #trans = transforms.Compose([ # transforms.ToTensor(), # normalize, #]) for epoch in range(MAX_EPOCH): epoch_time = time.time() scheduler.step() count = 0 for i_batch, _sample_batched in enumerate(train_loader): _inputs = _sample_batched[ 0] #--- batch_size x 3 x 224 x 224, bgr, 0, 255 _bgr_images = _sample_batched[1].float( ) #--batch_size x 224 x 224 x 3, bgr, [0, 255] _mask_images = _sample_batched[2].ge(200) _gt_pts = _sample_batched[3] _inputs = _inputs / 255. _inputs = _inputs.sub_(mean[:, None, None]).div_(std[:, None, None]) _gt_pts = _gt_pts / CROP_SIZE * image_size _gt_pts = _gt_pts.view(batch_size, -1, 3) _inputs = _inputs.to(device) _bgr_images = _bgr_images.to(device) _mask_images = _mask_images.to(device) _gt_pts = _gt_pts.to(device) _render_images = _bgr_images.clone() coeff = landmarks_model(_inputs) #-----rendering----------------------- id_coeff, ex_coeff, tex_coeff, angles, gamma, translation = landmarks_model.Split_coeff( coeff) coeff_loss = landmarks_model.get_coeff_loss( id_coeff, ex_coeff, tex_coeff) face_shape = landmarks_model.Shape_formation( id_coeff, ex_coeff, batch_size) face_norm = landmarks_model.Compute_norm(face_shape, batch_size) rotation = landmarks_model.Compute_rotation_matrix(angles) face_shape = torch.matmul(face_shape, rotation) ###旋转vertex face_shape = face_shape + translation.view(-1, 1, 3).repeat( 1, face_shape.size()[1], 1) norm_r = torch.matmul(face_norm, rotation) face_texture = landmarks_model.Texture_formation( tex_coeff, batch_size) face_color, _ = landmarks_model.Illumination_layer( face_texture, norm_r, gamma) face_color = Textures( verts_rgb=face_color.to(device)) #---改成pytorch3d中的Textures数据格式 skin_mask_color = face_texture[:, landmarks_model.skinmask, :] skin_mask_loss = landmarks_model.get_skin_mask_loss( skin_mask_color) mesh = Meshes(face_shape.to(device), landmarks_model.face_index, face_color) #---landmarks------ transformed_face_shape = cameras.transform_points(face_shape) landmarks = transformed_face_shape[:, landmarks_model.facemodel. keypoints, :] landmarks = ((landmarks + 1) * image_size - 1) / 2. landmarks[:, :, :2] = image_size - landmarks[:, :, : 2] #---x坐标和y坐标都需要倒置一下 landmark_loss = landmarks_model.get_landmark_loss( _gt_pts[:, :, :2], landmarks[:, :, :2]) #-------rendered images--- images = renderer(mesh) images = images[:, :, :, :3] #---get images images = images[:, :, :, [2, 1, 0]] #---rgb to bgr images_clone = images.clone() #images = images.clamp(0, 255) index = (images > 0) _render_images[index] = images[index] target_images_dir = "debug_images_dir" if not os.path.exists(target_images_dir): os.makedirs(target_images_dir) image_leve_loss = landmarks_model.get_image_level_loss( _render_images, _bgr_images, _mask_images) #landmark_loss = 1.6e-3 * landmark_loss #image_leve_loss = 1.9 * image_leve_loss #coeff_loss = 3e-4 * coeff_loss #skin_mask_loss = 5 * skin_mask_loss landmark_loss = 0.5 * landmark_loss image_leve_loss = 0.1 * image_leve_loss coeff_loss = coeff_loss skin_mask_loss = skin_mask_loss loss = image_leve_loss + coeff_loss + skin_mask_loss + landmark_loss #_bgr_images = _bgr_images.cpu().detach().numpy() #for i in range(batch_size): # a_image = _bgr_images[i] # a_target_image_path = os.path.join(target_images_dir, str(i) + '.jpg') # cv2.imwrite(a_target_image_path, a_image) #sys.exit(0) #_render_images = _render_images.cpu().detach().numpy() #for i in range(batch_size): # a_image = _render_images[i] # a_target_image_path = os.path.join(target_images_dir, str(i) + '.jpg') # cv2.imwrite(a_target_image_path, a_image) avg_loss.update(loss.detach().item()) landmark_avg_loss.update(landmark_loss.detach().item()) skin_mask_avg_loss.update(skin_mask_loss.detach().item()) coeff_avg_loss.update(coeff_loss.detach().item()) pixel_avg_loss.update(image_leve_loss.detach().item()) if count % 100 == 0: print('Iter: [%d, %5d]' % (epoch, i_batch)) print(' Iter: [%d, %5d]' % (epoch, i_batch) + ' landmark_loss' + ': %.3e' % landmark_avg_loss.avg) print(' Iter: [%d, %5d]' % (epoch, i_batch) + ' coeff_loss' + ': %.3e' % coeff_avg_loss.avg) print(' Iter: [%d, %5d]' % (epoch, i_batch) + ' skin_mask_loss' + ': %.3e' % skin_mask_avg_loss.avg) print(' Iter: [%d, %5d]' % (epoch, i_batch) + ' image_leve_loss' + ': %.3e' % pixel_avg_loss.avg) print(' Iter: [%d, %5d]' % (epoch, i_batch) + ' loss' + ': %.3e' % avg_loss.avg) print('\n') _render_images = _render_images.cpu().detach().numpy() _bgr_images = _bgr_images.cpu().detach().numpy() for i in range(batch_size): a_image = _render_images[i] b_image = _bgr_images[i] c_image = np.concatenate((a_image, b_image), axis=1) a_target_image_path = os.path.join(target_images_dir, str(i) + '.jpg') cv2.imwrite(a_target_image_path, c_image) landmark_avg_loss = AverageMeter('landmark_avg_loss', ':.4e') skin_mask_loss = AverageMeter('skin_mask_loss', ':.4e') pixel_avg_loss = AverageMeter('pixel_avg_loss', ':.4e') avg_loss = AverageMeter('loss', ':.4e') if count % 500 == 0: a_save_name = "_".join([ SUFFIX, 'iter', 'epoch', '%d' % epoch, 'i_batch', '%d' % i_batch ]) + '.pth' a_save_path = os.path.join(WRITE_SNAPSHOT_PATH, a_save_name) torch.save(landmarks_model.state_dict(), a_save_path) optimizer.zero_grad() loss.backward() optimizer.step() count += 1
def main(): print("***** Running training *****") print(f" Task = {args.dataset}") print(f" Num Epochs = {args.epochs}") print(f" Total train batch size = {args.train_batch}") trainset, testset, transform_train, transform_test, num_classes = init_data( ) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers, drop_last=True, pin_memory=True) testloader = torch.utils.data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, drop_last=False, pin_memory=True) model = gen_model(args.model, args.depth, args.widen_factor, num_classes, '', False, 0.1) model = model.cuda() print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) wd_params, non_wd_params = [], [] for name, param in model.named_parameters(): # if len(param.size()) == 1: if 'bn' in name or 'bias' in name: non_wd_params.append( param) # bn.weight, bn.bias and classifier.bias, conv2d.bias # print(name) else: wd_params.append(param) param_list = [{ 'params': wd_params, 'weight_decay': args.weight_decay }, { 'params': non_wd_params, 'weight_decay': 0 }] optimizer = optim.SGD(param_list, lr=args.lr, momentum=args.momentum, nesterov=args.nesterov) schedular = MultiStepLR(optimizer, args.decay_step, args.lr_decay) # train the model from scratch best_acc = 0 start_epoch = 0 # Resume title = args.task_name log_names = [ 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.', 'Train Top5', 'Valid Top5' ] if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' ckpt = torch.load(args.resume) model.load_state_dict(ckpt) logger = Logger(os.path.join(args.save_path, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.save_path, 'log.txt'), title=title) logger.set_names(log_names) for epoch in range(args.epochs): lr = optimizer.param_groups[0]['lr'] print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, lr)) train_loss, train_acc, train_top5 = train(args, trainloader, model, optimizer) test_loss, test_acc, test_top5 = test(testloader, model) log_vals = [ train_loss, test_loss, train_acc, test_acc, train_top5, test_top5 ] logger.append(log_vals) if test_acc > best_acc: torch.save( model.state_dict(), os.path.join(args.save_path, f'{args.net_name}_best.pth')) if epoch % args.num_save_epoch == 0 or epoch == args.epochs - 1: torch.save( model.state_dict(), os.path.join(args.save_path, f'{args.net_name}_{epoch}.pth')) schedular.step() best_acc = max(test_acc, best_acc) log_str = f"Epoch: {epoch}," for k, v in zip(log_names, log_vals): log_str += f"{k}: {v}," print(log_str) print('Best test acc:', best_acc) return model, best_acc
def main(): # 1. argparser opts = parse(sys.argv[1:]) print(opts) # 3. visdom vis = visdom.Visdom(port=opts.port) # 4. data set train_set = None test_set = None if opts.data_type == 'voc': train_set = VOC_Dataset(root=opts.data_root, split='train', resize=opts.resize) test_set = VOC_Dataset(root=opts.data_root, split='test', resize=opts.resize) opts.num_classes = 20 elif opts.data_type == 'coco': train_set = COCO_Dataset(root=opts.data_root, set_name='train2017', split='train', resize=opts.resize) test_set = COCO_Dataset(root=opts.data_root, set_name='val2017', split='test', resize=opts.resize) opts.num_classes = 80 # 5. data loader train_loader = torch.utils.data.DataLoader(train_set, batch_size=opts.batch_size, collate_fn=train_set.collate_fn, shuffle=True, num_workers=4, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, collate_fn=test_set.collate_fn, shuffle=False, num_workers=2, pin_memory=True) # 6. network model = RetinaNet(num_classes=opts.num_classes).to(device) model = torch.nn.DataParallel(module=model, device_ids=device_ids) coder = RETINA_Coder(opts=opts) # there is center_anchor in coder. # 7. loss criterion = Focal_Loss(coder=coder) # 8. optimizer optimizer = torch.optim.SGD(params=model.parameters(), lr=opts.lr, momentum=opts.momentum, weight_decay=opts.weight_decay) # 9. scheduler scheduler = MultiStepLR(optimizer=optimizer, milestones=[30, 45], gamma=0.1) # 10. resume if opts.start_epoch != 0: checkpoint = torch.load(os.path.join(opts.save_path, opts.save_file_name) + '.{}.pth.tar' .format(opts.start_epoch - 1), map_location=device) # 하나 적은걸 가져와서 train model.load_state_dict(checkpoint['model_state_dict']) # load model state dict optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # load optim state dict scheduler.load_state_dict(checkpoint['scheduler_state_dict']) # load sched state dict print('\nLoaded checkpoint from epoch %d.\n' % (int(opts.start_epoch) - 1)) else: print('\nNo check point to resume.. train from scratch.\n') # for statement for epoch in range(opts.start_epoch, opts.epoch): # 11. train train(epoch=epoch, vis=vis, train_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, opts=opts) # 12. test test(epoch=epoch, vis=vis, test_loader=test_loader, model=model, criterion=criterion, coder=coder, opts=opts) scheduler.step()
def train_validate(learningRate, patience, momentum=0.5): # train and validate epoches model = LetterCNN() # weights initialization # Init_weights(model) model.apply(Init_weights) # initalization trainning lists avg_loss_per_epoch_list = [] loss_temp = [] # for batch upadating loss_list = [] # record every last batch's loss for each epoch accuracy_per_epoch_list = [] accuracy_temp = [] accuracy_list = [] ## record every last batch's accuracy for each epoch # initialization validation lists val_loss_list = [] val_accuracy_list = [] val_avg_loss = [] iterations = len(train_loader) # how many steps/batches in one epoch # define the loss function and optimizer lossCriterion = nn.CrossEntropyLoss(weight=torch.FloatTensor( [1.4, 1.4, 0.8, 0.8, 1, 0.9, 0.9, 1])) # we use crossentropy loss criterion optimizer = optim.SGD(model.parameters(), lr=learningRate, momentum=momentum) # momentum method /lr=0.01 before scheduler = MultiStepLR(optimizer, milestones=[6, 15, 20, 30], gamma=0.7) # Early stopping start # patience = 6 early_stopping = EarlyStopping(patience=patience, verbose=True) for epoch in range(epoches): # epoches # train mode model.train() for iteration, (images, labels) in enumerate( train_loader): # for each step/block for training loader outputs = model(images) # collect the loss, last batch' loss and average loss for epoch loss = lossCriterion(outputs, labels) # get loss for every step loss_temp.append(loss.item()) loss1 = loss.item() #update weights and do BP optimizer.zero_grad() # To avoid gradient sums loss.backward() # back propagation optimizer.step( ) #All optimizers implement a step() method, that updates the parameters. # print(len(labels)) total = labels.size( 0) # how many labels do you have in this step(batch) pro8 = F.softmax(outputs, dim=1).data _, predicted = torch.max( pro8, 1) # return the prdicted indices for each row # print(outputs.data) # .sum()is used to calculate # of elements whose predicts are same as labels #but it return in term of tensor, we use item() to retrieve number in it. # print((predicted == labels).sum()) # collect accuracy list for train data correct = (predicted == labels).sum().item() accuracy_temp.append(correct / total) # for bacthes acc = correct / total # record accuracy instantly # print(loss_temp) accuracy_list.append( acc) # record every last batch's Accuracy of each epoch accuracy_per_epoch_list.append( np.average(accuracy_temp )) # record all batch's average ACCuracy of each epoch loss_list.append(loss1) # record every last batch's LOSS of each epoch avg_loss_per_epoch_list.append(np.average( loss_temp)) # record all batch's average LOSS of each epoch # if (iteration+1) % iterations ==0: # track all the statistics/10 batches per track # print('Trainmodel Epoch[{}/{}],AvgLoss:{:.4f},AvgAccuracy:{:.2f}%'.format(epoch+1,epochs,loss_list[epoch],accuracy_list[epoch]*100)) print('Trainmodel Epoch[{}/{}], Loss:{:.4f}, Accuracy:{:.2f}%'. format(epoch + 1, epoches, loss1, acc * 100)) # print(len(accuracy_list)) ### validation############################################################################################## model.eval() for j, (images, labels ) in enumerate(validation_loader): # loader with all the data outputs = model(images) # print(outputs.shape) _, predicted = torch.max(F.softmax(outputs, dim=1), 1) correct_val = (predicted == labels).sum().item() total_val = labels.size(0) val_accuracy_list.append(correct_val / total_val) val_loss = lossCriterion(outputs, labels) val_loss_list.append(val_loss.item()) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model # averageLoss val_avg_loss.append(np.average(val_loss_list)) # clear temp lists to track next epoach accuray_temp = [] loss_temp = [] print('Validation Epoch[{}/{}]:, Loss:{:.4f}, Accuracy:{:.2f}%\n'. format(epoch + 1, epoches, val_avg_loss[epoch], val_accuracy_list[epoch] * 100)) # using average loss to do early stopping early_stopping(np.average(val_loss_list), model) if early_stopping.early_stop: print("Early stopping") break val_loss_list = [] scheduler.step() # checkpoint model.load_state_dict(torch.load('checkpoint.pt')) # plot the accuracy and loss fig = plt.figure(num=2, figsize=(15, 8), dpi=80) ax1 = fig.add_subplot(2, 1, 1) ax2 = fig.add_subplot(2, 1, 2) ax1.plot(range(len(accuracy_list)), accuracy_list, color='g', label='Train_Accuracy') ax1.plot(range(len(val_accuracy_list)), val_accuracy_list, color='r', label='Validation_Accuracy') ax2.plot(range(len(loss_list)), avg_loss_per_epoch_list, color='g', label='Train_Loss') ax2.plot(range(len(val_avg_loss)), val_avg_loss, color='r', label='validation_Loss') ax1.set_xlabel('Epochs') ax2.set_xlabel('Epochs') ax1.set_ylabel('Accuracy') ax2.set_ylabel('Loss') ax1.set_title('Accuracy') ax2.set_title('Loss') ax1.legend() ax2.legend() plt.show()
def main(): args.num_classes = get_num_classes(args.dataset) model = SGN(args.num_classes, args.dataset, args.seg, args) total = get_n_params(model) print(model) print('The number of parameters: ', total) print('The modes is:', args.network) if torch.cuda.is_available(): print('It is using GPU!') model = model.cuda() criterion = LabelSmoothingLoss(args.num_classes, smoothing=0.1).cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.monitor == 'val_acc': mode = 'max' monitor_op = np.greater best = -np.Inf str_op = 'improve' elif args.monitor == 'val_loss': mode = 'min' monitor_op = np.less best = np.Inf str_op = 'reduce' scheduler = MultiStepLR(optimizer, milestones=[60, 90, 110], gamma=0.1) # Data loading ntu_loaders = NTUDataLoaders(args.dataset, args.case, seg=args.seg) train_loader = ntu_loaders.get_train_loader(args.batch_size, args.workers) val_loader = ntu_loaders.get_val_loader(args.batch_size, args.workers) train_size = ntu_loaders.get_train_size() val_size = ntu_loaders.get_val_size() test_loader = ntu_loaders.get_test_loader(32, args.workers) print('Train on %d samples, validate on %d samples' % (train_size, val_size)) best_epoch = 0 output_dir = make_dir(args.dataset) save_path = os.path.join(output_dir, args.network) if not os.path.exists(save_path): os.makedirs(save_path) checkpoint = osp.join(save_path, '%s_best.pth' % args.case) earlystop_cnt = 0 csv_file = osp.join(save_path, '%s_log.csv' % args.case) log_res = list() lable_path = osp.join(save_path, '%s_lable.txt' % args.case) pred_path = osp.join(save_path, '%s_score.npy' % args.case) # Training if args.train == 1: for epoch in range(args.start_epoch, args.max_epochs): print(epoch, optimizer.param_groups[0]['lr']) t_start = time.time() train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch) val_loss, val_acc = validate(val_loader, model, criterion) log_res += [[train_loss, train_acc.cpu().numpy(),\ val_loss, val_acc.cpu().numpy()]] print( 'Epoch-{:<3d} {:.1f}s\t' 'Train: loss {:.4f}\taccu {:.4f}\tValid: loss {:.4f}\taccu {:.4f}' .format(epoch + 1, time.time() - t_start, train_loss, train_acc, val_loss, val_acc)) current = val_loss if mode == 'min' else val_acc ####### store tensor in cpu current = current.cpu() if monitor_op(current, best): print('Epoch %d: %s %sd from %.4f to %.4f, ' 'saving model to %s' % (epoch + 1, args.monitor, str_op, best, current, checkpoint)) best = current best_epoch = epoch + 1 save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best': best, 'monitor': args.monitor, 'optimizer': optimizer.state_dict(), }, checkpoint) earlystop_cnt = 0 else: print('Epoch %d: %s did not %s' % (epoch + 1, args.monitor, str_op)) earlystop_cnt += 1 scheduler.step() print('Best %s: %.4f from epoch-%d' % (args.monitor, best, best_epoch)) with open(csv_file, 'w') as fw: cw = csv.writer(fw) cw.writerow(['loss', 'acc', 'val_loss', 'val_acc']) cw.writerows(log_res) print('Save train and validation log into into %s' % csv_file) ### Test args.train = 0 model = SGN(args.num_classes, args.dataset, args.seg, args) model = model.cuda() test(test_loader, model, checkpoint, lable_path, pred_path)
class Trainer(object): def __init__(self, model_name, model, lr, train_on_gpu=False, fp16=False, loss_scaling=False): self.model = model self.lr = lr self.model_name = model_name self.train_on_gpu = train_on_gpu self.loss_scaling = loss_scaling if train_on_gpu and torch.backends.cudnn.enabled: self.fp16_mode = fp16 else: self.fp16_mode = False self.loss_scaling = False print("CuDNN backend not available. Can't train with FP16.") self.best_acc = 0 self.best_epoch = 0 self._LOSS_SCALE = 128.0 if self.train_on_gpu: self.model = self.model.cuda() if self.fp16_mode: self.model = self.network_to_half(self.model) self.model_params, self.master_params = self.prep_param_list( self.model) # Declare optimizer. if not hasattr(self, 'optimizer'): if self.fp16_mode: self.optimizer = optim.SGD( self.master_params, self.lr, momentum=0.9, weight_decay=5e-4) else: self.optimizer = optim.SGD( self.model.parameters(), self.lr, momentum=0.9, weight_decay=5e-4) self.scheduler = MultiStepLR( self.optimizer, milestones=[10, 20, 50, 100, 180], gamma=0.1) #if self.train_on_gpu: # self.model = nn.DataParallel(self.model) print('\n Model: {} | Training on GPU: {} | Mixed Precision: {} |' 'Loss Scaling: {}'.format(self.model_name, self.train_on_gpu, self.fp16_mode, self.loss_scaling)) def prep_param_list(self, model): """ Create two set of of parameters. One in FP32 and other in FP16. Since gradient updates are with numbers that are out of range for FP16 this a necessity. We'll update the weights with FP32 and convert them back to FP16. """ model_params = [p for p in model.parameters() if p.requires_grad] master_params = [p.detach().clone().float() for p in model_params] for p in master_params: p.requires_grad = True return model_params, master_params def master_params_to_model_params(self, model_params, master_params): """ Move FP32 master params to FP16 model params. """ for model, master in zip(model_params, master_params): model.data.copy_(master.data) def model_grads_to_master_grads(self, model_params, master_params): for model, master in zip(model_params, master_params): if master.grad is None: master.grad = Variable(master.data.new(*master.data.size())) master.grad.data.copy_(model.grad.data) def BN_convert_float(self, module): ''' Designed to work with network_to_half. BatchNorm layers need parameters in single precision. Find all layers and convert them back to float. This can't be done with built in .apply as that function will apply fn to all modules, parameters, and buffers. Thus we wouldn't be able to guard the float conversion based on the module type. ''' if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): module.float() for child in module.children(): self.BN_convert_float(child) return module class tofp16(nn.Module): """ Add a layer so inputs get converted to FP16. Model wrapper that implements:: def forward(self, input): return input.half() """ def __init__(self): super(Trainer.tofp16, self).__init__() def forward(self, input): return input.half() def network_to_half(self, network): """ Convert model to half precision in a batchnorm-safe way. """ return nn.Sequential(self.tofp16(), self.BN_convert_float(network.half())) def warmup_learning_rate(self, init_lr, no_of_steps, epoch, len_epoch): """Warmup learning rate for 5 epoch""" factor = no_of_steps // 30 lr = init_lr * (0.1**factor) """Warmup""" lr = lr * float(1 + epoch + no_of_steps * len_epoch) / (5. * len_epoch) return lr def train(self, epoch, no_of_steps, trainloader): self.model.train() train_loss, correct, total = 0, 0, 0 # If epoch less than 5 use warmup, else use scheduler. if epoch < 5: lr = self.warmup_learning_rate(self.lr, no_of_steps, epoch, len(trainloader)) for param_group in self.optimizer.param_groups: param_group['lr'] = lr elif epoch == 5: for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr #scheduler = MultiStepLR( # self.optimizer, milestones=[80, 120, 160, 180], gamma=0.1) #if epoch >= 5: # scheduler.step(epoch=epoch) print('Learning Rate: %g' % (list( map(lambda group: group['lr'], self.optimizer.param_groups)))[0]) # Loss criterion is in FP32. criterion = nn.CrossEntropyLoss() for idx, (inputs, targets) in enumerate(trainloader): if self.train_on_gpu: inputs, targets = inputs.cuda(), targets.cuda() self.model.zero_grad() outputs = self.model(inputs) # We calculate the loss in FP32 since reduction ops can be # wrong when represented in FP16. loss = criterion(outputs, targets) if self.loss_scaling: # Sometime the loss may become small to be represente in FP16 # So we scale the losses by a large power of 2, 2**7 here. loss = loss * self._LOSS_SCALE # Calculate the gradients loss.backward() if self.fp16_mode: # Now we move the calculated gradients to the master params # so that we can apply the gradient update in FP32. self.model_grads_to_master_grads(self.model_params, self.master_params) if self.loss_scaling: # If we scaled our losses now is a good time to scale it # back since our gradients are in FP32. for params in self.master_params: params.grad.data = params.grad.data / self._LOSS_SCALE # Apply weight update in FP32. self.optimizer.step() # Copy the updated weights back FP16 model weights. self.master_params_to_model_params(self.model_params, self.master_params) else: self.optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += (targets == predicted).sum().item() progress_bar( idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (idx + 1), 100. * correct / total, correct, total)) if epoch >= 5: #modified at 2020.09.09 self.scheduler.step() def evaluate(self, epoch, testloader): self.model.eval() test_loss = 0 correct = 0 total = 0 criterion = nn.CrossEntropyLoss() with torch.no_grad(): for idx, (test_x, test_y) in enumerate(testloader): if self.train_on_gpu: test_x, test_y = test_x.cuda(), test_y.cuda() outputs = self.model(test_x) loss = criterion(outputs, test_y) test_loss += loss.item() _, predicted = outputs.max(1) total += test_y.size(0) correct += (predicted == test_y).sum().item() progress_bar( idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (loss / (idx + 1), 100. * correct / total, correct, total)) acc = 100.0 * correct / total if acc > self.best_acc: self.save_model(self.model, self.model_name, acc, epoch) def save_model(self, model, model_name, acc, epoch): state = { 'net': model.state_dict(), 'acc': acc, 'epoch': epoch, } if self.fp16_mode: save_name = os.path.join('weights', model_name + '_fp16', 'weights.%03d.%.03f.pt' % (epoch, acc)) else: save_name = os.path.join('weights', model_name, 'weights.%03d.%.03f.pt' % (epoch, acc)) if not os.path.exists(os.path.dirname(save_name)): os.makedirs(os.path.dirname(save_name)) torch.save(state, save_name) print("\nSaved state at %.03f%% accuracy. Prev accuracy: %.03f%%" % (acc, self.best_acc)) self.best_acc = acc self.best_epoch = epoch def load_model(self, path=None): """ Load previously saved model. THis doesn't check for precesion type. """ if path is not None: checkpoint_name = path elif self.fp16_mode: checkpoint_name = os.path.join( 'weights', self.model_name + '_fp16', 'weights.%03d.%.03f.pt' % (self.best_epoch, self.best_acc)) else: checkpoint_name = os.path.join( 'weights', self.model_name + '_fp16', 'weights.%03d.%.03f.pt' % (self.best_epoch, self.best_acc)) if not os.path.exists(checkpoint_name): print("Best model not found") return checkpoint = torch.load(checkpoint_name) self.model.load_state_dict(checkpoint['net']) self.best_acc = checkpoint['acc'] self.best_epoch = checkpoint['epoch'] print("Loaded Model with accuracy: %.3f%%, from epoch: %d" % (checkpoint['acc'], checkpoint['epoch'] + 1)) def train_and_evaluate(self, traindataloader, testdataloader, no_of_steps): self.best_acc = 0.0 for i in range(no_of_steps): print('\nEpoch: %d' % (i + 1)) self.train(i, no_of_steps, traindataloader) self.evaluate(i, testdataloader)
def train_and_evaluate(model, train_loader, test_loader, optimizer, criterion, criterion_T, accuracy, model_dir, args): start_epoch = 0 best_acc = 0. # learning rate schedulers for different models: scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=0.1) # TensorboardX setup writer = SummaryWriter(log_dir=model_dir) # ensemble # writerB = SummaryWriter(logdir = os.path.join(model_dir, 'B')) # ensemble # Save best ensemble or average accTop1 choose_E = False # Save the parameters for export result_train_metrics = list(range(args.num_epochs)) result_test_metrics = list(range(args.num_epochs)) # If the training is interruptted if args.resume: # Load checkpoint. logging.info('Resuming from checkpoint..') resumePath = os.path.join(args.resume, 'last.pth') assert os.path.isfile( resumePath), 'Error: no checkpoint directory found!' checkpoint = torch.load(resumePath) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optim_dict']) # resume from the last epoch start_epoch = checkpoint['epoch'] scheduler.step(start_epoch - 1) if choose_E: best_acc = checkpoint['test_accTop1'] else: best_acc = checkpoint['mean_test_accTop1'] result_train_metrics = torch.load( os.path.join(args.resume, 'train_metrics')) result_test_metrics = torch.load( os.path.join(args.resume, 'test_metrics')) for epoch in range(start_epoch, args.num_epochs): scheduler.step() # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, args.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train_metrics = train(train_loader, model, optimizer, criterion, criterion_T, accuracy, args) writer.add_scalar('Train/Loss', train_metrics['train_loss'], epoch + 1) # writer.add_scalar('Train/Loss_True', train_metrics['train_true_loss'], epoch+1) # writer.add_scalar('Train/Loss_Group', train_metrics['train_group_loss'], epoch+1) writer.add_scalar('Train/AccTop1', train_metrics['train_accTop1'], epoch + 1) # Evaluate for one epoch on validation set test_metrics = evaluate(test_loader, model, criterion, criterion_T, accuracy, args) # Find the best accTop1 for Branch1. if choose_E: test_acc = test_metrics['test_accTop1'] else: test_acc = test_metrics['mean_test_accTop1'] writer.add_scalar('Test/Loss', test_metrics['test_loss'], epoch + 1) # writer.add_scalar('Test/Loss_True', test_metrics['test_true_loss'], epoch+1) # writer.add_scalar('Test/Loss_Group', test_metrics['test_group_loss'], epoch+1) writer.add_scalar('Test/AccTop1', test_metrics['test_accTop1'], epoch + 1) result_train_metrics[epoch] = train_metrics result_test_metrics[epoch] = test_metrics # Save latest train/test metrics torch.save(result_train_metrics, os.path.join(model_dir, 'train_metrics')) torch.save(result_test_metrics, os.path.join(model_dir, 'test_metrics')) last_path = os.path.join(model_dir, 'last.pth') # Save latest model weights, optimizer and accuracy torch.save( { 'state_dict': model.state_dict(), 'epoch': epoch + 1, 'optim_dict': optimizer.state_dict(), 'test_accTop1': test_metrics['test_accTop1'], 'mean_test_accTop1': test_metrics['mean_test_accTop1'] }, last_path) # If best_eval, best_save_path is_best = test_acc >= best_acc if is_best: logging.info("- Found better accuracy") best_acc = test_acc # Save best metrics in a json file in the model directory test_metrics['epoch'] = epoch + 1 utils.save_dict_to_json( test_metrics, os.path.join(model_dir, "test_best_metrics.json")) # Save model and optimizer shutil.copyfile(last_path, os.path.join(model_dir, 'best.pth')) writer.close()
def main(args): model = load_config(args.model) dataset = load_config(args.dataset) device = torch.device('cuda' if model['common']['cuda'] else 'cpu') if model['common']['cuda'] and not torch.cuda.is_available(): sys.exit('Error: CUDA requested but not available') # if args.batch_size < 2: # sys.exit('Error: PSPNet requires more than one image for BatchNorm in Pyramid Pooling') os.makedirs(model['common']['checkpoint'], exist_ok=True) num_classes = len(dataset['common']['classes']) net = UNet(num_classes).to(device) if args.resume: path = os.path.join(model['common']['checkpoint'], args.resume) cuda = model['common']['cuda'] def map_location(storage, _): return storage.cuda() if cuda else storage.cpu() chkpt = torch.load(path, map_location=map_location) net.load_state_dict(chkpt) resume_at_epoch = int(args.resume[11:16]) else: resume_at_epoch = 0 if model['common']['cuda']: torch.backends.cudnn.benchmark = True net = DataParallel(net) optimizer = SGD(net.parameters(), lr=model['opt']['lr'], momentum=model['opt']['momentum']) scheduler = MultiStepLR(optimizer, milestones=model['opt']['milestones'], gamma=model['opt']['gamma']) weight = torch.Tensor(dataset['weights']['values']) for i in range(resume_at_epoch): scheduler.step() criterion = CrossEntropyLoss2d(weight=weight).to(device) # criterion = FocalLoss2d(weight=weight).to(device) train_loader, val_loader = get_dataset_loaders(model, dataset) num_epochs = model['opt']['epochs'] history = collections.defaultdict(list) for epoch in range(resume_at_epoch, num_epochs): print('Epoch: {}/{}'.format(epoch + 1, num_epochs)) train_hist = train(train_loader, num_classes, device, net, optimizer, scheduler, criterion) print('Train loss: {:.4f}, mean IoU: {:.4f}'.format(train_hist['loss'], train_hist['iou'])) for k, v in train_hist.items(): history['train ' + k].append(v) val_hist = validate(val_loader, num_classes, device, net, criterion) print('Validate loss: {:.4f}, mean IoU: {:.4f}'.format(val_hist['loss'], val_hist['iou'])) for k, v in val_hist.items(): history['val ' + k].append(v) visual = 'history-{:05d}-of-{:05d}.png'.format(epoch + 1, num_epochs) plot(os.path.join(model['common']['checkpoint'], visual), history) checkpoint = 'checkpoint-{:05d}-of-{:05d}.pth'.format(epoch + 1, num_epochs) torch.save(net.state_dict(), os.path.join(model['common']['checkpoint'], checkpoint))