def val(model, dataset): ''' 计算模型在验证集上的准确率 返回top1和top3的准确率 ''' model.eval() dataset.val() acc_meter = meter.AverageValueMeter() top1_meter = meter.AverageValueMeter() dataloader = t.utils.data.DataLoader(dataset, opt.batch_size, opt.shuffle, num_workers=opt.workers, pin_memory=True) for ii, data in enumerate(tqdm.tqdm(dataloader)): input, label, _ = data val_input = Variable(input, volatile=True).cuda() val_label = Variable(label.type(t.LongTensor), volatile=True).cuda() score = model(val_input) acc = topk_acc(score.data, label.cuda()) top1 = topk_acc(score.data, label.cuda(), k=1) acc_meter.add(acc) top1_meter.add(top1) model.train() dataset.train() print(acc_meter.value()[0], top1_meter.value()[0]) return acc_meter.value()[0], top1_meter.value()[0]
def val(model, dataset): ''' 计算模型在验证集上的准确率 返回top1和top3的准确率 ''' model.eval() dataset.val() acc_meter = meter.AverageValueMeter() top1_meter = meter.AverageValueMeter() dataloader = t.utils.data.DataLoader(dataset, opt.batch_size / 2, opt.shuffle, num_workers=8, pin_memory=True) for ii, data in tqdm.tqdm(enumerate(dataloader)): input, label, _ = data #bs, ncrops, c, h, w = input.size() #val_input = Variable(input.view(-1, c, h, w), volatile=True).cuda() val_input = Variable(input, volatile=True).cuda() #val_label = Variable(label.type(t.LongTensor), volatile=True).cuda() #scores = model(val_input) # fuse batch size and ncrops bs*ncrop,80 #prob = t.nn.functional.softmax(scores) # prob_avg = prob.view(bs, ncrops, -1).mean(1) prob_avg = model(val_input) acc = topk_acc(prob_avg.data, label.cuda()) top1 = topk_acc(prob_avg.data, label.cuda(), k=1) acc_meter.add(acc) top1_meter.add(top1) model.train() dataset.train() print(acc_meter.value()[0], top1_meter.value()[0]) return acc_meter.value()[0], top1_meter.value()[0]
def train(epoch, warm_up=True): model.train() train_loss = 0 total, top1_correct, top5_correct = 0, 0, 0 top1_acc, top5_acc = 0, 0 for batch_idx, (inputs, targets) in enumerate(train_loader): # learning rate warm up if warm_up and epoch <= WARM_UP_EPOCHS: warm_up_scheduler.step() inputs, targets = inputs.to(device), targets.to(device) outputs = model(inputs) if OBJ_FUNC == 'FL': outputs = F.softmax(outputs, dim=1) optimizer.zero_grad() # loss_param = {'y_hat': outputs, 'y': targets} ; loss = primary_criterion(**loss_param) loss = primary_criterion(outputs, targets) if torch.isinf(loss): print('[ERROR] nan loss (%s), stop training.' % loss) exit(1) loss.backward(retain_graph=True) if OBJ_FUNC == 'COT': entropy = complement_criterion(outputs, targets) entropy.backward() optimizer.step() train_loss += loss.item() # _, predicted = outputs.max(1) total += targets.size()[0] top_acc_list = utils.topk_acc(outputs, targets, topk=(1, 5)) # top1_correct += predicted.eq(targets).sum().item() top1_correct += top_acc_list[0] top5_correct += top_acc_list[1] top1_acc = 100.0 * (top1_correct / total) top5_acc = 100.0 * (top5_correct / total) utils.progress_bar( 'Train', epoch + 1, batch_idx, len(train_loader), msg= 'Loss: %.3f | Acc: [top-1] %.3f%% (%d/%d), [top-5] %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), top1_acc, top1_correct, total, top5_acc, top5_correct, total)) train_loss /= len(train_loader) tensor_board_writer.add_scalars('train/acc', {'acc': top1_acc}, epoch) tensor_board_writer.add_scalars('train/loss', {'loss': train_loss}, epoch) train_csv_writer.writerow({ 'epoch': epoch, 'loss': train_loss, 'acc': top1_acc.item(), 'top5acc': top5_acc.item(), 'lr': optimizer.state_dict()['param_groups'][0]['lr'] }) train_csv_file.flush()
def valid(epoch): global best_top1_valid_acc model.eval() valid_loss = 0 total, top1_correct, top5_correct = 0, 0, 0 top1_acc, top5_acc = 0, 0 with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(valid_loader): inputs, targets = inputs.to(device), targets.to(device) outputs = model(inputs) loss = cross_entropy(outputs, targets) valid_loss += loss.item() total += targets.size()[0] top_acc_list = utils.topk_acc(outputs, targets, topk=(1, 5)) top1_correct += top_acc_list[0] top5_correct += top_acc_list[1] top1_acc = 100.0 * (top1_correct / total) top5_acc = 100.0 * (top5_correct / total) utils.progress_bar( 'Valid', epoch + 1, batch_idx, len(valid_loader), msg= 'Loss: %.3f | Acc: [top-1] %.3f%% (%d/%d), [top-5] %.3f%% (%d/%d)' % (valid_loss / (batch_idx + 1), top1_acc, top1_correct, total, top5_acc, top5_correct, total)) if top1_acc > best_top1_valid_acc: print('Saving current parameters of the model (checkpoint).') state = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'acc': top1_acc, 'top5acc': top5_acc, 'loss': valid_loss, } # torch.save(state, log_dir + '/' + str(top1_acc.item()) + '.pth') torch.save(state, log_dir + '/' + 'model' + '.pth') best_top1_valid_acc = top1_acc valid_loss /= len(valid_loader) tensor_board_writer.add_scalars('valid/acc', {'acc': top1_acc}, epoch) tensor_board_writer.add_scalars('valid/loss', {'loss': valid_loss}, epoch) valid_csv_writer.writerow({ 'epoch': epoch, 'loss': valid_loss, 'acc': top1_acc.item(), 'top5acc': top5_acc.item(), 'lr': optimizer.state_dict()['param_groups'][0]['lr'] }) valid_csv_file.flush()
def train(**kwargs): ''' 训练模型 ''' opt.parse(kwargs) lr1, lr2 = opt.lr1, opt.lr2 vis.vis.env = opt.env # 模型 model = getattr(models,opt.model)(opt) if opt.load_path: model.load(opt.load_path) print(model) model.cuda() optimizer = model.get_optimizer(lr1,lr2) criterion = getattr(models,opt.loss)() # 指标:求均值 loss_meter = meter.AverageValueMeter() acc_meter = meter.AverageValueMeter() top1_meter = meter.AverageValueMeter() step = 0 max_acc = 0 vis.vis.texts = '' # 数据 dataset = ClsDataset(opt) dataloader = t.utils.data.DataLoader(dataset, opt.batch_size, opt.shuffle, num_workers=opt.workers,pin_memory=True) # 训练 for epoch in range(opt.max_epoch): loss_meter.reset() acc_meter.reset() top1_meter.reset() for ii, data in tqdm.tqdm(enumerate(dataloader, 0)): # 训练 optimizer.zero_grad() input, label, _ = data input = Variable(input.cuda()) label = Variable(label.cuda()) output = model(input).squeeze() error = criterion(output, label) error.backward() optimizer.step() # 计算损失的均值和训练集的准确率均值 loss_meter.add(error.data[0]) acc = topk_acc(output.data,label.data) acc_meter.add(acc) top1_acc = topk_acc(output.data,label.data,k=1) top1_meter.add(top1_acc) # 可视化 if (ii+1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() log_values = dict(loss = loss_meter.value()[0], train_acc = acc_meter.value()[0], epoch = epoch, ii = ii, train_top1_acc= top1_meter.value()[0] ) vis.plot_many(log_values) # 数据跑一遍之后,计算在验证集上的分数 accuracy,top1_accuracy = val(model,dataset) vis.plot('val_acc', accuracy) vis.plot('val_top1',top1_accuracy) info = time.strftime('[%m%d_%H%M%S]') + 'epoch:{epoch},val_acc:{val_acc},lr:{lr},val_top1:{val_top1},train_acc:{train_acc}<br>'.format( epoch=epoch, lr=lr1, val_acc=accuracy, val_top1=top1_accuracy, train_acc=acc_meter.value() ) vis.vis.texts += info vis.vis.text(vis.vis.texts, win=u'log') # 调整学习率 # 如果验证集上准确率降低了,就降低学习率,并加载之前的最好模型 # 否则保存模型,并记下模型保存路径 if accuracy > max_acc: max_acc = accuracy best_path = model.save(accuracy) else: if lr1==0: lr1=lr2 model.load(best_path) lr1, lr2 = lr1 *opt.lr_decay, lr2 * opt.lr_decay optimizer = model.get_optimizer(lr1,lr2) vis.vis.save([opt.env])
checkpoint = torch.load(MODEL_CHECKPOINT_DIR + '/' + 'model.pth') checkpoint = modify_checkpoint_keys(checkpoint) model.load_state_dict(checkpoint['model_state_dict']) # 5. Load the model to CPU or GPU. device = 'cuda' if torch.cuda.is_available() else 'cpu' model = model.to(device) # 6. (If GPU,) Turn on 'Data Parallel' mode. if device == 'cuda': model = torch.nn.DataParallel(model) cudnn.benchmark = True # 7. Turn on the test (evaluation) mode. model.eval() # 8. Test (evaluate) y_hats, ys = get_all_y_hats_and_ys(model, test_loader, device) # 9. Result (1) (acc) top_acc_list = utils.topk_acc(y_hats, ys, topk=(1, 5)) total_sample_sizes = utils.get_total_sample_sizes(TEST_DIR) print('[RESULT] Acc: [top-1]', 100 * top_acc_list[0].item() / total_sample_sizes, end=' ') print('[top-5]', 100 * top_acc_list[1].item() / total_sample_sizes) # 10. Result (2) (confusion matrix) conf_matrix = confusion_matrix(ys, y_hats.argmax(dim=1)) plot_confusion_matrix(conf_matrix, CLASSES, file_format='svg')
def train(**kwargs): opt.parse(kwargs) lr1, lr2 = opt.lr1, opt.lr2 lr3 = opt.lr3 vis.vis.env = opt.env max_acc = 0 # 模型 model = getattr(models, opt.model)(opt) optimizer = model.get_optimizer(opt.model, lr1, lr2, lr3) if opt.load_path: #load optimizer + model #checkpoint = t.load(opt.load_path,lambda storage, loc: storage) checkpoint = t.load(opt.load_path) model.load_state_dict(checkpoint['d']) optimizer.load_state_dict(checkpoint['optimizer']) max_acc = checkpoint['acc'] print('using checkpoint:{}'.format(opt.load_path)) print('old config:') print(checkpoint['opt']) print(model) model.cuda() criterion = getattr(models, opt.loss)() # 指标:求均值 loss_meter = meter.AverageValueMeter() acc_meter = meter.AverageValueMeter() top1_meter = meter.AverageValueMeter() vis.vis.texts = '' # 数据 dataset = ClsDataset() dataloader = t.utils.data.DataLoader(dataset, opt.batch_size, opt.shuffle, num_workers=opt.workers, pin_memory=True) time_begin = time.time() for epoch in range(opt.max_epoch): loss_meter.reset() acc_meter.reset() top1_meter.reset() for ii, data in tqdm.tqdm(enumerate(dataloader, 0)): # 训练 optimizer.zero_grad() input, label, _ = data input = Variable(input.cuda()) label = Variable(label.cuda()) output = model(input).squeeze() error = criterion(output, label) error.backward() optimizer.step() # 计算损失的均值和训练集的准确率均值 loss_meter.add(error.data[0]) acc = topk_acc(output.data, label.data) acc_meter.add(acc) top1_acc = topk_acc(output.data, label.data, k=1) top1_meter.add(top1_acc) # 可视化 if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() log_values = dict(loss=loss_meter.value()[0], train_acc=acc_meter.value()[0], epoch=epoch, ii=ii, train_top1_acc=top1_meter.value()[0]) vis.plot_many(log_values) # 数据跑一遍之后,计算在验证集上的分数 accuracy, top1_accuracy = val(model, dataset) vis.plot('val_acc', accuracy) vis.plot('val_top1', top1_accuracy) info = time.strftime( '[%m%d_%H%M%S]' ) + 'epoch:{epoch},train_acc:{train_acc},mac_acc:{max_acc},val_acc:{val_acc},lr:{lr}<br>'.format( epoch=epoch, lr=lr1, train_acc=acc_meter.value(), val_acc=accuracy, max_acc=max_acc #val_top1=top1_accuracy ) vis.vis.texts += info # 调整学习率 # 如果验证集上准确率降低了,就降低学习率,并加载之前的最好模型 # 否则保存模型,并记下模型保存路径 if accuracy > max_acc: max_acc = accuracy best_path = model.save(accuracy) else: if lr1 == 0: lr1 = lr2 if lr3: lr3 = lr1 lr3 = lr3 * opt.lr_decay model.load(best_path) lr1, lr2 = lr1 * opt.lr_decay, lr2 * opt.lr_decay optimizer = model.get_optimizer(opt.model, lr1, lr2, lr3) vis.vis.texts += 'change learning_rate' #for param_group in optimizer.param_groups: # lr = init_lr * (0.5 ** (epoch // lr_decay_epoch)) # param_group['lr'] = lr # param_group['weight_decay'] = weight_decay vis.vis.text(vis.vis.texts, win=u'log') vis.vis.save([opt.env]) time_all = time.time() - time_begin print(time_all) print('Training complete in {:.0f}hour {:.0f}min'.format( time_all // 3600, time_all // 60)) print('Best val Acc: {:4f}'.format(max_acc))