def train(): model = tv.models.resnet101(pretrained=True) for param in model.parameters(): param.requires_grad = False model.fc = t.nn.Linear(2048, 2048) for name, param in model.named_parameters(): if name == 'layer4.2.conv2.weight': param.requires_grad = True if name == 'layer4.2.bn2.weight': param.requires_grad = True if name == 'layer4.2.bn2.bias': param.requires_grad = True for name, param in model.named_parameters(): if param.requires_grad == True: print(name) model.cuda() criterion = t.nn.BCEWithLogitsLoss() optimizer = t.optim.Adam(model.parameters(), lr=1e-3) dataloader = get_dataloader() word_att = dataloader.dataset.word_att loss_meter = meter.AverageValueMeter() epoch = 32 for epoch in range(epoch): loss_meter.reset() for ii, (imgs, caps, indexes) in tqdm.tqdm(enumerate(dataloader)): optimizer.zero_grad() imgs = imgs.cuda() caps = caps.cuda() labels = model(imgs) loss = criterion(labels, caps) loss.backward() optimizer.step() loss_meter.add(loss.item()) if (ii + 1) % 50 == 0: print('epoch:', epoch, 'loss:', loss_meter.value()[0]) if (ii + 1) % 1000 == 0: ture_words = [] print('真实属性词:') ture_pic_att = [(ix, item) for ix, item in enumerate(caps[6])] for item in ture_pic_att: if item[1] == 1: ture_words.append(word_att[item[0]]) print(ture_words) gen_words = [] print('预测属性词:') m = t.nn.Sigmoid() labels_sigmoid = m(labels) result_pic_att = [(ix, item) for ix, item in enumerate(labels_sigmoid[6])] for item in result_pic_att: if item[1] >= 0.5: gen_words.append(word_att[item[0]]) print(gen_words) prefix = 'muti_labei_classification' path = '{prefix}_{time}'.format(prefix=prefix, time=time.strftime('%m%d_%H%M')) t.save(model, path)
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.0) misfit = nn.CrossEntropyLoss(weight=weights) softmax = nn.Softmax2d() print(bcolors.BOLD + 'Batches=%d' % (N // batch_size) + bcolors.ENDC) best_val_loss = np.Inf hist_val_loss = [] hist_train_loss = [] train_time = [] val_time = [] for epoch in range(num_epochs): print(bcolors.BOLD + '\n=> Training Epoch #%d' % (epoch + 1) + bcolors.ENDC) running_loss = tnt.AverageValueMeter() running_acc = tnt.AverageValueMeter() start_time = time.time() needs_header = True # Training Loop count = 0 for batch_idx, (images, labels) in enumerate(train_loader): if use_gpu: images = images.cuda() labels = labels.cuda() # Forward Pass optimizer.zero_grad() if is_unet:
num_layers=args.num_layers, dim=args.dim, hidden_dim=args.hidden_dim, num_heads=8, dropout_prob=0.1, max_length=args.seq_length) model.train() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4, nesterov=True) annealer = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_epochs) loss_meter = meter.AverageValueMeter() time_meter = meter.TimeMeter(unit=False) train_losses = [] for epoch in range(args.num_epochs): for i, (x, y) in enumerate(train_loader): x, y = x.to(args.device), y.to(args.device) loss = model.loss(x, y).mean() optimizer.zero_grad() loss.backward() optimizer.step() loss_meter.add(loss.cpu().data.numpy(), n=1)
def train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers, print_freq, log_params_hist): """Training loop for one epoch.""" losses = {'objective_loss' : tnt.AverageValueMeter(), 'regularizer_loss' : tnt.AverageValueMeter()} if compression_scheduler is None: # Initialize the regularizer loss to zero losses['regularizer_loss'].add(0) classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) # Switch to train mode model.train() end = time.time() for train_step, (inputs, target) in enumerate(train_loader): # Measure data loading time data_time.add(time.time() - end) target = target.cuda(async=True) input_var = torch.autograd.Variable(inputs) target_var = torch.autograd.Variable(target) # Execute the forard phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch) output = model(input_var) loss = criterion(output, target_var) # Measure accuracy and record loss classerr.add(output.data, target) losses['objective_loss'].add(loss.data[0]) if compression_scheduler: # Before running the backward phase, we add any regularization loss computed by the scheduler regularizer_loss = compression_scheduler.before_backward_pass(epoch, train_step, steps_per_epoch, loss) loss += regularizer_loss losses['regularizer_loss'].add(regularizer_loss.data[0]) # Compute the gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch) # measure elapsed time batch_time.add(time.time() - end) steps_completed = (train_step+1) if steps_completed % print_freq == 0: # Log some statistics lr = optimizer.param_groups[0]['lr'] stats = ('Peformance/Training/', OrderedDict([ ('Loss', losses['objective_loss'].mean), ('Reg Loss', losses['regularizer_loss'].mean), ('Top1', classerr.value(1)), ('Top5', classerr.value(5)), ('LR', lr), ('Time', batch_time.mean)]) ) distiller.log_training_progress(stats, model.named_parameters() if log_params_hist else None, epoch, steps_completed, steps_per_epoch, print_freq, loggers) end = time.time()
def train(**kwargs): opt.parse(kwargs) vis = Visualizer(opt.env) model = getattr(models, opt.model)() if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # 数据设定 户籍科 010 82640433 train_data = DogCat(opt.load_model_path, train=True) val_data = DogCat(opt.train_data_root, train=False) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) train_dataloader = DataLoader(test_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # 目标函数和优化器 criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(model) # 统计指标,平滑处理之后的损失 loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in tqdm( enumerate(train_dataloader)): # ii num ,(data,label) enumerate # 训练模型参数 input = Variable(data) target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.stop() # 更新统计指标及可视化 loss_meter.add(loss.data[0]) confusion_matrix.add(loss.data[0]) confusion_matrix.add(score.data, target.data) if ii % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) if os.path.exist(opt.debug_file): import ipdb ipdb.set_trace() model.save() # 计算验证集上的指标及其可视化 val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( 'epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}' .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(args, train_dataset, test_dataset, model, optimizer, writer, device): print("start") train_loader = DataLoader(dataset=train_dataset, batch_size=args.batchsize, collate_fn=batcher, shuffle=args.shuffle, num_workers=args.workers) test_loader = DataLoader(dataset=test_dataset, batch_size=args.batchsize * 2, collate_fn=batcher, shuffle=args.shuffle, num_workers=args.workers) print(model) print(train_dataset.mean.item(), train_dataset.std.item()) # if model.name in ["MGCN", "SchNet"]: if args.multi_gpu: model.module.set_mean_std(train_dataset.mean, train_dataset.std) else: model.set_mean_std(train_dataset.mean, train_dataset.std) model.to(device) loss_fn = nn.MSELoss() MAE_fn = nn.L1Loss() mse_meter = meter.AverageValueMeter() mae_meter = meter.AverageValueMeter() init_lr = args.lr info = {'train_loss': [], 'train_mae': [], 'test_loss': [], 'test_mae': []} for epoch in range(args.epochs): mse_meter.reset() mae_meter.reset() model.train() for idx, (mols, label) in enumerate(train_loader): g = dgl.batch([mol.ful_g for mol in mols]) g.to(device) label = label.to(device) res = model(g).squeeze() loss = loss_fn(res, label) mae = MAE_fn(res, label) # if loss>1e3: # print('loss more than 1e3') optimizer.zero_grad() loss.backward() optimizer.step() mae_meter.add(mae.detach().item()) mse_meter.add(loss.detach().item()) if idx % 50 == 0 and args.use_tb: writer.add_scalar( 'training_loss', mse_meter.value()[0], int((idx + 1 + epoch * len(train_loader)) / 50)) writer.add_scalar( 'training_mae', mae_meter.value()[0], int((idx + 1 + epoch * len(train_loader)) / 50)) print('training loss {} mae {}'.format(mse_meter.value()[0], mae_meter.value()[0])) loss_test, mae_test = test(args, test_loader, model, device) print( "Epoch {:2d}, training: loss: {:.7f}, mae: {:.7f} test: loss{:.7f}, mae:{:.7f}" .format(epoch, mse_meter.value()[0], mae_meter.value()[0], loss_test, mae_test)) if (epoch + 1) % 100 == 0: init_lr = init_lr / 2 for param_group in optimizer.param_groups: param_group['lr'] = init_lr print('current learning rate: {}'.format(init_lr)) info['train_loss'].append(mse_meter.value()[0]) info['train_mae'].append(mae_meter.value()[0]) info['test_loss'].append(loss_test) info['test_mae'].append(mae_test) if args.use_tb: writer.add_scalar('testing_loss', loss_test, epoch) writer.add_scalar('testing_mae', mae_test, epoch) return info
def train(**kwargs): print("开始训练") # 定义一个网络模型对象 # 通过config文件中模型名称来加载模型 netWork = getattr(models, opt.model)() print('当前使用的模型为' + opt.model) # 定义可视化对象 vis = Visualizer(opt.env + opt.model) # 先将模型加载到内存中,即CPU中 map_location = lambda storage, loc: storage if opt.load_model_path: netWork.load_state_dict( t.load(opt.load_model_path, map_location=map_location)) if opt.use_gpu: netWork.cuda() # step2: 加载数据 train_data = XueLangDataSet(opt.data_root, train=True) #train=False test=False 则为验证集 val_data = XueLangDataSet(opt.data_root, train=False) # 数据集加载器 train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.val_batch_size, shuffle=True, num_workers=opt.num_workers) # criterion 损失函数和optimizer优化器 # 分类损失函数使用交叉熵 criterion = t.nn.CrossEntropyLoss() lr = opt.lr # 优化器使用Adam if opt.fixed_weight: # 选择固定部分权重参数 if opt.model is 'ResNet18_bo' or opt.model is 'ResNet152_bo': # ResNet18_bo和ResNet152网络只更新最后的全连接层 print(opt.model + '网络只更新最后的全连接层') optimizer = t.optim.Adam(netWork.model_bo.fc.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) if opt.model is 'VGG16_bo' or opt.model is 'VGG19_bo': print(opt.model + '网络只更新分类层') optimizer = t.optim.Adam(netWork.classifier.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) if opt.model is 'DenseNet_bo': print(opt.model + '网络只更新最后的全连接层') optimizer = t.optim.Adam(netWork.classifier.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) else: # 更新全部参数(只vgg19做了更改) print(opt.model + '网络更新全部参数') optimizer = t.optim.Adam(netWork.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) # 统计指标meters 仪表 显示损失的图形 #计算所有数的平均数和标准差,来统计一个epoch中损失的平均值 loss_meter = meter.AverageValueMeter() # 定义初始的loss previous_loss = 1e100 best_val_auc = 0 for epoch in range(opt.max_epoch): # 清空仪表信息 loss_meter.reset() # 迭代数据集加载器 for ii, (data_origin, label) in enumerate(train_dataloader): # 训练模型 # input_img为模型输入图像 input_img = Variable(data_origin) # label_img为对应标签 label_img = Variable(label) # 将数据转到GPU if opt.use_gpu: input_img = input_img.cuda() label_img = label_img.cuda() # 优化器梯度清零 optimizer.zero_grad() # 前向传播,得到网络产生的输出值label_output label_output = netWork(input_img) # 损失为交叉熵 loss = criterion(label_output, label_img) # 反向传播 自动求梯度 loss进行反向传播 loss.backward() # 更新优化器的可学习参数 optimizer优化器进行更新参数 optimizer.step() # 更新仪表 并可视化 loss_meter.add(loss.data[0]) # 每print_freq次可视化loss if ii % opt.print_freq == opt.print_freq - 1: # plot是自定义的方法 vis.plot('训练集loss', loss_meter.value()[0]) # 一个epoch之后保存模型 t.save(netWork, opt.checkpoint_root + opt.model + '.pth') print("第" + str(epoch) + "次epoch完成==============================================") # 当前时刻的一些信息 vis.log("epoch:{epoch},lr:{lr},loss:{loss}".format( epoch=epoch, loss=loss_meter.value()[0], lr=lr)) # 更新学习率 如果损失开始升高,则降低学习率 if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0] # 在验证集上进行验证,保存在验证集上效果最好的模型 # 模型调整为验证模式 netWork.eval() predict_label = [] real_label = [] for ii, (val_data_origin, val_label) in enumerate(val_dataloader): # 训练模型 # input_img为模型输入图像 val_input_img = Variable(val_data_origin, volatile=True) # label_img为对应标签 val_label_img = val_label # 将数据转到GPU if opt.use_gpu: val_input_img = val_input_img.cuda() # 前向传播,得到网络产生的输出值label_output val_label_output = netWork(val_input_img) # 将结果合并 val_predict_score = t.nn.functional.softmax( val_label_output, dim=1)[:, 1].cpu().data.numpy().flatten() val_label_img = val_label_img.numpy().flatten() for i in range(len(val_label_img)): predict_label.append(val_predict_score[i]) real_label.append(val_label_img[i]) # 过完一遍验证集,计算整个验证集上的AUC validation_auc_sklearn = roc_auc_score(real_label, predict_label) # 画出验证集的auc sklearn vis.plot('验证集的auc', validation_auc_sklearn) # 模型恢复为训练模式 netWork.train() # 保存到目前为止 在验证集上的AUC最大的模型 if best_val_auc < validation_auc_sklearn: best_val_auc = validation_auc_sklearn print('当前得到最好的验证集的AUC为 %.5f' % best_val_auc) netWork.save( netWork, opt.checkpoint_root + 'auc' + str(validation_auc_sklearn) + '.pth') print("============训练完毕=============")
def train(model): avgLoss = 0.0 best_acc = 0.0 save_path = './weights/captcha' os.makedirs(save_path, exist_ok=True) if t.cuda.is_available(): model = model.cuda() # data loading trainDataset = Captcha("../captcha/train/", train=True) testDataset = Captcha("../captcha/test/", train=False) trainDataLoader = DataLoader(trainDataset, batch_size=batchSize, shuffle=True, num_workers=4) testDataLoader = DataLoader(testDataset, batch_size=batchSize, shuffle=True, num_workers=4) circles_per_epoch = len(trainDataLoader) // batchSize # max_iters = circles_per_epoch * circles_per_epoch # loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=learningRate) loss_meter = meter.AverageValueMeter() # training procedure for epoch in range(totalEpoch): for circle, input in tqdm.tqdm(enumerate(trainDataLoader, 0)): x, label = input if t.cuda.is_available(): x = x.cuda() label = label.cuda() label = label.long() label1, label2, label3, label4 = label[:, 0], label[:, 1], label[:, 2], label[:, 3] # print(label1,label2,label3,label4) optimizer.zero_grad() y1, y2, y3, y4 = model(x) # print(y1.shape, y2.shape, y3.shape, y4.shape) loss1, loss2, loss3, loss4 = criterion(y1, label1), criterion(y2, label2) \ , criterion(y3, label3), criterion(y4, label4) loss = loss1 + loss2 + loss3 + loss4 loss_meter.add(loss.item()) writer.add_scalar('train/loss', loss.item(), circle + epoch * circles_per_epoch) # print(loss) avgLoss += loss.item() loss.backward() optimizer.step() # evaluation if circle % printCircle == 1: print("Epoch %d : after %d circle,the train loss is %.5f" % (epoch, circle, avgLoss / printCircle)) writeFile("Epoch %d : after %d circle,the train loss is %.5f" % (epoch, circle, avgLoss / printCircle)) avgLoss = 0 if circle % testCircle == 1: accuracy = test(model, testDataLoader) if accuracy > best_acc: best_acc = accuracy model.save(save_path) print('current acc is : {}, the best acc is : {}'.format(accuracy, best_acc)) writeFile("current acc is : %.5f, the best acc is : %.5f" % (accuracy, best_acc)) writer.add_scalar('test/acc', accuracy, circle + epoch * circles_per_epoch) # if circle % saveCircle == 1: # model.save(str(epoch)+"_"+str(saveCircle)) writer.close()
def main(): if not os.path.exists(opt.save): os.mkdir(opt.save) if opt.scat > 0: model, params, stats = models.__dict__[opt.model](N=opt.N, J=opt.scat) else: model, params, stats = models.__dict__[opt.model]() def create_optimizer(opt, lr): print('creating optimizer with lr = %f' % lr) return torch.optim.SGD(params.values(), lr, opt.momentum, weight_decay=opt.weightDecay) def get_iterator(mode): ds = create_dataset(opt, mode) return ds.parallel(batch_size=opt.batchSize, shuffle=mode, num_workers=opt.nthread, pin_memory=False) optimizer = create_optimizer(opt, opt.lr) iter_test = get_iterator(False) iter_train = get_iterator(True) if opt.scat > 0: scat = Scattering(M=opt.N, N=opt.N, J=opt.scat, pre_pad=False).cuda() epoch = 0 if opt.resume != '': resumeFile = opt.resume if not resumeFile.endswith('pt7'): resumeFile = torch.load(opt.resume + '/latest.pt7')['latest_file'] state_dict = torch.load(resumeFile) epoch = state_dict['epoch'] params_tensors, stats = state_dict['params'], state_dict['stats'] for k, v in params.iteritems(): v.data.copy_(params_tensors[k]) optimizer.load_state_dict(state_dict['optimizer']) print('model was restored from epoch:', epoch) print('\nParameters:') print( pd.DataFrame([(key, v.size(), torch.typename(v.data)) for key, v in params.items()])) print('\nAdditional buffers:') print( pd.DataFrame([(key, v.size(), torch.typename(v)) for key, v in stats.items()])) n_parameters = sum( [p.numel() for p in list(params.values()) + list(stats.values())]) print('\nTotal number of parameters: %f' % n_parameters) meter_loss = meter.AverageValueMeter() classacc = meter.ClassErrorMeter(topk=[1, 5], accuracy=False) timer_data = meter.TimeMeter('s') timer_sample = meter.TimeMeter('s') timer_train = meter.TimeMeter('s') timer_test = meter.TimeMeter('s') def h(sample): inputs = sample[0].cuda() if opt.scat > 0: inputs = scat(inputs) inputs = Variable(inputs) targets = Variable(sample[1].cuda().long()) if sample[2]: model.train() else: model.eval() y = torch.nn.parallel.data_parallel(model, inputs, np.arange(opt.ngpu).tolist()) return F.cross_entropy(y, targets), y def log(t, state): if (t['epoch'] > 0 and t['epoch'] % opt.frequency_save == 0): torch.save( dict(params={k: v.data.cpu() for k, v in params.iteritems()}, stats=stats, optimizer=state['optimizer'].state_dict(), epoch=t['epoch']), open(os.path.join(opt.save, 'epoch_%i_model.pt7' % t['epoch']), 'w')) torch.save( dict( latest_file=os.path.join(opt.save, 'epoch_%i_model.pt7' % t['epoch'])), open(os.path.join(opt.save, 'latest.pt7'), 'w')) z = vars(opt).copy() z.update(t) logname = os.path.join(opt.save, 'log.txt') with open(logname, 'a') as f: f.write('json_stats: ' + json.dumps(z) + '\n') print(z) def on_sample(state): global data_time data_time = timer_data.value() timer_sample.reset() state['sample'].append(state['train']) def on_forward(state): prev_sum5 = classacc.sum[5] prev_sum1 = classacc.sum[1] classacc.add(state['output'].data, torch.LongTensor(state['sample'][1])) meter_loss.add(state['loss'].data[0]) next_sum5 = classacc.sum[5] next_sum1 = classacc.sum[1] n = state['output'].data.size(0) curr_top5 = 100.0 * (next_sum5 - prev_sum5) / n curr_top1 = 100.0 * (next_sum1 - prev_sum1) / n sample_time = timer_sample.value() timer_data.reset() if (state['train']): txt = 'Train:' else: txt = 'Test' if (state['t'] % opt.frequency_print == 0 and state['t'] > 0): print( '%s [%i,%i/%i] ; loss: %.3f (%.3f) ; acc5: %.2f (%.2f) ; acc1: %.2f (%.2f) ; data %.3f ; time %.3f' % (txt, state['epoch'], state['t'] % len(state['iterator']), len(state['iterator']), state['loss'].data[0], meter_loss.value()[0], curr_top5, classacc.value(5), curr_top1, classacc.value(1), data_time, sample_time)) def on_start(state): state['epoch'] = epoch def on_start_epoch(state): classacc.reset() meter_loss.reset() timer_train.reset() state['iterator'] = iter_train epoch = state['epoch'] + 1 if epoch in epoch_step: print('changing LR') lr = state['optimizer'].param_groups[0]['lr'] state['optimizer'] = create_optimizer(opt, lr * opt.lr_decay_ratio) def on_end_epoch(state): if (state['t'] % opt.frequency_test == 0 and state['t'] > 0): train_loss = meter_loss.value() train_acc = classacc.value() train_time = timer_train.value() meter_loss.reset() classacc.reset() timer_test.reset() engine.test(h, iter_test) log( { "train_loss": train_loss[0], "train_acc": 100 - train_acc[0], "test_loss": meter_loss.value()[0], "test_acc": 100 - classacc.value()[0], "epoch": state['epoch'], "n_parameters": n_parameters, "train_time": train_time, "test_time": timer_test.value(), }, state) engine = Engine() engine.hooks['on_sample'] = on_sample engine.hooks['on_forward'] = on_forward engine.hooks['on_start_epoch'] = on_start_epoch engine.hooks['on_end_epoch'] = on_end_epoch engine.hooks['on_start'] = on_start engine.train(h, iter_train, opt.epochs, optimizer)
import warnings warnings.filterwarnings("ignore") from metric import metric_results, printMetricResults from tensorboardX import SummaryWriter from torchnet import meter device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # device = 'cpu' class Config(object): tensorboardX_path = './tensorboardXDir/solution1' config = Config() loss_train_meter = meter.AverageValueMeter() # 记录损失函数的均值和方差 loss_valid_meter = meter.AverageValueMeter() def get_logger(filename='logtest'): from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter logger = getLogger(__name__) logger.setLevel(INFO) handler1 = StreamHandler() handler1.setFormatter(Formatter("%(message)s")) handler2 = FileHandler(filename=f"{filename}.log") handler2.setFormatter(Formatter("%(message)s")) logger.addHandler(handler1) logger.addHandler(handler2) return logger
print('Called with args:') print(args) if args.use_tfboard: from model.utils.logger import Logger # Set the logger logger = Logger('./logs') if args.use_visdom: # visdom from visual_loss import Visualizer from torchnet import meter featurename= 'fpn' visname= featurename+args.net+"_"+args.dataset+"_"+str(args.session) vis = Visualizer(env=visname) loss_meter = meter.AverageValueMeter() loss_rpn_cls_meter = meter.AverageValueMeter() loss_rpn_box_meter = meter.AverageValueMeter() loss_rcnn_cls_meter = meter.AverageValueMeter() loss_rcnn_box_meter = meter.AverageValueMeter() logging.basicConfig(filename="logs/"+args.net+"_"+args.dataset+"_"+str(args.session)+".log", filemode='w', level=logging.DEBUG) logging.info(str(datetime.now())) if args.dataset == "pascal_voc": args.imdb_name = "voc_2007_trainval" args.imdbval_name = "voc_2007_test" args.set_cfgs = ['FPN_ANCHOR_SCALES', '[32, 64, 128, 256, 512]', 'FPN_FEAT_STRIDES', '[4, 8, 16, 32, 64]', 'MAX_NUM_GT_BOXES', '20'] elif args.dataset == "pascal_voc_0712": args.imdb_name = "voc_2007_trainval+voc_2012_trainval"
def train(**kwargs): """根据命令行参数更新配置""" opt.parse(kwargs) vis = Visualizer(opt.env) """(1)step1:加载网络,若有预训练模型也加载""" #model = getattr(models,opt.model)() model = models.resnet34(pretrained=True) model.fc = nn.Linear(512, 2) #if opt.load_model_path: # model.load(opt.load_model_path) if opt.use_gpu: #GPU model.cuda() """(2)step2:处理数据""" train_data = DogCat(opt.train_data_root, train=True) #训练集 val_data = DogCat(opt.train_data_root, train=False) #验证集 train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) """(3)step3:定义损失函数和优化器""" criterion = t.nn.CrossEntropyLoss() #交叉熵损失 lr = opt.lr #学习率 optimizer = t.optim.SGD(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) """(4)step4:统计指标,平滑处理之后的损失,还有混淆矩阵""" loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e10 """(5)开始训练""" for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in enumerate(train_dataloader): print "ii:", ii #训练模型参数 input = Variable(data) target = Variable(label) if opt.use_gpu: input = input.cuda() target = target.cuda() #梯度清零 optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() #反向传播 #更新参数 optimizer.step() #更新统计指标及可视化 loss_meter.add(loss.item()) #print score.shape,target.shape confusion_matrix.add(score.detach(), target.detach()) if ii % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() #model.save() name = time.strftime('model' + '%m%d_%H:%M:%S.pth') t.save(model.state_dict(), 'checkpoints/' + name) """计算验证集上的指标及可视化""" val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) print "epoch:", epoch, "loss:", loss_meter.value( )[0], "accuracy:", val_accuracy """如果损失不再下降,则降低学习率""" if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group["lr"] = lr previous_loss = loss_meter.value()[0]
def train(**kwargs): # setting the parameter in opt as the input argument for k, v in kwargs.items(): setattr(opt, k, v.strip("'")) # setting the device opt.device = t.device('cuda') if opt.use_gpu else t.device('cpu') device = opt.device vis = Visualizer(env=opt.env) # get the sequence from sequence.npz data, word2ix_train, ix2word_train, word2ix_fix, ix2word_fix = load_data( opt.parsed_data_path) random.shuffle(data) #devide the data for the test and train and convert to the dataloader devision = int(len(data) * 8 / 10) train_data = data[:devision] test_data = data[devision + 1:] train_data = t.from_numpy(train_data) test_data = t.from_numpy(test_data) dataloader = t.utils.data.DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, num_workers=1) dataloader_fortest = t.utils.data.DataLoader(test_data, batch_size=opt.batch_size, shuffle=True, num_workers=1) # define the model model = TrainingModel_Vec(len(word2ix_train), len(word2ix_fix), 200, 400) optimizer = t.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=opt.lr) criterion = nn.CrossEntropyLoss() loss_meter = meter.AverageValueMeter() # load the pretrained word vector and convert it to a matrix in the order of index pretrained_weight = form_matrix(ix2word_fix, opt.pathforvec) pretrained_weight = np.array(pretrained_weight) # copy the pretrained vectors to the embeding model.embeddingsfix.weight.data.copy_(t.from_numpy(pretrained_weight)) i = 0 for epoch in range(opt.epoch): loss_meter.reset() for ii, data_ in tqdm.tqdm(enumerate(dataloader)): data_ = data_.long().transpose(1, 0).contiguous() data_ = data_.to(device) optimizer.zero_grad() input_, target = data_[:-1, :], data_[1:, :] output, _ = model(input_) loss = criterion(output, target.view(-1)) loss.backward() optimizer.step() loss_meter.add(loss.item()) # plot the loss if (1 + ii) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() vis.plot('lossintrain', loss_meter.value()[0]) #print("loss_meter.value()[0] : " + str(loss_meter.value()[0])) vis.plot('lossintrain', loss.item()) #print("loss.item() : " + str(loss.item())) # for test loss_meter.reset() model.eval() # 设置为test模式 test_loss = 0 # 初始化测试损失值为0 correct = 0 # 初始化预测正确的数据个数为0 total = 0 for iii, datatest in enumerate(dataloader_fortest): #if args.cuda: # data, target = data.cuda(), target.cuda() datatest = datatest.long().transpose(1, 0).contiguous() datatest = datatest.to(device) optimizer.zero_grad() input_test, target_test = datatest[:-1, :], datatest[ 1:, :] #后面这个是是去掉了第一行,前面这个是去掉最后一行 output_test, _ = model(input_test) test_loss += criterion(output_test, target_test.view(-1)) #print("loss_test: " + str(loss_test)) #loss_meter.add(loss_test.item()) #test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss 把所有loss值进行累加 pred = output_test.data.max(1, keepdim=True)[ 1] # get the index of the max log-probability 其中[0]是值[1]是index #print(output_test.size()) #print(target_test.size()) #print("right: " + str(pred.eq(target_test.data.view_as(pred)).cpu().sum())) #print(pred.size()[0]) #print(target_test) target_test = target_test.data.view_as( pred)[int(pred.size()[0] / 4 * 2):int(pred.size()[0] / 4 * 3)] #print(target_test) pred = pred[int(pred.size()[0] / 4 * 2):int(pred.size()[0] / 4 * 3)] #print("original: " + str(len(datatest.data[0]))) #print(target_test.data.view_as(pred).size()[0]) #print(target_test.data.view_as(pred).size()) correct += pred.eq(target_test).cpu().sum() # 对预测正确的数据个数进行累加 total += target_test.size()[0] #correct += find_in_ten(output_test.data,target_test.data) test_loss /= iii print(epoch) print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'. format(test_loss, correct, total, 100. * correct / total)) model.train() t.save(model.state_dict(), '%s_%s.pth' % ("testtestingfix", epoch))
def train(): vis = Visualizer("Kesci") train_data = AppData("data/train_23d_1p_ap.json", iflabel=True) val_data = AppData("data/val_23d_1p_ap.json", iflabel=True) train_dataloader = DataLoader(train_data, 256, shuffle=True, num_workers=4) val_dataloader = DataLoader(val_data, 256, shuffle=False, num_workers=2) test_data = AppData("data/test_23d_1p_ap.json", iflabel=True) test_dataloader = DataLoader(test_data, 256, shuffle=False, num_workers=2) criterion = t.nn.CrossEntropyLoss(weight=t.Tensor([1, 1.2])).cuda() learning_rate = 0.0005 weight_decay = 0.0002 model = Sequence(15, 128, 1).cuda() optimizer = t.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 for epoch in range(500): loss_meter.reset() confusion_matrix.reset() for ii, (data, property, label) in tqdm(enumerate(train_dataloader)): input = Variable(data).cuda() input2 = Variable(property).cuda() target = Variable(label).cuda().view(-1) output = model(input, input2) optimizer.zero_grad() loss = criterion(output, target) loss.backward() optimizer.step() loss_meter.add(loss.data[0]) confusion_matrix.add(output.data, target.data) if ii % 100 == 99: vis.plot('loss', loss_meter.value()[0]) if epoch % 3 == 2: train_cm, train_f1 = val(model, train_dataloader) vis.plot('train_f1', train_f1) val_cm, val_f1 = val(model, val_dataloader) vis.plot_many({'val_f1': val_f1, 'learning_rate': learning_rate}) # vis.log("epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}".format( # epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), # train_cm=str(confusion_matrix.value()), lr=learning_rate)) if loss_meter.value()[0] > previous_loss: learning_rate = learning_rate * 0.95 # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = learning_rate previous_loss = loss_meter.value()[0] if epoch % 10 == 9: model.save() test_cm, test_f1 = val(model, test_dataloader) vis.plot('test_f1', test_f1) vis.log( "{train_f1}, {val_f1}, {test_f1}, model:{model}, {train_cm}, {val_cm}, {test_cm}" .format(train_f1=train_f1, val_f1=val_f1, test_f1=test_f1, model=time.strftime('%m%d %H:%M:%S'), train_cm=str(train_cm.value()), val_cm=str(val_cm.value()), test_cm=str(test_cm.value())))
def train(args): """ Implements the training loop for the MultiTaskResnet3dClassifier. Args: args (Namespace) : Program arguments """ # Get model and loss function model = MTClassifier3D(args).to(args.device) # Initialize losses for each head loss_wrapper = MultiTaskLoss(args) loss_fn = nn.BCEWithLogitsLoss() # TODO: Get train and validation dataloaders train_dataset = ClassifierDataset(args.csv_dir, 'train', args.features, resample=( args.num_slices, args.slice_size, args.slice_size)) train_loader = DataLoader( train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True ) peds_validation_dataset = ClassifierDataset(args.peds_csv_dir, 'val', args.peds_features, resample=( args.num_slices, args.slice_size, args.slice_size)) peds_validation_loader = DataLoader( peds_validation_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False, pin_memory=True ) adult_validation_dataset = ClassifierDataset(args.adult_csv_dir, 'val', args.adult_features, resample=( args.num_slices, args.slice_size, args.slice_size)) adult_validation_loader = DataLoader( adult_validation_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False, pin_memory=True ) # Get optimizer and scheduler optimizer = optim.Adam(model.parameters(), args.lr) warmup_iters = args.lr_warmup_epochs * len(train_loader) lr_milestones = [len(train_loader) * m for m in args.lr_milestones] lr_scheduler = WarmupMultiStepLR( optimizer, milestones=lr_milestones, gamma=args.lr_gamma, warmup_iters=warmup_iters, warmup_factor=1e-5) # Get saver, logger, and evaluator saver = ModelSaver(args, max_ckpts=args.max_ckpts, metric_name=args.best_ckpt_metric, maximize_metric=args.maximize_metric) # evaluator = ModelEvaluator(args, validation_loader, cls_loss_fn) # Load model from checkpoint is applicable if args.continue_train: saver.load_model(model, args.name, ckpt_path=args.load_path, optimizer=optimizer, scheduler=lr_scheduler) logger = TrainLogger(args, len(train_loader.dataset)) # Multi GPU training if applicable if len(args.gpu_ids) > 1: print("Using", len(args.gpu_ids), "GPUs.") model = nn.DataParallel(model) loss_meter = meter.AverageValueMeter() # Train model logger.log_hparams(args) while not logger.is_finished_training(): logger.start_epoch() for inputs, targets in tqdm(train_loader): logger.start_iter() with torch.set_grad_enabled(True): inputs = inputs.to(args.device) targets = targets.to(args.device) head_preds = model(inputs) loss = loss_wrapper(head_preds, targets) loss_meter.add(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() # Log all train losses if logger.iter % args.steps_per_print == 0 and logger.iter != 0: logger.log_metrics({'train_loss': loss_meter.value()[0]}) loss_meter.reset() logger.end_iter() # Evaluate model and save model ckpt if logger.epoch % args.epochs_per_eval == 0: peds_metrics = evaluate(args, model, loss_wrapper, peds_validation_loader, "validation", args.device, 'peds') logger.log_metrics(peds_metrics) adult_metrics = evaluate(args, model, loss_wrapper, adult_validation_loader, "validation", args.device, 'adult') logger.log_metrics(adult_metrics) if logger.epoch % args.epochs_per_save == 0: saver.save(logger.epoch, model, optimizer, lr_scheduler, args.device, args.name) lr_scheduler.step() logger.end_epoch()
def train(**kwargs): # 根据命令行参数更新配置 opt.parse(kwargs) # vis = Visualizer(opt.env) # step1: 模型 model = getattr(mymodels, opt.model)() ''' model_ft = torchvision.models.vgg16_bn(pretrained = True) pretrained_dict = model_ft.state_dict() model_dict = model.state_dict() # 将pretrained_dict里不属于model_dict的键剔除掉 pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} model_dict.update(pretrained_dict)x model.load_state_dict(model_dict) ''' if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() summary(model, (3, 224, 224)) print(opt) # step2: 数据 train_data = myData( filelists=opt.train_filelists, # transform = data_transforms['train'], scale=opt.cropscale, transform=None, test=False, data_source='none') val_data = myData( filelists=opt.test_filelists, # transform =data_transforms['val'], transform=None, scale=opt.cropscale, test=False, data_source='none') train_loader = DataLoader(dataset=train_data, batch_size=opt.batch_size, shuffle=True) print(train_loader) val_loader = DataLoader(dataset=val_data, batch_size=opt.batch_size, shuffle=False) dataloaders = {'train': train_loader, 'val': val_loader} dataset_sizes = {'train': len(train_data), 'val': len(val_data)} # step3: 目标函数和优化器 criterion = FocalLoss(2) # criterion = torch.nn.CrossEntropyLoss() lr = opt.lr # optimizer = torch.optim.Adam(model.parameters(), # lr = lr, # weight_decay = opt.weight_decay) optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=0.5, weight_decay=opt.weight_decay) exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_stepsize, gamma=0.5) # set learning rate every 10 epoch decrease 10% # step4: 统计指标:平滑处理之后的损失,还有混淆矩阵 confusion_matrix = meter.ConfusionMeter(2) train_loss = meter.AverageValueMeter() # 为了可视化增加的内容 val_loss = meter.AverageValueMeter() train_acc = meter.AverageValueMeter() # 为了可视化增加的内容 val_acc = meter.AverageValueMeter() previous_loss = 1e100 best_tpr = 0.0 # 训练 for epoch in range(opt.max_epoch): print('Epoch {}/{}'.format(epoch, opt.max_epoch - 1)) print('-' * 10) train_loss.reset() train_acc.reset() running_loss = 0.0 running_corrects = 0 exp_lr_scheduler.step() for step, batch in enumerate(tqdm(train_loader, desc='Train %s On Anti-spoofing' % (opt.model), unit='batch')): inputs, labels = batch if opt.use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels.cuda()) else: inputs = Variable(inputs) labels = Variable(labels) optimizer.zero_grad() # zero the parameter gradients with torch.set_grad_enabled(True): outputs = model(inputs) # print(outputs.shape) _, preds = torch.max(outputs, 1) loss0 = criterion(outputs, labels) loss = loss0 loss.backward() # backward of gradient optimizer.step() # strategy to drop if step % 20 == 0: pass # print('epoch:%d/%d step:%d/%d loss: %.4f loss0: %.4f loss1: %.4f'%(epoch, opt.max_epoch, step, len(train_loader), # loss.item(),loss0.item(),loss1.item())) running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) ''' if step%opt.print_freq==opt.print_freq-1: vis.plot('loss', train_loss.value()[0]) # 如果需要的话,进入debug模式 if os.path.exists(opt.debug_file): import ipdb; ipdb.set_trace() ''' epoch_loss = running_loss / dataset_sizes['train'] epoch_acc = running_corrects.double() / float(dataset_sizes['train']) print('Train Loss: {:.8f} Acc: {:.4f}'.format(epoch_loss, epoch_acc)) train_loss.add(epoch_loss) train_acc.add(epoch_acc) val_loss.reset() val_acc.reset() val_cm, v_loss, v_accuracy, metric = val(model, val_loader, dataset_sizes['val']) print('Val Loss: {:.8f} Acc: {:.4f}'.format(v_loss, v_accuracy)) val_loss.add(v_loss) val_acc.add(v_accuracy) eer = metric[0] tprs = metric[1] auc = metric[2] xy_dic = metric[3] tpr1 = tprs['TPR(1.%)'] # vis.plot_many_stack({'train_loss':train_loss.value()[0],\ # 'val_loss':val_loss.value()[0]},win_name ="Loss") # vis.plot_many_stack({'train_acc':train_acc.value()[0],\ # 'val_acc':val_acc.value()[0]},win_name = 'Acc') # vis.log("epoch:{epoch},lr:{lr},\ # train_loss:{train_loss},train_acc:{train_acc},\ # val_loss:{val_loss},val_acc:{val_acc},\ # train_cm:{train_cm},val_cm:{val_cm}" # .format( # epoch = epoch, # train_loss = train_loss.value()[0], # train_acc = train_acc.value()[0], # val_loss = val_loss.value()[0], # val_acc = val_acc.value()[0], # train_cm=str(confusion_matrix.value()), # val_cm = str(val_cm.value()), # lr=lr)) ''' if v_loss > previous_loss: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr ''' # vis.plot_many_stack({'lr':lr},win_name ='lr') previous_loss = val_loss.value()[0] # if tpr1 > best_tpr: best_tpr = tpr1 best_tpr_epoch = epoch # best_model_wts = model.state_dict() os.system('mkdir -p %s' % (os.path.join('checkpoints', opt.model))) model.save(name='checkpoints/' + opt.model + '/' + str(epoch) + '.pth') # print('Epoch: {:d} Val Loss: {:.8f} Acc: {:.4f}'.format(epoch,v_loss,v_accuracy),file=open('result/val.txt','a')) print( 'Epoch: {:d} Val Loss: {:.8f} Acc: {:.4f} EER: {:.6f} TPR(1.0%): {:.6f} TPR(.5%): {:.6f} AUC: {:.8f}'.format( epoch, v_loss, v_accuracy, eer, tprs["TPR(1.%)"], tprs["TPR(.5%)"], auc), file=open('D:\\dingding\\xiazai\\test\\val.txt', 'a')) print( 'Epoch: {:d} Val Loss: {:.8f} Acc: {:.4f} EER: {:.6f} TPR(1.0%): {:.6f} TPR(.5%): {:.6f} AUC: {:.8f}'.format( epoch, v_loss, v_accuracy, eer, tprs["TPR(1.%)"], tprs["TPR(.5%)"], auc)) # model.load_state_dict(best_model_wts) print('Best val Epoch: {},Best val TPR: {:4f}'.format(best_tpr_epoch, best_tpr))
# 用 torchnet来存放损失函数,如果没有,请安装conda install torchnet ''' 训练前的模型、损失函数设置 vis = Visualizer(env='my_wind')#为了可视化增加的内容 loss_meter = meter.AverageValueMeter()#为了可视化增加的内容 for epoch in range(10): #每个epoch开始前,将存放的loss清除,重新开始记录 loss_meter.reset()#为了可视化增加的内容 model.train() for ii,(data,label)in enumerate(trainloader): ... out=model(input) loss=... loss_meter.add(loss.data[0])#为了可视化增加的内容 #loss可视化 #loss_meter.value()[0]返回存放的loss的均值 vis.plot_many_stack({'train_loss': loss_meter.value()[0]})#为了可视化增加的内容 ''' # 示例 vis = Visualizer(env='my_wind') # 为了可视化增加的内容 loss_meter = meter.AverageValueMeter() # 为了可视化增加的内容 for epoch in range(103): time.sleep(.1) # loss_meter.reset() # 为了可视化增加的内容 loss_meter.add(epoch * random.random()) # 假设loss=epoch vis.plot_many_stack({'train_loss': loss_meter.value()[0]}) # 为了可视化增加的内容 # 如果还想同时显示test loss,如法炮制,并用字典的形式赋值,如下。还可以同时显示train和test accuracy # vis.plot_many_stack({'train_loss': loss_meter.value()[0],'test_loss':test_loss_meter.value()[0]})#为了可视化增加的内容
def train(**kwargs): opt.parse(kwargs) if not os.path.exists(opt.save_folder): os.mkdir(opt.save_folder) tb_logger = SummaryWriter(opt.save_folder) logger = create_logger('global_logger', opt.save_folder + '/log.txt') batch_time = AverageMeter(10) data_time = AverageMeter(10) losses = AverageMeter(10) loss_meter = meter.AverageValueMeter() train_sets = [] for data_txt in opt.train_txt: data_root, gt_root, list_file = data_txt.split(' ') train_sets.append( OCRDataset(data_root, gt_root, list_file, opt.input_size, 'train', opt.chars_list, opt.max_seq)) train_data = ConcatDataset(train_sets) train_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_works) valid_sets = [] for valid_txt in opt.valid_txt: data_root, gt_root, list_file = valid_txt.split(' ') valid_sets.append( OCRDataset(data_root, gt_root, list_file, opt.input_size, 'valid', opt.chars_list, opt.max_seq)) valid_data = ConcatDataset(valid_sets) valid_loader = DataLoader(valid_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_works) model = getattr(models, opt.model)(opt.basenet, opt.input_size, opt.max_seq, opt.num_classes, mode='train', attn=opt.attn) if opt.load_model_path is not None: load_state(model, opt.load_model_path, 'cuda:%d' % opt.gpus[0]) if len(opt.gpus) > 1: model = torch.nn.DataParallel(model, device_ids=opt.gpus) model = gpu(model, opt) if len(opt.gpus) > 1: optimizer = torch.optim.Adam(model.module.parameters(), lr=opt.lr, betas=opt.betas, weight_decay=opt.weight_decay) else: optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=opt.betas, weight_decay=opt.weight_decay) curr_step = 0 total_step = int(len(train_data) / opt.batch_size * opt.epoches) best_val_error = 1e10 previous_loss = 1e10 # warmup warmup_epoches = opt.epoches // 10 warmup_rate = math.pow(100, 1 / warmup_epoches) for epoch in range(opt.epoches): model.train() end = time.time() # loss_meter.reset() for i, (imgs, gt_chars_seg, gt_order_seg, gt_pos_seg) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) # zero the parameter gradients optimizer.zero_grad() imgs = gpu(imgs, opt) gt_chars_seg = gpu(gt_chars_seg, opt) gt_order_seg = gpu(gt_order_seg, opt) gt_pos_seg = gpu(gt_pos_seg, opt) chars_seg, ord_seg, pos_seg = model(imgs) loss = get_loss(chars_seg, ord_seg, pos_seg, gt_chars_seg, gt_order_seg, gt_pos_seg, opt) loss.backward() optimizer.step() losses.update(loss.item()) loss_meter.add(loss.item()) # measure elapsed time batch_time.update(time.time() - end) end = time.time() curr_step += 1 current_lr = optimizer.param_groups[0]['lr'] if curr_step % opt.print_freq == 0: tb_logger.add_scalar('loss_train', losses.avg, curr_step) tb_logger.add_scalar('lr', current_lr, curr_step) logger.info( 'Iter: [{0}/{1}]\t' 'Epoch: {2}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'LR {lr:.4f}'.format(curr_step, total_step, epoch, batch_time=batch_time, data_time=data_time, loss=losses, lr=current_lr)) # val model.eval() val_error = val(model, valid_loader, opt) logger.info('Mean error: {0}\t'.format(val_error)) if not tb_logger is None: tb_logger.add_scalar('error_val', val_error, curr_step) if val_error < best_val_error: best_val_error = val_error if len(opt.gpus) > 1: torch.save(model.module.state_dict(), os.path.join(opt.save_folder, "best_val_error.pth")) else: torch.save(model.state_dict(), os.path.join(opt.save_folder, "best_val_error.pth")) # warmup if epoch < warmup_epoches: for param_group in optimizer.param_groups: param_group["lr"] *= warmup_rate # decay lr if loss no longer decrease else: if opt.lr_immediate_decay and loss_meter.value( )[0] > previous_loss: for param_group in optimizer.param_groups: param_group["lr"] *= opt.lr_decay if epoch == int(opt.epoches * 0.6) or epoch == int( opt.epoches * 0.9): for param_group in optimizer.param_groups: param_group["lr"] *= opt.lr_decay previous_loss = loss_meter.value()[0] # save last pth if len(opt.gpus) > 1: torch.save(model.module.state_dict(), os.path.join(opt.save_folder, "last.pth")) else: torch.save(model.state_dict(), os.path.join(opt.save_folder, "last.pth"))
def train(): """ train function :return: """ vis = visualizer.Visualizer(config.visdom_env) # step1: configure model model = torchvision.models.densenet121(pretrained=False, num_classes=2) if config.use_gpu: model = torch.nn.DataParallel(model).cuda() # step2: data train_data = DogCat(config.train_data_root, train=True) val_data = DogCat(config.train_data_root, train=False) train_dataloader = DataLoader(train_data, config.batch_size, shuffle=True, num_workers=config.num_workers) val_dataloader = DataLoader(val_data, config.batch_size, shuffle=False, num_workers=config.num_workers) # step3: criterion and optimizer criterion = torch.nn.CrossEntropyLoss() lr = config.lr optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=config.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 # train model.train() best_acc = -1.0 start_epoch = -1 # optionally resume from a checkpoint state = dict() if config.load_model_path: logging.info('Loading checkpoint from {path}'.format( path=config.load_model_path)) state = model_util.load(config.load_model_path) start_epoch = state['epoch'] best_acc = state['accuracy'] model.load_state_dict(state['state_dic']) optimizer.load_state_dict(state['optimizer']) logging.info('Loaded checkpoint from {path}'.format( path=config.load_model_path)) for epoch in range(start_epoch + 1, config.max_epoch): logging.info('epoch = %d' % epoch) loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in tqdm(enumerate(train_dataloader), total=len(train_data)): # train model input_var = Variable(data) target_var = Variable(label) if config.use_gpu: input_var = input_var.cuda() target_var = target_var.cuda() optimizer.zero_grad() score = model(input_var) loss = criterion(score, target_var) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.data[0]) confusion_matrix.add(score.data, target_var.data) if ii % config.print_freq == config.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) # 进入debug模式 # if os.path.exists(config.debug_file): # import ipdb; # ipdb.set_trace() # validate and visualize val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) is_best = val_accuracy > best_acc best_acc = max(val_accuracy, best_acc) logging.info( "epoch:{epoch},lr:{lr},loss:{loss},acc:{acc} train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], acc=val_accuracy, val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) state['epoch'] = epoch state['model'] = config.model state['state_dic'] = model.state_dict() state['accuracy'] = val_accuracy state['optimizer'] = optimizer.state_dict() model_util.save(state, config.checkpoint_dir, is_best) # update learning rate if loss_meter.value()[0] > previous_loss: lr = lr * config.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers, args): """Training loop for one epoch.""" losses = OrderedDict([(OVERALL_LOSS_KEY, tnt.AverageValueMeter()), (OBJECTIVE_LOSS_KEY, tnt.AverageValueMeter())]) classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() # For Early Exit, we define statistics for each exit # So exiterrors is analogous to classerr for the non-Early Exit case if args.earlyexit_lossweights: args.exiterrors = [] for exitnum in range(args.num_exits): args.exiterrors.append(tnt.ClassErrorMeter(accuracy=True, topk=(1, 5))) total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) # Switch to train mode model.train() end = time.time() for train_step, (inputs, target) in enumerate(train_loader): # Measure data loading time data_time.add(time.time() - end) inputs, target = inputs.to('cuda'), target.to('cuda') # Execute the forward phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) if not hasattr(args, 'kd_policy') or args.kd_policy is None: output = model(inputs) else: output = args.kd_policy.forward(inputs) if not args.earlyexit_lossweights: loss = criterion(output, target) # Measure accuracy and record loss classerr.add(output.data, target) else: # Measure accuracy and record loss loss = earlyexit_loss(output, target, criterion, args) losses[OBJECTIVE_LOSS_KEY].add(loss.item()) if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) agg_loss = compression_scheduler.before_backward_pass(epoch, train_step, steps_per_epoch, loss, optimizer=optimizer, return_loss_components=True) loss = agg_loss.overall_loss losses[OVERALL_LOSS_KEY].add(loss.item()) for lc in agg_loss.loss_components: if lc.name not in losses: losses[lc.name] = tnt.AverageValueMeter() losses[lc.name].add(lc.value.item()) # Compute the gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) # measure elapsed time batch_time.add(time.time() - end) steps_completed = (train_step+1) if steps_completed % args.print_freq == 0: # Log some statistics errs = OrderedDict() if not args.earlyexit_lossweights: errs['Top1'] = classerr.value(1) errs['Top5'] = classerr.value(5) else: # for Early Exit case, the Top1 and Top5 stats are computed for each exit. for exitnum in range(args.num_exits): errs['Top1_exit' + str(exitnum)] = args.exiterrors[exitnum].value(1) errs['Top5_exit' + str(exitnum)] = args.exiterrors[exitnum].value(5) stats_dict = OrderedDict() for loss_name, meter in losses.items(): stats_dict[loss_name] = meter.mean stats_dict.update(errs) stats_dict['LR'] = optimizer.param_groups[0]['lr'] stats_dict['Time'] = batch_time.mean stats = ('Peformance/Training/', stats_dict) params = model.named_parameters() if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.print_freq, loggers) end = time.time()
def train(args, settings, train_datset, model, optimizer, writer, device): print("start") train_loader = DataLoader(dataset=train_datset, batch_size=args.batchsize, collate_fn=batcher_g, shuffle=args.shuffle, num_workers=args.workers) # test_loader= DataLoader(dataset=test_dataset,batch_size=args.batchsize,collate_fn=batcher_g,shuffle=args.shuffle,num_workers=args.workers) print(model) model.to(device) loss_fn = nn.CrossEntropyLoss() # MAE_fn = nn.L1Loss() n_loss_meter = meter.AverageValueMeter() c_loss_meter = meter.AverageValueMeter() n_acc_meter = meter.ConfusionMeter( 100) # clustering num might be too big, do not use confusion matrix c_acc_meter = AccMeter(settings['cls_num']) init_lr = args.lr info = {'n_loss': [], 'n_acc': [], 'c_loss': [], 'c_acc': []} cls_tags = 0 K = settings['cls_num'] N = len(train_datset) cls_distr = torch.ones(K) / K inst_distr = torch.ones(N) / N cls_log_prob = np.ones([N, K ]) * np.log(K) / N # prob_tensor (cost function) cls_tags = np.ones([N, K]) / (K * N) # the tag is a prob distribution for epoch in range(args.epochs): n_loss_meter.reset() c_loss_meter.reset() n_acc_meter.reset() c_acc_meter.reset() model.train() # prepare pesudo labels via k means if epoch % settings['cls_epochs'] == 1: # feats_all = get_preds(args, model, train_datset, device) # if epoch == 0: # cls_tags = k_means(feats_all.cpu(), settings['cls_num'], settings['iters'], # inits=settings['init_method'], show_stats=True) # else: # cls_tags = k_means(feats_all.cpu(), settings['cls_num'], settings['iters'], inits='random', #use random tags # show_stats=True) # perform optimal transport time0 = time.time() cls_tags = ot.sinkhorn( inst_distr, cls_distr, cls_log_prob, 0.04) # shape dataset_num*cls_num ...takes 40s~250s on cpu print('optimal transport solved: {}'.format(time.time() - time0)) # model.re_init_head() for idx, (mols, n_label, ids) in enumerate(train_loader): g = dgl.batch([mol.ful_g for mol in mols]) g.to(device) n_label = n_label.to(device) # Mask node features mask = torch.randint( 0, g.number_of_nodes(), [int(args.mask_n_ratio * g.number_of_nodes())]) g.ndata['nodes'][mask] = 0 # make pesudo labels vis optimal transport cls_labels = N * torch.tensor( cls_tags[list(ids)], requires_grad=False).to(device).float() atom_preds, cls_preds = model(g) cls_logits = torch.log(F.softmax(cls_preds, dim=1)) n_pred_cls = torch.argmax(atom_preds, dim=1) n_loss = loss_fn(atom_preds[mask], n_label[mask]) # compute c loss c_loss = torch.sum(-cls_labels * cls_logits, dim=1).mean() loss = c_loss + n_loss optimizer.zero_grad() loss.backward() optimizer.step() cls_log_prob[idx * args.batchsize:idx * args.batchsize + len(mols)] = -cls_logits.detach().cpu().numpy() # n_loss_meter.add(n_loss.detach().item()) c_loss_meter.add(c_loss.detach().item()) n_acc_meter.add(n_pred_cls, n_label) # c_acc_meter.add(c_pred_cls, cls_labels) if idx % 50 == 0 and args.use_tb: acc = 100 * sum( n_acc_meter.value()[i, i] for i in range(10)) / n_acc_meter.value().sum() writer.add_scalar( 'n_train_loss', n_loss_meter.value()[0], int((idx + 1 + epoch * len(train_loader)) / 50)) writer.add_scalar( 'n_train_acc', acc, int((idx + 1 + epoch * len(train_loader)) / 50)) print('training loss {} acc {}'.format(n_loss_meter.value()[0], acc)) # n_loss_test, n_acc_test= test(args,test_loader,model,device) acc = 100 * sum(n_acc_meter.value()[i, i] for i in range(10)) / n_acc_meter.value().sum() print( "Epoch {:2d}, training: loss: {:.7f}, acc: {:.7f} self-clustering: loss: {:.7f}" .format(epoch, n_loss_meter.value()[0], acc, c_loss_meter.value()[0])) if (epoch + 1) % 100 == 0: init_lr = init_lr / 1 for param_group in optimizer.param_groups: param_group['lr'] = init_lr print('current learning rate: {}'.format(init_lr)) info['n_loss'].append(n_loss_meter.value()[0]) info['n_acc'].append(acc) info['c_loss'].append(c_loss_meter.value()[0]) info['c_acc'].append(100 * c_acc_meter.value()) return info
def _validate(data_loader, model, criterion, loggers, args, epoch=-1): """Execute the validation/test loop.""" losses = {'objective_loss': tnt.AverageValueMeter()} classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) if args.earlyexit_thresholds: # for Early Exit, we have a list of errors and losses for each of the exits. args.exiterrors = [] args.losses_exits = [] for exitnum in range(args.num_exits): args.exiterrors.append(tnt.ClassErrorMeter(accuracy=True, topk=(1, 5))) args.losses_exits.append(tnt.AverageValueMeter()) args.exit_taken = [0] * args.num_exits batch_time = tnt.AverageValueMeter() total_samples = len(data_loader.sampler) batch_size = data_loader.batch_size if args.display_confusion: confusion = tnt.ConfusionMeter(args.num_classes) total_steps = total_samples / batch_size msglogger.info('%d samples (%d per mini-batch)', total_samples, batch_size) # Switch to evaluation mode model.eval() end = time.time() for validation_step, (inputs, target) in enumerate(data_loader): with torch.no_grad(): inputs, target = inputs.to('cuda'), target.to('cuda') # compute output from model output = model(inputs) if not args.earlyexit_thresholds: # compute loss loss = criterion(output, target) # measure accuracy and record loss losses['objective_loss'].add(loss.item()) classerr.add(output.data, target) if args.display_confusion: confusion.add(output.data, target) else: earlyexit_validate_loss(output, target, criterion, args) # measure elapsed time batch_time.add(time.time() - end) end = time.time() steps_completed = (validation_step+1) if steps_completed % args.print_freq == 0: if not args.earlyexit_thresholds: stats = ('', OrderedDict([('Loss', losses['objective_loss'].mean), ('Top1', classerr.value(1)), ('Top5', classerr.value(5))])) else: stats_dict = OrderedDict() stats_dict['Test'] = validation_step for exitnum in range(args.num_exits): la_string = 'LossAvg' + str(exitnum) stats_dict[la_string] = args.losses_exits[exitnum].mean # Because of the nature of ClassErrorMeter, if an exit is never taken during the batch, # then accessing the value(k) will cause a divide by zero. So we'll build the OrderedDict # accordingly and we will not print for an exit error when that exit is never taken. if args.exit_taken[exitnum]: t1 = 'Top1_exit' + str(exitnum) t5 = 'Top5_exit' + str(exitnum) stats_dict[t1] = args.exiterrors[exitnum].value(1) stats_dict[t5] = args.exiterrors[exitnum].value(5) stats = ('Performance/Validation/', stats_dict) distiller.log_training_progress(stats, None, epoch, steps_completed, total_steps, args.print_freq, loggers) if not args.earlyexit_thresholds: msglogger.info('==> Top1: %.3f Top5: %.3f Loss: %.3f\n', classerr.value()[0], classerr.value()[1], losses['objective_loss'].mean) if args.display_confusion: msglogger.info('==> Confusion:\n%s\n', str(confusion.value())) return classerr.value(1), classerr.value(5), losses['objective_loss'].mean else: total_top1, total_top5, losses_exits_stats = earlyexit_validate_stats(args) return total_top1, total_top5, losses_exits_stats[args.num_exits-1]
def train(**kwargs): opt.parse(kwargs) vis = Visualizer(opt.env) # step1: configure model model = getattr(models, opt.model)(opt) if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # step2: data train_data = DocumentPair(opt.train_data_root, doc_type='train', suffix='txt', load=lambda x: x.strip().split(',')) train_data.initialize(vocab_size=opt.vocab_size) val_data = DocumentPair(opt.validate_data_root, doc_type='validate', suffix='txt', load=lambda x: x.strip().split(','), vocab=train_data.vocab) val_data.initialize() train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # step3: criterion and optimizer criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 # train for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, batch in enumerate(train_dataloader): data_left, data_right, label, num_pos = load_data( batch, opt, train_data.vocab) # train model input_data_left, input_data_right = Variable( t.from_numpy(data_left)), Variable(t.from_numpy(data_right)) target = Variable(t.from_numpy(label)) if opt.use_gpu: input_data_left, input_data_right = input_data_left.cuda( ), input_data_right.cuda() target = target.cuda() optimizer.zero_grad() scores, predictions = model((input_data_left, input_data_right)) loss = criterion(scores, target.max(1)[1]) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.data[0]) confusion_matrix.add(predictions.data, target.max(1)[1].data) if ii % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) # 进入debug模式 if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() model.save() # validate and visualize val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) # update learning rate if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(**kwargs): opt._parse(kwargs) vis = Visualizer(opt.env,port = opt.vis_port) device = t.device('cuda') if opt.use_gpu else t.device('cpu') # 数据加载 train_data = FLogo(opt.data_root,train=True) train_dataloader = DataLoader(train_data,opt.batch_size,shuffle=True,num_workers=opt.num_workers) ''' # 以下内容是可视化dataloader的数据的 一 检查dataset是否合理 二 为了写论文凑图 dataiter = iter(train_dataloader) img1,img2,lable=dataiter.next() img1 = tv.utils.make_grid((img1+1)/2,nrow=6,padding=2).numpy() img2 = tv.utils.make_grid((img2+1)/2,nrow=6,padding=2).numpy() plt.figure() plt.imshow(np.transpose(img1, (1, 2, 0))) plt.figure() plt.imshow(np.transpose(img2, (1, 2, 0))) plt.figure() lables = label.unsqueeze(1) # lables mask = tv.utils.make_grid(lables,nrow=6,padding=2).numpy() plt.imshow(np.transpose(mask, (1, 2, 0))) plt.show() from torchvision.transforms import ToPILImage import numpy as np import matplotlib.pylab as plt train() ''' # 网络 net = Net() net.train() # 加载预训练模型 if opt.load_model_path: net.load_state_dict(t.load(opt.load_model_path,map_location = lambda storage,loc:storage),False) print('已加载完。。') else: # 模型初始化 for m in net.modules(): if isinstance(m, (nn.Conv2d, nn.Linear)): nn.init.xavier_normal_(m.weight) print('模型参数完成初始化。。') net.to(device) # 损失函数和优化器 criterion = nn.BCEWithLogitsLoss(pos_weight=opt.pos_weight.to(device)) optimizer = t.optim.SGD(net.parameters(),lr=opt.lr, momentum=opt.momentum,weight_decay=opt.weight_decay) # 使用meter模块 loss_meter = meter.AverageValueMeter() # 学习率调整策略 # scheduler = StepLR(optimizer, step_size=1000, gamma=0.5) for epoch in range(opt.epoches): loss_meter.reset() # 重置loss_meter?? for ii,(target_img,query_logo,mask) in tqdm.tqdm(enumerate(train_dataloader)): print(target_img.shape) # 训练 target_img = target_img.to(device) query_logo = query_logo.to(device) mask = mask.to(device) optimizer.zero_grad() output = net(query_logo,target_img) output = output.squeeze() predict = t.sigmoid(output) # predict_mask = t.sigmoid(output) # true output should be sigmoid # ipdb.set_trace() true_mask = mask/255 # predict = output.view(output.size(0),-1) # target = true_mask.view(true_mask.size(0),-1) # ipdb.set_trace() # print(predict.size(),target.size()) # loss = criterion(F.softmax(output,dim=2),true_mask) loss = criterion(output,true_mask) # print(loss.item()) loss.backward() optimizer.step() # meter update and visualize loss_meter.add(loss.item()) if (ii+1)%opt.plot_every == 0: vis.img('target_img', ((target_img + 1) / 2).data[0]) vis.img('query_logo', ((query_logo + 1) / 2).data[0]) vis.img('truth groud', (true_mask.data[0])) vis.img('predict', predict.data[0]) pre_judgement = predict.data[0] pre_judgement[pre_judgement > 0.5] = 1 # 改成0.7怎么样! pre_judgement[pre_judgement <= 0.5] = 0 vis.img('pre_judge(>0.5)', pre_judgement) # vis.img('pre_judge', pre_judgement) # vis.log({'predicted':output.data[0].cpu().numpy()}) # vis.log({'truth groud':true_mask.data[0].cpu().numpy()}) print('finish epoch:',epoch) # vis.log({'predicted':output.data[0].cpu().numpy()}) vis.plot('loss',loss_meter.value()[0]) if (epoch+1) %opt.save_model_epoch == 0: vis.save([opt.env]) t.save(net.state_dict(),'checkpoints/%s_localize_v6.pth' % epoch)
def train(**kwargs): # opt.parse(kwargs) vis = Visualizer(opt.env) savingData = [] # # step1: configure model model = getattr(models, opt.model)() if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: model.cuda() # step2: data train_data = DogCat(opt.train_data_root, train=True) val_data = DogCat(opt.train_data_root, train=False) test_data = DogCat(opt.test_data_root, test=True) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) test_dataloader = DataLoader(test_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # step3: criterion and optimizer criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = t.optim.Adam(model.parameters(), lr=lr, weight_decay=opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e100 # train for epoch in range(opt.max_epoch + 1): # validate and visualize val_cm, val_accuracy = val(model, val_dataloader) test_cm, test_accuracy = val(model, test_dataloader) vis.plot('test_accuracy', test_accuracy) vis.plot('lr', lr) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm},test_cm:{test_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), test_cm=str(test_cm.value()), lr=lr)) print("epoch = ", epoch, " loss = ", loss_meter.value()[0], " lr = ", lr) batch_results = [(epoch, loss_meter.value()[0], lr, str(val_cm.value()), str(confusion_matrix.value()), str(test_cm.value()), val_accuracy, test_accuracy) ] # savingData += batch_results # save_training_data(savingData, opt.traingData_file) # # update learning rate # if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # # 第二种降低学习率的方法:不会有moment等信息的丢失 # for param_group in optimizer.param_groups: # param_group['lr'] = lr if epoch == opt.max_epoch: return previous_loss = loss_meter.value()[0] loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in tqdm(enumerate(train_dataloader), total=len(train_data) / opt.batch_size): # train model input = data target = label if opt.use_gpu: input = input.cuda() target = target.cuda() optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.item()) confusion_matrix.add(score.data, target.data) if ii % opt.print_freq == opt.print_freq - 1: vis.plot('loss', loss_meter.value()[0]) # 进入debug模式 if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() prefix = 'checkpoints/' name = time.strftime(prefix + '%m%d_%H:%M:%S_' + str(epoch + 1) + '.pth') if epoch == 0: model.save(name) if np.mod(epoch + 1, 10) == 0: model.save(name)
def train(opt): #创建可视化对象 if opt.use_env: #需要在pycharm里的Terminal先启动visdom服务器:python -m visdom.server vis = Visdom(env=opt.env) #获取数据 data, ix2word, word2ix = get_data(opt) # #word_embeds=word2vec_train(opt, data, ix2word)#预先训练词向量,代替模型中的embed层,因为要做text rank提取关键字 #print(word_embeds) data = t.from_numpy(data) #转成torch # print(data.shape, data) dataloader = DataLoader(data, batch_size=opt.batch_size, shuffle=True, num_workers=1) #模型定义 encoder = Encoder(len(word2ix), opt.hidden_dim) decoder = AttentionDecoder(opt.hidden_dim, len(word2ix), opt.dropout_rate, opt.input_len) en_optimizer = t.optim.SGD(encoder.parameters(), lr=opt.lr) de_optimizer = t.optim.SGD(decoder.parameters(), lr=opt.lr) criterion = nn.CrossEntropyLoss() # if opt.model_path: # model.load_state_dict(t.load(opt.model_path)) if opt.use_gpu: # model=model.cuda() encoder = encoder.cuda() decoder = decoder.cuda() criterion = criterion.cuda() loss_meter = meter.AverageValueMeter() #原来用的是loss=0.0,现在用这个自动计算平均值 # count = 0 for epoch in range(opt.epoch): # print(epoch) loss_meter.reset() # 重置为0 count = 0 for i, data_ in enumerate( dataloader ): #tqdm 在 Python 长循环中添加一个进度提示信息,用户只需要封装任意的迭代器 tqdm(iterator) #训练 #data_: 一个batch,torch.Size([128, 80]), 每个batch是128首诗,每首诗最长是80个字。后续的hidden_size=embed_size=128 # print(data_.shape) # print(data_) data_ = data_.long().contiguous( ) #transpose交换第0维度和第1维度,所以data从batch_size*seq_len变成seq_len*batch_size; contoguouse()把tensor变成在内存中连续分布的形式。 if opt.use_gpu: data_ = data_.cuda() #loss_meter.reset() # 重置为0 for poetry in data_: #poetry是data_的一行,也就是一首诗 loss = 0 encoder_hidden = encoder.initHidden(opt.use_gpu) en_optimizer.zero_grad() # 置零 de_optimizer.zero_grad() count += 1 # print(poetry) #print(epoch,':',count) input_, target_ = poetry[:opt.input_len].view( -1, 1), poetry[1:].view(-1, 1) #输入为每首诗的前10个字,target为整首诗, 形状!!!!!, input_len = input_.size(0) output_len = target_.size(0) encoder_outputs = t.zeros( opt.input_len, encoder.hidden_size, device='cuda' if opt.use_gpu else 'cpu') #loss_meter.reset() # 重置为0 # encoder_hidden = encoder.initHidden(opt.use_gpu) # en_optimizer.zero_grad() # 置零 # de_optimizer.zero_grad() #encoder: for ei in range(input_len): encoder_output, encoder_hidden = encoder( input_[ei], encoder_hidden) encoder_outputs[ei] = encoder_output[0] #decoder: #use_teacher_forcing=True if random.random()<opt.teacher_forcing_ratio else False use_teacher_forcing = True decoder_input = t.tensor( [[word2ix['<START>']]], device='cuda' if opt.use_gpu else 'cpu') decoder_hidden = encoder_hidden if use_teacher_forcing: for di in range(output_len): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) decoder_input = target_[di] loss += criterion(decoder_output, target_[di]) #loss_meter.add(loss.item()) else: for di in range(output_len): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) topv, topi = decoder_output.topk(1) #预测出来的下一个字 decoder_input = topi.squeeze().detach( ) #detach阻断这个节点上的反向传播 loss += criterion(decoder_output, target_[di]) #loss_meter.add(loss.item()) if decoder_input.item() == word2ix['<END>']: break loss.backward() en_optimizer.step() de_optimizer.step() #可视化 if count % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() #设置断点 #显示loss值 #print(type(loss_meter.value()[0]),loss_meter.value()[0]) vis.line(X=np.array([ (count // opt.plot_every) / opt.plot_every ]), Y=np.array([loss.item() / output_len]), win='loss', update='None' if count // opt.plot_every == 0 else 'append') #显示诗歌原文 #print(type(poetry),poetry.shape,poetry.tolist()) p = [ix2word[k] for k in poetry.tolist()] #输出原诗 vis.text(' '.join(p), win=u'origin_poem') # #显示生成的诗 start_words = '床前明月光,疑似地上霜' gen_poetry = ''.join( generate(opt, encoder, decoder, start_words, ix2word, word2ix)) print(i, ':', gen_poetry) vis.text(''.join(gen_poetry), win=u'generate_poem') # #保存模型 t.save(encoder.state_dict(), '%s/seq2seq/1_%s.pth' % (opt.model_prefix, epoch)) t.save(decoder.state_dict(), '%s/seq2seq/1_%s.pth' % (opt.model_prefix, epoch))
def validate(net, K, L, W, misfit, val_loader, use_gpu, epoch, fig_path, save_fig, nbatches=1, is_unet=False): # For now just test on one image from the training set, later loop over val set running_loss = tnt.AverageValueMeter() running_acc = tnt.AverageValueMeter() count = 0 for batch_idx, (images, labels) in enumerate(val_loader): if use_gpu: images = images.cuda() labels = labels.cuda() # Forward Pass with torch.no_grad(): if is_unet: outputs = net(images) else: X = net(images, K, L) outputs = conv1x1(X, W) probs = softmax(outputs) loss = misfit(outputs, labels) _, preds = torch.max(outputs, 1) acc = getAccuracy(preds, labels) running_loss.add(loss.item()) running_acc.add(acc) summary_writer.add_scalar('Val Loss', running_loss.mean, epoch + (batch_idx / nbatches)) summary_writer.add_scalar('Val Acc', running_acc.mean, epoch + (batch_idx / nbatches)) # Save every val image if save_fig and (epoch + 1) % 24 == 0: for i in range(images.shape[0]): plot_probs( images[i], labels[i], preds[i], probs[i], os.path.join(fig_path, 'final_preds/%06d_%04d.png' % (epoch, count))) count += 1 # Save a single val image if save_fig and epoch % 1 == 0: plot_probs(images[0], labels[0], preds[0], probs[0], os.path.join(fig_path, 'validating/%06d.png' % epoch)) print('\n Validation Loss: %6.4f, Acc: %6.4f' % (running_loss.mean, running_acc.mean * 100)) return running_loss.mean
def train(**kwargs): opt._parse(kwargs) vis = Visualizer(opt.env, port=opt.vis_port) # step1: configure model model = getattr(models, opt.model)() if opt.load_model_path: model.load(opt.load_model_path) model.to(opt.device) # step2: data train_data = DogCat(opt.train_data_root, train=True) val_data = DogCat(opt.train_data_root, train=False) train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=False, num_workers=opt.num_workers) # step3: criterion and optimizer criterion = t.nn.CrossEntropyLoss() lr = opt.lr optimizer = model.get_optimizer(lr, opt.weight_decay) # step4: meters loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(2) previous_loss = 1e10 # train for epoch in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() for ii, (data, label) in tqdm(enumerate(train_dataloader)): # train model input = data.to(opt.device) target = label.to(opt.device) optimizer.zero_grad() score = model(input) loss = criterion(score, target) loss.backward() optimizer.step() # meters update and visualize loss_meter.add(loss.item()) # detach 一下更安全保险 confusion_matrix.add(score.detach(), target.detach()) if (ii + 1) % opt.print_freq == 0: vis.plot('loss', loss_meter.value()[0]) # 进入debug模式 if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() model.save() # validate and visualize val_cm, val_accuracy = val(model, val_dataloader) vis.plot('val_accuracy', val_accuracy) vis.log( "epoch:{epoch},lr:{lr},loss:{loss},train_cm:{train_cm},val_cm:{val_cm}" .format(epoch=epoch, loss=loss_meter.value()[0], val_cm=str(val_cm.value()), train_cm=str(confusion_matrix.value()), lr=lr)) # update learning rate if loss_meter.value()[0] > previous_loss: lr = lr * opt.lr_decay # 第二种降低学习率的方法:不会有moment等信息的丢失 for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0]
def train(**kwargs): for k, v in kwargs.items(): setattr(opt, k, v) vis = Visualizer(env=opt.env) # 获取数据 data, word2ix, ix2word = get_data(opt) data = t.from_numpy(data) dataloader = t.utils.data.DataLoader(data, batch_size=opt.batch_size, shuffle=True, num_workers=1) # 模型定义 model = PoetryModel(len(word2ix), 128, 256) optimizer = t.optim.Adam(model.parameters(), lr=opt.lr) criterion = nn.CrossEntropyLoss() if opt.model_path: model.load_state_dict(t.load(opt.model_path)) if opt.use_gpu: model.cuda() criterion.cuda() loss_meter = meter.AverageValueMeter() for epoch in range(opt.epoch): loss_meter.reset() for ii, data_ in tqdm.tqdm(enumerate(dataloader)): # 训练 data_ = data_.long().transpose(1, 0).contiguous() if opt.use_gpu: data_ = data_.cuda() optimizer.zero_grad() input_, target = Variable(data_[:-1, :]), Variable(data_[1:, :]) output, _ = model(input_) loss = criterion(output, target.view(-1)) loss.backward() optimizer.step() loss_meter.add(loss.data[0]) # # 可视化 # if (1+ii)%opt.plot_every==0: # # if os.path.exists(opt.debug_file): # ipdb.set_trace() # # vis.plot('loss',loss_meter.value()[0]) # # # 诗歌原文 # poetrys=[ [ix2word[_word] for _word in data_[:,_iii]] # for _iii in range(data_.size(1))][:16] # vis.text('</br>'.join([''.join(poetry) for poetry in poetrys]),win=u'origin_poem') # # gen_poetries = [] # # 分别以这几个字作为诗歌的第一个字,生成8首诗 # for word in list(u'春江花月夜凉如水'): # gen_poetry = ''.join(generate(model,word,ix2word,word2ix)) # gen_poetries.append(gen_poetry) # vis.text('</br>'.join([''.join(poetry) for poetry in gen_poetries]),win=u'gen_poem') t.save(model.state_dict(), '%s_%s.pth' % (opt.model_prefix, epoch))
def train(**kwargs): #init for k_, v_ in kwargs.items(): setattr(opt, k_, v_) if opt.vis: vis = Visualizer(opt.env) vis_val = Visualizer('valdemoire') #dataset FiveCrop_transforms = transforms.Compose([ transforms.FiveCrop(256), transforms.Lambda(lambda crops: torch.stack( [transforms.ToTensor()(crop) for crop in crops])) ]) data_transforms = transforms.Compose([ # transforms.RandomCrop(256), transforms.ToTensor() ]) train_data = MoireData(opt.train_path) test_data = MoireData(opt.test_path, is_val=True) train_dataloader = DataLoader(train_data, batch_size=opt.train_batch_size, shuffle=True, num_workers=opt.num_workers, drop_last=True) test_dataloader = DataLoader(test_data, batch_size=opt.val_batch_size, shuffle=True, num_workers=opt.num_workers, drop_last=True) last_epoch = 0 #model_init cfg.merge_from_file("config/cfg.yaml") model = get_pose_net(cfg, pretrained=opt.model_path) #initweight model = model.to(opt.device) if opt.vis: val_loss, val_psnr = val(model, test_dataloader, vis_val) print(val_loss, val_psnr) else: val_loss, val_psnr = val(model, test_dataloader) print(val_loss, val_psnr) criterion_c = L1_Charbonnier_loss() criterion_s = L1_Sobel_Loss() lr = opt.lr optimizer = torch.optim.Adam( params=model.parameters(), lr=lr, weight_decay=0.01 #0.005 ) if opt.model_path: map_location = lambda storage, loc: storage checkpoint = torch.load(opt.model_path, map_location=map_location) last_epoch = checkpoint["epoch"] optimizer_state = checkpoint["optimizer"] optimizer.load_state_dict(optimizer_state) lr = checkpoint["lr"] for param_group in optimizer.param_groups: param_group['lr'] = lr loss_meter = meter.AverageValueMeter() psnr_meter = meter.AverageValueMeter() previous_loss = 1e100 accumulation_steps = opt.accumulation_steps for epoch in range(opt.max_epoch): if epoch < last_epoch: continue loss_meter.reset() psnr_meter.reset() torch.cuda.empty_cache() loss_list = [] for ii, (moires, clear_list) in tqdm(enumerate(train_dataloader)): moires = moires.to(opt.device) clears = clear_list[0].to(opt.device) output_list, edge_output_list = model(moires) outputs, edge_X = output_list[0], edge_output_list[0] if epoch < 20: pass elif epoch >= 20 and epoch < 40: opt.loss_alpha = 0.9 else: opt.loss_alpha = 1.0 c_loss = criterion_c(outputs, clears) s_loss = criterion_s(edge_X, clears) loss = opt.loss_alpha * c_loss + (1 - opt.loss_alpha) * s_loss # saocaozuo gradient accumulation loss = loss / accumulation_steps loss.backward() if (ii + 1) % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() loss_meter.add(loss.item() * accumulation_steps) moires = tensor2im(moires) outputs = tensor2im(outputs) clears = tensor2im(clears) psnr = colour.utilities.metric_psnr(outputs, clears) psnr_meter.add(psnr) if opt.vis and (ii + 1) % opt.plot_every == 0: #100个batch画图一次 vis.images(moires, win='moire_image') vis.images(outputs, win='output_image') vis.text( "current outputs_size:{outputs_size},<br/> outputs:{outputs}<br/>" .format(outputs_size=outputs.shape, outputs=outputs), win="size") vis.images(clears, win='clear_image') #record the train loss to txt vis.plot('train_loss', loss_meter.value() [0]) #meter.value() return 2 value of mean and std vis.log( "epoch:{epoch}, lr:{lr}, train_loss:{loss}, train_psnr:{train_psnr}" .format(epoch=epoch + 1, loss=loss_meter.value()[0], lr=lr, train_psnr=psnr_meter.value()[0])) loss_list.append(str(loss_meter.value()[0])) torch.cuda.empty_cache() if opt.vis: val_loss, val_psnr = val(model, test_dataloader, vis_val) vis.plot('val_loss', val_loss) vis.log( "epoch:{epoch}, average val_loss:{val_loss}, average val_psnr:{val_psnr}" .format(epoch=epoch + 1, val_loss=val_loss, val_psnr=val_psnr)) else: val_loss, val_psnr = val(model, test_dataloader) #每个epoch把loss写入文件 with open(opt.save_prefix + "loss_list.txt", 'a') as f: f.write("\nepoch_{}\n".format(epoch + 1)) f.write('\n'.join(loss_list)) if (epoch + 1) % opt.save_every == 0 or epoch == 0: # 每5个epoch保存一次 prefix = opt.save_prefix + 'HRnet_epoch{}_'.format(epoch + 1) file_name = time.strftime(prefix + '%m%d_%H_%M_%S.pth') checkpoint = { 'epoch': epoch + 1, "optimizer": optimizer.state_dict(), "model": model.state_dict(), "lr": lr } torch.save(checkpoint, file_name) if (loss_meter.value()[0] > previous_loss) or ((epoch + 1) % 10) == 0: lr = lr * opt.lr_decay for param_group in optimizer.param_groups: param_group['lr'] = lr previous_loss = loss_meter.value()[0] prefix = opt.save_prefix + 'HRnet_final_' file_name = time.strftime(prefix + '%m%d_%H_%M_%S.pth') checkpoint = { 'epoch': epoch + 1, "optimizer": optimizer.state_dict(), "model": model.state_dict(), "lr": lr } torch.save(checkpoint, file_name)