def main(): #define the dataloader train_loader = DataLoader(SkeletonFeeder(mode='train', debug=False), batch_size=params['batchsize'], shuffle=True, num_workers=params['numworkers']) val_loader = DataLoader(SkeletonFeeder(mode='valid', debug=False), batch_size=params['batchsize'], shuffle=False, num_workers=params['numworkers']) n_class = params['n_class'] cur_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) #use the model and transfer to gpu model = Classification(n_class=n_class) #add model cfg model = model.cuda(params['gpu'][0])#to do---> model = nn.DataParallel(model, device_ids=params['gpu']) if params['retrain']: trained_dict = torch.load(params['retrain'],map_location='cpu') model_dict = model.state_dict() trained_dict = {k:v for k,v in trained_dict.items() if k in model_dict} model_dict.update(trained_dict) model.load_state_dict(model_dict) print('load trained model finish') model_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(model_params, lr=params['lr'], weight_decay=params['weight_decay']) schedule = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.333, patience=2, verbose=True) writer = SummaryWriter() criterion = nn.CrossEntropyLoss() min_loss = 1000 print('-------------------start training----------------------') print('lr:', optimizer.param_groups[0]['lr']) for i in range(params['epoch']): train_loss, train_top1, train_top5, batch_time, data_time = train(model, train_loader, optimizer,criterion) valid_loss, val_top1, val_top5 = valid(model, val_loader,optimizer,criterion) schedule.step(valid_loss) f = open(params['log']+'bert_classifylog_'+cur_time+'.txt', 'a') print('epoch:', str(i + 1) + "/" + str(params['epoch'])) print('data time:%0.3f'%data_time.avg, 'batch time:%0.3f'%batch_time.avg, 'epoch time:%0.3f'%(batch_time.sum)) print('train loss:%0.8f'%train_loss, 'top1:%0.2f'%train_top1, '%', 'top5:%0.2f'%train_top5, '%', 'lr:', optimizer.param_groups[0]['lr']) print('valid loss:%0.8f'%valid_loss, 'top1:%0.2f'%val_top1, '%', 'top5:%0.2f'%val_top5, '%') f.write('epoch:'+str(i+1)+"/"+str(params['epoch'])+'\n') f.write('data time:%0.3f'%data_time.avg+'batch time:%0.3f'%batch_time.avg+'epoch time:%0.3f'%(batch_time.sum)+'\n') f.write('train loss:%0.8f'%train_loss+'top1:%0.2f'%train_top1+'%'+'top5:%0.2f'%train_top5+'%'+'lr:'+str(optimizer.param_groups[0]['lr'])+'\n') f.write('valid loss:%0.8f'%valid_loss+'top1:%0.2f'%val_top1+'%'+'top5:%0.2f'%val_top5+'%'+'\n') f.write('************************************\n') f.close() writer.add_scalar('train loss', train_loss, i) writer.add_scalar('valid loss', valid_loss, i) writer.add_scalar('train top1', train_top1, i) writer.add_scalar('valid top1', val_top1, i) writer.add_scalar('train top5', train_top5, i) writer.add_scalar('valid top5', val_top5, i) if valid_loss < min_loss: torch.save(model.state_dict(), params['save_path']+'bert_classifymodel_'+cur_time+'.pth') print('saving model successful to --->',params['save_path']) min_loss = valid_loss writer.close()
labels, requires_grad=False) ##将梯度初始化为零(因为一个batch的loss关于weight的导数是所有sample的loss关于weight的导数的累加和) optimizer.zero_grad() ## losses = model(images, labels) losses.backward() ## ##更新所有参数 optimizer.step() ## print("epochs:{} step:{},losses:{:.4f},lr:{:.6f}".format( epoch + 1, epoch * nums + step, losses.item(), optimizer.state_dict()['param_groups'][0]['lr'])) logs.write( str(epoch * nums + step) + " " + str(losses.item()) + " " + str(optimizer.state_dict()['param_groups'][0]['lr']) + "\n") if epoch % args.log == 0: torch.save( { 'models': model.state_dict(), 'optimzer': optimizer.state_dict(), 'epochs': epoch }, "./models/classes_{}.pth".format(epoch)) if epoch % 4 == 0: for s, (images, labels) in enumerate(valid_dataloader): images, labels = Variable(images.cuda(), requires_grad=False), Variable( labels.cuda(), requires_grad=False) losses = model(images, labels) valid_logs.write(str(epoch + 1) + " " + str(losses.item()) + "\n") logs.close() valid_logs.close()