if resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir(save_path), 'Error: no checkpoint directory found!' checkpoint = torch.load(save_path + '/%s_ckpt.t7' % model_name) net.load_state_dict(checkpoint['net']) if use_cuda: Device = int(sys.argv[3]) # Device = 0 net.cuda(Device) cudnn.benchmark = True criterion = nn.CrossEntropyLoss() optimizer = optim.SGD( net.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True, ) def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 a = time.time() for batch_idx, (data, target) in enumerate(train_loader): if use_cuda:
valid_data = RMBDataset(data_dir=valid_dir, transform=valid_transform) # 构建DataLoder train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True) valid_loader = DataLoader(dataset=valid_data, batch_size=BATCH_SIZE) # ============================ step 2/5 模型 ============================ net = LeNet(classes=2) net.initialize_weights() # ============================ step 3/5 损失函数 ============================ criterion = nn.CrossEntropyLoss() # 选择损失函数 # ============================ step 4/5 优化器 ============================ optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9) # 选择优化器 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # 设置学习率下降策略 # ============================ step 5/5 训练 ============================ train_curve = list() valid_curve = list() for epoch in range(MAX_EPOCH): loss_mean = 0. correct = 0. total = 0. net.train() for i, data in enumerate(train_loader):
def train(batch_size=50, lr=0.01, data_folder='data', dataset_name='mnist', model_name='lenet', max_epochs=10, log_freq=100): # model definition if model_name == 'lenet': from model.lenet import LeNet model = LeNet() else: from model.modelzoo import create_model model, input_size = create_model(model_name, n_classes=120) model = apply_cuda(model) # data source if dataset_name == 'mnist': train_loader = load_data('train', batch_size, data_folder, dataset_name) eval_loader = load_data('test', batch_size, data_folder, dataset_name) else: train_loader = load_data('train', batch_size, data_folder, dataset_name, input_size) eval_loader = load_data('test', batch_size, data_folder, dataset_name, input_size) n_batches_train = len(train_loader) n_batches_eval = len(eval_loader) print( datetime.now(), 'batch size = {}'.format(batch_size), 'number of batches for training = {}'.format(n_batches_train), 'number of batches for evaluation = {}'.format(n_batches_eval)) # optimizer and loss definition criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9) for epoch in range(max_epochs): print(datetime.now(), 'epoch: {}/{}'.format(epoch+1, max_epochs)) # training set print('==== training phase ====') avg_loss = float(0) avg_acc = float(0) model.train() for step, (images, labels) in enumerate(train_loader): optimizer.zero_grad() images, labels = apply_cuda(images), apply_cuda(labels) images, labels = apply_var(images), apply_var(labels) # forward pass if model_name == 'inception_v3': logits, aux_logits = model(images) loss1 = criterion(logits, labels) loss2 = criterion(aux_logits, labels) loss = loss1 + 0.4*loss2 else: logits = model(images) loss = criterion(logits, labels) _, pred = torch.max(logits.data, 1) bs_ = labels.data.size()[0] match_count = (pred == labels.data).sum() accuracy = float(match_count)/float(bs_) avg_loss += loss.item()/float(n_batches_train) avg_acc += accuracy/float(n_batches_train) # backward pass loss.backward() optimizer.step() if (step+1) % log_freq == 0: print( datetime.now(), 'training step: {}/{}'.format(step+1, n_batches_train), 'loss={:.5f}'.format(loss.item()), 'acc={:.4f}'.format(accuracy)) print( datetime.now(), 'training ends with avg loss={:.5f}'.format(avg_loss), 'and avg acc={:.4f}'.format(avg_acc)) # validation set print('==== validation phase ====') avg_acc = float(0) model.eval() for images, labels in eval_loader: images, labels = apply_cuda(images), apply_cuda(labels) logits = model(images) _, pred = torch.max(logits.data, 1) bs_ = labels.data.size()[0] match_count = (pred == labels.data).sum() accuracy = float(match_count)/float(bs_) avg_acc += accuracy/float(n_batches_eval) print( datetime.now(), 'evaluation results: acc={:.4f}'.format(avg_acc)) # save the model for every epoch ckpt_path = '{}_{}_bs{}_lr{}_ep{}.pth'.format( model_name, dataset_name, batch_size, lr, epoch) torch.save({ 'epoch': epoch, 'state_dict': model.state_dict(), 'avg_loss': avg_loss, 'avg_acc': avg_acc}, ckpt_path)