def train(): conf = Config() # 打印模型配置信息 conf.dump() parser = argparse.ArgumentParser(description='图片分类模型训练') parser.add_argument( '--resume_checkpoint', action='store', type=str, default='model/checkpoint.pth', help='从模型的checkpoint恢复模型,并继续训练,如果resume_checkpoint这个参数提供' '这些参数将忽略--arch, --learning_rate, --hidden_units, and --drop_p') args = parser.parse_args() #加载数据 dataloaders, class_to_idx = load_data(conf.data_directory) #创建模型,如果模型文件存在 if args.resume_checkpoint and os.path.exists(args.resume_checkpoint): #加载checkpoint print('resume_checkpoint已存在,开始加载模型') model, optimizer, epoch, history = load_checkpoint( checkpoint_path=args.resume_checkpoint, load_optimizer=True, gpu=conf.cuda) start_epoch = epoch + 1 else: #创建新模型和优化器 print('resume_checkpoint未设置或模型文件不存在,创建新的模型') model = create_model( arch=conf.arch, class_to_idx=class_to_idx, hidden_units=conf.hidden_units, drop_p=conf.dropout) optimizer = create_optimizer(model=model, lr=conf.learning_rate) start_epoch = 1 history = None #训练模型 history, best_epoch = train_model( dataloaders=dataloaders, model=model, optimizer=optimizer, gpu=conf.cuda, start_epoch=start_epoch, epochs=conf.epochs, train_history=history) #测试集上测试模型 test_acc = test_model(dataloader=dataloaders['test'], model=model, gpu=conf.cuda) print(f'模型在测试集上的准确率是 {(test_acc * 100):.2f}%') #保存模型 save_checkpoint( save_path=conf.save_path+conf.save_name, epoch=best_epoch, model=model, optimizer=optimizer, history=history) #绘制历史记录 plot_history(history)
# train_recalls.append(avg_train_recall) print("Train loss: ", avg_train_loss) print("Train recall: ", avg_train_recall) writer.add_scalar("Loss/train", avg_train_loss, ep) writer.add_scalar("Recall/train", avg_train_recall, ep) avg_val_loss, avg_val_recall = model_utils.validate_model( rec_sys_model, loss_func, valid_loader, ep, top_k, val_display_step) # val_losses.append(avg_val_loss) # val_recalls.append(avg_val_recall) print("Val loss: ", avg_val_loss) print("Val recall: ", avg_val_recall) writer.add_scalar("Loss/val", avg_val_loss, ep) writer.add_scalar("Recall/val", avg_val_recall, ep) avg_test_loss, avg_test_recall = model_utils.test_model( rec_sys_model, loss_func, test_loader, ep, top_k, test_display_step) # test_losses.append(avg_test_loss) # test_recalls.append(avg_test_recall) writer.add_scalar("Loss/test", avg_test_loss, ep) writer.add_scalar("Recall/test", avg_test_recall, ep) if (avg_test_recall > recall_max): print('Test loss decrease from ({:.6f} --> {:.6f}) '.format( loss_min, avg_test_loss)) print('recall increase from {:.6f} --> {:.6f}'.format( recall_max, avg_test_recall)) print('Can save model') # check_point.save_ckpt(checkpoint, True, model_name, checkpoint_dir, best_model_dir, ep) check_point.save_config_param(best_model_dir, model_name, config_param) torch.save(rec_sys_model, best_model_dir + model_name + '.pt')
def run(rank, model, train_pics, train_bsz): workers = [int(v) for v in str(args.learners).split('-')] _group = [w for w in workers].append(rank) group = dist.new_group(_group) for p in model.parameters(): scatter_p_list = [p.data for _ in range(len(workers) + 1)] dist.scatter(tensor=p.data, scatter_list=scatter_p_list, group=group) print('Model Sent Finished!') print('Begin!') transform = transforms.Compose([ transforms.Resize(128), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) tmp = [ (0, 0) for _ in range(int(math.ceil(train_pics / (len(workers) * train_bsz)))) ] pre_time = datetime.datetime.now() for epoch in range(args.epochs): for batch_idx, (_, _) in enumerate(tmp): for param in model.parameters(): tensor = torch.zeros_like(param.data) # FIXME FIXED:gather_list中的每个Tensor都必须是新的对象,否则会出问题 gather_list = [ torch.zeros_like(param.data) for _ in range(len(workers) + 1) ] dist.gather(tensor=tensor, gather_list=gather_list, group=group) tensor = sum(gather_list) / len(workers) param.data -= tensor scatter_list = [param.data for _ in range(len(workers) + 1)] dist.scatter(tensor=tensor, scatter_list=scatter_list, group=group) print('Done {}/{}!'.format(batch_idx, len(tmp))) print('Done Epoch {}/{}!'.format(epoch + 1, args.epochs)) end_time = datetime.datetime.now() # 测试ps的模型准确率 h, remainder = divmod((end_time - pre_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = "Time %02d:%02d:%02d" % (h, m, s) test_dataset = datasets.CIFAR10(args.data_dir, train=False, download=False, transform=transform) criterion = torch.nn.CrossEntropyLoss() test_data = DataLoader(test_dataset, batch_size=128, shuffle=True) test_loss, acc = test_model(dist.get_rank(), model, test_data, criterion=criterion) print('total time ' + str(time_str)) f = open('./result_' + str(rank) + '_' + args.model + '.txt', 'a') f.write('Rank: ' + str(rank) + ', \tEpoch: ' + str(args.epochs) + ', \tTestLoss: ' + str(test_loss) + ', \tTestAcc: ' + str(acc) + ', \tTotalTime: ' + str(time_str) + '\n') f.close()
def run(rank, workers, model, save_path, train_data, test_data): # 获取ps端传来的模型初始参数 _group = [w for w in workers].append(0) group = dist.new_group(_group) for p in model.parameters(): tmp_p = torch.zeros_like(p) dist.scatter(tensor=tmp_p, src=0, group=group) p.data = tmp_p print('Model recved successfully!') optimizer = MySGD(model.parameters(), lr=0.01, momentum=0.5) criterion = torch.nn.CrossEntropyLoss() print('Begin!') for epoch in range(args.epochs): pre_time = datetime.datetime.now() model.train() # AlexNet在指定epoch减少学习率LR if args.model == 'AlexNet': if epoch + 1 in [40, 60]: for param_group in optimizer.param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format(param_group['lr'])) epoch_train_loss = 0 for batch_idx, (data, target) in enumerate(train_data): data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() delta_ws = optimizer.get_delta_w() # 同步操作 for idx, param in enumerate(model.parameters()): dist.gather(tensor=delta_ws[idx], dst=0, group=group) recv = torch.zeros_like(delta_ws[idx]) dist.scatter(tensor=recv, src=0, group=group) param.data = recv epoch_train_loss += loss.data.item() print('Rank {}, Epoch {}, Batch {}/{}, Loss:{}'.format( rank, epoch, batch_idx, len(train_data), loss.data.item())) end_time = datetime.datetime.now() h, remainder = divmod((end_time - pre_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = "Time %02d:%02d:%02d" % (h, m, s) epoch_train_loss /= len(train_data) epoch_train_loss = format(epoch_train_loss, '.4f') # 训练结束后进行test test_loss, acc = test_model(rank, model, test_data, criterion=criterion) print('total time ' + str(time_str)) f = open('./result_' + str(rank) + '_' + args.model + '.txt', 'a') f.write('Rank: ' + str(rank) + ', \tEpoch: ' + str(epoch + 1) + ', \tTrainLoss: ' + str(epoch_train_loss) + ', \tTestLoss: ' + str(test_loss) + ', \tTestAcc: ' + str(acc) + ', \tTime: ' + str(time_str) + '\n') f.close() if (epoch + 1) % 5 == 0: if not os.path.exists(save_path): os.makedirs(save_path) torch.save( model.state_dict(), save_path + '/' + args.model + '_' + str(epoch + 1) + '.pkl')
def run(rank, workers, model, save_path, train_data, test_data): # 获取ps端传来的模型初始参数 level_0 = [int(v) for v in str(args.level_0).split('-')] level_1 = [int(v) for v in str(args.level_1).split('-')] _level_0_group = [w for w in level_0] level_0_group = dist.new_group(_level_0_group) _level_1_group = [w for w in level_1] level_1_group = dist.new_group(_level_1_group) optimizer = MySGD(model.parameters(), lr=0.01, momentum=0.5) criterion = torch.nn.CrossEntropyLoss() print('The model was successfully initialized!') print('Begin!') for epoch in range(args.epochs): pre_time = datetime.datetime.now() model.train() # AlexNet在指定epoch减少学习率LR if args.model == 'AlexNet': if epoch + 1 in [40, 60]: for param_group in optimizer.param_groups: param_group['lr'] *= 0.1 print('LR Decreased! Now: {}'.format(param_group['lr'])) epoch_train_loss = 0 epoch_train_acc = 0 for batch_idx, (data, target) in enumerate(train_data): data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() delta_ws = optimizer.get_delta_w() # 同步操作 for idx, param in enumerate(model.parameters()): global split_tensor split_tensor = torch.chunk(delta_ws[idx], 2, 0) split_tensor = list(split_tensor) thread1 = MyThread(0, len(_level_0_group), level_0_group) thread2 = MyThread(1, len(_level_1_group), level_1_group) thread1.start() thread2.start() thread1.join() thread2.join() thread1 = MyThread(0, len(_level_1_group), level_1_group) thread2 = MyThread(1, len(_level_0_group), level_0_group) thread1.start() thread2.start() thread1.join() thread2.join() split_tensor = tuple(split_tensor) param.data -= torch.cat((split_tensor[0], split_tensor[1]), 0) epoch_train_loss += loss.data.item() epoch_train_acc += get_acc(output, target) print('Rank {}, Epoch {}, Batch {}/{}, Loss:{}' .format(rank, epoch, batch_idx, len(train_data), loss.data.item())) end_time = datetime.datetime.now() h, remainder = divmod((end_time-pre_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = "Time %02d:%02d:%02d" % (h, m, s) epoch_train_loss /= len(train_data) epoch_train_loss = format(epoch_train_loss, '.4f') # 训练结束后进行test test_loss, acc = test_model(rank, model, test_data, criterion=criterion) print('total time ' + str(time_str)) f = open('./result_' + str(rank) + '_' + args.model + '.txt', 'a') f.write('Rank: ' + str(rank) + ', \tEpoch: ' + str(epoch + 1) + ', \tTrainLoss: ' + str(epoch_train_loss) + ', \tTrainAcc: ' + str(epoch_train_acc / len(train_data)) + ', \tTestLoss: ' + str(test_loss) + ', \tTestAcc: ' + str(acc) + ', \tTime: ' + str(time_str) + '\n') f.close() if (epoch + 1) % 5 == 0: if not os.path.exists(save_path): os.makedirs(save_path) torch.save(model.state_dict(), save_path + '/' + args.model + '_' + str(epoch + 1) + '.pkl')
if pre_tr_model == 'vgg16': input_units = model.classifier[0].in_features model.name = 'vgg16' elif pre_tr_model == 'vgg19': input_units = model.classifier[0].in_features model.name = 'vgg19' elif pre_tr_model == 'densenet': input_units = model.classifier.in_features model.name = 'densenet' elif pre_tr_model == 'alexnet': input_units = model.classifier[1].in_features model.name = 'alexnet' #building classifier of model model = build_classifier(model, input_units, hidden_units, dropout) print(model) #Set criterion and optimizer criterion = nn.NLLLoss() optimizer = optim.Adam(model.classifier.parameters(), learning_rate) model.to(device) # Training model model = train_model(model, epochs, trainloader, validloader, criterion, optimizer, device) # Testing model test_model(model, testloader, device) # Saving model save_model(model, train_data, save_dir)
from model_utils import train_model, test_model from cnn_model_with_output_conv import CNNModel model = CNNModel() train_model(model, "cnn_model_with_output_conv", 20000) test_model(model, "cnn_model_with_output_conv")
def test_model(model, ds_config): datasets = ds_config_to_datasets(ds_config) model_utils.test_model(model, datasets["test"], MIN_CLASS_LABEL, MAX_CLASS_LABEL, "muh_confusion.png")
#constants #output_cats = 102 # number of flower classifications (can make this a command line input for other training) args = get_args_train() if (args.device == 'gpu' and torch.cuda.is_available()): device = torch.device('cuda') else: print( "Model should be trained on GPU, enable and select --gpu gpu for training" ) train_data, test_data, validation_data, trainloader, testloader, validationloader = load_data( args.data_directory) pretrain_model, arch_inFeatures = pretrained_model(args.arch) model, criterion = create_classifier(pretrain_model, arch_inFeatures, args.hidden_units, args.output_cats) optimizer = optim.Adam(model.classifier.parameters(), lr=args.lr) trained_model = train_model(model, args.epochs, trainloader, validationloader, device, optimizer, criterion) tested_model = test_model(trained_model, testloader, device, optimizer, criterion) save_checkpoint(trained_model, args.save_directory, args.arch, train_data, optimizer, args.epochs, args.hidden_units)
flat_dim = get_flat_dim(INPUT_DIM, N_CONV, CONV_FILTERS, K_SIZES, P_KERNELS, STRIDES, P_STRIDES, PADDINGS) model = ConvNet(N_CONV, N_POOL, N_FC, CONV_FILTERS, K_SIZES, P_KERNELS, STRIDES, P_STRIDES, PADDINGS, FC_DIMS, N_MLP, MLP_DIMS, BATCH_TILE, INPUT_DIM[0], flat_dim, DEVICE).to(DEVICE) # --- IHC pretrained weights --- if os.path.exists(cnn_file): weights = torch.load(cnn_file, map_location=DEVICE) model.load_state_dict(weights['state_dict'], strict=False) model.to(DEVICE) else: print('WARNING: model file does not exists!') if mode == 'inference': predictions = inference(DEVICE, model, AGGREGATION, data_path, BATCH_TILE=BATCH_TILE) elif mode == 'test': criterion = nn.CrossEntropyLoss() ACC, F1, precision, recall = test_model(DEVICE, model, AGGREGATION, criterion, data_path, BATCH_TILE=BATCH_TILE)