def lenet(root, path_train, path_test): dataset_train = dataset_LeNet(root + path_train, train=True) dataset_test = dataset_LeNet(root + path_test, train=False) trainloader = DataLoader(dataset_train, batch_size=250, shuffle=True) testloader = DataLoader(dataset_test, batch_size=1000) model = LeNet5() criterion = t.nn.CrossEntropyLoss() lr = 0.01 optimizer = t.optim.SGD(model.parameters(), lr, momentum=0.4) for epoch in range(40): for _, (data, label) in enumerate(trainloader): model.train() optimizer.zero_grad() score = model(data) loss = criterion(score, label) loss.backward() optimizer.step() print("Epoch:%d loss:%f" % (epoch, loss.mean())) res = [] for _, (data) in enumerate(testloader): model.eval() predict = model(data) predict = predict.detach().numpy().tolist() res += predict res = np.array(res) ans = np.argmax(res, axis=1) dataset_test.save_res(ans, "./images/res_MLP.csv")
def run(args): ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=args.device) dataset_sink_mode = False download_dataset(args.data_dir) # define the loss function net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # create the network network = LeNet5() # define the optimizer net_opt = build_optimizer(args, network) config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10) # save the network model and parameters for subsequence fine-tuning ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck) # group layers into an object with training and evaluation features model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) if args.init_ckpt: load_ckpt(network, args.init_ckpt) train_net(network, model, args, ckpoint_cb, dataset_sink_mode)
def main(): from model import LeNet5 from datautils import MyMNIST from torchvision.datasets import MNIST from torchvision import transforms logging.basicConfig(level=logging.INFO) net = LeNet5(mnist=True) transform = transforms.ToTensor() dataset_train = MyMNIST("./dataset/mnist/", train=True, transform=transform, download=False) dataset_test = MyMNIST("./dataset/mnist/", train=False, transform=transform, download=False) params = { "lr": 0.0001, "momentum": 0.9, "weight_decay": 0.01, "gamma": 0.1, "decay_delay": None, "batch_size": 128, "epochs": 5, "early_stop": None, "num_workers": 4, "optimizer": "adam", "device": "cuda:0", "display_step": 100, "model_root": None, "load_latest": False, } train(net, dataset_train, dataset_test, **params)
def main(): global args args = parser.parse_args() # load dataset if args.dataset == 'sign_mnist': loader = dataset_loader(True) if args.model == 'LeNet5': train_loader, test_loader = loader.load_sign_mnist( 28, isGrayScale=args.gray_scale) elif args.model == 'ResNet34': train_loader, test_loader = loader.load_sign_mnist( 224, isGrayScale=args.gray_scale) else: raise RuntimeError('unrecognized model name ' + repr(args.model)) elif args.dataset == 'kinect_leap': loader = dataset_loader(False) if args.model == 'LeNet5': train_loader, test_loader = loader.load_kinect_leap( img_size=28, isGrayScale=args.gray_scale) elif args.model == 'ResNet34': train_loader, test_loader = loader.load_kinect_leap( img_size=224, isGrayScale=args.gray_scale) else: raise RuntimeError('unrecognized model name ' + repr(args.model)) else: raise RuntimeError('unrecogniazed dataset name' + repr(args.dataset)) # load model if args.model == 'LeNet5': model = LeNet5(class_num=loader.class_num, is_gray_scale=args.gray_scale).cuda() elif args.model == 'ResNet34': model = ResNet34(class_num=loader.class_num, is_gray_scale=args.gray_scale).cuda() else: raise RuntimeError('unrecognized model name ' + repr(args.model)) print(model) criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) # time counting start_time = time.time() for epoch in range(1, args.epoch + 1): train(model, train_loader, criterion, optimizer, epoch) test(model, test_loader, epoch) end_time = time.time() print('training process using ', end_time - start_time) # save model if args.save_model: saving_path = './trained_model/' + args.model + '.pth' torch.save(model, saving_path) return
def main(): """ Main function Here, you should instantiate 1) Dataset objects for training and test datasets 2) DataLoaders for training and testing 3) model 4) optimizer: SGD with initial learning rate 0.01 and momentum 0.9 5) cost function: use torch.nn.CrossEntropyLoss """ # write your codes here USE_CUDA = torch.cuda.is_available() DEVICE = torch.device("cuda" if USE_CUDA else "cpu") EPOCHS = 10 BATCH_SIZE = 64 model = LeNet5().to(DEVICE) #model = CustomMLP().to(DEVICE) #optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum = 0.5) optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5, weight_decay=0.001) d = "C:/Users/inolab/Desktop/DNN/CNN_homework/data" transform_train = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) transform_test = transforms.Compose([transforms.ToTensor()]) train_set = MNIST(data_dir=d, folder='train', transform=transform_train) train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=0) test_set = MNIST(data_dir=d, folder='test', transform=transform_test) tst_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=0) for epoch in range(1, EPOCHS + 1): trn_loss, tr_acc = train(model, train_loader, DEVICE, optimizer, epoch) tst_loss, acc = test(model, tst_loader, DEVICE) print('[{}] Test Loss : {:.4f}, Test Acc: {:.2f}%'.format( epoch, tst_loss, acc))
def evaluate_one_image(): # 存放的是我从百度下载的猫狗图片路径 train ='D:/workspace/python_dir/tersonflowdemo/dogvscat/dataset/data/test1/' image_array = get_one_image(train) with tf.Graph().as_default(): BATCH_SIZE = 1 # 因为只读取一副图片 所以batch 设置为1 N_CLASSES = 2 # 2个输出神经元,[1,0] 或者 [0,1]猫和狗的概率 # 转化图片格式 image = tf.cast(image_array, tf.float32) # 图片标准化 # image = tf.image.per_image_standardization(image)#这行不要加,我们训练的图片并没有做标准化,这行改了我两天 # 图片原来是三维的 [208, 208, 3] 重新定义图片形状 改为一个4D 四维的 tensor image = tf.reshape(image, [1, 208, 208, 3]) logit = model.inference(image, BATCH_SIZE, N_CLASSES) # 因为 inference 的返回没有用激活函数,所以在这里对结果用softmax 激活 logit = tf.nn.softmax(logit) # 用最原始的输入数据的方式向模型输入数据 placeholder x = tf.placeholder(tf.float32, shape=[208, 208, 3]) # 我门存放模型的路径 logs_train_dir = 'D:/workspace/python_dir/tersonflowdemo/dogvscat/result/' # 定义saver saver = tf.train.Saver() with tf.Session() as sess: print("从指定的路径中加载模型。。。。") # 将模型加载到sess ckpt = tf.train.get_checkpoint_state(logs_train_dir) if ckpt and ckpt.model_checkpoint_path: global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] saver.restore(sess, ckpt.model_checkpoint_path) print('模型加载成功, 训练的步数为 %s' % global_step) else: print('模型加载失败,,,文件没有找到') # 将图片输入到模型计算 prediction = sess.run(logit, feed_dict={x: image_array}) # 获取输出结果中最大概率的索引 max_index = np.argmax(prediction) print('猫的概率 %.6f' % prediction[:, 0]) print('狗的概率 %.6f' % prediction[:, 1])
def main(): args = parser.parse_args() model = torch.nn.DataParallel(LeNet5()) if os.path.isfile(args.model): print("=> loading model '{}'".format(args.model)) checkpoint = torch.load(args.model) model.load_state_dict(checkpoint['state_dict']) print("=> loaded model '{}'".format(args.model)) else: print("=> no checkpoint found at '{}'".format(args.resume)) sample_loader = torch.utils.data.DataLoader(datasets.MNIST( args.data, train=False, transform=transforms.Compose([transforms.ToTensor()])), batch_size=1, shuffle=True, num_workers=0, pin_memory=True) # plot convolotion layer kernels filter1 = model.module.conv1.weight.data.numpy() filter2 = model.module.conv2.weight.data.numpy() plot_conv_kernels(normalize(filter1), 'conv1_filters', args) plot_conv_kernels(normalize(filter2), 'conv2_filters', args) # plot feature maps for idx, (data, _) in enumerate(sample_loader): if idx == args.num_samples: break feature1 = model.module.feature_conv1(data) feature2 = model.module.feature_conv2(data) plot_feature_map(data, feature1, 'sample_#{}_conv1_feature_maps'.format(idx + 1), args) plot_feature_map(data, feature2, 'sample_#{}_conv2_feature_maps'.format(idx + 1), args) if not args.output: plt.show()
def build_model(): global yan_chi, images, labels global acc, loss, c_loss global train_op, lr, global_step global x_train, y_train global x_test, y_test yan_chi = tf.placeholder(tf.float32) images = tf.placeholder(tf.float32, [None, 24, 24, 3]) labels = tf.placeholder(tf.float32, [None, 10]) x_train, y_train = LeNet5.train_input() x_test, y_test = LeNet5.test_input() logits = LeNet5.inference(images) acc = LeNet5.get_acc(logits, labels) c_loss, loss = LeNet5.get_loss(logits, labels) train_op, lr, global_step = LeNet5.get_op(yan_chi, loss)
[transforms.Scale((32, 32)), transforms.ToTensor()]) testset = torchvision.datasets.MNIST(root=r'./data', train=False, download=True, transform=transform_test) testloader = torch.utils.data.DataLoader(testset, batch_size=1024, shuffle=False, num_workers=8) # Model print('==> Building model..') net = LeNet5() net = net.to(device) # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isdir(os.path.join( __input_dir__, 'checkpoint')), 'Error: no checkpoint directory found!' checkpoint = torch.load( os.path.join(__input_dir__, 'checkpoint/ckpt_lbi_group_resume.t7')) net.load_state_dict(checkpoint['net']) criterion = nn.CrossEntropyLoss(size_average=True) def test():
def main(): """ Main function Here, you should instantiate 1) Dataset objects for training and test datasets 2) DataLoaders for training and testing 3) model 4) optimizer: SGD with initial learning rate 0.01 and momentum 0.9 5) cost function: use torch.nn.CrossEntropyLoss """ # write your codes here if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') batch_size=64 trn_dataset = dataset.MNIST(data_dir='../data/train') trn_loader = torch.utils.data.DataLoader(trn_dataset, batch_size=batch_size, shuffle=True) tst_dataset = dataset.MNIST(data_dir='../data/test') tst_loader = torch.utils.data.DataLoader(tst_dataset, batch_size=batch_size, shuffle=True) LeNet = LeNet5().to(device) Custom = CustomMLP().to(device) # loss criterion = nn.CrossEntropyLoss() optimizer1 = optim.SGD(LeNet.parameters(), lr=0.01, momentum=0.9) optimizer2 = optim.SGD(Custom.parameters(), lr=0.01, momentum=0.9) # hyper-parameters num_epochs = 20 trn_loss_list1 = [] tst_loss_list1 = [] trn_acc1 = [] tst_acc1 = [] trn_loss_list2 = [] tst_loss_list2 = [] trn_acc2 = [] tst_acc2 = [] for epoch in range(num_epochs): print("epoch : ", (epoch+1)) print("LeNet") trn_loss, trn_acc = train(LeNet, trn_loader, device, criterion, optimizer1) trn_loss_list1.append(trn_loss / len(trn_loader)) trn_acc1.append(trn_acc) tst_loss, tst_acc = test(LeNet, tst_loader, device, criterion) tst_loss_list1.append(tst_loss / len(tst_loader)) tst_acc1.append(tst_acc) print("CustomMLP") trn_loss, trn_acc = train(Custom, trn_loader, device, criterion, optimizer2) trn_loss_list2.append(trn_loss / len(trn_loader)) trn_acc2.append(trn_acc) tst_loss, tst_acc = test(Custom, tst_loader, device, criterion) tst_loss_list2.append(tst_loss / len(tst_loader)) tst_acc2.append(tst_acc) plt.figure(figsize=(5,4)) x_range = range(len(trn_loss_list1)) plt.plot(x_range, trn_loss_list1, label="trn") plt.plot(x_range, tst_loss_list1, label="tst") plt.legend() plt.ylim(0, 1) plt.title('LeNet-5 Loss') plt.xlabel("training steps") plt.ylabel("loss") plt.grid() plt.figure(figsize=(5,4)) x_range = range(len(trn_acc1)) plt.plot(x_range, trn_acc1, label="trn") plt.plot(x_range, tst_acc1, label="tst") plt.legend() plt.ylim(0, 100) plt.title('LeNet-5 Accuracy') plt.xlabel("training steps") plt.ylabel("loss") plt.grid() plt.figure(figsize=(5,4)) x_range = range(len(trn_loss_list2)) plt.plot(x_range, trn_loss_list2, label="trn") plt.plot(x_range, tst_loss_list2, label="tst") plt.legend() plt.ylim(0, 1) plt.title('CustomMLP Loss') plt.xlabel("training steps") plt.ylabel("loss") plt.grid() plt.figure(figsize=(5,4)) x_range = range(len(trn_acc2)) plt.plot(x_range, trn_acc2, label="trn") plt.plot(x_range, tst_acc2, label="tst") plt.legend() plt.ylim(0, 100) plt.title('CustomMLP Accuracy') plt.xlabel("training steps") plt.ylabel("acc") plt.grid() # val acc with torch.no_grad(): corr_num = 0 total_num = 0 for j, val in enumerate(tst_loader): val_x, val_label = val if device: val_x = val_x.cuda() val_label =val_label.cuda() val_output = LeNet(val_x) model_label = val_output.argmax(dim=1) corr = val_label[val_label == model_label].size(0) corr_num += corr total_num += val_label.size(0) print("LeNet5 acc: {:.2f}".format(corr_num / total_num * 100)) with torch.no_grad(): corr_num = 0 total_num = 0 for j, val in enumerate(tst_loader): val_x, val_label = val if device: val_x = val_x.cuda() val_label =val_label.cuda() val_output = Custom(val_x) model_label = val_output.argmax(dim=1) corr = val_label[val_label == model_label].size(0) corr_num += corr total_num += val_label.size(0) print("CustomMLP acc: {:.2f}".format(corr_num / total_num * 100))
def get_model(): model = LeNet5() model.to(device) return model, optim.SGD(model.parameters(), lr=BASE_LR)
transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) index = np.random.randint(len(data_test)) # get random test image real_img_label = data_test[index][1].numpy() TIME_ID = time.time() save_real_img_path = "real_img_label=%s_%s.jpg" % (real_img_label, TIME_ID) test_img_tensor = data_test[index][0][0] vutils.save_image(test_img_tensor.data.cpu().float(), save_real_img_path) real_img_path = dilate(erode((save_real_img_path))) real_img = Image.open(real_img_path).convert("L") # Set up models BE_path = "train_baseline_lenet5/trained_weights2/weights/SERVER12-20190222-1834_E17S0_acc=0.9919.pth" SE_path = "../Experiments/20190311-1249_improve_LT_BD-no-dropout-no-leak-info_add-DA-img-lossx0.1/weights/SERVER12-20190311-1249_SE_E5S0_testacc=0.4003.pth" BE = LeNet5(BE_path).cuda() SE = SmallLeNet5(SE_path).cuda() # BE = SE num_test = 100 fake_pred = [] for i in range(num_test): fake_pred.append( BE(transforms.ToTensor()(randomaffine(fake_img)).unsqueeze( 0).cuda()).argmax().data.cpu().item()) fake_pred = np.array(fake_pred) print("\nfake label = %s\n" % fake_img_label, fake_pred, np.sum(fake_pred == fake_img_label)) real_pred = [] for i in range(num_test):
weights_path = pjoin(project_path, "weights") # to save torch model if not args.resume: if os.path.exists(project_path): respond = "Y" # input("The appointed project name has existed. Do you want to overwrite it (everything inside will be removed)? (y/n) ") if str.upper(respond) in ["Y", "YES"]: shutil.rmtree(project_path) else: exit(1) if not os.path.exists(weights_path): os.makedirs(weights_path) TIME_ID = "SERVER" + os.environ["SERVER"] + time.strftime("-%Y%m%d-%H%M") log_path = pjoin(weights_path, "log_" + TIME_ID + ".txt") log = sys.stdout if args.debug else open(log_path, "w+") # Set up model net = LeNet5(args.model) net.cuda() # Prepare data data_train = datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.Resize((32, 32)), transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) data_test = datasets.MNIST('../data', train=False, download=True,
def main(): # parse arguments parser = argparse.ArgumentParser(description='MNIST experiment (training)') parser.add_argument('-s', '--imsize', type=int, default=28, help='input image size (resolution)') parser.add_argument('-l', '--logdir', type=str, help='log directory path') parser.add_argument('-d', '--datadir', type=str, default="./data/polyMNIST", help='data directory path') parser.add_argument('-n', '--epoch', type=int, default=20, help='number of epochs to train') parser.add_argument('-j', '--jobs', type=int, default=-1, help='number of threads to use') parser.add_argument('-b', '--batch', type=int, default=64, help='batch size to use') parser.add_argument('-c', '--cache_root', type=str, default='./cache', help='path to directory to cache') parser.add_argument('--lr', type=float, default=1e-2, help='learning rate') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--seed', type=int, default=0, help='random seed.') parser.add_argument('--no_cuda', action='store_true', default=0, help='Not use GPU.') parser.add_argument('--optim', type=str, default='sgd', choices=['sgd', 'adam'], metavar='OPT', help='optimizer') parser.add_argument('--log_interval', type=int, default=100, help='logging interval') parser.add_argument('--no_decay', action='store_true', help='decay learning rate') args = parser.parse_args() torch.manual_seed(args.seed) np.random.seed(args.seed) if args.logdir is None: dt = datetime.datetime.now().strftime("%m-%d-%Y_%H-%M-%S") args.logdir = "logs/log-" + dt use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") if args.jobs < 0: args.jobs = multiprocessing.cpu_count() # logger and snapshot current code logger = init_logger(args) logger.info("%s", repr(args)) # dataloader trainset = PMNISTDataSet(args.datadir, 'train', imsize=args.imsize, cache_root=args.cache_root) trainloader = DataLoader(trainset, batch_size=args.batch, shuffle=True, num_workers=args.jobs) testset = PMNISTDataSet(args.datadir, 'test', imsize=args.imsize, cache_root=args.cache_root) testloader = DataLoader(testset, batch_size=args.batch, shuffle=True, num_workers=args.jobs) model = LeNet5((args.imsize, args.imsize), MEAN, STD) model = model.double() if args.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) else: optimizer = optim.Adam(model.parameters(), lr=args.lr) # LR decay scheduler if not args.no_decay: scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) if use_cuda: model = nn.DataParallel(model) model.to(device) for epoch in range(args.epoch): # loop over the dataset train(args, model, optimizer, epoch, device, trainloader, logger) test(args, model, epoch, device, testloader, logger) # save checkpoint torch.save( { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, os.path.join(args.logdir, 'checkpoint_final.pth.tar'))
train=False) # get some random training images dataiter = iter(train_loader) sample = dataiter.next() # create grid of images img_grid = torchvision.utils.make_grid(sample['image']) # show images matplotlib_imshow(img_grid, one_channel=True) # write to tensorboard writer.add_image('four_digit_images', img_grid) model = LeNet5() optimizer = torch.optim.SGD(model.parameters(), lr=initial_learning_rate, momentum=sgd_momentum) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[2, 5, 8, 12], gamma=0.1) # loss_fn = torch.nn.CrossEntropyLoss(size_average=True) loss_fn = torch.nn.MSELoss() if torch.cuda.is_available(): print("Using GPU") model.cuda() writer.add_graph(model, sample['image']) running_loss = 0.0
def main(): """ Main function Here, you should instantiate 1) Dataset objects for training and test datasets 2) DataLoaders for training and testing 3) model 4) optimizer: SGD with initial learning rate 0.01 and momentum 0.9 5) cost function: use torch.nn.CrossEntropyLoss """ n_cpu = multiprocessing.cpu_count() data_dir_dict = {'train': '../data/train', 'test': '../data/test'} for _, data_dir in data_dir_dict.items(): if not os.path.isdir(data_dir): tar = tarfile.open('%s.tar' % data_dir, mode='r') tar.extractall(path='./data/') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') num_epochs = 50 batch_size = 512 criterion = nn.CrossEntropyLoss() train_dataset = dataset.MNIST(data_dir_dict['train']) train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=n_cpu) test_dataset = dataset.MNIST(data_dir_dict['test']) test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=n_cpu) net = LeNet5().to(device) net_optimizer = SGD(net.parameters(), lr=.01, momentum=.9) net_trn_loss_list, net_trn_acc_list, net_tst_loss_list, net_tst_acc_list= [], [], [], [] print('--------------LeNet5--------------') for epoch in range(num_epochs): net_time = time.time() trn_loss, trn_acc = train(net, train_loader, device, criterion, net_optimizer) tst_loss, tst_acc = test(net, test_loader, device, criterion) print('%s epoch || %.5f time' % ((epoch + 1), time.time() - net_time)) print('avg train loss: %.5f | avg train acc: %.5f ||| avg test loss: %.5f | avg test acc: %.5f'\ %(trn_loss, trn_acc, tst_loss, tst_acc)) print('') net_trn_loss_list.append(trn_loss) net_trn_acc_list.append(trn_acc) net_tst_loss_list.append(tst_loss) net_tst_acc_list.append(tst_acc) mlp = CustomMLP().to(device) mlp_optimizer = SGD(mlp.parameters(), lr=.01, momentum=.9) mlp_trn_loss_list, mlp_trn_acc_list, mlp_tst_loss_list, mlp_tst_acc_list= [], [], [], [] print('--------------CustomMLP--------------') for epoch in range(num_epochs): mlp_time = time.time() trn_loss, trn_acc = train(mlp, train_loader, device, criterion, mlp_optimizer) tst_loss, tst_acc = test(mlp, test_loader, device, criterion) print('%s epoch || %.5f time' % ((epoch + 1), time.time() - mlp_time)) print('avg train loss: %.5f | avg train acc: %.5f ||| avg test loss: %.5f | avg test acc: %.5f'\ %(trn_loss, trn_acc, tst_loss, tst_acc)) print('') mlp_trn_loss_list.append(trn_loss) mlp_trn_acc_list.append(trn_acc) mlp_tst_loss_list.append(tst_loss) mlp_tst_acc_list.append(tst_acc) del train_dataset del train_loader train_dataset_aug = dataset.MNIST(data_dir_dict['train'], aug_option=True) train_loader_aug = DataLoader(dataset=train_dataset_aug, batch_size=batch_size, shuffle=True, num_workers=n_cpu) net_aug = LeNet5().to(device) net_optimizer_aug = SGD(net_aug.parameters(), lr=.01, momentum=.9) net_aug_trn_loss_list, net_aug_trn_acc_list, net_aug_tst_loss_list, net_aug_tst_acc_list= [], [], [], [] print('--------------LeNet5 with augmentation--------------') for epoch in range(num_epochs): net_time = time.time() trn_loss, trn_acc = train(net_aug, train_loader_aug, device, criterion, net_optimizer_aug) tst_loss, tst_acc = test(net_aug, test_loader, device, criterion) print('%s epoch || %.5f time' % ((epoch + 1), time.time() - net_time)) print('avg train loss: %.5f | avg train acc: %.5f ||| avg test loss: %.5f | avg test acc: %.5f'\ %(trn_loss, trn_acc, tst_loss, tst_acc)) print('') net_aug_trn_loss_list.append(trn_loss) net_aug_trn_acc_list.append(trn_acc) net_aug_tst_loss_list.append(tst_loss) net_aug_tst_acc_list.append(tst_acc) mlp_aug = CustomMLP().to(device) mlp_optimizer_aug = SGD(mlp_aug.parameters(), lr=.01, momentum=.9) mlp_aug_trn_loss_list, mlp_aug_trn_acc_list, mlp_aug_tst_loss_list, mlp_aug_tst_acc_list= [], [], [], [] print('--------------CustomMLP with augmentation--------------') for epoch in range(num_epochs): mlp_time = time.time() trn_loss, trn_acc = train(mlp_aug, train_loader_aug, device, criterion, mlp_optimizer_aug) tst_loss, tst_acc = test(mlp_aug, test_loader, device, criterion) print('%s epoch || %.5f time' % ((epoch + 1), time.time() - mlp_time)) print('avg train loss: %.5f | avg train acc: %.5f ||| avg test loss: %.5f | avg test acc: %.5f'\ %(trn_loss, trn_acc, tst_loss, tst_acc)) print('') mlp_aug_trn_loss_list.append(trn_loss) mlp_aug_trn_acc_list.append(trn_acc) mlp_aug_tst_loss_list.append(tst_loss) mlp_aug_tst_acc_list.append(tst_acc) net_result, net_aug_result, mlp_result, mlp_aug_result = {}, {}, {}, {} net_result['train_loss'] = net_trn_loss_list net_result['train_acc'] = net_trn_acc_list net_result['test_loss'] = net_tst_loss_list net_result['test_acc'] = net_tst_acc_list net_aug_result['train_loss'] = net_aug_trn_loss_list net_aug_result['train_acc'] = net_aug_trn_acc_list net_aug_result['test_loss'] = net_aug_tst_loss_list net_aug_result['test_acc'] = net_aug_tst_acc_list mlp_result['train_loss'] = mlp_trn_loss_list mlp_result['train_acc'] = mlp_trn_acc_list mlp_result['test_loss'] = mlp_tst_loss_list mlp_result['test_acc'] = mlp_tst_acc_list mlp_aug_result['train_loss'] = mlp_aug_trn_loss_list mlp_aug_result['train_acc'] = mlp_aug_trn_acc_list mlp_aug_result['test_loss'] = mlp_aug_tst_loss_list mlp_aug_result['test_acc'] = mlp_aug_tst_acc_list net_result = pd.DataFrame( net_result, columns=['train_loss', 'train_acc', 'test_loss', 'test_acc']) net_aug_result = pd.DataFrame( net_aug_result, columns=['train_loss', 'train_acc', 'test_loss', 'test_acc']) mlp_result = pd.DataFrame( mlp_result, columns=['train_loss', 'train_acc', 'test_loss', 'test_acc']) mlp_aug_result = pd.DataFrame( mlp_aug_result, columns=['train_loss', 'train_acc', 'test_loss', 'test_acc']) if not os.path.exists('../results/'): os.makedirs('../results/') net_result.to_csv('../results/LeNet5.csv', index=False) net_aug_result.to_csv('../results/LeNet5_aug.csv', index=False) mlp_result.to_csv('../results/MLP.csv', index=False) mlp_aug_result.to_csv('../results/MLP_aug.csv', index=False)
def main(net="lenet5", augmentation=False): device = torch.device("cpu") if net == "lenet5" : net = LeNet5() elif net == "mlp" : net = CustomMLP() else : print("Not support Networks") loss = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9) if os.path.isdir("../data/train_mnist") == True : pass else : print("Unzip train mnist") trn_dataset = '../data/train.tar' trn_dst_path = '../data/train_mnist' os.makedirs(trn_dst_path) with tarfile.TarFile(trn_dataset, 'r') as file: file.extractall(trn_dst_path) if os.path.isdir("../data/test_mnist") == True : pass else : print("Unzip test mnist") tst_dataset = '../data/test.tar' tst_dst_path = '../data/test_mnist' os.makedirs(tst_dst_path) with tarfile.TarFile(tst_dataset, 'r') as file: file.extractall(tst_dst_path) train_folder = "../data/train_mnist/train" test_folder = "../data/test_mnist/test" train_set = dataset.MNIST(train_folder) test_set = dataset.MNIST(test_folder) if augmentation == True : aug_train_set = dataset.MNIST(train_folder, aug=True) aug_test_Set = dataset.MNIST(test_folder) trn_loader = DataLoader(train_set, batch_size=128) tst_loader = DataLoader(test_set, batch_size=128) if augmentation == True : aug_train_loader = DataLoader(aug_train_set, batch_size=128) aug_test_loader = DataLoader(aug_test_Set, batch_size=128) train_logger = [] test_logger = [] for epoch in range(5) : if augmentation == False : training = train(net.to(device), trn_loader, device, loss, optimizer, epoch) tests = test(net.to(device), tst_loader, device, loss, epoch) train_logger.append(training) test_logger.append(tests) else : training = train(net.to(device), aug_train_loader, device, loss, optimizer, epoch) tests = test(net.to(device), tst_loader, device, loss, epoch) train_logger.append(training) test_logger.append(tests) return train_logger, test_logger
def train(self): """ 模型训练,如果模型文件已存在则跳过训练,直接加载模型文件 :return: """ self.model = LeNet5(dropout_prob=self.dropout_prob, halve_conv_kernels=self.halve_conv_kernels) if self.has_cuda: self.model.cuda() # 模型文件已经存在,直接加载模型后返回 if os.path.exists(self.model_path): print('Train: model file exists, skip training.') print('Train: loading model state from file [%s] ...' % self.model_path) self.model.load_state_dict(torch.load(self.model_path)) return criterion = nn.CrossEntropyLoss(reduction='sum') optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4, betas=(0.9, 0.99)) idx = 0 # batch 计数 is_stop = False # 是否停止训练 best_loss = float('inf') # 最优评估 loss 值 best_acc = 0. # 最优评估准确率 best_batch_idx = 0 # 最优批次 best_model_state = None # 模型最优状态 print('Train: start training model ...') self.model.train() for epoch in range(self.epochs): for i, (x, y) in enumerate(self.train_loader): idx += 1 if self.has_cuda: x, y = x.cuda(), y.cuda() optimizer.zero_grad() # 梯度清零 output = self.model(x) # 前向传播计算预测值 loss = criterion(output, y) # 计算 loss loss.backward() # 反向传播计算梯度 optimizer.step() # 参数调整 # 每 100 批次输出一次效果 if idx % 100 == 0: y = y.cpu() y_pred = output.argmax(dim=1).cpu() acc = metrics.accuracy_score(y, y_pred) # 本轮训练准确率 # 使用测试集评估模型的准确率和 loss 值 eval_acc, eval_loss = self.eval(self.test_loader) # 设置为 train 模式,因为在 eval() 中置为了 eval 模式 self.model.train() suffix = '' # 更新最优状态 if eval_loss < best_loss or eval_acc > best_acc: suffix = ' *' best_batch_idx = idx best_loss = min(best_loss, eval_loss) best_acc = max(best_acc, eval_acc) best_model_state = self.model.state_dict() msg = 'Train [Epoch {:>3}]: \tTrain Loss: {:7.3f}\t' + \ 'Train Acc: {:>5.2%}\t' + \ 'Eval Loss: {:7.3f}\tEval Acc: {:>5.2%}{}' print( msg.format(epoch + 1, loss.item(), acc, eval_loss, eval_acc, suffix)) # 如果超过连续 1000 个批次没有优化,则结束训练 if idx - best_batch_idx > 1000: print('no optimization for more than 1000 batches, ' 'auto stop training.') is_stop = True break if is_stop: break print('Train: end training model, best loss {:.3f}, best acc {:.2%}'. format(best_loss, best_acc)) if not os.path.exists(self.output_dir): os.mkdir(self.output_dir) print('Train: saving model [%s] ...' % self.model_path) # 保存模型 torch.save(best_model_state, self.model_path) # 加载模型最优状态 self.model.load_state_dict(best_model_state)
def main(): """ Main function Here, you should instantiate 1) Dataset objects for training and test datasets 2) DataLoaders for training and testing 3) model 4) optimizer: SGD with initial learning rate 0.01 and momentum 0.9 5) cost function: use torch.nn.CrossEntropyLoss """ lenet = LeNet5() mlp = CustomMLP() mnist_Dataset_trn = MNIST(train_dir) mnist_Dataset_tst = MNIST(test_dir) trn_loader = DataLoader(mnist_Dataset_trn, batch_size=128, shuffle=True) tst_loader = DataLoader(mnist_Dataset_tst, batch_size=128, shuffle=True) criterion = nn.CrossEntropyLoss() optimizer_lenet = optim.SGD(lenet.parameters(), lr=0.001, momentum=0.9) optimizer_mlp = optim.SGD(mlp.parameters(), lr=0.001, momentum=0.9) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") trn_loss = [] train_acc = [] tst_loss = [] test_acc = [] test_loss, acc_ = test(lenet.to(device), tst_loader, device, criterion) for epoch in range(20): training_loss, acc = train(lenet.to(device), trn_loader, device, criterion, optimizer_lenet, epoch) trn_loss.append(training_loss) train_acc.append(acc) test_loss, acc_ = test(lenet, tst_loader, device, criterion) tst_loss.append(test_loss) test_acc.append(acc_) trn_loss_mlp = [] train_acc_mlp = [] tst_loss_mlp = [] test_acc_mlp = [] test_loss, acc_ = test(mlp.to(device), tst_loader, device, criterion) for epoch in range(20): training_loss, acc = train(mlp.to(device), trn_loader, device, criterion, optimizer_mlp, epoch) trn_loss_mlp.append(training_loss) train_acc_mlp.append(acc) test_loss, acc_ = test(mlp, tst_loader, device, criterion) tst_loss_mlp.append(test_loss) test_acc_mlp.append(acc_) fig = plt.figure() plt.plot(range(20), trn_loss, color='blue') plt.plot(range(20), trn_loss_mlp, color='red') plt.legend(['Train Loss Lent', 'Train Loss MLP'], loc='upper right') plt.plot(range(20), tst_loss, color='blue') plt.plot(range(20), tst_loss_mlp, color='red') plt.legend(['Test Loss Lent', 'Test Loss MLP'], loc='upper right') plt.plot(range(20), train_acc, color='blue') plt.plot(range(20), train_acc_mlp, color='red') plt.legend(['Train Accuracy Lent', 'Train Accuracy MLP'], loc='upper right') plt.plot(range(20), test_acc, color='blue') plt.plot(range(20), test_acc_mlp, color='red') plt.legend(['Test Accuracy Lent', 'Test Accuracy MLP'], loc='upper right')
def main(): """ Main function Here, you should instantiate 1) Dataset objects for training and test datasets 2) DataLoaders for training and testing 3) model 4) optimizer: SGD with initial learning rate 0.01 and momentum 0.9 5) cost function: use torch.nn.CrossEntropyLoss """ # ========== 1. data load ========== transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.1307], [0.3081])]) train_dataset = dataset.MNIST(data_dir = 'data/train.tar', transform=transform) test_dataset = dataset.MNIST(data_dir = 'data/test.tar', transform=transform) train_data = DataLoader(train_dataset, batch_size=64) test_data = DataLoader(test_dataset, batch_size=64) # ========== 2. Lenet 5 model ========== device = 'cuda' if torch.cuda.is_available() else 'cpu' training_epochs = 10 lenet_model = LeNet5().to(device) lenet_optimizer = torch.optim.SGD(lenet_model.parameters(), lr=0.01, momentum=0.9) lenet_cost_function = torch.nn.CrossEntropyLoss().to(device) print('Lenet 5 training start ') lenet_time = time.time() lenet_trn_loss, lenet_trn_acc, lenet_tst_loss, lenet_tst_acc = [],[],[],[] for epoch in range(training_epochs) : lenet_train_loss, lenet_train_acc = train(model=lenet_model, trn_loader=train_data, device=device, criterion=lenet_cost_function, optimizer=lenet_optimizer) lenet_test_loss, lenet_test_acc = test(model=lenet_model, tst_loader=test_data, device=device, criterion=lenet_cost_function) lenet_trn_loss.append(lenet_train_loss); lenet_trn_acc.append(lenet_train_acc) lenet_tst_loss.append(lenet_test_loss); lenet_tst_acc.append(lenet_test_acc) print('epochs {} training loss {} training accuracy {} validation loss {} validation accuracy {}'.format(epoch, lenet_train_loss, lenet_train_acc, lenet_test_loss, lenet_test_acc)) if epoch+1 == 10 : print('lenet execution time : {}'.format(time.time() - lenet_time)) # ========== 3. Regularized Lenet 5 model ========== regularized_lenet_model = regularized_LeNet5().to(device) regularized_lenet_optimizer = torch.optim.SGD(regularized_lenet_model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.001) regularized_lenet_cost_function = torch.nn.CrossEntropyLoss().to(device) print('Regularized Lenet 5 training start ') r_lenet_time = time.time() r_lenet_trn_loss, r_lenet_trn_acc, r_lenet_tst_loss, r_lenet_tst_acc = [],[],[],[] for epoch in range(training_epochs) : regularized_lenet_train_loss, regularized_lenet_train_acc = train(model=regularized_lenet_model, trn_loader=train_data, device=device, criterion=regularized_lenet_cost_function, optimizer=regularized_lenet_optimizer) regularized_lenet_test_loss, regularized_lenet_test_acc = test(model=regularized_lenet_model, tst_loader=test_data, device=device, criterion=regularized_lenet_cost_function) r_lenet_trn_loss.append(regularized_lenet_train_loss); r_lenet_trn_acc.append(regularized_lenet_train_acc) r_lenet_tst_loss.append(regularized_lenet_test_loss); r_lenet_tst_acc.append(regularized_lenet_test_acc) print('epochs {} training loss {} training accuracy {} validation loss {} validation accuracy {}'.format(epoch, regularized_lenet_train_loss, regularized_lenet_train_acc, regularized_lenet_test_loss, regularized_lenet_test_acc)) if epoch+1 == 10 : print('regularized execution time : {}'.format(time.time() - r_lenet_time)) # ========== 4. Custom model Load ========== custom_model = CustomMLP().to(device) custom_optimizer = torch.optim.SGD(custom_model.parameters(), lr=0.01, momentum=0.9) custom_cost_function = torch.nn.CrossEntropyLoss().to(device) print('Custom model training start') custom_time = time.time() custom_trn_loss, custom_trn_acc, custom_tst_loss, custom_tst_acc = [],[],[],[] for epoch in range(training_epochs) : custom_train_loss, custom_train_acc = train(model=custom_model, trn_loader=train_data, device=device, criterion=custom_cost_function, optimizer=custom_optimizer) custom_test_loss, custom_test_acc = test(model=custom_model, tst_loader=test_data, device=device, criterion=custom_cost_function) custom_trn_loss.append(custom_train_loss); custom_trn_acc.append(custom_train_acc) custom_tst_loss.append(custom_test_loss); custom_tst_acc.append(custom_test_acc) print('epochs {} training loss {} training accuracy {} validation loss {} validation accuracy {}'.format(epoch, custom_train_loss, custom_train_acc, custom_test_loss, custom_test_acc)) if epoch+1 == 10 : print('custom model execution time : {}'.format(time.time() - custom_time)) # ========== 5. visualization ========== # make loss and acc list for visualization trn_loss = [lenet_trn_loss, r_lenet_trn_loss, custom_trn_loss] trn_acc = [lenet_trn_acc, r_lenet_trn_acc, custom_trn_acc] tst_loss = [lenet_tst_loss, r_lenet_tst_loss, custom_tst_loss] tst_acc = [lenet_tst_acc, r_lenet_tst_acc, custom_tst_acc] # draw plot draw_plot(trn_loss, trn_acc, tst_loss, tst_acc)
MAX_STEP = 8000 learning_rate = 0.0001 # 小于0.001 print("I'm OK") train_dir = 'D:/workspace/python_dir/tersonflowdemo/dogvscat/dataset/data/train/' # 训练图片文件夹 logs_train_dir = 'D:/workspace/python_dir/tersonflowdemo/dogvscat/result/' # 保存训练结果文件夹 train, train_label = test.get_files(train_dir) train_batch, train_label_batch = test.get_batch(train, train_label, IMG_W, IMG_H, BATCH_SIZE, CAPACITY) # 训练操作定义 sess = tf.Session() train_logits = model.inference(train_batch, BATCH_SIZE, N_CLASSES) #网络结构 train_loss = model.losses(train_logits, train_label_batch) #反向传播结构 train_op = model.trainning(train_loss, learning_rate) train_acc = model.evaluation(train_logits, train_label_batch) # train_label_batch = tf.one_hot(train_label_batch,2,1,0) # 测试操作定义 summary_op = tf.summary.merge_all() # 产生一个writer来写log文件 train_writer = tf.summary.FileWriter(logs_train_dir, sess.graph) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator()
train=False, transform=transforms) # define the data loaders train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True) valid_loader = DataLoader(dataset=valid_dataset, batch_size=BATCH_SIZE, shuffle=False) print("Data is loaded") torch.manual_seed(RANDOM_SEED) model = LeNet5(N_CLASSES).to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) criterion = nn.CrossEntropyLoss() model, optimizer, _ = training_loop(model, criterion, optimizer, train_loader, valid_loader, N_EPOCHS, DEVICE) torch.save( model, "/home/murugesh/PycharmProjects/LeNeT5_implementation_using_PyTorch/model/model.pt" ) # Plotting Results ROW_IMG = 10 N_ROWS = 5
# compute the gradient of the loss w.r.t to all graph leaves (inputs) loss.backward() # perform a parameter update using calculated gradients optimizer.step() #bookkeeping epoch_loss += loss.item() return epoch_loss if __name__ == '__main__': ### Instantiate model before optimizer if moving to cuda model = LeNet5(data_sets_dict['num_classes']).to(device) optimizer = optimizer_func(model.parameters(), **optimizer_args) ### Train # create up here since we use it to save our models EM = ExperimentManager(EXPERIMENTS_ROOT_DIR) # bookkeeping loss = [] training_accuracy = [] testing_accuracy = [] best_training_accuracy = 0 # Train the model for epoch in range(num_epochs):
def main(): """ Main function Here, you should instantiate 1) Dataset objects for training and test datasets 2) DataLoaders for training and testing 3) model 4) optimizer: SGD with initial learning rate 0.01 and momentum 0.9 5) cost function: use torch.nn.CrossEntropyLoss """ # roottrain='E:/document/programing/mnisttest/data/test' # roottest ='E:/document/programing/mnisttest/data/train' roottrain = 'data/train' roottest = 'data/test' trainloader = DataLoader( dataset=Dataset( root=roottrain), ################################################# batch_size=10, shuffle=True) testloader = DataLoader( dataset=Dataset( root=roottest), ################################################ batch_size=10, shuffle=False) device = torch.device("cuda:0") # model = CustomMLP() model = LeNet5() criterionLeNet = torch.nn.CrossEntropyLoss() optimizerLeNet = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) lenet5trnloss, lenet5trnacc = train(model=model, trn_loader=trainloader, device=device, criterion=criterionLeNet, optimizer=optimizerLeNet) lenet5tstloss, lenet5tstacc = test(model=model, tst_loader=testloader, device=device, criterion=criterionLeNet) model = CustomMLP() criterionCustomMLP = torch.nn.CrossEntropyLoss() optimizerCustomMLP = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) CustomMLPtrnloss, CustomMLPtrnacc = train(model=model, trn_loader=trainloader, device=device, criterion=criterionCustomMLP, optimizer=optimizerCustomMLP) CustomMLPtstloss, CustomMLPtstacc = test(model=model, tst_loader=testloader, device=device, criterion=criterionCustomMLP) # device = torch.device("cuda:0") # # model = CustomMLP() # model = LeNet5() # # criterion = torch.nn.CrossEntropyLoss() # optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) # lenet5trnloss,lenet5trnacc=train(model = model, trn_loader=trainloader, device=device, criterion=criterion, optimizer=optimizer) # lenet5tstloss,lenet5tstacc=test(model=model, tst_loader=testloader, device=device, criterion=criterion) # model = CustomMLP() # CustomMLPtrnloss,CustomMLPtrnacc=train(model = model, trn_loader=trainloader, device=device, criterion=criterion, optimizer=optimizer) # CustomMLPtstloss,CustomMLPtstacc=test(model=model, tst_loader=testloader, device=device, criterion=criterion) fig = plt.figure() lossplt = fig.add_subplot(2, 2, 1) plt.plot(range(int((trainloader.__len__()) / 100)), lenet5trnloss, color='g', label='LeNet5 train loss') plt.plot(range(int((testloader.__len__()) / 100)), lenet5tstloss, color='r', label='LeNet5 test loss') plt.plot(range(int((trainloader.__len__()) / 100)), CustomMLPtrnloss, color='b', label='Custom MLP train loss') plt.plot(range(int((testloader.__len__()) / 100)), CustomMLPtstloss, color='m', label='Custom MLP test loss') plt.legend(loc='upper right', bbox_to_anchor=(1.0, 1.0)) plt.xlabel('epoch (x100)') plt.ylabel('loss') plt.title('Loss') accplt = fig.add_subplot(2, 2, 2) plt.plot(range(int((trainloader.__len__()) / 100)), lenet5trnacc, color='g', label='LeNet5 train accuracy') plt.plot(range(int((testloader.__len__()) / 100)), lenet5tstacc, color='r', label='LeNet5 test accuracy') plt.plot(range(int((trainloader.__len__()) / 100)), CustomMLPtrnacc, color='b', label='Custom MLP train accuracy') plt.plot(range(int((testloader.__len__()) / 100)), CustomMLPtstacc, color='m', label='Custom MLP test accuracy') plt.legend(loc='upper right', bbox_to_anchor=(1.0, 1.0)) plt.xlabel('epoch (x100)') plt.ylabel('acc') plt.title('Accuracy') lenetplt = fig.add_subplot(2, 2, 3) plt.plot(range(int((trainloader.__len__()) / 100)), lenet5trnloss, color='g', label='train loss') plt.plot(range(int((testloader.__len__()) / 100)), lenet5tstloss, color='r', label='test loss') plt.plot(range(int((trainloader.__len__()) / 100)), lenet5trnacc, color='b', label='train accuracy') plt.plot(range(int((testloader.__len__()) / 100)), lenet5tstacc, color='m', label='test accuracy') plt.legend(loc='upper right', bbox_to_anchor=(1.0, 1.0)) plt.xlabel('epoch (x100)') plt.title('Loss and Accuracy of LeNet5') customplt = fig.add_subplot(2, 2, 4) plt.plot(range(int((trainloader.__len__()) / 100)), CustomMLPtrnloss, color='g', label='train loss') plt.plot(range(int((testloader.__len__()) / 100)), CustomMLPtstloss, color='r', label='test loss') plt.plot(range(int((trainloader.__len__()) / 100)), CustomMLPtrnacc, color='b', label='train accuracy') plt.plot(range(int((testloader.__len__()) / 100)), CustomMLPtstacc, color='m', label='test accuracy') plt.legend(loc='upper right', bbox_to_anchor=(1.0, 1.0)) plt.xlabel('epoch (x100)') plt.title('Loss and Accuracy of Custom MLP') plt.show()
def worker(gpu, ngpus_per_node, args): args.gpu = gpu device = torch.device("cpu") if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) device = torch.device("cuda:" + str(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) model = LeNet5().to(device) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) else: model = torch.nn.DataParallel(model) train_dataset = datasets.MNIST(args.data, train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.MNIST( args.data, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, device, args) # evaluate on validation set acc = validate(val_loader, model, criterion, device, args)