helper.imshow(image[0, :]) # %% [markdown] # # Train a network # # To make things more concise here, I moved the model architecture and training code from the last part to a file called `fc_model`. Importing this, we can easily create a fully-connected network with `fc_model.Network`, and train the network using `fc_model.train`. I'll use this model (once it's trained) to demonstrate how we can save and load models. # %% # Create the network, define the criterion and optimizer model = fc_model.Network(784, 10, [512, 256, 128]) criterion = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) # %% fc_model.train(model, trainloader, testloader, criterion, optimizer, epochs=2) # %% [markdown] # ## Saving and loading networks # # As you can imagine, it's impractical to train a network every time you need to use it. Instead, we can save trained networks then load them later to train more or use them for predictions. # # The parameters for PyTorch networks are stored in a model's `state_dict`. We can see the state dict contains the weight and bias matrices for each of our layers. # %% print("Our model: \n\n", model, '\n') print("The state dict keys: \n\n", model.state_dict().keys()) # %% [markdown] # The simplest thing to do is simply save the state dict with `torch.save`. For example, we can save it to a file `'checkpoint.pth'`.
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.001)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--checkpoint', type=str, default="./model_checkpoint/checkpoint.pth", help='Path to save Check point') parser.add_argument('--verify-model', action="store_true", default=False, help='use for to verify model file') parser.add_argument('--debug', action="store_true", default=False, help='use for to print debug logs') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") devname = "cuda" if use_cuda else "cpu" print("\n\n----------------\nDevice Used to process:", devname) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) train_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=True, download=True, transform=transform), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=False, download=True, transform=transform), batch_size=args.test_batch_size, shuffle=True, **kwargs) cwd = os.getcwd() cwd = cwd + "/" + args.checkpoint if (args.debug == True): print("\n\nPath for checkpoint file :{}\n\n".format(cwd)) if os.path.isfile(cwd): print( "File Check Status: File is already present \nRetry with different file name\n----------------\n" ) else: print( "\n----------------\nFile Check Status: File is not present!!!\nCreating new with name:{}\n----------------\n\n" .format(cwd)) model = fc_model.Network(784, 10) criterion = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) #Train and validation fc_model.train(model, train_loader, test_loader, criterion, optimizer, device, args.epochs) print("\n----------------\nOur model: \n\n", model, "\n----------------\n") if (args.debug == True): print("The state dict keys: \n\n", model.state_dict().keys(), "\n----------------\n") checkpoint = { 'input_size': 784, 'output_size': 10, 'state_dict': model.state_dict() } torch.save(checkpoint, args.checkpoint) if (args.verify_model == True): model1 = load_checkpoint(args.checkpoint) model1.to(device) print("\n\nloaded model\n\n", model1) # Test out your network! model1.eval() dataiter = iter(test_loader) images, labels = dataiter.next() # Convert 2D image to 1D vector img, labels = images.to(device), labels.to(device) # Calculate the class probabilities (softmax) for img with torch.no_grad(): output = model1.forward(img) ps = torch.exp(output) equality = (labels.data == ps.max(1)[1]) print(ps) print(equality)
help="Set number of hidden_units", default=512, type=int) parser.add_argument("--epochs", help="Set number of epochs to train for", default=20, type=int) parser.add_argument("--gpu", help="Use GPU for inference", action="store_true") args = parser.parse_args() trainloader, validloader, testloader, class_to_idx = load_data( args.data_directory) model, criterion, optimizer = create_model(arch=args.arch, hidden_units=args.hidden_units, lr=args.learning_rate) model.class_to_idx = class_to_idx device = torch.device("cuda" if ( torch.cuda.is_available() and args.gpu) else "cpu") train(model, criterion, optimizer, trainloader, validloader, device=device, epochs=args.epochs, save_dir=args.save_dir)
## train model print('\n*** Trainning model ***\n\n') if gpu and torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' criterion = nn.NLLLoss() if arch == 'vgg11': optimizer = optim.Adam(model.classifier.parameters(), lr=learnrate) elif arch == 'resnet50': optimizer = optim.Adam(model.fc.parameters(), lr=learnrate) model, train_losses, valid_losses = fc_model.train(model, criterion, optimizer, dataloaders['train'], dataloaders['valid'], epochs, device) ## save the checkpoint print('\n*** saving model checkpoint ***') model.class_to_idx = class_to_idx checkpoint = { 'arch': arch, 'input_size': input_size, 'output_size': output_size, 'hidden_layers': hidden_layers, 'dropout': dropout, 'state_dict': model.state_dict(), 'class_to_idx': model.class_to_idx, 'epochs': epochs,