Example #1
0
def run_feed_forward_back_propagation(model, epochs, dataloaders, criterion,
                                      device, optimizer):
    model.to(device)
    train_losses = []
    validation_losses = []

    for e in keep_awake(range(epochs)):
        running_loss = update_from_training_data(model, dataloaders, device,
                                                 criterion, optimizer,
                                                 train_losses)
        validation_loss, validation_accuracy = evaluate_model_on_validation(
            model, dataloaders, device, criterion, validation_losses)
        test_loss, test_accuracy = evaluate_model_on_testing(
            model, dataloaders, device, criterion)
        print("Epoch: {}/{}".format(e + 1, epochs))
        print("Training Loss: {:.3f}..".format(running_loss /
                                               len(dataloaders['training'])))
        print("Validation Loss: {:.3f}..".format(
            validation_loss / len(dataloaders['validation'])))
        print("Validation Accuracy: {:.3f}..".format(
            validation_accuracy / len(dataloaders['validation'])))
        print("Testing Loss: {:.3f}..".format(test_loss /
                                              len(dataloaders['testing'])))
        print("Test Accuracy: {:.3f}..\n".format(test_accuracy /
                                                 len(dataloaders['testing'])))
def train(model, trainloader, validloader, device, epochs=5):
    # Train fully connected layers
    print_every = 25
    steps = 0
    optimizer = model.optimizer
    criterion = model.criterion

    for e in keep_awake(range(epochs)):
        running_loss = 0
        # turn on dropout for training
        model.train()
        for images, labels in trainloader:
            steps += 1

            # move images and labels to same device as model
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()

            logps = model.forward(images)
            loss = criterion(logps, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # print progress periodically
            if steps % print_every == 0:
                # turn off dropout for validation
                model.eval()

                accuracy = 0
                valid_loss = 0

                with torch.no_grad():
                    for images, labels in validloader:
                        # move images and labels to same device as model
                        images, labels = images.to(device), labels.to(device)

                        logps = model.forward(images)
                        batch_loss = criterion(logps, labels)

                        valid_loss += batch_loss.item()

                        # check accuraccy
                        ps = torch.exp(logps)
                        top_p, top_class = ps.topk(1, dim=1)
                        equals = top_class == labels.view(*top_class.shape)
                        accuracy += torch.mean(equals.type(
                            torch.FloatTensor)).item()

                # print output
                print(f"Epoch {e+1}/{epochs}.. "
                      f"Train loss: {running_loss/print_every:.3f}.. "
                      f"Validation loss: {valid_loss/len(validloader):.3f}.. "
                      f"Validation accuracy: {accuracy/len(validloader):.3f}")

    return model
Example #3
0
    def train_and_validate(self):
        optimizer: NNModule = self.optimizer_class(
            self.classifier.parameters(), lr=self.learning_rate)
        self.arch.to(self.device)
        for ep in keep_awake(range(self.epochs)):
            print(f"\nStarting epoch # {ep + 1} of {self.epochs}")
            print(f"Batch progress", end="...")
            # set model for training
            self.arch.train()
            training_loss: float = 0.0

            for count, (images,
                        labels) in enumerate(self.dataloaders["train"]):
                optimizer.zero_grad()
                images: torch.Tensor = images.to(self.device)
                labels: torch.Tensor = labels.to(self.device)

                if count % 10 == 0:
                    print(count, end="...")
                log_ps: torch.Tensor = self.arch.forward(images)
                loss: torch.Tensor = self.criterion(log_ps, labels)
                training_loss += loss.item()
                loss.backward()
                optimizer.step()
            print(f"\nTotal training loss: {training_loss}")

            print(f"\nBeginning evaluation for epoch #{ep + 1}")
            print(f"Batch progress", end="...")
            # set model for evaluation
            self.arch.eval()
            accuracy: float = 0.0
            validation_loss: float = 0.0
            with torch.no_grad():

                for count, (images, labels) in enumerate(
                        self.dataloaders["validation"]):
                    images: torch.Tensor = images.to(self.device)
                    labels: torch.Tensor = labels.to(self.device)

                    if count % 10 == 0:
                        print(count, end="...")
                    log_ps: torch.Tensor = self.arch.forward(images)
                    loss: torch.Tensor = self.criterion(log_ps, labels)
                    validation_loss += loss.item()
                    ps: torch.Tensor = torch.exp(log_ps)
                    top_class: torch.Tensor = ps.topk(1, dim=1)[1]
                    equals: torch.Tensor = torch.eq(
                        top_class, labels.view(*top_class.shape))
                    batch_acc: float = equals.type(torch.FloatTensor).mean()
                    accuracy += batch_acc

                print(
                    f"\n\tTotal validation loss: {validation_loss}"
                    f"\n\tAccuracy: {accuracy / len(self.dataloaders['validation'])}"
                )
Example #4
0
def train_nn_model(nn_model,
                   optimizer,
                   trainloader,
                   validloader,
                   device,
                   epochs,
                   report_every=20):
    """Run training loop."""
    criterion = nn.NLLLoss()
    nn_model.to(device)
    running_loss = 0.0
    start_time = time.time()
    for epoch in keep_awake(range(epochs)):
        steps = 0
        for inputs, labels in trainloader:
            steps += 1
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            logps = nn_model(inputs)
            loss = criterion(logps, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            if steps % report_every == 0:
                val_loss = 0
                accuracy = 0
                nn_model.eval()
                with torch.no_grad():
                    for inputs, labels in validloader:
                        inputs, labels = inputs.to(device), labels.to(device)
                        logps = nn_model(inputs)
                        val_loss += criterion(logps, labels).item()

                        ps = torch.exp(logps)  # inverse of log
                        top_p, top_class = ps.topk(1, dim=1)
                        equals = top_class == labels.view(*top_class.shape)
                        device_type = torch.FloatTensor
                        if device == torch.device('cuda'):
                            device_type = torch.cuda.FloatTensor
                        accuracy += torch.mean(equals.type(device_type)).item()

                val_size = len(validloader)
                logging.info(
                    f"Epoch {epoch+1}/{epochs}.. "
                    f"Steps: {steps}.. "
                    f"Time: {(time.time() - start_time):.3f}s.. "
                    f"Running loss: {running_loss/report_every:.3f}.. "
                    f"Validation loss: {val_loss/val_size:.3f}.. "
                    f"Validation accuracy: {100 * accuracy/val_size:.3f}%")
                running_loss = 0
                nn_model.train()
    logging.info(f"Total training time: {(time.time() - start_time):.3f}s")
Example #5
0
def train_model(model, trainloader, validloader, device, optimizer, epochs):
    criterion = nn.NLLLoss()

    step = 0
    print_every = 5

    train_loss = 0

    for e in keep_awake(range(epochs)):

        for inputs, labels in trainloader:
            step += 1

            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            if step % print_every == 0:
                valid_loss = 0
                accuracy = 0

                model.eval()

                with torch.no_grad():
                    for inputs, labels in validloader:
                        inputs, labels = inputs.to(device), labels.to(device)

                        log_ps = model(inputs)
                        batch_loss = criterion(log_ps, labels)

                        valid_loss += batch_loss.item()

                        ps = torch.exp(log_ps)
                        top_p, top_class = ps.topk(1, dim=1)
                        equals = top_class == labels.view(*top_class.shape)
                        accuracy += torch.mean(equals.type(
                            torch.FloatTensor)).item()

                model.train()

                print(f'Epoch: {e + 1}/{epochs}',
                      f'Training Loss: {train_loss/print_every:.3f}',
                      f'Valid Loss: {valid_loss/len(validloader):.3f}',
                      f'Valid Accuracy: {accuracy/len(validloader):.3f}')
                train_loss = 0
    return model
Example #6
0
def train_model(epochs, dropout, model, criterion, optimizer, device,
                train_dataloader, valid_dataloader):
    # TODO: Build and train your network
    steps = 0
    running_loss = 0
    print_every = 5

    for epoch in keep_awake(range(epochs)):
        for images, labels in train_dataloader:
            steps += 1

            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()

            logps = model.forward(images)
            loss = criterion(logps, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if steps % print_every == 0:
                test_loss = 0
                accuracy = 0
                model.eval()
                with torch.no_grad():
                    for images, labels in valid_dataloader:
                        images, labels = images.to(device), labels.to(device)
                        logps = model.forward(images)
                        batch_loss = criterion(logps, labels)

                        test_loss += batch_loss.item()

                        # Calculate accuracy
                        ps = torch.exp(logps)
                        top_p, top_class = ps.topk(1, dim=1)
                        equals = top_class == labels.view(*top_class.shape)
                        accuracy += torch.mean(equals.type(
                            torch.FloatTensor)).item()

                print(
                    f"Epoch {epoch+1}/{epochs}.. "
                    f"Train loss: {running_loss/print_every:.3f}.. "
                    f"Validation loss: {test_loss/len(valid_dataloader):.3f}.. "
                    f"Vaildation accuracy: {accuracy/len(valid_dataloader):.3f}"
                )
                running_loss = 0
                model.train()

    print('Training done')
    return model, optimizer
Example #7
0
def train_network(model, device, optimizer, train_loader, validation_loader,
                  epochs):
    running_loss = 0
    for epoch in keep_awake(range(epochs)):
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            logps = model.forward(inputs)
            # Set gradients to zero.
            optimizer.zero_grad()
            loss = criterion(logps, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        else:
            validation_loss, validation_accuracy = test_network(
                model, validation_loader, device)
            print(
                f"Epoch {epoch+1}/{epochs}.. "
                f"Train loss: {running_loss/len(train_loader):.3f}.. "
                f"Validation loss: {validation_loss/len(validation_loader):.3f}.. "
                f"Validation accuracy: {validation_accuracy/len(validation_loader):.3f}"
            )
            running_loss = 0
    return model, optimizer
Example #8
0
valid_data = datasets.ImageFolder(valid_dir, transform=data_transforms[1])
test_data = datasets.ImageFolder(test_dir, transform=data_transforms[2])
image_datasets = [train_data, valid_data, test_data]

# TODO: Using the image datasets and the trainforms, define the dataloaders
trainloader = torch.utils.data.DataLoader(train_data,
                                          batch_size=64,
                                          shuffle=True)
validloader = torch.utils.data.DataLoader(valid_data,
                                          batch_size=64,
                                          shuffle=True)
testloader = torch.utils.data.DataLoader(test_data, batch_size=64)
dataloaders = [trainloader, validloader, testloader]

############################################TODO: Build and train your network
for i in keep_awake(range(5)):
    #Loading the pre-trained net
    if args.arch == "vgg":
        model = models.vgg16(pretrained=True)
        mod_classifier_input = 25088
        # Freeze parameters so we don't backprop through them
        for parameter in model.parameters():
            parameter.requires_grad = False
    elif args.arch == "densenet":
        model = models.densenet161(pretrained=True)
        mod_classifier_input = 2208
        # Freeze parameters so we don't backprop through them
        for parameter in model.parameters():
            parameter.requires_grad = False
    else:
        print(
Example #9
0
def main():
    debug = 'true'
    l = 1
    while l == 1:
        in_args = get_input_args()
        print("***************Training Starting***************")
        print(" data_dir:       ", in_args.data_dir)
        print(" arch:           = {!r}".format(in_args.arch))
        print(" learning_rate:  = {!r}".format(in_args.lr))
        print(" scheduler:      = {!r}".format(in_args.schdlr))
        print(" dropout:        = {!r}".format(in_args.dropout))
        print(" hidden_layers:  = {!r}".format(in_args.hidden_layers))
        print(" epochs:         = {!r}".format(in_args.epochs))
        print(" batch_size      = {!r}".format(in_args.batch_size))
        print(" gpu:            = {!r}".format(in_args.gpu))
        print(" checkpoint:     = {!r}".format(in_args.save_dir))
        print(" log:            = {!r}".format(in_args.log))
        #print(in_args)
        if in_args.batch_size > 64 or in_args.batch_size < 1:
            print("--batch_size: must range from 1 to 64.")
            sys.exit(1)

        yn = str(
            input(
                "Would you like to continue training with these choices Y/N?  "
            ))
        if (yn == 'y' or 'Y' or 'YES' or 'yes'): l = 0
        else:
            sys.exit(1)
    print("\nSet the directory, Path and Name for the Checkpoint-------------")
    if in_args.save_dir:
        # Create save directory if required
        if not os.path.exists(in_args.save_dir):
            os.makedirs(in_args.save_dir)
        # Save checkpoint in save directory
        chkpoint_filepath = in_args.save_dir + '/' + in_args.arch + '_checkpoint.pth'
    else:
        # Save checkpoint in current directory
        chkpoint_filepath = in_args.arch + '_checkpoint.pth'

    # create logger
    logfile = in_args.log
    logging.basicConfig(filename=logfile,
                        filemode='a',
                        level=logging.INFO,
                        format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')
    logging.info('train logging started')
    # log selected arguments
    logging.info('%s %s %s %s', "architecture:", in_args.arch, "checkpoint:",
                 chkpoint_filepath)

    #check gpu and set a flag for it.
    device = setupmodel.gpu_check()
    print(device)
    if torch.cuda.is_available() and in_args.gpu:
        gpu = True
        print("gpu is ENABLED and set, Training on GPU")
    else:
        gpu = False
    print("device:= ", device, "and gpu is:", gpu, "in_args.gpu:", in_args.gpu)

    # Load the data and do the transforms for the training, validation, and testing sets
    print("\ncall load_and_transform-----------------------")
    # Set the default top level folder for the data
    data_dir = in_args.data_dir
    dataloaders, trainloader, vloader, testloader, class_to_idx, dataset_sizes = datasetprep.load_and_transform(
        in_args.data_dir, in_args.batch_size)

    # create a mapping from the label number and the actual flower name.
    cat_to_name = datasetprep.map_catalog_to_name()

    if debug:
        # Explore the current batch, ids and labels
        print("Now show only data batch tensor, ids, and labels")
        inputs, labels = next(iter(dataloaders['training']))
        print(inputs.size())  # gets the batch tensor info
        print(labels)

    print(
        "\nData load_and_transforms completed-----------------------------------"
    )
    logging.info(
        '%s', "Data load_and_transforms completed-------------------------")

    print("\nget the model and features sizes-----------------------",
          in_args.arch)
    model, input_size = setupmodel.get_model(in_args.arch)
    # print out the model information
    if debug:
        print("architecture:", in_args.arch)
        output_size = len(class_to_idx)
        print("output_size:= ", output_size)
        print("input_size:= ", input_size)
        print("output_size= ", len(class_to_idx))
        print("hidden_layers:  = {!r}".format(in_args.hidden_layers))
        learning_rate = in_args.lr
        print("hyperparameters:")
        print("batch_size: =", in_args.batch_size, "epochs: =", in_args.epochs,
              "dropout: =", in_args.dropout, "learning_rate= ", learning_rate)
        # How to load and view all the class indexes, and then get the output_size of the dataset:
        #model.class_to_idx = image_datasets['training_dataset'].class_to_idx
        #model.class_to_idx = trainloader.class_to_idx
        # print(model.class_to_idx)
    logging.info('%s %s %s %s %s %s %s %s', "architecture:", in_args.arch,
                 "input_size:= ", input_size, "output_size:= ", output_size,
                 "hidden_layers: ", in_args.hidden_layers)
    logging.info('%s %s %s %s %s %s %s %s', "batch_size: =",
                 in_args.batch_size, "epochs: =", in_args.epochs, "dropout: =",
                 in_args.dropout, "learning_rate= ", learning_rate)
    print(
        "\n apply hyperparameters and run the classifier-----------------------"
    )
    # Create the classifier
    print("\nSetting Neural Network / create Classifer------")
    print('class_to_idx: ', class_to_idx)
    model, criterion, optimizer = setupmodel.create_classifier(
        model, input_size, in_args.hidden_layers, output_size, learning_rate,
        in_args.dropout, class_to_idx)
    print('criterion=', criterion)
    logging.info('%s %s %s %s %s %s %s %s', "learning_rate: =", in_args.lr,
                 "hidden_layers: =", in_args.hidden_layers, " batch_size: =",
                 in_args.batch_size, "checkpoint:", chkpoint_filepath)
    print("model", model)

    print("\nTraining Neural Network------------------------")
    # Model Training, train the final layers, also we will get an idea of how well the training is working
    # https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html#load-data
    # Initialize with the hyperparameters
    start_time = time()
    epochs = in_args.epochs
    print("Network architecture:", in_args.arch)
    print("Number of epochs:    ",
          in_args.epochs)  # Number of epochs  to train for
    print('Learning rate:       ', in_args.lr)
    print("dropout:=            ", in_args.dropout)
    print('device=              ', device)
    # print("schedular not active")
    logging.info('%s %s %s %s', "architecture:", in_args.arch,
                 "Number of epochs: ", in_args.epochs)
    # Train the network
    print("\nTrain the network---")
    logging.info('%s', " Training Starting-----")

    from workspace_utils import keep_awake
    for i in keep_awake(range(1)):
        print("active session started")
        # The training loss, validation loss, and validation accuracy are printed out as a network trains
        model, criterion, optimizer = setupmodel.train_model(
            model, criterion, optimizer, epochs, trainloader, vloader)
        logging.info('%s', " Training Completed-----")

        # Save trained model
        print("\n Save the checkpoint -------------------")
        # Create `class_to_idx` attribute in model before saving to checkpoint
        # model.class_to_idx = image_datasets['training_dataset'].class_to_idx
        setupmodel.save_checkpoint(model, chkpoint_filepath, in_args.arch,
                                   in_args.epochs, criterion, optimizer)
        print('Model saved at {}'.format(chkpoint_filepath))
        logging.info('%s %s %s %s  %s', " checkpoint saved-----", in_args.arch,
                     model, chkpoint_filepath, in_args.epochs)
        print("\n checkpoint saved-------------------")

    # Calculate and print overall runtime
    time_elapsed = time() - start_time
    print('Training time_elapsed: {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    logging.info(
        '%s', 'Training time_elapsed: {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
Example #10
0
def train(data_dir,
          save_dir=os.path.dirname(os.path.abspath(__file__)),
          arch='resnet50',
          learning_rate=0.003,
          hidden_units=512,
          epochs=3,
          device=None):
    if device:
        device = 'cuda'
    else:
        device = 'cpu'

    model = flower_model(save_dir=save_dir,
                         arch=arch,
                         learning_rate=learning_rate,
                         hidden_units=hidden_units)
    ts, tl = build_dataloaders(data_dir)
    train_dataset = ts[0]
    trainloader, validloader, testloader = tl

    # Setup variables
    model.model.to(device)
    steps = 0  # Count steps for validation
    print_every = 3  # Number of steps until validation occurs
    running_loss = 0  # Accumulate train loss to be averaged

    # Training Loop
    for e in keep_awake(range(epochs)):
        # Loop for each epoch using the trainloader to output batches of images.
        for inputs, labels in trainloader:
            # Counting up steps until next validation
            steps += 1
            # Transfer the inputs and labels to the GPU (if active)
            inputs, labels = inputs.to(device), labels.to(device)
            # Zeroing the gradients of the optimizer, don't actually understand how this helps.
            # Assuming something to prevent the optimizer from adding gradients for each .step(). Not sure why not automatic...
            model.optimizer.zero_grad()
            # Running a forward pass. LogSoftmax outputs the logarithmic probabilites. Can retrieve probs with e^logps
            logps = model.model.forward(inputs)
            # Finding the loss of the output compared to the label
            loss = model.criterion(logps, labels)
            # Finding the gradients with back propagation
            loss.backward()
            # Apply the gradients to the model classifier (fc) parameters with the optimizer
            model.optimizer.step()

            # Update the running loss to be averaged in the validation step. Not really sure what the .item() is actually pulling...
            running_loss += loss.item()

            # Validation pass, every print_every number of steps
            if steps % print_every == 0:
                # Preventing the model from training, not exactly sure how this differs from torch.no_grad()...
                model.model.eval()
                # Initialize variables for validation
                accuracy = 0
                valid_loss = 0
                # Doing another type of preventing gradients from being built, same comment as model.eval() note...
                with torch.no_grad():
                    for inputs, labels in validloader:
                        # Same steps as in trainer, but without doing steps to find backprop gradients and update the model
                        inputs, labels = inputs.to(device), labels.to(device)
                        logps = model.model.forward(inputs)
                        batch_loss = model.criterion(logps, labels)
                        valid_loss += batch_loss.item()

                        # Steps to find the accuracy of the model on the validation images
                        # probs = e^log(probs)
                        ps = torch.exp(logps)
                        # Finding the category with the highest probability
                        top_p, top_class = ps.topk(1, dim=1)
                        # Finding if the top_category matches the label (check if guess is correct). Not sure what the * is...
                        equals = top_class == labels.view(*top_class.shape)
                        # Finding the accuracy, don't entirely understand this step... How many were correct? .item? Why average in print?
                        accuracy += torch.mean(equals.type(
                            torch.FloatTensor)).item()
                # Format the print. Finding the averages of the accumulated losses. Not sure I understand accuracy (what is being accumulated?)
                print(
                    "Epoch {}/{}".format(e + 1, epochs),
                    "Train Loss: {:.3f}".format(running_loss / print_every),
                    "Validation Loss: {:.3f}".format(valid_loss /
                                                     len(testloader)),
                    "Validation Accuracy: {:.3f}".format(accuracy /
                                                         len(testloader)))
                # Reset running_loss
                running_loss = 0
                # Set the model back to training for the next epoch
                model.model.train()
    return model
Example #11
0
def main():
    global model, device, data_dir, train_dir, valid_dir, test_dir
    global batch_size
    # TODO 0: Measures total program runtime by collecting start time
    start_time = time()
    print("start time {}".format(start_time))

    in_arg = get_cmd_args()
    #check_command_line_arguments(in_arg)
    if (in_arg.data_dir != default_data_dir):
        print("ERROR ERROR only allowed data_dir is 'flowers'")

    data_dir = in_arg.data_dir
    train_dir = data_dir + '/train'
    valid_dir = data_dir + '/valid'
    test_dir = data_dir + '/test'

    gpu = in_arg.gpu
    arch = in_arg.arch
    save_dir = in_arg.save_dir  ## to save checkpoint path
    lr = in_arg.learning_rate
    save_dir = in_arg.save_dir
    epochs = in_arg.epochs
    batch_size = in_arg.batch_size
    hidden_layers = in_arg.hidden_units
    ## create save_dir
    save_dir = os.path.join("/home/workspace/ImageClassifier/", save_dir)
    #save_dir = "/home/workspace/ImageClassifier/" + save_dir
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir, mode=0o755)
    ## create checkpoint file suffix
    time_suffix = datetime.now().timestamp
    print("Running train.py with:", "\n    data_dir = ", in_arg.data_dir,
          "\n    gpu =", in_arg.gpu, "\n    arch =", in_arg.arch,
          "  learning_rate =", in_arg.learning_rate, "\n    save_dir=",
          in_arg.save_dir, "\n    epochs =", in_arg.epochs,
          "\n    batch_size =", in_arg.batch_size)

    with open('cat_to_name.json', 'r') as f:
        cat_to_name = json.load(f)

    create_datasets()
    create_dataloaders()

    device = torch.device("cpu")
    cuda = torch.cuda.is_available()
    if (gpu and cuda):
        device = torch.device("cuda:0")
    print("CUDA:{}".format(cuda))

    if (arch == 'vgg13' or arch == 'vgg16'):
        model = models.vgg13(pretrained=True)
        no_input_layer = 25088
    elif (arch == 'vgg16'):
        model = models.vgg16(pretrained=True)
        no_input_layer = 25088

    elif (arch == 'alexnet'):
        model = models.alexnet(pretrained=True)
        no_input_layer = 9216
    else:
        print("train.py does not support model:{}".format(arch))
        print("train.py supports only vgg13 , vgg16,alexnet")
        print("Defaulting to vgg16")
        model = models.vgg16(pretrained=True)
        no_input_layer = 25088

    #model

    print("Model's state_dict:")
    for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor].size())
    print("\nOur model:\n\n", model, '\n')
    print("State dict keys:\n\n", model.state_dict().keys())

    print("DEVICE being used is:", device)

    model.classifier.out_features = num_of_classes

    model_classifier = create_classifier(model, hidden_layers)

    model.classifier = model_classifier
    print("DEVICE:{}".format(device))
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.classifier.parameters(), lr=lr)

    ## start Training
    print("Start training")
    ##  exit()
    steps = 0
    running_loss = 0
    print_every = 40
    for i in keep_awake(range(5)):
        for e in range(epochs):
            model.train()
            for images, labels in train_loader:
                steps += 1
                images, labels = images.to(device), labels.to(device)

                optimizer.zero_grad()

                output = model.forward(images)
                output = output.to(device)
                loss = criterion(output, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

                if steps % print_every == 0:
                    # Make sure network is in eval mode for inference
                    model.eval()

                    # Turn off gradients for validation, saves memory and computations
                    with torch.no_grad():
                        test_loss, accuracy = validation(
                            model, valid_loader, criterion, device)

                    print(
                        "Epoch: {}/{}.. ".format(e + 1, epochs),
                        "Training Loss: {:.3f}.. ".format(running_loss /
                                                          print_every),
                        "Test Loss: {:.3f}.. ".format(test_loss /
                                                      len(valid_loader)),
                        "Test Accuracy: {:.3f}".format(accuracy /
                                                       len(valid_loader)))

                    running_loss = 0

                    # Make sure training is back on
                    model.train()
    end_time = time()
    print("start time:{};end time:{}".format(start_time, end_time))
    print("End training")
    ## end training
    # TODO: Save the checkpoint
    model.class_to_idx = train_dataset.class_to_idx

    checkpoint = {
        'arch': arch,
        'input_size': no_input_layer,
        'output_size': 102,
        'class_to_idx': model.class_to_idx,
        'state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epochs': epochs,
        'classifier': model.classifier
    }
    #'hidden_layers':[each.out_features for each in vgg16_model.hidden_layers] gave following error
    ##AttributeError: 'VGG' object has no attribute 'hidden_layers'

    torch.save(
        checkpoint,
        '/home/workspace/ImageClassifier/' + save_dir + '/checkpoint.pth')
    print("END saving checkpoint")
Example #12
0
# Set the liquidation time
lqt = 60

# Set the number of trades
n_trades = 60

# Set trader's risk aversion
tr = 1e-6

# Set the number of episodes to run the simulation
episodes = 10000

shortfall_hist = np.array([])
shortfall_deque = deque(maxlen=100)

for episode in keep_awake(range(episodes)):
    # Reset the enviroment
    cur_state = env.reset(seed=episode,
                          liquid_time=lqt,
                          num_trades=n_trades,
                          lamb=tr)

    # set the environment to make transactions
    env.start_transactions()

    for i in range(n_trades + 1):

        # Predict the best action for the current state.
        action = agent.act(cur_state, add_noise=True)

        # Action is performed and new state, reward, info are received.
Example #13
0
def main():
    argparser = argparse.ArgumentParser(
        description='Train the Classifier')
    argparser.add_argument(
        'data_directory',
        help='The directory of training data.')
    argparser.add_argument(
        '--save_dir',
        default='checkpoint.pth',
        help='The directory for saving checkpoints.')
    argparser.add_argument(
        '--arch',
        default='densenet121',
        help='The model name.')
    argparser.add_argument(
        '--hidden_units',
        type=int,
        help='The number of hidden units.')
    argparser.add_argument(
        '--epochs',
        type=int,
        default=15,
        help='The number of traning epochs.')
    argparser.add_argument(
        '--learning_rate',
        type=float,
        default=0.003,
        help='The learning rate.')
    argparser.add_argument(
        '--gpu',
        action='store_true',
        help='Enable gpu')
    args = argparser.parse_args()
    checkpoint_file = args.save_dir
    data_dir = args.data_directory
    model_name = args.arch
    hidden_units = args.hidden_units
    epochs = args.epochs
    learning_rate = args.learning_rate
    gpu_enabled = args.gpu
    
    # Load training data
    train_dir = data_dir + '/train'
    valid_dir = data_dir + '/valid'
    test_dir = data_dir + '/test'
    train_transforms = transforms.Compose([transforms.RandomRotation(30),
                                        transforms.RandomResizedCrop(224),
                                        transforms.RandomHorizontalFlip(),
                                        transforms.ToTensor(),
                                        transforms.Normalize([0.485, 0.456, 0.406],
                                                            [0.229, 0.224, 0.225])])
    valid_transforms = transforms.Compose([transforms.Resize(255),
                                        transforms.CenterCrop(224),
                                        transforms.ToTensor(),
                                        transforms.Normalize([0.485, 0.456, 0.406],
                                                            [0.229, 0.224, 0.225])])
    test_transforms = transforms.Compose([transforms.Resize(255),
                                        transforms.CenterCrop(224),
                                        transforms.ToTensor(),
                                        transforms.Normalize([0.485, 0.456, 0.406],
                                                            [0.229, 0.224, 0.225])])

    train_data = datasets.ImageFolder(train_dir, transform=train_transforms)
    valid_data = datasets.ImageFolder(valid_dir, transform=test_transforms)
    test_data = datasets.ImageFolder(test_dir, transform=test_transforms)

    trainloader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_data, batch_size=64)
    testloader = torch.utils.data.DataLoader(test_data, batch_size=64)

    # Biuld training model
    model, classifier_name, hidden_layers = create_model(model_name, hidden_units)
    criterion = nn.NLLLoss()
    if classifier_name == 'classifier':
        optimizer = optim.Adam(model.classifier.parameters(), lr=learning_rate)
    elif classifier_name == 'fc':
        optimizer = optim.Adam(model.fc.parameters(), lr=learning_rate)
    device = 'cpu'
    if gpu_enabled:
        if torch.cuda.is_available():
            device = 'cuda'
        else:
            print("gpu is not available")
    model.to(device);
    # Start training
    #train_losses, valid_losses = [], []
    for e in keep_awake(range(epochs)):
        running_loss = 0
        for images, labels in trainloader:
        
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
        
            log_ps = model.forward(images)
            loss = criterion(log_ps, labels)
            loss.backward()
            optimizer.step()
        
            running_loss += loss.item()
        
        else:
            # Prints out training loss, validation loss, and validation accuracy as the network trains
            test_loss = 0
            accuracy = 0
        
            # Turn off gradients for validation, saves memory and computations
            with torch.no_grad():
                model.eval()
                for images, labels in validloader:
                    images, labels = images.to(device), labels.to(device)
                    log_ps = model.forward(images)
                    test_loss += criterion(log_ps, labels)
                
                    ps = torch.exp(log_ps)
                    top_p, top_class = ps.topk(1, dim=1)
                    equals = top_class == labels.view(*top_class.shape)
                    accuracy += torch.mean(equals.type(torch.FloatTensor))
        
            model.train()
        
            #train_losses.append(running_loss/len(trainloader))
            #valid_losses.append(test_loss/len(validloader))

            print("Epoch: {}/{}.. ".format(e+1, epochs),
                "Training Loss: {:.3f}.. ".format(running_loss/len(trainloader)),
                "validation Loss: {:.3f}.. ".format(test_loss/len(validloader)),
                "validation Accuracy: {:.3f}".format(accuracy/len(validloader)))

    # Do validation on the test set
    with active_session():
        test_loss = 0
        accuracy = 0
        with torch.no_grad():
            model.eval()
            for images, labels in testloader:
                images, labels = images.to(device), labels.to(device)
                #print(images.shape)
                log_ps = model.forward(images)
                test_loss += criterion(log_ps, labels)
                ps = torch.exp(log_ps)
                top_p, top_class = ps.topk(1, dim=1)
                equals = top_class == labels.view(*top_class.shape)
                accuracy += torch.mean(equals.type(torch.FloatTensor))
            model.train()
    print("Test Loss: {:.3f}.. ".format(test_loss/len(testloader)),
            "Test Accuracy: {:.3f}".format(accuracy/len(testloader)))

    # Save the check point
    model_classifier_state_dict = None
    if classifier_name == 'classifier':
        model_classifier_state_dict = model.classifier.state_dict()
    elif classifier_name == 'fc':
        model_classifier_state_dict = model.fc.state_dict()
    checkpoint = {'class_to_idx': train_data.class_to_idx,
              'optimizer_state_dict': optimizer.state_dict,
              'learning_rate' : learning_rate,
              'device' : device,
              'model_name' : model_name,
              'hidden_layers' : hidden_layers,
              'model_classifier_state_dict': model_classifier_state_dict}
    torch.save(checkpoint, checkpoint_file)
Example #14
0
def train_image_classifier(parameters, device, model, criterion, optimizer,
                           train_loader, valid_loader, train_losses,
                           valid_losses, valid_accuracies):
    """
    Train and validate image classification model

    :param parameters: dictionary providing model parameters
    :param device: torch.device indicating if GPU or CPU will be used
    :param model: the neural network model
    :param criterion: the loss / error function
    :param optimizer: the optimizer for backpropagation
    :param train_loader: training torch.utils.data.DataLoader
    :param valid_loader: validation torch.utils.data.DataLoader
    :param train_losses: list to store training loss calculated at a certain interval
    :param valid_losses: list to store validation loss calculated at a certain interval
    :param valid_accuracies: list to store validation accuracy calculated at a certain interval
    :return:
    """

    # Initialize parameters
    train_loss_accuracy_batch_interval = parameters.get(
        'train_loss_accuracy_batch_interval', 5)
    epochs = parameters.get('epochs', 5)

    # Training loop
    # keep_awake is provided by Udacity to ensure that anything that happens inside this loop will keep the workspace active
    # for e in range(epochs):
    for e in keep_awake(range(epochs)):

        # Enable training mode, which uses dropouts
        model.train()

        print("***************************")
        print("Epoch {}/{}".format(e + 1, epochs))

        running_training_loss = 0
        batch_count = 0
        train_image_count = 0
        for train_images, train_labels in train_loader:

            # Count images processed by training
            train_image_count += len(train_images)

            # Count batches used for training
            batch_count += 1

            # Move input and label tensors to GPU (if available) or CPU
            train_images, train_labels = train_images.to(
                device), train_labels.to(device)

            # Ensure that the gradient is not accumulated with each iteration
            optimizer.zero_grad()

            # Forward feeding the model
            train_output = model(train_images)

            # Calculate the loss (error function)
            train_loss = criterion(train_output, train_labels)

            # Backpropagation
            train_loss.backward()

            # Update weights
            optimizer.step()

            # Accumulate loss for each trained image
            running_training_loss += train_loss.item()

            # Validate the model after each loss_accuracy_batch_interval
            if batch_count % train_loss_accuracy_batch_interval == 0:

                # Validate the model
                test_image_classifier(device, model, criterion, valid_loader,
                                      valid_losses, valid_accuracies, False)

                # Calculate loss for batch used for last training
                train_losses.append(running_training_loss / batch_count)

                print("----")
                print("Processed {} training batches with {} processed images".
                      format(batch_count, train_image_count))
                print("Training loss: {}".format(train_losses[-1]))
                print("Validation loss: {}".format(valid_losses[-1]))
                print("Validation accuracy: {}".format(valid_accuracies[-1]))
Example #15
0
def main():
    seeding()
    parallel_envs = 4
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    save_interval = 1000
    t = 0

    # amplitude of OU noise, which slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    """
    `env` controls three agents, two blue, one red.
    env.observation_space: [Box(14,), Box(14,), Box(14,)]
    env.action_sapce: [Box(2,), Box(2,), Box(2,)]
    Box(14,) can be broken down into 2+3*2+3*2=14
    (2) location coordinates of the target landmark
    (3*2) the three agents' positions w.r.t. the target landmark
    (3*2) the three agents' velocities w.r.t. the target landmark
    """
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        # Consult `env_wrapper.py` line 19.
        all_obs = env.reset()
        """
        `all_abs` is a list of size `parallel_envs`,
        each item in the list is another list of size two,
        first is env.observation_space: [Box(14,), Box(14,), Box(14,)],
        second is [Box(14,)], which is added to faciliate training
        https://goo.gl/Xtr6sF
        `obs` and `obs_full` are both lists of size `parallel_envs`,
        `obs` has the default observation space [Box(14,), Box(14,), Box(14,)]
        `obs_full` has the compounded observation space [Box(14,)]
        """
        obs, obs_full = transpose_list(all_obs)

        # for calculating rewards for one episode - addition of all time steps

        # save info or not
        save_info = ((episode) % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of steps
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            # `actions_array` has shape (3, parallel_envs, 2)
            actions_array = torch.stack(actions).detach().numpy()
            # `actions_for_env` has shape (parallel_envs, 3, 2), because
            # input to `step` requires the first index to be `parallel_envs`
            actions_for_env = np.rollaxis(actions_array, axis=1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = \
                env.step(actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update the target network `parallel_envs`=4 times
        # after every `episode_per_update`=2*4
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            # update the local network for all agents, `a_i` refers to agent no.
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            # soft update the target network towards the actual networks
            maddpg.update_targets()

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # Saves the model.
        save_dict_list = []
        if save_info:
            for i in range(3):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # Save gif files.
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
Example #16
0
def main():

    ###Get input from user
    in_arg = get_input_args()
    print(in_arg)

    train_dir = in_arg.dir + '/train'
    valid_dir = in_arg.dir + '/valid'
    test_dir = in_arg.dir + '/test'

    # Pass transforms in here, then run the next cell to see how the transforms look
    train_data = datasets.ImageFolder(train_dir, transform=train_transforms)
    valid_data = datasets.ImageFolder(valid_dir, transform=valid_transforms)
    test_data = datasets.ImageFolder(test_dir, transform=test_transforms)

    # TODO: Using the image datasets and the trainforms, define the dataloaders
    trainloader = torch.utils.data.DataLoader(train_data,
                                              batch_size=64,
                                              shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_data,
                                              batch_size=64,
                                              shuffle=True)
    testloader = torch.utils.data.DataLoader(test_data, batch_size=64)

    ####Define the model
    resnet18 = models.resnet18(pretrained=True)
    alexnet = models.alexnet(pretrained=True)
    vgg16 = models.vgg16(pretrained=True)
    densenet121 = models.densenet121(pretrained=True)

    models_dic = {
        'resnet': resnet18,
        'alexnet': alexnet,
        'vgg': vgg16,
        'densenet': densenet121
    }

    model_name = in_arg.arch

    ###Load the userdefined model
    model = models_dic[model_name]

    # Freeze parameters so we don't backprop through them
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for param in model.parameters():
        param.requires_grad = False

    classifier = nn.Sequential(
        OrderedDict([('fc1', nn.Linear(1024, in_arg.hidden_nodes)),
                     ('relu', nn.ReLU()),
                     ('fc2', nn.Linear(in_arg.hidden_nodes,
                                       in_arg.output_nodes)),
                     ('output', nn.LogSoftmax(dim=1))]))

    model.classifier = classifier

    criterion = nn.NLLLoss()

    # Only train the classifier parameters, feature parameters are frozen
    optimizer = optim.Adam(model.classifier.parameters(),
                           lr=in_arg.learning_rate)

    model.to(device)

    epochs = in_arg.epocs
    steps = 0
    running_loss = 0
    print_every = 5

    for i in keep_awake(range(5)):
        for epoch in range(epochs):
            for inputs, labels in trainloader:
                steps += 1
                # Move input and label tensors to the default device
                inputs, labels = inputs.to(device), labels.to(device)

                optimizer.zero_grad()

                logps = model.forward(inputs)
                loss = criterion(logps, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

                if steps % print_every == 0:
                    valid_loss = 0
                    accuracy = 0
                    model.eval()
                    with torch.no_grad():
                        for inputs, labels in validloader:
                            inputs, labels = inputs.to(device), labels.to(
                                device)
                            logps = model.forward(inputs)
                            batch_loss = criterion(logps, labels)

                            valid_loss += batch_loss.item()

                            # Calculate accuracy
                            ps = torch.exp(logps)
                            top_p, top_class = ps.topk(1, dim=1)
                            equals = top_class == labels.view(*top_class.shape)
                            accuracy += torch.mean(
                                equals.type(torch.FloatTensor)).item()

                    print(
                        f"Epoch {epoch+1}/{epochs}.. "
                        f"Train loss: {running_loss/print_every:.3f}.. "
                        f"Validation loss: {valid_loss/len(testloader):.3f}.. "
                        f"Validation accuracy: {accuracy/len(testloader):.3f}")
                    running_loss = 0
                    model.train()

    ########Save the model
    model.class_to_idx = train_data.class_to_idx

    checkpoint = {
        'input_size': 1024,
        'output_size': in_arg.output_nodes,
        'hidden_layers': [each for each in model.classifier],
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'class_to_idx': model.class_to_idx
    }

    torch.save(checkpoint, in_arg.save_dir)
Example #17
0
def build_and_train_model(data_path,checkpoint_dir, base_model='vgg16', learning_rate=0.05, hidden_units=4096, epochs=1, device='cuda'):

    train_loader=loading_data(data_path)[0]
    valid_loader=loading_data(data_path)[1]

    models_options={'vgg11':models.vgg11(pretrained=True),'vgg13':models.vgg13(pretrained=True),'vgg16':models.vgg16(pretrained=True)}
    model = models_options[base_model]
    

    
    l_rate=learning_rate

    # Use GPU if it's available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # do not backpropagate through the parameters of the loaded model
    for param in model.parameters():
        param.requires_grad = False

    # define my feed forward
    model.classifier = nn.Sequential(nn.Linear(25088, hidden_units),
                                     nn.Dropout(0.3),
                                     nn.ReLU(),
                                     nn.Linear(hidden_units,102),
                                     nn.LogSoftmax(dim=1))
    # define the criterion
    criterion = nn.NLLLoss()

    # define optimizer of parameters - only for the classifier, not the imported model
    optimizer = optim.Adam(model.classifier.parameters(), lr=l_rate)

    model.to(device);

    steps = 0
    running_loss = 0
    print_every = 10


    # Train the NW

    for i in keep_awake(range(5)):  #anything that happens inside this loop will keep the workspace active
        # do iteration with lots of work here
        with active_session():
            # do long-running work here


            for epoch in range(epochs):
                for images, labels in train_loader:
                    steps += 1
                    # Move input and label tensors to the default device
                    images, labels = images.to(device), labels.to(device)

                    # reset gradient to 0 
                    optimizer.zero_grad()

                    # forward step
                    result = model.forward(images)
                    # calculate loss
                    loss = criterion(result, labels)
                    # backpropagate
                    loss.backward()
                    optimizer.step()

                    running_loss += loss.item()

                    if steps % print_every == 0:
                        eval_loss = 0
                        accuracy = 0

                        # Set the model in validation mode 

                        model.eval()

                        # turn off gradient during validation

                        with torch.no_grad():
                            for images, labels in valid_loader:
                                images, labels = images.to(device), labels.to(device)
                                result = model.forward(images)
                                batch_loss = criterion(result, labels)

                                eval_loss += batch_loss.item()

                                # Calculate accuracy
                                ps = torch.exp(result)
                                top_p, top_class = ps.topk(1, dim=1)
                                equals = top_class == labels.view(*top_class.shape)
                                accuracy += torch.mean(equals.type(torch.FloatTensor)).item()
            else:
                        # print testing vs validation loss and accuracy

                        print(f"Epoch {epoch+1}/{epochs}.. "
                              f"Train loss: {running_loss/print_every:.3f}.. "
                              f"Eval loss: {eval_loss/len(valid_loader):.3f}.. "
                              f"Eval accuracy: {accuracy/len(valid_loader):.3f}")
                        running_loss = 0
                        model.train()

                        # EXPORT THE CHECKPOINT!

    model.class_to_idx = loading_data(data_path)[3].class_to_idx

    checkpoint = {'network': base_model,
                  'input_size': 25088,
                  'output_size': 102,
                  'learning_rate': learning_rate,       
                  'batch_size': 64,
                  'classifier' : model.classifier,
                  'epochs': epochs,
                  'optimizer': optimizer.state_dict(),
                  'state_dict': model.state_dict(),
                  'class_to_idx': model.class_to_idx}

    torch.save(checkpoint, checkpoint_dir+'/checkpoint_terminal.pth')
     
    print('Your model has been trained and saved as checkpoint in the folder you indicated.', checkpoint_dir)
def train_model(arch="vgg16",
                hidden_units=4069,
                checkpoint=" ",
                epochs=3,
                learning_rate=0.003):
    for i in keep_awake(range(1)):
        device = torch.device(mode)

        dataset, dataloader = loadData(data_dir)
        trainloader = dataloader['train']
        train_dataset = dataset['train']
        validloader = dataloader['valid']
        valid_dataset = dataset['valid']

        classes_num = len(train_dataset.classes)

        model = load_checkpoint(hidden_units, arch, classes_num)

        optimizer = optim.Adam(model.classifier.parameters(), learning_rate)
        criterian = nn.NLLLoss()

        model.to(device)
        epochs_num = epochs
        step = 0
        running_loss = 0
        print_every = 5

        for epoch in range(epochs_num):
            for images, labels in trainloader:
                step += 1
                images, labels = images.to(device), labels.to(device)
                optimizer.zero_grad()
                logps = model(images)
                loss = criterian(logps, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                if step % print_every == 0:
                    vaild_loss = 0
                    accuracy = 0
                    model.eval()
                    with torch.no_grad():

                        #Validation Loop
                        for images, labels in validloader:
                            images, labels = images.to(device), labels.to(
                                device)
                            logps = model(images)
                            batch_loss = criterian(logps, labels)
                            vaild_loss += batch_loss.item()

                            #Claculate The Accuracy
                            ps = torch.exp(logps)
                            top_ps, top_class = ps.topk(1, dim=1)
                            equality = top_class == labels.view(
                                *top_class.shape)
                            accuracy += torch.mean(
                                equality.type(torch.FloatTensor)).item()
                        print(
                            f"Epoch {epoch+1}/{epochs}.."
                            f"Train loss: {running_loss/print_every:.3f}.. "
                            f"Validation loss: {vaild_loss/len(validloader):.3f}.. "
                            f"Validation accuracy: {accuracy/len(validloader):.3f}"
                        )
                        running_loss = 0
                        model.train()

        model.class_to_idx = train_dataset.class_to_idx
        checkpoint_dictionary = {
            'hidden_units': hidden_units,
            'arch': arch,
            'class_to_idx': model.class_to_idx,
            'state_dict': model.state_dict()
        }
        torch.save(checkpoint_dictionary, checkpoint)
    return model
Example #19
0
def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        all_obs = env.reset()  #
        obs, obs_full = transpose_list(all_obs)

        #for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = ((episode) % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  #soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        #saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()
Example #20
0
                 ('relu1', nn.ReLU()),
                 ('hidden_layer1', nn.Linear(hidden_units, 90)),
                 ('relu2', nn.ReLU()), ('hidden_layer2', nn.Linear(90, 80)),
                 ('relu3', nn.ReLU()), ('hidden_layer3', nn.Linear(80, 102)),
                 ('output', nn.LogSoftmax(dim=1))]))
model.classifier = classifier
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.classifier.parameters(),
                      learning_rate,
                      momentum=0.9)
if gpu:
    model.cuda()

# Train the network
print_every = 10
for epoch in keep_awake(range(1, num_epochs + 1)):
    t_loss = 0.0
    for i, (t_image, t_label) in enumerate(trainloader, 1):
        if gpu:
            t_image = t_image.cuda()
            t_label = t_label.cuda()
        #Reset gradients
        optimizer.zero_grad()
        #Forward
        output = model.forward(t_image)
        loss = criterion(output, t_label)
        #Backword
        loss.backward()
        #parameter update
        optimizer.step()
        t_loss += loss.item()