def setup(self):
        self.save_checkpoint(name='setup')
        (model_type, input_length, input_dim, num_classes, num_hidden, batch_size, learning_rate, train_steps, max_norm, wanted_device) = itemgetter(*flags)(vars(self.config))

        assert model_type in ('RNN', 'LSTM')

        # Initialize the device which to run the model on
        # TODO: debug CUDA issues
        device = torch.device(wanted_device)
        # device = torch.device(device if torch.cuda.is_available() else 'cpu')

        # Initialize the model that we are going to use
        model_pars = [input_length, input_dim, num_hidden, num_classes, batch_size, device]
        self.model = LSTM(*model_pars) \
            if model_type == 'LSTM' \
            else VanillaRNN(*model_pars)
        self.model.to(device)

        # Initialize the dataset and data loader (note the +1)
        dataset = PalindromeDataset(input_length+1)
        self.data_loader = DataLoader(dataset, batch_size, num_workers=1)

        # Setup the loss and optimizer
        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=learning_rate)
Beispiel #2
0
def train(config):

    # Initialize the device which to run the model on
    device = torch.device("cpu")

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, 1, num_workers=1)

    # Initialize the model that we are going to use
    model = VanillaRNN(config.input_length, config.input_dim,
                       config.num_hidden, config.num_classes, device, True,
                       False)
    grads_vanilla = get_grads(model, device, data_loader, dataset,
                              config.num_classes)

    model = LSTM(config.input_length, config.input_dim, config.num_hidden,
                 config.num_classes, device, True, False)
    grads_lstm = get_grads(model, device, data_loader, dataset,
                           config.num_classes)

    import matplotlib.pyplot as plt
    plt.plot(grads_vanilla, "o", label='VanillaRNN')
    plt.plot(grads_lstm, "o", label='LSTM')
    plt.yscale("log")
    plt.xlabel("Time step")
    plt.ylabel("Gradient magnitude (log)")
    plt.title(
        "Comparison of gradient backprop in types of RNN\nInitialization: xavier_uniform, LSTM forget gate bias: 2"
    )
    plt.legend()
    plt.show()
Beispiel #3
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')
    if config.tensorboard:
        writer = SummaryWriter(config.summary +
                               datetime.now().strftime("%Y%m%d-%H%M%S"))
    elif config.record_plot:
        CSV_DIR = config.csv
        if not os.path.isfile(CSV_DIR):
            f = open(CSV_DIR, 'w')
            writer = csv.writer(f)
            writer.writerow(
                ['model_type', 'step', 'input_length', 'accuracy', 'loss'])
            f.close()
    # Initialize the device which to run the model on
    device = torch.device(config.device)

    # Initialize the model that we are going to use
    if config.model_type == 'RNN':

        model = VanillaRNN(config.input_length,
                           config.input_dim,
                           config.num_hidden,
                           config.num_classes,
                           config.batch_size,
                           config.gradient_check,
                           device=device)

    elif config.model_type == 'LSTM':

        model = LSTM(config.input_length,
                     config.input_dim,
                     config.num_hidden,
                     config.num_classes,
                     config.batch_size,
                     config.gradient_check,
                     device=device)

    optimizer = torch.optim.RMSprop(model.parameters(),
                                    lr=config.learning_rate)

    model.to(device)
    # Initialize the dataset and data loader (note the +1)
    # torch.manual_seed(42)
    # np.random.seed(42)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)
    acc_check = []
    # Setup the loss
    criterion = torch.nn.CrossEntropyLoss()

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()

        # Add more code here ...
        optimizer.zero_grad()
        # Add more code here ...0
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)
        out = model.forward(batch_inputs)
        # print(f'forward output {out.shape}, batch input shape {batch_inputs.shape}, batch_targets.shape {batch_targets.shape}')
        loss = criterion(out, batch_targets)
        loss.backward()
        ############################################################################
        # QUESTION: what happens here and why?
        ############################################################################
        torch.nn.utils.clip_grad_norm(model.parameters(),
                                      max_norm=config.max_norm)
        ############################################################################

        optimizer.step()

        predictions = out.argmax(dim=-1)
        accuracy = (predictions == batch_targets).float().mean()
        acc_check.append(accuracy.detach().cpu().float())
        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        if step % 10 == 0:

            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Model type {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, config.model_type,
                    examples_per_second, accuracy, loss))
            if config.tensorboard:
                writer.add_scalar('training_loss', loss, step)
                writer.add_scalar('accuracy', accuracy, step)
            elif config.record_plot:
                with open(CSV_DIR, 'a') as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        config.model_type, step, config.input_length,
                        accuracy.item(),
                        loss.item()
                    ])

        if loss <= 1e-3 and not all(
            [i is 1.0 for i in acc_check[5:] if len(acc_check) >= 5]):
            break
        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/96553
            break

    return accuracy
Beispiel #4
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')

    # Set the model
    # config.model_type = 'RNN'
    config.model_type = 'LSTM'

    # Initialize the device which to run the model on
    device = torch.device(config.device)

    # p_len_list = [4,5,6,7,8,9,10,15,20,25,30,35,40,45,50]
    p_len_list = [4, 9, 14, 15, 16, 17, 18, 19, 24, 29, 34, 39, 44, 49]
    p_acc = []
    # config.batch_size = 150
    for in_len in p_len_list:
        print("The Palendrom length is: " + str(in_len+1))
        config.input_length = in_len

        np.random.seed(42)

        if config.model_type == 'RNN':
            # Initialize the RNN model that we are going to use
            model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device)
        elif config.model_type == 'LSTM':
            # Initialize the LSTM model that we are going to use
            model = LSTM(config.input_length, config.input_dim, config.num_hidden, config.num_classes, device)

        # Initialize the dataset and data loader (note the +1)
        dataset = PalindromeDataset(config.input_length+1)
        data_loader = DataLoader(dataset, 4000, num_workers=1)

        (test_inputs, test_targets) = next(iter(data_loader))

        data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

        #Test set ~ 4000

        # Setup the loss and optimizer
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate)

        #Data Storage
        train_acc = []
        test_acc = []
        # test_one = False
        for step, (batch_inputs, batch_targets) in enumerate(data_loader):

            # Only for time measurement of step through network
            t1 = time.time()
            model_out = model.forward(batch_inputs)
            loss = criterion(model_out, batch_targets)
            optimizer.zero_grad()
            loss.backward()

            ############################################################################
            # QUESTION: what happens here and why?  It seems that its giving the gradient an upper limit so that there arent exploading gradients
            ############################################################################
            torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm)
            ############################################################################

            # Add more code here ...

            optimizer.step()

            loss = loss.item()
            # loss_over_time.append(loss)

            accuracy = np.average((torch.max(model_out, 1)[1] == batch_targets))
            train_acc.append(accuracy)
            # Just for time measurement
            t2 = time.time()
            examples_per_second = config.batch_size/float(t2-t1)

            if step % 10 == 0:

                # print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                #       "Accuracy = {:.2f}, Loss = {:.3f}".format(
                #         datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                #         config.train_steps, config.batch_size, examples_per_second,
                #         accuracy, loss
                # ))

                model_out = model.forward(test_inputs)
                accuracy = np.average((torch.max(model_out, 1)[1] == test_targets))
                # if accuracy == 1 and not test_one:
                #     test_one = True
                # print("The currecnt test set accuracy is: " + str(accuracy))
                if (step > 2500 and in_len < 9) or (step > 4000 and in_len >= 9):
                    if accuracy == 1:
                        print(str(step))
                        print("We have convergence due to 1,  accuracy is: " + str(accuracy))
                        p_acc.append(accuracy)
                        test_acc = []
                        break
                    elif not all(x <= accuracy for x in test_acc[-5:]):
                        print(str(step))
                        print("We have convergence due to being worse than last 5, accuracy is: " + str(accuracy) + ". Best is:  " + str(max(test_acc)))
                        p_acc.append(max(test_acc))
                        test_acc = []
                        break
                    elif np.var(test_acc[-5:]) < 0.001:
                        print(str(step))
                        print("We have convergence due to variance low, accuracy is: " + str(accuracy) + ". Best is:  " +  str(max(test_acc)))
                        p_acc.append(max(test_acc))
                        test_acc = []
                        break
                    else:
                        test_acc.append(accuracy)
                else:
                    test_acc.append(accuracy)
            if step == config.train_steps:
                print("We havent converged, but we ran out of time")
                p_accc.append(max(test_acc))
                test_acc = []
            ## Another stopping could be loss < 0.015?
            ## This is stopping after training acc is 1 for 50 steps.
            # if step % 50 == 0:
            #     if sum(train_acc) == len(train_acc) and train_acc[-1] == 1 and step > 1000:
            #         print("We have convergence" + str(sum(train_acc)))
            #         model_out = model.forward(test_inputs)
            #         accuracy = np.average((torch.max(model_out, 1)[1] == test_targets))
            #         print(accuracy)
            #         p_acc.append(accuracy)
            #         train_acc = []
            #         break
            #     train_acc = []
            if step == config.train_steps:
                # If you receive a PyTorch data-loader error, check this bug report:
                # https://github.com/pytorch/pytorch/pull/9655
                break
        # break

    print('Done training.')
    plt.plot(p_len_list, p_acc)
    plt.title("Accuracy for Different Lengths of Palindrome for Test Set")
    plt.xlabel("Palindrome Length")
    plt.ylabel("Accuracy")
    plt.savefig('figs/LSTM_Acc_basic.png')
    plt.show()
Beispiel #5
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')

    if not os.path.isdir(MODEL_FOLDER):
        os.mkdir(MODEL_FOLDER)

    if not os.path.isdir(IMAGES_FOLDER):
        os.mkdir(IMAGES_FOLDER)

    filename = config.model_type + '_nods' + '_length_input=' + str(
        config.input_length) + '_optimizer=' + config.optimizer + '_lr=' + str(
            config.learning_rate).replace('.', ',')
    print("Training " + config.model_type + " " + str(config.input_length) +
          " optimizer " + config.optimizer + ' lr ' +
          str(config.learning_rate))

    f = open(MODEL_FOLDER + filename, 'w')
    plotter = LossAccPlotter(config.model_type + ' input length ' +
                             str(config.input_length) + ' optimizer ' +
                             config.optimizer,
                             IMAGES_FOLDER + filename,
                             x_label="Steps",
                             show_regressions=False)

    # Initialize the device which to run the model on
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Initialize the model that we are going to use
    if config.model_type == 'RNN':
        model = VanillaRNN(config.input_length, config.input_dim,
                           config.num_hidden, config.num_classes,
                           config.batch_size)
    elif config.model_type == 'LSTM':
        model = LSTM(config.input_length, config.input_dim, config.num_hidden,
                     config.num_classes, config.batch_size)

    #print([print (x.shape) for x in model.parameters()])

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = nn.CrossEntropyLoss()

    if config.optimizer == "adam":
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
    else:
        optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate)

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):
        # Only for time measurement of step through network
        t1 = time.time()

        predictions = model(batch_inputs.to(device))

        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       max_norm=config.max_norm)

        loss = criterion(predictions, batch_targets)
        accuracy = get_accuracy(predictions, batch_targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        if step % 100 == 0:
            info = "Train Step {:04d}/{:04d}: Accuracy = {:.2f}, Loss = {:.3f}".format(
                step, config.train_steps, accuracy, loss)
            f.write(info + '\n')

            plotter.add_values(step,
                               loss_train=loss.data.numpy(),
                               acc_train=accuracy,
                               redraw=False)

            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    plotter.redraw(plot=False)
    f.close()
    print('Done training.')
Beispiel #6
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device(config.device)
    device = torch.device(
        'cuda' if torch.cuda.is_available() else 'cpu')  # fixme

    # Initialize the model that we are going to use
    if config.model_type == 'RNN':
        model = VanillaRNN(config.input_length, config.input_dim,
                           config.num_hidden, config.num_classes,
                           config.batch_size, device)
    elif config.model_type == 'LSTM':
        model = LSTM(config.input_length, config.input_dim, config.num_hidden,
                     config.num_classes, config.batch_size, device)

    model = model.to(device)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    if config.optim == "RMSprop":
        optimizer = torch.optim.RMSprop(model.parameters(),
                                        lr=config.learning_rate)
    if config.optim == "Adam":
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=config.learning_rate)

    # Better timing
    t1 = time.time()

    output_file = "results/{}_len{}_{}_batch{}.csv".format(
        config.model_type, config.input_length, config.optim,
        config.batch_size)
    f = open(output_file, "w+")
    f.write("step;accuracy\n")
    f.close()

    accuracies = []

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Prepare for torch
        x = torch.tensor(batch_inputs, dtype=torch.float32, device=device)
        y = torch.tensor(batch_targets, dtype=torch.long, device=device)

        # Forward pass
        predictions = model(x)
        loss = criterion(predictions, y)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()

        ############################################################################
        # QUESTION: what happens here and why?
        ############################################################################
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       max_norm=config.max_norm)
        ############################################################################

        optimizer.step()

        if step % config.print_every == 0:
            # Just for time measurement (changed to measure average every time it prints)
            t2 = time.time()
            examples_per_second = (10 * config.batch_size) / float(t2 - t1)

            accuracy = torch.sum(predictions.argmax(dim=1) == y).to(
                torch.float32) / len(batch_inputs)
            accuracies.append(accuracy)
            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))

            f = open(output_file, "a+")
            f.write("%d;%f\n" % (step, accuracy))
            f.close()

            # Only for time measurement of step through network
            t1 = time.time()

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    print('Done training.')
    print(np.percentile(np.array(accuracies), 95))
def train(config):

    assert config.model_type in ('RNN', 'LSTM')
    plt.figure(0)

    # Initialize the device which to run the model on
    device = torch.device(config.device)

    in_len = 24
    print("The Palendrom length is: " + str(in_len + 1))
    config.input_length = in_len

    np.random.seed(42)

    model = VanillaRNN(config.input_length, config.input_dim,
                       config.num_hidden, config.num_classes, device)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)

    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()

    #Data Storage

    (batch_inputs, batch_targets) = next(iter(data_loader))
    # Only for time measurement of step through network
    model_out = model.forward(batch_inputs)
    optimizer = torch.optim.RMSprop(model.parameters(),
                                    lr=config.learning_rate)
    loss = criterion(model_out, batch_targets)
    optimizer.zero_grad()
    loss.backward()

    RNN_grad_norms = [torch.norm(h.grad) for h in model.h_list]

    plt.plot(np.arange(0, len(RNN_grad_norms)), RNN_grad_norms, label='RNN')

    model = LSTM(config.input_length, config.input_dim, config.num_hidden,
                 config.num_classes, device)
    optimizer = torch.optim.RMSprop(model.parameters(),
                                    lr=config.learning_rate)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()

    #Data Storage

    (batch_inputs, batch_targets) = next(iter(data_loader))
    # Only for time measurement of step through network
    model_out = model.forward(batch_inputs)
    loss = criterion(model_out, batch_targets)
    optimizer.zero_grad()
    loss.backward()
    LSTM_grad_norms = [torch.norm(h.grad) for h in model.h_list]

    print('Done training.')
    plt.plot(np.arange(0, len(LSTM_grad_norms)), LSTM_grad_norms, label='LSTM')
    plt.title("Gradient Norms through 50 Time Steps in the RNN and LSTM")
    plt.xlabel("Time Steps")
    plt.ylabel("Gradient Norms")
    plt.yscale('log')
    plt.legend()
    plt.savefig('figs/Gradient_Norms.png')
    plt.show()
Beispiel #8
0
def train(config, acc_th=0.99, epsilon=0.01):
    """
    """
    # some additional vars
    learning_rate = config.learning_rate

    # input_length = seq_length (?)
    seq_length = config.input_length

    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device(config.device)

    # Initialize the model that we are going to use
    if config.model_type == 'RNN':
        # Because I added kwargs to VanillaRNN, this will work
        model = VanillaRNN(seq_length, **config.__dict__)
    else:
        model = LSTM(seq_length, **config.__dict__)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss=CrossEntropy and optimizer=SGD
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)

    loss_list = list()
    accuracy_list = list()

    mean_loss_list = list()
    mean_accuracy_list = list()

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):
        # X_batch = torch.stack(X_transposed).t()
        # Y_batch = torch.stack(y_transposed).t()
        X = batch_inputs.to(device)
        y = batch_targets.to(device)

        if config.model_type == 'LSTM':
            if X.dim() != len(model.X_dimensions):
                X = X.view(X.size()[0], X.size()[1], 1)

        model.forward(X)

        # TODO: Try one-hot encode
        # make X to be (B, L, D) where D is 10 (one-hot encoded)
        #torch.zeros(len(x), x.max() + 1).scatter_(1, x.unsqueeze(1), 1.)

        # Only for time measurement of step through network
        t1 = time.time()

        # Add more code here ...
        optimizer.zero_grad()
        outputs = model.forward(X)

        # Add more code here ...
        loss_current = criterion(outputs, y)
        loss_current.backward(retain_graph=True)
        optimizer.step()

        ############################################################################
        # QUESTION: what happens here and why?   - RESCALING ?
        ############################################################################
        torch.nn.utils.clip_grad_norm(model.parameters(),
                                      max_norm=config.max_norm)
        ############################################################################

        loss = loss_current.detach().item()
        accuracy = (outputs.argmax(dim=1) == y.long()).sum().float() / float(
            y.shape[0])

        loss_list.append(loss)
        accuracy_list.append(accuracy)

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        if step % 50 == 0:

            mean_loss_list.append(np.mean(loss_list[-50:]))
            mean_accuracy_list.append(np.mean(accuracy_list[-50:]))

            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))

            if step == config.train_steps or mean_loss_list[
                    -1] < epsilon:  #or mean_accuracy_list[-1] > acc_th:
                # If you receive a PyTorch data-loader error, check this bug report:
                # https://github.com/pytorch/pytorch/pull/9655
                print(mean_loss_list[-1])
                print(mean_accuracy_list[-1])
                break

    print('Done training.')
    return mean_loss_list, mean_accuracy_list
Beispiel #9
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device(config.device)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)
    test_loader = DataLoader(dataset, 10000, num_workers=1)

    if config.train_log != "STDOUT":
        outfile = open(config.train_log, 'w')

    accuracy_avg = 0

    for i in range(config.avg_over):
        # Initialize the model that we are going to use
        if config.model_type == 'RNN':
            model = VanillaRNN(config.input_length,
                               config.input_dim,
                               config.num_hidden,
                               config.num_classes,
                               device,
                               linear=config.linear)
            optimizer = optim.RMSprop(model.parameters(), config.learning_rate)
        else:
            model = LSTM(config.input_length,
                         config.input_dim,
                         config.num_hidden,
                         config.num_classes,
                         device,
                         linear=config.linear)
            optimizer = optim.RMSprop(model.parameters(), config.learning_rate)

        model.to(device)

        # Setup the loss and optimizer
        criterion = nn.CrossEntropyLoss()

        for step, (batch_inputs, batch_targets) in enumerate(data_loader):

            # Only for time measurement of step through network
            t1 = time.time()

            batch_inputs = batch_inputs.to(device)
            batch_targets = batch_targets.to(device)

            optimizer.zero_grad()
            pred = model.forward(batch_inputs)

            loss = criterion(pred, batch_targets)
            accuracy = acc(
                pred,
                F.one_hot(batch_targets,
                          num_classes=config.num_classes).float(),
                config.num_classes)
            loss.backward()

            ############################################################################
            # QUESTION: what happens here and why?
            # Gradient clipping is performed. In deep computational graphs the
            # parameter gradient could grow very large due to
            # repeatedly applying the same operation. If this happens an SGD
            # update will take a bigger-than-usual step, possibly ending up in a
            # region where loss function already begins to curve upwards again.
            # To alleviate this behaviour we perform gradient clipping, which
            # restricts the maximum possible value of gradient and thus the max step
            # we can take. This will make convergence easier and the optimization
            # process will be better-behaved than without gradient clipping
            ############################################################################
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           max_norm=config.max_norm)
            ############################################################################

            optimizer.step()

            # Just for time measurement
            t2 = time.time()
            examples_per_second = config.batch_size / float(t2 - t1)

            if step % 10 == 0:
                if config.train_log != "STDOUT":
                    outfile.write(
                        "[{}] Averaging Step: {} Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                        "Accuracy = {:.2f}, Loss = {:.3f}\n".format(
                            datetime.now().strftime("%Y-%m-%d %H:%M"), i, step,
                            config.train_steps, config.batch_size,
                            examples_per_second, accuracy, loss))
                else:
                    print(
                        "[{}] Averaging Step: {} Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                        "Accuracy = {:.2f}, Loss = {:.3f}".format(
                            datetime.now().strftime("%Y-%m-%d %H:%M"), i, step,
                            config.train_steps, config.batch_size,
                            examples_per_second, accuracy, loss))

            if step == config.train_steps:
                # If you receive a PyTorch data-loader error, check this bug report:
                # https://github.com/pytorch/pytorch/pull/9655
                break

        test_inputs, test_targets = next(iter(test_loader))
        test_inputs = test_inputs.to(device)
        test_targets = test_targets.to(device)

        with torch.no_grad():
            pred = model.forward(test_inputs)
            loss = criterion(pred, test_targets)

        accuracy = acc(
            pred,
            F.one_hot(test_targets, num_classes=config.num_classes).float(),
            config.num_classes)
        accuracy_avg += accuracy

    print(accuracy_avg / config.avg_over, end='')

    if config.train_log != "STDOUT":
        outfile.close()
Beispiel #10
0
def train(config):
    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device(config.device)

    # Initialize params for models
    seq_length = config.input_length
    input_dim = config.input_dim
    num_hidden = config.num_hidden
    num_classes = config.num_classes

    print(seq_length, input_dim, num_classes, num_hidden)

    # Testing for convergence
    epsilon = 5e-4
    # minimal steps the model definitely trains, LSTM trains slower so needs more interations
    if seq_length < 30:
        if config.model_type == 'RNN':
            min_steps = 3000 if seq_length > 15 else 1000
        else:
            min_steps = 5000 if seq_length > 15 else 1500
    else:
        min_steps = 6500

    # Initialize the model that we are going to use
    if config.model_type == 'RNN':
        model = VanillaRNN(seq_length, input_dim, num_hidden, num_classes,
                           device)
    else:
        model = LSTM(seq_length, input_dim, num_hidden, num_classes, device)

    model.to(device)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate)

    # Train losses and accuracies for debugging purposes
    accuracies, losses = [], []

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()

        # convert to one-hot representation
        batch_inputs = torch.scatter(
            torch.zeros(*batch_inputs.size(), num_classes), 2,
            batch_inputs[..., None].to(torch.int64), 1).to(device)

        batch_targets = batch_targets.to(device)

        train_output = model.forward(batch_inputs)
        loss = criterion(train_output, batch_targets)

        ############################################################################
        # QUESTION: what happens here and why?
        ############################################################################
        # Clip exploding gradients
        torch.nn.utils.clip_grad_norm(model.parameters(),
                                      max_norm=config.max_norm)
        ############################################################################

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        accuracy = torch.sum(
            torch.eq(torch.argmax(train_output, dim=1),
                     batch_targets)).item() / train_output.size(0)
        accuracies.append(accuracy)
        losses.append(loss.item())

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        if step % 100 == 0:

            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))

            if step > min_steps and (
                    np.absolute(np.mean(losses[-102:-2]) - losses[-1]) <
                    epsilon):
                print("Convergence reached after {} steps".format(step))
                break

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    print('Done training.')
    return model
Beispiel #11
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device(config.device)

    # Initialize the model that we are going to use
    if config.model_type == "RNN":
        model = VanillaRNN(config.input_length, config.input_dim,
                           config.num_hidden, config.num_classes,
                           config.batch_size, config.device)
    else:
        model = LSTM(config.input_length, config.input_dim, config.num_hidden,
                     config.num_classes, config.batch_size, config.device)

    device = model.device
    model = model.to(device)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(),
                                    lr=config.learning_rate)

    list_train_acc = []
    list_train_loss = []
    acc_average = []
    loss_average = []

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()

        # Add more code here ...
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)
        #model.to(device)

        output = model(batch_inputs)
        loss = criterion(output, batch_targets)
        model.zero_grad()
        loss.backward()
        loss = loss.item(
        )  #Only get the value to be stored, otherwise too much memory

        ############################################################################
        # QUESTION: what happens here and why?
        # it clips gradient norm of an iterable of parameters, so the gradients are normalized w.r.t. to the max_norm
        # Thus is will limit the size and get reasonably gradients as opposed to very large gradients.
        # This handles the case of exploding gradients as with each layer the gradient can get amplified.
        ############################################################################
        torch.nn.utils.clip_grad_norm(model.parameters(),
                                      max_norm=config.max_norm)
        ############################################################################

        # Add more code here ...
        optimizer.step()

        #Loss is computed above
        #loss = np.inf   # fixme
        #accuracy = 0.0  # fixme

        number_predictions = torch.argmax(output, dim=1)
        result = number_predictions == batch_targets
        accuracy = result.sum().item() / len(batch_targets)

        if config.measure_type == 2:
            acc_average.append(accuracy)
            loss_average.append(loss)

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        # Add accuracy and loss to list
        if step % 10 == 0:

            #Average accuracy and loss over the last 10 step
            if config.measure_type == 2:
                accuracy = sum(acc_average) / 10
                loss = sum(loss_average) / 10
                acc_average = []
                loss_average = []

            #Either accuracy and loss on the the 10th interval or the average of the last 10 steps.
            list_train_acc.append(accuracy)
            list_train_loss.append(loss)

            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))
        elif config.measure_type == 0:
            #Track accuracy and loss for every step even if it is not on the 10th interval
            list_train_acc.append(accuracy)
            list_train_loss.append(loss)

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    print('Done training.')

    if not config.overview_length:
        if config.measure_type == 0:
            eval_steps = list(range(config.train_steps + 1))  #Every step Acc
        else:
            eval_steps = list(range(0, config.train_steps + 10, 10))

        print(len(list_train_acc))
        plt.plot(eval_steps, list_train_acc, label="Train accuracy")
        plt.xlabel("Step")
        plt.ylabel("Accuracy")
        plt.title("Training accuracies", fontsize=18, fontweight="bold")
        plt.legend()
        # plt.savefig('accuracies.png', bbox_inches='tight')
        plt.show()

        plt.plot(eval_steps, list_train_loss, label="Train loss")
        plt.xlabel("Step")
        plt.ylabel("Loss")
        plt.title("Train loss", fontsize=18, fontweight="bold")
        plt.legend()
        # plt.savefig('loss.png', bbox_inches='tight')
        plt.show()

    return (list_train_acc, list_train_loss)
Beispiel #12
0
def run(model_type, input_length, input_dim, num_classes, num_hidden,
        batch_size, learning_rate, train_steps, max_norm, device):
    assert model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device(device)

    # Initialize the model that we are going to use
    model_pars = [
        input_length, input_dim, num_hidden, num_classes, batch_size, device
    ]
    model = LSTM(*model_pars) \
        if model_type == 'LSTM' \
        else VanillaRNN(*model_pars)
    model.to(device)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(input_length + 1)
    data_loader = DataLoader(dataset, batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()

        # Add more code here ...
        optimizer.zero_grad()
        ys = model.forward(batch_inputs)

        ############################################################################
        # QUESTION: what happens here and why?
        ############################################################################
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_norm)
        ############################################################################

        # Add more code here ...
        predictions = ys.argmax(dim=-1)
        loss = criterion(ys, batch_targets)
        loss.backward()
        optimizer.step()
        accuracy = (batch_targets == predictions).float().mean()

        # Just for time measurement
        t2 = time.time()
        examples_per_second = batch_size / float(t2 - t1)

        stats = {'loss': loss, 'accuracy': accuracy}

        if step % 10 == 0:

            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    train_steps, batch_size, examples_per_second, accuracy,
                    loss))

        if step == train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    print('Done training.')
    return (accuracy.item())
Beispiel #13
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')

    model_dir = config.summary_path + config_to_str(config) + '/'
    os.makedirs(model_dir)  # , exist_ok=True)

    # add assets to filename if we removoed it

    with open(model_dir + 'config.pkl', 'wb+') as fd:
        pickle.dump(config, fd)

    # Initialize the device which to run the model on
    device = torch.device(config.device)

    # Initialize the model that we are going to use
    if config.model_type == 'RNN':
        model = VanillaRNN(config.input_length, config.input_dim, config.num_hidden,
                           config.num_classes, config.batch_size, config.device, config.predict_half)
    else:
        model = LSTM(config.input_length, config.input_dim, config.num_hidden,
                     config.num_classes, config.batch_size, config.device, config.predict_half)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length+1, config.batch_size, config.train_steps)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate)

    losses = []
    accuracies = []
    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()
        batch_inputs, batch_targets = Variable(batch_inputs.to(device)), Variable(batch_targets.to(device))

        # Add more code here ...
        optimizer.zero_grad()
        batch_output = model.forward(batch_inputs)
        if config.predict_half:
            #MANY
            long_target = torch.cat((batch_inputs[:, int(config.input_length/2)+1:], batch_targets.unsqueeze(-1).float()),1)
            long_predictions = batch_output[:, int(config.input_length/2):]
            loss = criterion(long_predictions.contiguous().view(-1, long_predictions.size()[-1]),
                      long_target.contiguous().view(-1).long())
            accuracy = float(torch.sum(long_predictions.argmax(2) == long_target.long())) / (long_predictions.shape[0] * long_predictions.shape[1])
        else:
            #ONE
            predictions = batch_output
            loss = criterion(predictions, batch_targets)
            accuracy = float(torch.sum(predictions.argmax(1) == batch_targets)) / predictions.shape[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.max_norm)
        optimizer.step()

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size/float(t2-t1)

        if step % 10 == 0:
            losses.append(loss.item())
            accuracies.append(accuracy)
            if step % 200 == 0:
                print("[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                      "Accuracy = {:.2f}, Loss = {:.3f}".format(
                        datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                        config.train_steps, config.batch_size, examples_per_second,
                        accuracy, loss
                ))

    with open(model_dir + 'learning_curves.pkl', 'wb+') as fd:
        pickle.dump((losses, accuracies), fd)
    print('Done training.')
Beispiel #14
0
def train(config):

    #set variables
    T_options = list(range(config.min_len, config.max_len))
    print("model:", config.model_type)
    print("min input length:", config.min_len)
    print("max input length:", config.max_len)

    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def acc(predictions, targets):
        accuracy = (predictions.argmax(dim=1) == targets).float().mean().item()
        return accuracy

    all_accuracies = []
    all_losses = []
    all_train_steps = []

    for T in T_options:

        accuracies = np.array([])
        losses = np.array([])

        final_accs = []
        final_losses = []
        final_train_steps = []

        config.input_length = T

        for i in range(4):

            print("Iteration", i, "with T:", T, "learning rate:",
                  config.learning_rate)

            # Initialize the dataset and data loader (note the +1)
            dataset = PalindromeDataset(config.input_length + 1)
            data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

            # Initialize the model that we are going to use
            if config.model_type == "RNN":
                model = VanillaRNN(config.input_length, config.input_dim,
                                   config.num_hidden, config.num_classes,
                                   device)
            elif config.model_type == "LSTM":
                model = LSTM(config.input_length, config.input_dim,
                             config.num_hidden, config.num_classes, device)
            model.to(device)

            # Setup the loss and optimizer
            criterion = torch.nn.CrossEntropyLoss()
            # optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate)
            optimizer = torch.optim.Adam(model.parameters(),
                                         lr=config.learning_rate)

            for step, (batch_inputs, batch_targets) in enumerate(data_loader):

                # Only for time measurement of step through network
                t1 = time.time()

                batch_inputs = batch_inputs.to(device)
                batch_targets = batch_targets.to(device)

                p = model.forward(batch_inputs)

                loss = criterion(p, batch_targets)
                accuracy = acc(p, batch_targets)

                optimizer.zero_grad()

                loss.backward()

                ############################################################################
                # QUESTION: what happens here and why?
                # here the gradients are clipped to prevent exploding gradients
                ############################################################################
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               max_norm=config.max_norm)
                ############################################################################

                optimizer.step()

                # Just for time measurement
                t2 = time.time()
                examples_per_second = config.batch_size / float(t2 - t1)

                if step % 10 == 0:
                    print(
                        "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                        "Accuracy = {:.2f}, Loss = {:.3f}".format(
                            datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                            config.train_steps, config.batch_size,
                            examples_per_second, accuracy, loss))

                    accuracies = np.append(accuracies, accuracy)
                    losses = np.append(losses, loss.item())

                if step == config.train_steps or (step % 10 == 0
                                                  and accuracy == 1):
                    # If you receive a PyTorch data-loader error, check this bug report:
                    # https://github.com/pytorch/pytorch/pull/9655

                    final_accs.append(accuracy)
                    final_losses.append(loss.item())
                    final_train_steps.append(step)

                    break

        all_accuracies.append(final_accs)
        all_losses.append(final_losses)
        all_train_steps.append(final_train_steps)

    print('Done training.')
    print(all_accuracies)
    print(all_losses)
    print(all_train_steps)

    with open("lstm_xavier_{}_{}.txt".format(config.min_len, config.max_len),
              "w") as output:
        output.write("accuracies \n")
        output.write(str(all_accuracies) + "\n")
        output.write("losses \n")
        output.write(str(all_losses) + "\n")
        output.write("train steps \n")
        output.write(str(all_train_steps) + "\n")
def grads_over_time(config):

    assert config.model_type in ('RNN', 'LSTM')
    if config.tensorboard:
        writer = SummaryWriter(config.summary + datetime.now().strftime("%Y%m%d-%H%M%S"))
    elif config.record_plot:
        CSV_DIR = config.csv
        if not os.path.isfile(CSV_DIR):
            f = open(CSV_DIR, 'w')
            writer = csv.writer(f)
            writer.writerow(['model_type', 'step', 'input_length', 'accuracy', 'loss'])
            f.close()
    # Initialize the device which to run the model on
    device = torch.device(config.device)

    # Initialize the model that we are going to use
    if config.model_type=='RNN':

        model = VanillaRNN(config.input_length,
                config.input_dim,
                config.num_hidden,
                config.num_classes,
                config.batch_size,
                config.gradient_check,
                device=device)
        optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate)

    elif config.model_type=='LSTM':

        model = LSTM(config.input_length,
                config.input_dim,
                config.num_hidden,
                config.num_classes,
                config.batch_size,
                config.gradient_check,
                device=device)
        optimizer = torch.optim.RMSprop(model.parameters(), lr=config.learning_rate)
    
    model.to(device)
    # Initialize the dataset and data loader (note the +1)
    # torch.manual_seed(42)
    # np.random.seed(42)
    dataset = PalindromeDataset(config.input_length+1)
    # Setup the loss 
    criterion = torch.nn.CrossEntropyLoss()

    # Add more code here ...
   
    # Add more code here ...0
    batch_inputs, batch_targets = dataset[0]
    batch_inputs = torch.from_numpy(batch_inputs).unsqueeze(0).to(device)
    batch_targets = torch.from_numpy(np.array([batch_targets])).to(device)

    out = model.forward(batch_inputs)
    loss = criterion(out, batch_targets)
    loss.backward()
    ############################################################################
    # QUESTION: what happens here and why?
    ############################################################################
    torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=config.max_norm)
    ############################################################################
    optimizer.zero_grad()
    optimizer.step()
    gradient_list = []
    for hidden_grad in model.grad_hidden_list:
        # print(torch.norm(hidden_grad.grad).item())
        gradient_list.append(torch.norm(hidden_grad.grad, p=2).item())

    

    return  gradient_list
Beispiel #16
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    #device = torch.device(config.device)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialize the model that we are going to use
    if config.model_type is 'RNN':
        model = VanillaRNN(seq_length=config.input_length,
                           input_dim=config.input_dim,
                           num_hidden=config.num_hidden,
                           num_classes=config.num_classes,
                           batch_size=config.batch_size,
                           device=device)
    if config.model_type is 'LSTM':
        model = LSTM(seq_length=config.input_length,
                     input_dim=config.input_dim,
                     num_hidden=config.num_hidden,
                     num_classes=config.num_classes,
                     batch_size=config.batch_size,
                     device=device)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(), config.learning_rate)
    #optimizer = torch.optim.Adam(model.parameters(), config.learning_rate)

    Accuracy = []

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()

        y = model.forward(batch_inputs.to(device))
        loss = criterion(y, batch_targets.to(device))
        loss.backward()

        ############################################################################
        # QUESTION: what happens here and why?
        # limits the size of the parameter updates by scaling the gradients down
        # Should be placed after loss.backward() but before optimizer.step()
        ############################################################################
        torch.nn.utils.clip_grad_norm(model.parameters(),
                                      max_norm=config.max_norm)
        ############################################################################

        optimizer.step()

        loss = loss.item()

        acc_in = np.argmax(y.cpu().detach().numpy(),
                           axis=1) == batch_targets.cpu().detach().numpy()
        accuracy = np.sum(acc_in) / batch_targets.shape[0]

        Accuracy.append(accuracy)

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        if step % 10 == 0:

            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    print('Done training. :)')
Beispiel #17
0
def train(config):
    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def acc(predictions, targets):
        accuracy = (predictions.argmax(dim=1) == targets).float().mean().item()
        return accuracy

    # Initialize the dataset and data loader (note the +1
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    lstm = LSTM(config.input_length, config.input_dim, config.num_hidden,
                config.num_classes)
    rnn = VanillaRNN(config.input_length, config.input_dim, config.num_hidden,
                     config.num_classes, device)

    optimizer_lstm = torch.optim.RMSprop(lstm.parameters(),
                                         lr=config.learning_rate)
    optimizer_rnn = torch.optim.RMSprop(rnn.parameters(),
                                        lr=config.learning_rate)

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        print("step", step)
        # Initialize the model that we are going to use
        lstm_out = lstm.forward(batch_inputs)

        optimizer_lstm.zero_grad()
        loss_lstm = criterion(lstm_out, batch_targets)
        loss_lstm.backward()
        optimizer_lstm.step()

        rnn_out = rnn.forward(batch_inputs)

        optimizer_rnn.zero_grad()
        loss_rnn = criterion(rnn_out, batch_targets)
        loss_rnn.backward()
        optimizer_rnn.step()

        lstm_norms = []
        for h in lstm.all_h:
            lstm_norms.append(h.grad.norm().item())

        rnn_norms = []
        for h in rnn.all_h:
            rnn_norms.append(h.grad.norm().item())

        sequence = list(range(1, config.input_length + 1))
        plt.figure(figsize=(15, 6))
        plt.plot(sequence, rnn_norms, label="rnn")
        plt.plot(sequence, lstm_norms, label="lstm")
        plt.legend()
        plt.xlabel("sequence value")
        plt.ylabel("gradient norm")

        plt.show()

        break

    print('Done training.')
Beispiel #18
0
def train(config):
    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    if torch.cuda.is_available():
        device = torch.device('cuda')
        l_type = torch.cuda.LongTensor
        f_type = torch.cuda.FloatTensor
    else:
        device = torch.device('cpu')
        l_type = torch.LongTensor
        f_type = torch.FloatTensor

    # Initialize the model that we are going to use
    if config.model_type == "RNN":
        model = VanillaRNN(config.input_length, config.input_dim,
                           config.num_hidden, config.num_classes,
                           config.batch_size, device)
    else:
        model = LSTM(config.input_length, config.input_dim, config.num_hidden,
                     config.num_classes, config.batch_size, device)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=0)

    # Setup the loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate)

    if torch.cuda.is_available():
        criterion.cuda()
        model.cuda()

    losses = []
    accuracies = []

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()

        model.zero_grad()
        optimizer.zero_grad()

        batch_inputs = batch_inputs.type(f_type)
        batch_targets = batch_targets.type(l_type)

        batch_predicted = model(batch_inputs)
        loss = criterion(batch_predicted, batch_targets)
        loss.backward()

        ############################################################################
        # QUESTION: what happens here and why?
        ############################################################################
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       max_norm=config.max_norm)
        ############################################################################

        optimizer.step()

        loss = loss.item()
        correct_predicted = torch.sum(
            torch.argmax(batch_predicted, dim=1) == batch_targets).item()
        accuracy = float(correct_predicted) / config.batch_size

        losses.append(loss)
        accuracies.append(accuracy)

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        if step % 100 == 0:
            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.4f}, Loss = {:.4f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))

        if loss <= 0.001:
            print('Model converged in {:04d} steps at loss {}'.format(
                step, loss))
            break

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    print('Done training.')
Beispiel #19
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device(config.device)
    # Initialize the model that we are going to use
    if config.model_type == 'RNN':
        model = VanillaRNN(config.input_length, config.input_dim,
                           config.num_hidden, config.num_classes,
                           config.batch_size, device)
    else:
        model = LSTM(config.input_length,
                     config.input_dim,
                     config.num_hidden,
                     config.num_classes,
                     config.batch_size,
                     device=device)
    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = CrossEntropyLoss()
    optimizer = RMSprop(model.parameters(), lr=config.learning_rate)

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()

        prob = model.forward(batch_inputs)
        ############################################################################
        # QUESTION: what happens here and why? Done to avoid vanishing gradients
        ############################################################################
        torch.nn.utils.clip_grad_norm(model.parameters(),
                                      max_norm=config.max_norm)
        ############################################################################
        loss = criterion.forward(prob, batch_targets)
        accuracy = float(
            torch.sum(prob.argmax(dim=1) == batch_targets)) / config.batch_size
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)
        writer.add_scalar('Train/Accuracy', accuracy, step)

        if step % 10 == 0:

            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break
    print('Done training.')
Beispiel #20
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device(config.device)
    print('Running on {}'.format(device))
    # Initialize the model that we are going to use
    if config.model_type is 'RNN':
        model = VanillaRNN(config.input_length, config.input_dim,
                           config.num_hidden, config.num_classes,
                           config.batch_size, device)
    else:
        model = LSTM(config.input_length, config.input_dim, config.num_hidden,
                     config.num_classes, config.batch_size, device)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    optimizer = torch.optim.RMSprop(model.parameters(), config.learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    accuracies = []
    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()
        y_pred_batch = model(batch_inputs.to(device))
        loss = criterion(y_pred_batch, batch_targets.to(device))
        loss.backward()
        ############################################################################
        # QUESTION: what happens here and why?
        ############################################################################
        torch.nn.utils.clip_grad_norm(model.parameters(),
                                      max_norm=config.max_norm)
        ############################################################################

        optimizer.step()
        optimizer.zero_grad()

        loss = loss.item()
        accuracy = np.sum(
            np.argmax(y_pred_batch.cpu().detach().numpy(), axis=1) ==
            batch_targets.cpu().detach().numpy()) / batch_targets.shape[0]

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        accuracies.append(accuracy)
        if step % 10 == 0:

            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))
        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    print('Done training.')
    print("Length {} max. accuracy: {}".format(config.input_length,
                                               max(accuracies)))
Beispiel #21
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device(config.device)

    # Initialize the model that we are going to use
    model = None

    if config.model_type == 'LSTM':
        model = LSTM(
            config.input_length,
            config.input_dim,
            config.num_hidden,
            config.num_classes,
            config.device,
        )
    elif config.model_type == 'RNN':
        model = VanillaRNN(
            config.input_length,
            config.input_dim,
            config.num_hidden,
            config.num_classes,
            config.device,
        )
    else:
        print('Your model type input is neither \'RNN\' or \'LSTM\'')
        return

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(),
                                    lr=config.learning_rate)

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)

        # Only for time measurement of step through network
        t1 = time.time()

        # Add more code here ...
        optimizer.zero_grad()
        output = model.forward(batch_inputs)

        loss = criterion.forward(output, batch_targets)
        loss.backward()
        ############################################################################
        # QUESTION: what happens here and why?
        ############################################################################
        '''
        ANSWER:
        This function ‘clips’ the norm of the gradients by scaling the gradients down 
        by the same amount in order to reduce the norm to an acceptable level. In 
        practice this places a limit on the size of the parameter updates.
        '''
        torch.nn.utils.clip_grad_norm(model.parameters(),
                                      max_norm=config.max_norm)
        ############################################################################

        # Add more code here ...
        optimizer.step()
        with torch.no_grad():
            pred = torch.nn.functional.softmax(output, dim=0)
            pred = torch.max(pred, 1)[1]

            loss = loss  # fixme
            accuracy = np.sum([
                1 if pred[i] == batch_targets[i] else 0
                for i in range(len(pred))
            ]) / len(batch_targets)  # fixme

            # Just for time measurement
            t2 = time.time()
            examples_per_second = config.batch_size / float(t2 - t1)

            if step % 10 == 0:

                print(
                    "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                    "Accuracy = {:.2f}, Loss = {:.3f}".format(
                        datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                        config.train_steps, config.batch_size,
                        examples_per_second, accuracy, loss))

            if step == config.train_steps:
                # If you receive a PyTorch data-loader error, check this bug report:
                # https://github.com/pytorch/pytorch/pull/9655
                break

    print('Done training.')
    """
    Test memory capacity
    """
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, 1000, num_workers=1)
    batch_inputs, batch_targets = next(iter(data_loader))
    batch_inputs = batch_inputs.to(device)
    batch_targets = batch_targets.to(device)

    output = model.forward(batch_inputs)
    print('T:', config.input_length + 1)
    pred = torch.nn.functional.softmax(output, dim=0)
    pred = torch.max(pred, 1)[1]
    accuracy = np.sum(
        [1 if pred[i] == batch_targets[i] else 0
         for i in range(len(pred))]) / len(batch_targets)
    print("Final accuracy:", accuracy)
def train(config):
    assert config.model_type in ('RNN', 'LSTM')

    exp_name = 'runs/{}_batch{}_dim{}_len{}_{}'.format(
        config.model_type, config.batch_size, config.input_dim,
        config.input_length,
        datetime.now().strftime("%Y-%m-%d %H:%M"))
    print(exp_name)
    print(config)

    writer = SummaryWriter(exp_name)

    # Convert to labels
    def to_label(tensor):
        _, tensor = tensor.max(-1)
        return tensor

    # Output accuracy given predictions and targets
    def get_accuracy(predictions, targets):
        idx_p = to_label(predictions)
        idx_t = targets
        correct = (idx_p == idx_t).type(
            torch.FloatTensor)  # TODO test accuracy
        accuracy = (correct.sum() / correct.shape[0]).item()
        return accuracy

    # Initialize the device which to run the model on
    device = torch.device(config.device)
    print(device)

    # Initialize the model that we are going to use
    if config.model_type == 'RNN':
        model = VanillaRNN(config.input_length, config.input_dim,
                           config.num_hidden, config.num_classes,
                           config.batch_size, device)
        model.to(device)
    elif config.model_type == 'LSTM':
        model = LSTM(config.input_length, config.input_dim, config.num_hidden,
                     config.num_classes, config.batch_size, device)
        model.to(device)
    else:
        raise ValueError

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(), config.learning_rate)

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()

        # Add more code here ...
        model.train()
        optimizer.zero_grad()

        batch_targets = batch_targets.to(device)

        if config.input_dim == 1:
            batch_inputs = batch_inputs.unsqueeze(-1).to(device)
        elif config.input_dim == 10:
            temp = batch_inputs.type(torch.LongTensor).to(device)
            batch_inputs = torch.zeros(config.batch_size, config.input_length,
                                       config.input_dim).to(device)
            batch_inputs.scatter_(2, temp.unsqueeze(-1), 1)

        batch_outputs = model(batch_inputs)

        loss = criterion(batch_outputs, batch_targets)
        accuracy = get_accuracy(batch_outputs, batch_targets)

        loss.backward()

        ############################################################################
        # QUESTION: what happens here and why?
        # ANSWER: The gradients are clipped up to a certain threshold value (of the vector norm) to avoid exploding gradient problem
        ############################################################################
        torch.nn.utils.clip_grad_norm(model.parameters(),
                                      max_norm=config.max_norm)
        ############################################################################

        optimizer.step()

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        # Print and save data to Tensorboard
        if step % 10 == 0:
            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))
            writer.add_scalar('Accuracy', accuracy, step)
            writer.add_scalar('Loss', loss.item(), step)

        # Output final scores
        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            with open("logs.txt", "a") as f:
                f.write("{}   Accuracy: {}\n".format(exp_name, accuracy))

            break

    print('Done training.')
Beispiel #23
0
def train(config):
    assert config.model_type in ('RNN', 'LSTM')

    # Initialize the device which to run the model on
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # set the config params to be used here
    SEQ_LENGTH = config.input_length
    INPUT_DIM = config.input_dim
    NUM_HIDDEN = config.num_hidden
    NUM_CLASSES = config.num_classes
    BATCH_SIZE = config.batch_size

    # Initialize the model that we are going to use
    if config.model_type == "RNN":
        model = VanillaRNN(seq_length=SEQ_LENGTH,
                           input_dim=INPUT_DIM,
                           num_hidden=NUM_HIDDEN,
                           num_classes=NUM_CLASSES,
                           batch_size=BATCH_SIZE,
                           device=device)
        optimizer = torch.optim.RMSprop(model.parameters(),
                                        lr=config.learning_rate)

    elif config.model_type == "LSTM":
        model = LSTM(seq_length=SEQ_LENGTH,
                     input_dim=INPUT_DIM,
                     num_hidden=NUM_HIDDEN,
                     num_classes=NUM_CLASSES,
                     batch_size=BATCH_SIZE,
                     device=device)
        optimizer = torch.optim.RMSprop(model.parameters(),
                                        lr=config.learning_rate,
                                        momentum=0.8,
                                        weight_decay=1e-4)

    model.to(device)
    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer (optimize done above)
    criterion = torch.nn.CrossEntropyLoss()

    # for intermediate reporting and convergence checks
    intermediate_accuracies = []

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        # Only for time measurement of step through network
        t1 = time.time()

        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)

        ############################################################################
        # QUESTION: what happens here and why?
        # ANSWER: the gradients are clipped /rescaled to a max value, as explained in slide 50 of lecture 6
        ############################################################################
        ############################################################################

        out = model.forward(batch_inputs)
        loss = criterion(out, batch_targets)

        optimizer.zero_grad()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       max_norm=config.max_norm)
        optimizer.step()

        accuracy = calculate_accuracy(out, batch_targets)

        intermediate_accuracies.append(accuracy)

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        if step % 10000 == 0:
            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))

        if step > 10:
            # check for convergence: If the last 5 measured accuracies' mean is over .98, we'll say it converges
            if step == config.train_steps or np.mean(
                    intermediate_accuracies[-5:-1]) >= 0.98:
                # If you receive a PyTorch data-loader error, check this bug report:
                # https://github.com/pytorch/pytorch/pull/9655
                break

    print('Done training.')
    print('finally accuracy:')
    print(accuracy)
    return accuracy, loss.data.item()
Beispiel #24
0
def train(config):

    assert config.model_type in ('RNN', 'LSTM')

    assert config.input_dim in (1, 10)

    # Initialize the device which to run the model on
    device = torch.device(config.device)

    accuracies = []

    filename = f'{config.model_type}_{config.input_length}.pkl'

    # Initialize the model that we are going to use
    if config.model_type == 'RNN':
        model = VanillaRNN(seq_length=config.input_length,
                           input_dim=config.input_dim,
                           num_hidden=config.num_hidden,
                           num_classes=config.num_classes,
                           batch_size=config.batch_size,
                           device=config.device)
    else:
        model = LSTM(seq_length=config.input_length,
                     input_dim=config.input_dim,
                     num_hidden=config.num_hidden,
                     num_classes=config.num_classes,
                     batch_size=config.batch_size,
                     device=config.device)

    model = torch.nn.DataParallel(model).to(device)

    # Initialize the dataset and data loader (note the +1)
    dataset = PalindromeDataset(config.input_length + 1)
    data_loader = DataLoader(dataset, config.batch_size, num_workers=1)

    # Setup the loss and optimizer
    loss_criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(),
                                    lr=config.learning_rate)

    model.train()

    for step, (batch_inputs, batch_targets) in enumerate(data_loader):

        batch_targets = batch_targets.to(device)

        # if input_dim == 10 - convert to one-hot, otherwise - use as it is
        if config.input_dim == 10 and len(batch_inputs.shape) < 3:
            batch_inputs = to_one_hot(batch_inputs).to(device)
        else:
            batch_inputs = batch_inputs.unsqueeze(2).to(device)

        # Only for time measurement of step through network
        t1 = time.time()

        optimizer.zero_grad()

        outputs = model(batch_inputs)

        ############################################################################
        # QUESTION: what happens here and why?
        ############################################################################
        torch.nn.utils.clip_grad_norm(model.parameters(),
                                      max_norm=config.max_norm)
        ############################################################################

        loss = loss_criterion(outputs, batch_targets)
        accuracy = calculate_accuracy(outputs, batch_targets)

        accuracies.append(accuracy.item())

        loss.backward()
        optimizer.step()

        # Just for time measurement
        t2 = time.time()
        examples_per_second = config.batch_size / float(t2 - t1)

        if step % 10 == 0:

            print(
                "[{}] Train Step {:04d}/{:04d}, Batch Size = {}, Examples/Sec = {:.2f}, "
                "Accuracy = {:.2f}, Loss = {:.3f}".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M"), step,
                    config.train_steps, config.batch_size, examples_per_second,
                    accuracy, loss))

        if step == config.train_steps:
            # If you receive a PyTorch data-loader error, check this bug report:
            # https://github.com/pytorch/pytorch/pull/9655
            break

    pickle.dump(accuracies, open(f'{filename}', 'wb'))
    print('Done training.')
Beispiel #25
0
def analyze_grads_over_time(config, pretrain_model=False):
    device = torch.device(config.device)
    config.input_length = 150

    seed = 42
    torch.manual_seed(seed)
    np.random.seed(seed)

    total_norms = []

    for m in ["RNN", "LSTM"]:

        # pretrain model
        if pretrain_model:
            model = train(config)
        else:
            # Initialize params for models
            seq_length = config.input_length
            input_dim = config.input_dim
            num_hidden = config.num_hidden
            num_classes = config.num_classes

            # Initialize the model that we are going to use
            if m == 'RNN':
                model = VanillaRNN(seq_length, input_dim, num_hidden,
                                   num_classes, device)
            else:
                model = LSTM(seq_length, input_dim, num_hidden, num_classes,
                             device)

            model.to(device)

        # Initialize the dataset and data loader (note the +1)
        dataset = PalindromeDataset(config.input_length + 1)
        # data_loader = DataLoader(dataset, batch_size=1, num_workers=1)
        data_loader = DataLoader(dataset,
                                 batch_size=config.batch_size,
                                 num_workers=1)

        # Setup the loss and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.RMSprop(model.parameters(), lr=config.learning_rate)

        # Get single batch from dataloader
        batch_inputs, batch_targets, = next(iter(data_loader))

        # convert to one-hot
        batch_inputs = torch.scatter(
            torch.zeros(*batch_inputs.size(), config.num_classes), 2,
            batch_inputs[..., None].to(torch.int64), 1).to(device)
        batch_targets = batch_targets.to(device)

        train_output = model.analyze_hs_gradients(batch_inputs)
        loss = criterion(train_output, batch_targets)

        optimizer.zero_grad()
        loss.backward()

        gradient_norms = []
        for i, (t, h) in enumerate(reversed(model.h_states)):
            _grad = h.grad  # (batch_size x hidden_dim)
            average_grads = torch.mean(
                _grad, dim=0
            )  # Calculate average gradient to get more stable estimate
            grad_l2_norm = average_grads.norm(2).item()
            gradient_norms.append(grad_l2_norm)

        print(len(gradient_norms))
        total_norms.append(gradient_norms)

    time_steps = np.arange(150)
    print(time_steps)

    fig = plt.figure(figsize=(15, 10), dpi=150)
    # fig.suptitle('L2-norm of Gradients across Time Steps (LSTM $b_f = 2$)', fontsize=32)
    fig.suptitle('L2-norm of Gradients across Time Steps', fontsize=36)
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(total_norms[0], linewidth=2, color="tomato", label="RNN")
    ax.plot(total_norms[1], linewidth=2, color="darkblue", label="LSTM")
    ax.tick_params(labelsize=16)
    ax.set_xticks(time_steps[::10])
    ax.set_xticklabels(time_steps[::10])

    ax.set_xlabel('Backpropagation Step', fontsize=24)
    ax.set_ylabel('Gradient Norm (L2)', fontsize=24)
    ax.legend(prop={'size': 16})

    if not os.path.exists('part1/figures/'):
        os.makedirs('part1/figures/')

    plt.savefig("part1/figures/Analyze_gradients_pt_{}.png".format(
        str(pretrain_model)))
    # plt.savefig("part1/figures/Analyze_gradients_pt_{}_bias_2.png".format(str(pretrain_model)))
    plt.show()