def model(X, Y, layers_dims, optimizer, learning_rate=0.0007, mini_batch_size=64, beta=0.9, beta1=0.9, beta2=0.999, epsilon=1e-8, num_epochs=10000, print_cost=True): """ 3-layer neural network model which can be run in different optimizer modes. Arguments: X -- input data, of shape (2, number of examples) Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples) layers_dims -- python list, containing the size of each layer learning_rate -- the learning rate, scalar. mini_batch_size -- the size of a mini batch beta -- Momentum hyperparameter beta1 -- Exponential decay hyperparameter for the past gradients estimates beta2 -- Exponential decay hyperparameter for the past squared gradients estimates epsilon -- hyperparameter preventing division by zero in Adam updates num_epochs -- number of epochs print_cost -- True to print the cost every 1000 epochs Returns: parameters -- python dictionary containing your updated parameters """ L = len(layers_dims) # number of layers in the neural networks costs = [] # to keep track of the cost t = 0 # initializing the counter required for Adam update seed = 10 # For grading purposes, so that your "random" minibatches are the same as ours m = X.shape[1] # number of training examples # Initialize parameters parameters = initialize_parameters(layers_dims) # Initialize the optimizer if optimizer == "gd": pass # no initialization required for gradient descent elif optimizer == "momentum": v = initialize_velocity(parameters) elif optimizer == "adam": v, s = initialize_adam(parameters) # Optimization loop for i in range(num_epochs): # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch seed = seed + 1 minibatches = random_mini_batches(X, Y, mini_batch_size, seed) cost_total = 0 for minibatch in minibatches: # Select a minibatch (minibatch_X, minibatch_Y) = minibatch # Forward propagation a3, caches = forward_propagation(minibatch_X, parameters) # Compute cost and add to the cost total cost_total += compute_cost(a3, minibatch_Y) # Backward propagation grads = backward_propagation(minibatch_X, minibatch_Y, caches) # Update parameters if optimizer == "gd": parameters = update_parameters_with_gd(parameters, grads, learning_rate) elif optimizer == "momentum": parameters, v = update_parameters_with_momentum( parameters, grads, v, beta, learning_rate) elif optimizer == "adam": t = t + 1 # Adam counter parameters, v, s = update_parameters_with_adam( parameters, grads, v, s, t, learning_rate, beta1, beta2, epsilon) cost_avg = cost_total / m # Print the cost every 1000 epoch if print_cost and i % 1000 == 0: print("Cost after epoch %i: %f" % (i, cost_avg)) if print_cost and i % 100 == 0: costs.append(cost_avg) # plot the cost plt.plot(costs) plt.ylabel('cost') plt.xlabel('epochs (per 100)') plt.title("Learning rate = " + str(learning_rate)) plt.show() return parameters
[[ 0.32171798 -0.25467393 1.46902454] [-2.05617317 -0.31554548 -0.3756023 ] [ 1.1404819 -1.09976462 -0.1612551 ]] b2 = [[-0.88020257] [ 0.02561572] [ 0.57539477]] A variant of this is Stochastic Gradient Descent (SGD), which is equivalent to mini-batch gradient descent where each mini-batch has just 1 example. The update rule that you have just implemented does not change. What changes is that you would be computing gradients on just one training example at a time, rather than on the whole training set. The code examples below illustrate the difference between stochastic gradient descent and (batch) gradient descent. (Batch) Gradient Descent: X = data_input Y = labels parameters = initialize_parameters(layers_dims) for i in range(0, num_iterations): # Forward propagation a, caches = forward_propagation(X, parameters) # Compute cost. cost += compute_cost(a, Y) # Backward propagation. grads = backward_propagation(a, caches, parameters) # Update parameters. parameters = update_parameters(parameters, grads) Stochastic Gradient Descent: X = data_input Y = labels parameters = initialize_parameters(layers_dims) for i in range(0, num_iterations): for j in range(0, m): # Forward propagation a, caches = forward_propagation(X[:,j], parameters) # Compute cost
import numpy as np import matplotlib.pyplot as plt import scipy.io import math import sklearn import sklearn.datasets from opt_utils_v1a import load_params_and_grads, initialize_parameters, forward_propagation, backward_propagation from opt_utils_v1a import compute_cost, predict, predict_dec, plot_decision_boundary, load_dataset from testCases import * """Batch Gradient Descent""" X = data_input Y = labels parameters = initialize_parameters(layers_dims) for i in range(0, num_iterations): # Forward propagation a, caches = forward_propagation(X, parameters) # Compute cost. cost += compute_cost(a, Y) # Backward propagation. grads = backward_propagation(a, caches, parameters) # Update parameters. parameters = update_parameters(parameters, grads)
def model(X, Y, layers_dims, optimizer, learning_rate = 0.0007, mini_batch_size = 64, beta = 0.9, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, num_epochs = 10000, print_cost = True): L = len(layers_dims) # number of layers in the neural networks costs = [] # to keep track of the cost t = 0 # initializing the counter required for Adam update seed = 10 # For grading purposes, so that your "random" minibatches are the same as ours m = X.shape[1] # number of training examples # Initialize parameters parameters = initialize_parameters(layers_dims) # Initialize the optimizer if optimizer == "gd": pass # no initialization required for gradient descent elif optimizer == "momentum": v = initialize_velocity(parameters) elif optimizer == "adam": v, s = initialize_adam(parameters) # Optimization loop for i in range(num_epochs): # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch seed = seed + 1 minibatches = random_mini_batches(X, Y, mini_batch_size, seed) cost_total = 0 for minibatch in minibatches: # Select a minibatch (minibatch_X, minibatch_Y) = minibatch # Forward propagation a3, caches = forward_propagation(minibatch_X, parameters) # Compute cost and add to the cost total cost_total += compute_cost(a3, minibatch_Y) # Backward propagation grads = backward_propagation(minibatch_X, minibatch_Y, caches) # Update parameters if optimizer == "gd": parameters = update_parameters_with_gd(parameters, grads, learning_rate) elif optimizer == "momentum": parameters, v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate) elif optimizer == "adam": t = t + 1 # Adam counter parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1, beta2, epsilon) cost_avg = cost_total / m # Print the cost every 1000 epoch if print_cost and i % 1000 == 0: print ("Cost after epoch %i: %f" %(i, cost_avg)) if print_cost and i % 100 == 0: costs.append(cost_avg) # plot the cost plt.plot(costs) plt.ylabel('cost') plt.xlabel('epochs (per 100)') plt.title("Learning rate = " + str(learning_rate)) plt.show() return parameters