def model_using_sgd(X, Y, layers_dims, learning_rate=0.01, initialization='random', _lambda=0, keep_prob=1, init_const=0.01, num_of_iterations=10000, print_cost=True, print_cost_after=1000, seed=None): L = len(layers_dims) - 1 # number of layers m = X.shape[1] # number of training examples # Initialize parameters parameters = initialize_parameters(layers_dims, initialization, init_const, seed) # Gradient Descent for i in range(num_of_iterations): for j in range(m): # Forward propagation if keep_prob == 1: AL, caches = forward_propagation(X[:, j], parameters, L) elif keep_prob < 1: AL, caches = forward_propagation_with_dropout( X[:, j], parameters, L, keep_prob) # Compute cost if _lambda == 0: cost = compute_cost(AL, Y[:, j]) else: cost = compute_cost_with_regularization( AL, Y[:, j], parameters, _lambda, L) # Backward propagation if _lambda == 0 and keep_prob == 1: grads = backward_propagation(AL, Y[:, j], caches) elif _lambda != 0: grads = backward_propagation_with_regularization( AL, Y[:, j], caches, _lambda) elif keep_prob < 1: grads = backward_propagation_with_dropout( AL, Y[:, j], caches, keep_prob) # Updating parameters parameters = update_parameters_using_gd(parameters, grads, learning_rate, L) # Priniting cost after given iterations if print_cost and i % print_cost_after == 0: print("Cost after iteration %i: %f" % (i, cost)) # Gradient checking gradient_checking(parameters, grads, X, Y, layers_dims, _lambda=_lambda) return parameters
def optimize_with_adam(hyperparameters, X, Y, parameters, print_cost): ( layer_dims, learning_rate, num_epochs, mini_batch_size, lambd, beta1, beta2, epsilon, ) = hyperparameters L = len(layer_dims) - 1 m = X.shape[1] costs = [] v, s = initialize_adam(L, parameters) for i in range(num_epochs): seed = np.random.seed(int(time())) minibatches = random_mini_batches(X, Y, mini_batch_size, seed) cost_total = 0 for minibatch in minibatches: (minibatch_X, minibatch_Y) = minibatch # Forward propagation AL, caches = L_model_forward(L, minibatch_X, parameters) # Compute cost cost_total += compute_cost_with_regularization( layer_dims, AL, minibatch_Y, parameters, lambd) # Backward propagation grads = L_model_backward_with_regularization( AL, minibatch_Y, caches, lambd) # Update parameters parameters, v, s = update_parameters_with_adam( parameters, grads, v, s, i, learning_rate, ) cost_avg = cost_total / (int(m / mini_batch_size) + int(m % mini_batch_size != 0)) if i % 100 == 0 and print_cost: print(f"Cost after Epoch {i} : {cost_avg}") costs.append(cost_avg) return parameters, grads, costs
def model_with_regularization(X_train, Y_train, X_test, Y_test, layers_dims, learning_rate=0.0075, num_iterations=3000, lambd=0.7, print_cost=False): costs = [] # keep track of cost # Parameters initialization. parameters = initialize_parameters_random(layers_dims) W1 = parameters["W1"] W2 = parameters["W2"] # Loop (gradient descent) for i in range(0, num_iterations): a3, cache = forward_propagation(X_train, parameters) # Compute cost. cost = compute_cost_with_regularization(a3, Y_train, parameters, lambd) grads = backward_propagation_with_regularization( X_train, Y_train, cache, lambd) # Update parameters. parameters = update_parameters(parameters, grads, learning_rate) # Print the cost every 100 training example if print_cost and i % 100 == 0: print("Cost after iteration %i: %f" % (i, cost)) if print_cost and i % 100 == 0: costs.append(cost) Y_prediction_train = predict(X_train, Y_train, parameters) Y_prediction_test = predict(X_test, Y_test, parameters) d = { "Y_prediction_test": Y_prediction_test, "Y_prediction_train": Y_prediction_train, "parameters": parameters, "learning_rate": learning_rate, "num_iterations": num_iterations, } return d
def gradient_checking(parameters, gradients, X, Y, layers_dims, _lambda=0, keep_prob=1, epsilon=1e-7): # Set-up variables parameters_values = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients, len(layers_dims)) num_of_parameters = parameters_values.shape[0] J_plus = np.zeros((num_of_parameters, 1)) J_minus = np.zeros((num_of_parameters, 1)) grad_approx = np.zeros((num_of_parameters, 1)) num_of_layers = len(layers_dims) - 1 # Compute grad_approx for i in range(num_of_parameters): # Compute J_plus[i] theta_plus = np.copy(parameters_values) theta_plus[i][0] = theta_plus[i][0] + epsilon if keep_prob == 1: AL, _ = forward_propagation( X, vector_to_dictionary(theta_plus, layers_dims), num_of_layers) elif keep_prob < 1: AL, _ = forward_propagation_with_dropout( X, vector_to_dictionary(theta_plus, layers_dims), num_of_layers, keep_prob) if _lambda == 0: J_plus[i] = compute_cost(AL, Y) else: J_plus[i] = compute_cost_with_regularization( AL, Y, parameters, _lambda, num_of_layers) # Compute J_minus[i] theta_minus = np.copy(parameters_values) theta_minus[i][0] = theta_minus[i][0] - epsilon if keep_prob == 1: AL, _ = forward_propagation( X, vector_to_dictionary(theta_minus, layers_dims), num_of_layers) elif keep_prob < 1: AL, _ = forward_propagation_with_dropout( X, vector_to_dictionary(theta_minus, layers_dims), num_of_layers, keep_prob) if _lambda == 0: J_minus[i] = compute_cost(AL, Y) else: J_minus[i] = compute_cost_with_regularization( AL, Y, parameters, _lambda, num_of_layers) # Compute grad_approx[i] grad_approx[i] = np.divide(J_plus[i] - J_minus[i], 2 * epsilon) # Compare gradapprox to backward propagation gradients by computing difference numerator = np.linalg.norm(grad - grad_approx) denominator = np.linalg.norm(grad) + np.linalg.norm(grad_approx) difference = np.divide(numerator, denominator) if difference > 2e-7: print("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m") else: print("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m") return difference
def model(X, Y, layers_dims, learning_rate=0.01, optimizer='adam', beta=0.9, beta1=0.9, beta2=0.999, epsilon=1e-8, mini_batch_size=64, initialization='random', _lambda=0, keep_prob=1, init_const=0.01, num_of_iterations=10000, print_cost=True, print_cost_after=1000): L = len(layers_dims) - 1 # number of layers costs = [] # to keep track of total cost seed = 10 # For grading purposes, so that your "random" minibatches are the same as ours t = 0 # initializing the counter required for Adam update m = X.shape[1] # number of training example # Initialize parameters parameters = initialize_parameters(layers_dims, initialization, init_const, seed) # Initialize the optimizer if optimizer == 'gd': pass # no initialization required for gradient descent elif optimizer == 'momentum': v = initialize_velocity(parameters, L) elif optimizer == 'adam': v, s = initialize_adam(parameters, L) # Optimization loop for i in range(num_of_iterations): # Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epoch seed = seed + 1 mini_batches = random_mini_batches(X, Y, mini_batch_size, seed) cost_total = 0 for mini_batch in mini_batches: # Select a minibatch (minibatch_X, minibatch_Y) = mini_batch # Forward propagation if keep_prob == 1: AL, caches = forward_propagation(minibatch_X, parameters, L) elif keep_prob < 1: AL, caches = forward_propagation_with_dropout( minibatch_X, parameters, L, keep_prob) # Compute cost and add to the total cost if _lambda == 0: cost_total += compute_cost(AL, minibatch_Y) else: cost_total += compute_cost_with_regularization( AL, minibatch_Y, parameters, _lambda, L) # Backward propagation if _lambda == 0 and keep_prob == 1: grads = backward_propagation(AL, minibatch_Y, caches) elif _lambda != 0: grads = backward_propagation_with_regularization( AL, minibatch_Y, caches, _lambda) elif keep_prob < 1: grads = backward_propagation_with_dropout( AL, minibatch_Y, caches, keep_prob) # Update parameters if optimizer == 'gd': parameters = update_parameters_using_gd( parameters, grads, learning_rate, L) elif optimizer == 'momentum': parameters, v = update_parameters_using_momentum( parameters, grads, v, beta, learning_rate, L) elif optimizer == 'adam': t += 1 # adam counter parameters, v, s = update_parameters_using_adam( parameters, grads, v, s, t, learning_rate, L, beta1, beta2, epsilon) cost_avg = cost_total / m # Print the cost every given epoch if print_cost and i % print_cost_after == 0: print("Cost after epoch %i: %f" % (i, cost_avg)) if print_cost and i % 100 == 0: costs.append(cost_avg) # Gradient checking gradient_checking(parameters, grads, X, Y, layers_dims, _lambda=_lambda) return parameters