def train2(X, Y, nn_architecture, epochs, learning_rate, method, n_batches, batch_size, cost_type, N_nn, kernel_a, alpha_init, alpha_rate, beta, gamma, verbose, var_epsilon, dispersion_factor=6): if method == "sgd": N_nn = 1 # initiation of neural net parameters params = [ init_layers(nn_architecture, i, dispersion_factor) for i in range(N_nn) ] alpha = alpha_init # initiation of lists storing the history cost_history = [] cost_history_mean = [] accuracy_history = [] elapsed_epochs = 0 # performing calculations for subsequent iterations for i in range(epochs): for batch in range(n_batches): Y_hat = [] costs = [] cache = [] grads = [] start = batch * batch_size end = start + batch_size for j in range(N_nn): # step forward Y_hat_temp, cache_temp = full_forward_propagation( X[:, start:end], params[j], nn_architecture) Y_hat.append(Y_hat_temp) cache.append(cache_temp) # calculating cost and saving it to history costj = get_cost_value(Y_hat[j], Y[:, start:end], cost_type) costs.append(costj) # step backward - calculating gradient if method in ["gradient", "gradient_old", "sgd"]: gradsj = full_backward_propagation(Y_hat[j], Y[:, start:end], cache[j], params[j], nn_architecture) grads.append(gradsj) if method == "gradient": params, var = update_nn_weights(params, grads, N_nn, learning_rate, kernel_a, alpha, beta, gamma) elif method == "gradient_old": params, var = update_nn_weights_old(params, grads, N_nn, learning_rate, kernel_a, alpha, beta, gamma) elif method == "nogradient": params, var = update_nn_weights_derivative_free( params, costs, learning_rate, N_nn, kernel_a, alpha, beta, gamma) elif method == "nogradient_old": params, var = update_nn_weights_derivative_free_old( params, costs, learning_rate, N_nn, kernel_a, alpha, beta, gamma) elif method == "sgd": params, var = update_sgd(params[0], grads[0], nn_architecture, learning_rate) else: raise Exception("No method found") #end of iteration cost_history.append(costs) #mean position mean_param = get_mean(params) Y_hat_mean, _ = full_forward_propagation(X[:, start:end], mean_param, nn_architecture) cost_mean = get_cost_value(Y_hat_mean, Y[:, start:end], cost_type) cost_history_mean.append(cost_mean) #end of epoch---------------- var_mean = np.mean( var) #mean of variances along dimensions of parameter space if (verbose): print( "Iteration: {:05} - mean cost: {:.5f} - particle variance: {:.5f}" .format(i, np.mean(costs), var_mean)) alpha += alpha_rate elapsed_epochs += 1 if var_mean < var_epsilon: print("Convergence achieved - Particles are localized") break if not method == "sgd": plot_cost(cost_history, cost_history_mean, 'Training Cost Function') plot_list(np.mean(cost_history, axis=1), 'Mean Cost Function') plot_distance_matrix(params, N_nn) else: plot_list(cost_history, 'Training Cost Function') print("Cost Function evaluated {:01} times".format( int(n_batches * elapsed_epochs * N_nn))) return params, mean_param
def train_experimental(X, Y, nn_architecture, epochs, learning_rate, method, n_batches, batch_size, cost_type, N_nn, kernel_a, alpha_init, alpha_rate, beta, gamma, verbose, var_epsilon, dispersion_factor=6): from optimizers import update_cloud_derivative_free from utils import flatten_weights, unflatten_weights if method == "sgd": N_nn = 1 # initiation of neural net parameters params = [ init_layers(nn_architecture, i, dispersion_factor) for i in range(N_nn) ] alpha = alpha_init # initiation of lists storing the history cost_history = [] cost_history_mean = [] accuracy_history = [] elapsed_epochs = 0 #find optimal kernel_a if kernel_a == "auto": print("Finding kernel constant...") paramsf, _, _ = flatten_weights(params, N_nn) params_diff_matrix = paramsf[:, np.newaxis] - paramsf norm = np.sum(params_diff_matrix**2, axis=2) for kernel_a in [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10 ]: if (np.mean(np.einsum('ij -> i', np.exp(-kernel_a * norm)) / N_nn)) < 0.5: break print("Kernel constant found: " + str(kernel_a)) if learning_rate == "auto": learning_rate = 1 lr_decay = True else: lr_decay = False # performing calculations for subsequent iterations for i in range(epochs): for batch in range(n_batches): Y_hat = [] costs = [] cache = [] grads = [] start = batch * batch_size end = start + batch_size for j in range(N_nn): # step forward Y_hat_temp, cache_temp = full_forward_propagation( X[:, start:end], params[j], nn_architecture) Y_hat.append(Y_hat_temp) cache.append(cache_temp) # calculating cost and saving it to history costj = get_cost_value(Y_hat[j], Y[:, start:end], cost_type) costs.append(costj) #get cloud (flattened weights), gradients, nn shape and weight names paramsf, nn_shape, weight_names = flatten_weights(params, N_nn) #get updated cloud and its variance paramsf, var = update_cloud_derivative_free( paramsf, costs, learning_rate, N_nn, kernel_a, alpha, beta, gamma) #restore NN weight shapes params = unflatten_weights(paramsf, nn_shape, weight_names, N_nn) if (lr_decay): if i == 0: paramsf_previous = paramsf gt = 0 delta = paramsf_previous - paramsf gt = gt + np.absolute(delta) learning_rate = 1 / np.sqrt(1 + gt) paramsf_previous = paramsf #print(np.mean(learning_rate)) #end of iteration cost_history.append(costs) #mean position mean_param = get_mean(params) Y_hat_mean, _ = full_forward_propagation(X[:, start:end], mean_param, nn_architecture) cost_mean = get_cost_value(Y_hat_mean, Y[:, start:end], cost_type) cost_history_mean.append(cost_mean) #end of epoch---------------- var_mean = np.mean( var) #mean of variances along dimensions of parameter space if (verbose): print( "Iteration: {:05} - mean cost: {:.5f} - particle variance: {:.5f}" .format(i, np.mean(costs), var_mean)) alpha = alpha + alpha_rate elapsed_epochs += 1 if var_mean < var_epsilon: print("Convergence achieved - Particles are localized") break if not method == "sgd": plot_cost(cost_history, cost_history_mean, 'Training Cost Function') plot_list(np.mean(cost_history, axis=1), 'Mean Cost Function') plot_distance_matrix(params, N_nn) else: plot_list(cost_history, 'Training Cost Function') print("Cost Function evaluated {:01} times".format( int(n_batches * elapsed_epochs * N_nn))) return params, mean_param
def train_with_profiling(X, Y, nn_architecture, epochs, learning_rate, method, n_batches, batch_size, cost_type, N, kernel_a, alpha_init, alpha_rate, beta, gamma, verbose, var_epsilon, dispersion_factor=6): import time from optimizers import update_nn_weights_profiled, update_nn_weights_derivative_free_profiled if method == "sgd": N = 1 profiling_time = { "full_forward_propagation": 0.0, "get_cost_value": 0.0, "full_backward_propagation": 0.0, "weights_update": 0.0, "weights_update_without_flattening": 0.0 } # initiation of neural net parameters params = [ init_layers(nn_architecture, i, dispersion_factor) for i in range(N) ] alpha = alpha_init # initiation of lists storing the history cost_history = [] cost_history_mean = [] accuracy_history = [] elapsed_epochs = 0 # performing calculations for subsequent iterations for i in range(epochs): for batch in range(n_batches): Y_hat = [] costs = [] cache = [] grads = [] start = batch * batch_size end = start + batch_size for j in range(N): # step forward full_forward_data = full_forward_propagation( X[:, start:end], params[j], nn_architecture) time_temp = time.process_time() Y_hat_temp, cache_temp = full_forward_propagation( X[:, start:end], params[j], nn_architecture) profiling_time["full_forward_propagation"] += ( time.process_time() - time_temp) Y_hat.append(Y_hat_temp) cache.append(cache_temp) # calculating cost and saving it to history time_temp = time.process_time() costj = get_cost_value(Y_hat[j], Y[:, start:end], cost_type) profiling_time["get_cost_value"] += (time.process_time() - time_temp) costs.append(costj) # step backward - calculating gradient if method in ["gradient", "sgd"]: time_temp = time.process_time() gradsj = full_backward_propagation(Y_hat[j], Y[:, start:end], cache[j], params[j], nn_architecture) profiling_time["full_backward_propagation"] += ( time.process_time() - time_temp) grads.append(gradsj) time_temp = time.process_time() if method == "gradient": params, var, cputime = update_nn_weights_profiled( params, grads, N, learning_rate, kernel_a, alpha, beta, gamma) elif method == "nogradient": params, var, cputime = update_nn_weights_derivative_free_profiled( params, costs, learning_rate, N, kernel_a, alpha, beta, gamma) elif method == "sgd": params, var = update_sgd(params[0], grads[0], nn_architecture, learning_rate) else: raise Exception("No method found") profiling_time["weights_update"] += (time.process_time() - time_temp) if method in ["gradient", "nogradient"]: profiling_time["weights_update_without_flattening"] += cputime #end of iteration cost_history.append(costs) #mean position mean_param = get_mean(params) Y_hat_mean, _ = full_forward_propagation(X[:, start:end], mean_param, nn_architecture) cost_mean = get_cost_value(Y_hat_mean, Y[:, start:end], cost_type) cost_history_mean.append(cost_mean) #end of epoch---------------- var_mean = np.mean( var) #mean of variances along dimensions of parameter space if (verbose): print( "Iteration: {:05} - mean cost: {:.5f} - particle variance: {:.5f}" .format(i, np.mean(costs), var_mean)) alpha = alpha + alpha_rate elapsed_epochs += 1 if var_mean < var_epsilon: print("Convergence achieved - Particles are localized") break if not method == "sgd": plot_cost(cost_history, cost_history_mean, 'Training Cost Function') plot_list(np.mean(cost_history, axis=1), 'Mean Cost Function') plot_distance_matrix(params, N) else: plot_list(cost_history, 'Training Cost Function') print("Cost Function evaluated {:01} times".format( int(n_batches * elapsed_epochs * N))) print("") print("CPU TIME --------------------------------------") for key, value in profiling_time.items(): print(key, value) print("") return params, mean_param
def train_nn(X, Y, cloud, nn_architecture, method, max_epochs, n_batches, batch_size, learning_rate, cost_type, N, kernel_a, alpha_init, alpha_rate, beta, gamma, verbose, var_epsilon): # initiation of lists storing the cost history cost_history = [] cost_history_mean = [] alpha = alpha_init elapsed_epochs = 0 print("\nTraining started...") # performing calculations for subsequent iterations for i in range(max_epochs): for batch in range(n_batches): start = batch * batch_size end = start + batch_size Y_hat = [] costs = [] cache = [] grads = [] for j in range(N): # step forward Y_hat_temp, cache_temp = full_forward_propagation( X[:, start:end], cloud[j], nn_architecture) Y_hat.append(Y_hat_temp) cache.append(cache_temp) # calculating cost and saving it to history costj = get_cost_value(Y_hat[j], Y[:, start:end], cost_type) costs.append(costj) # step backward - calculating gradient if method in ["gradient_descent", "swarm"]: gradsj = full_backward_propagation(Y_hat[j], Y[:, start:end], cache[j], cloud[j], nn_architecture) grads.append(gradsj) if method == "swarm": cloud, cloud_var = update_nn_weights(cloud, grads, learning_rate, N, kernel_a, alpha, beta, gamma) elif method == "swarm_derivfree": cloud, cloud_var = update_nn_weights_derivative_free( cloud, costs, learning_rate, N, kernel_a, alpha, beta, gamma) elif method == "gradient_descent": cloud, cloud_var = update_gd(cloud[0], grads[0], nn_architecture, learning_rate) else: raise Exception("No method found") #end of iteration cost_history.append(costs) #mean particle position and its cost cloud_mean = get_mean(cloud) Y_hat_mean, _ = full_forward_propagation(X[:, start:end], cloud_mean, nn_architecture) cost_mean = get_cost_value(Y_hat_mean, Y[:, start:end], cost_type) cost_history_mean.append(cost_mean) #end of epoch---------------- cloud_var = np.mean( cloud_var) #mean of variances along dimensions of parameter space if (verbose): print( "Iteration: {:05} - Cloud mean cost: {:.5f} - Cloud variance: {:.5f}" .format(i, cost_mean, cloud_var)) alpha += alpha_rate elapsed_epochs += 1 if cloud_var < var_epsilon: print("Convergence achieved - particles are localized") break if i == (max_epochs - 1): print("Maximum amount of epochs reached") print("\nFunction value at cloud mean: " + str(cost_mean)) print("Cost function evaluated {:01} times".format( int(n_batches * elapsed_epochs * N))) return cloud, cloud_mean, cloud_var, cost_history, cost_history_mean
def forward_propagation(self, X, cloud): Y, _ = full_forward_propagation(X.T, cloud, self.architecture) return Y.T