def update_nn_weights_derivative_free_old(cloud, cost, lr, N, kernel_a, alpha, beta, gamma): #get flattened weights, nn shape and weight names cloudf, nn_shape, weight_names = flatten_weights(cloud, N) #compute kernels kernels = [[kernel(cloudf[i], cloudf[j], kernel_a) for j in range(N)] for i in range(N)] gkernels = [[gkernel(cloudf[i], cloudf[j], kernel_a) for j in range(N)] for i in range(N)] #plt.imshow(kernels,vmin=0,vmax=1) #plt.colorbar() #compute mean and standart deviation cloud_mean = np.mean(cloudf, axis=0) cloud_var = get_var(cloudf, cloud_mean) #compute gradient flows updates = [] for nn in range(N): R = 0 P = 0 S = 0 Q = [ gkernels[nn][j] * cost[j] + kernels[nn][j] * cost[j] * np.divide( (cloudf[j] - cloud_mean), cloud_var) for j in range(N) ] Q = np.mean(Q, axis=0) if alpha > 0: R = [[kernels[nn][j] * (cloudf[j] - cloudf[k]) for j in range(N)] for k in range(N)] R = [item for sublist in R for item in sublist] #Flatten list of lists R = np.sum(R, axis=0) * float(1 / N**2) if beta > 0: P = [gkernels[nn][j] for j in range(N)] P = np.mean(P, axis=0) if gamma > 0: S = [ kernels[nn][j] * np.divide((cloudf[j] - cloud_mean), cloud_var) for j in range(N) ] S = np.mean(S, axis=0) updates.append(-lr * (Q + alpha * R + beta * P + gamma * S)) #update flattened tensors for nn in range(N): cloudf[nn] = cloudf[nn] + updates[nn] #restore NN weight shapes new_nn_weights = unflatten_weights(cloudf, nn_shape, weight_names, N) return new_nn_weights, cloud_var
def train(self, X, Y, method, max_epochs, n_batches, batch_size, learning_rate, cost_type, kernel_a="auto", alpha_init=0, alpha_rate=1, beta=0, gamma=1, verbose=True, var_epsilon=0): if (method == "sgd") and (self.N > 1): self.init_cloud(1) #transposing input and output data X = X.T Y = Y.T #find optimal kernel_a if kernel_a == "auto": cloudf, _, _ = flatten_weights(self.cloud, self.N) kernel_a = kernel_a_finder(cloudf, self.N) self.train_method = method self.cloud, self.cloud_mean, self.cloud_var, self.cost_history, self.cost_history_mean = train_nn( X, Y, self.cloud, self.architecture, method, max_epochs, n_batches, batch_size, learning_rate, cost_type, self.N, kernel_a, alpha_init, alpha_rate, beta, gamma, verbose, var_epsilon)
def init_cloud(self, N, dispersion_factor=6): self.N = N self.cloud = [ init_layers(self.architecture, i, dispersion_factor) for i in range(N) ] self.cloud_mean = get_mean(self.cloud) cloudf, _, _ = flatten_weights(self.cloud, self.N) self.cloud_var = get_var(cloudf)
def update_nn_weights_derivative_free(cloud, cost, lr, N, kernel_a, alpha, beta, gamma): #get cloud (flattened weights), gradients, nn shape and weight names cloudf, nn_shape, weight_names = flatten_weights(cloud, N) #get updated cloud and its variance cloudf, cloud_var = update_cloud_derivative_free(cloudf, cost, lr, N, kernel_a, alpha, beta, gamma) #restore NN weight shapes new_nn_weights = unflatten_weights(cloudf, nn_shape, weight_names, N) return new_nn_weights, cloud_var
def update_nn_weights_derivative_free_profiled(cloud, cost, lr, N_nn, kernel_a, alpha, beta, gamma): #get cloud (flattened weights), gradients, nn shape and weight names cloudf, nn_shape, weight_names = flatten_weights(cloud, N_nn) #time needed for the update time_temp = time.process_time() #get updated cloud and its variance cloudf, cloud_var = update_cloud_derivative_free(cloudf, cost, lr, kernel_a, alpha, beta, gamma) update_time = time.process_time() - time_temp #restore NN weight shapes new_nn_weights = unflatten_weights(cloudf, nn_shape, weight_names, N_nn) return new_nn_weights, cloud_var, update_time
def funObj(self, weights_flat, X, y): weights = unflatten_weights(weights_flat, self.layer_sizes) activations = [X] for W, b in weights: Z = X @ W.T + b X = 1 / (1 + np.exp(-Z)) activations.append(X) yhat = Z if self.classification: # softmax tmp = np.sum(np.exp(yhat), axis=1) f = -np.sum(yhat[y.astype(bool)] - log_sum_exp(yhat)) grad = np.exp(yhat) / tmp[:, None] - y else: # L2 loss f = 0.5 * np.sum((yhat - y)**2) grad = yhat - y # gradient for L2 loss grad_W = grad.T @ activations[-2] grad_b = np.sum(grad, axis=0) g = [(grad_W, grad_b)] for i in range(len(self.layer_sizes) - 2, 0, -1): W, b = weights[i] grad = grad @ W grad = grad * (activations[i] * (1 - activations[i])) # gradient of logistic loss grad_W = grad.T @ activations[i - 1] grad_b = np.sum(grad, axis=0) g = [(grad_W, grad_b)] + g # insert to start of list g = flatten_weights(g) # add L2 regularization f += 0.5 * self.lammy * np.sum(weights_flat**2) g += self.lammy * weights_flat return f, g
def fit(self, X, y): if y.ndim == 1: y = y[:, None] self.layer_sizes = [X.shape[1] ] + self.hidden_layer_sizes + [y.shape[1]] self.classification = y.shape[ 1] > 1 # assume it's classification iff y has more than 1 column # random init scale = 0.01 weights = list() for i in range(len(self.layer_sizes) - 1): W = scale * np.random.randn(self.layer_sizes[i + 1], self.layer_sizes[i]) b = scale * np.random.randn(1, self.layer_sizes[i + 1]) weights.append((W, b)) weights_flat = flatten_weights(weights) weights_flat_new, f = findMin(self.funObj, weights_flat, self.max_iter, X, y, verbose=True) self.weights = unflatten_weights(weights_flat_new, self.layer_sizes)
def train_experimental(X, Y, nn_architecture, epochs, learning_rate, method, n_batches, batch_size, cost_type, N_nn, kernel_a, alpha_init, alpha_rate, beta, gamma, verbose, var_epsilon, dispersion_factor=6): from optimizers import update_cloud_derivative_free from utils import flatten_weights, unflatten_weights if method == "sgd": N_nn = 1 # initiation of neural net parameters params = [ init_layers(nn_architecture, i, dispersion_factor) for i in range(N_nn) ] alpha = alpha_init # initiation of lists storing the history cost_history = [] cost_history_mean = [] accuracy_history = [] elapsed_epochs = 0 #find optimal kernel_a if kernel_a == "auto": print("Finding kernel constant...") paramsf, _, _ = flatten_weights(params, N_nn) params_diff_matrix = paramsf[:, np.newaxis] - paramsf norm = np.sum(params_diff_matrix**2, axis=2) for kernel_a in [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10 ]: if (np.mean(np.einsum('ij -> i', np.exp(-kernel_a * norm)) / N_nn)) < 0.5: break print("Kernel constant found: " + str(kernel_a)) if learning_rate == "auto": learning_rate = 1 lr_decay = True else: lr_decay = False # performing calculations for subsequent iterations for i in range(epochs): for batch in range(n_batches): Y_hat = [] costs = [] cache = [] grads = [] start = batch * batch_size end = start + batch_size for j in range(N_nn): # step forward Y_hat_temp, cache_temp = full_forward_propagation( X[:, start:end], params[j], nn_architecture) Y_hat.append(Y_hat_temp) cache.append(cache_temp) # calculating cost and saving it to history costj = get_cost_value(Y_hat[j], Y[:, start:end], cost_type) costs.append(costj) #get cloud (flattened weights), gradients, nn shape and weight names paramsf, nn_shape, weight_names = flatten_weights(params, N_nn) #get updated cloud and its variance paramsf, var = update_cloud_derivative_free( paramsf, costs, learning_rate, N_nn, kernel_a, alpha, beta, gamma) #restore NN weight shapes params = unflatten_weights(paramsf, nn_shape, weight_names, N_nn) if (lr_decay): if i == 0: paramsf_previous = paramsf gt = 0 delta = paramsf_previous - paramsf gt = gt + np.absolute(delta) learning_rate = 1 / np.sqrt(1 + gt) paramsf_previous = paramsf #print(np.mean(learning_rate)) #end of iteration cost_history.append(costs) #mean position mean_param = get_mean(params) Y_hat_mean, _ = full_forward_propagation(X[:, start:end], mean_param, nn_architecture) cost_mean = get_cost_value(Y_hat_mean, Y[:, start:end], cost_type) cost_history_mean.append(cost_mean) #end of epoch---------------- var_mean = np.mean( var) #mean of variances along dimensions of parameter space if (verbose): print( "Iteration: {:05} - mean cost: {:.5f} - particle variance: {:.5f}" .format(i, np.mean(costs), var_mean)) alpha = alpha + alpha_rate elapsed_epochs += 1 if var_mean < var_epsilon: print("Convergence achieved - Particles are localized") break if not method == "sgd": plot_cost(cost_history, cost_history_mean, 'Training Cost Function') plot_list(np.mean(cost_history, axis=1), 'Mean Cost Function') plot_distance_matrix(params, N_nn) else: plot_list(cost_history, 'Training Cost Function') print("Cost Function evaluated {:01} times".format( int(n_batches * elapsed_epochs * N_nn))) return params, mean_param
if args.save_interval and (epoch + 1) % args.save_interval == 0: torch.save( { 'epoch': epoch + 1, 'netG': netG.state_dict(), 'netD': netD.state_dict() }, '{:s}/checkpoint.pth'.format(args.saveDir)) print('Saved at checkpoint!') if args.visdom: netG.eval() with torch.no_grad(): z_pred = netG(fixed_x.unsqueeze(0)).squeeze() viz_3D = torch.stack((fixed_x[0::2], fixed_x[1::2], z_pred), dim=1) visdom.create_scatter( viz_3D, title='Sample Prediction (epoch {:d})'.format(epoch + 1)) if args.visdom: opts_D = dict(xlabel='Weight', ylabel='Freq', title='Weight Histogram (D)') opts_G = dict(xlabel='Weight', ylabel='Freq', title='Weight Histogram (G)') visdom.create_hist(utils.flatten_weights(netD), opts=opts_D) visdom.create_hist(utils.flatten_weights(netG), opts=opts_G) # export model if args.export: torch.onnx.export(netG, fixed_x.expand(2, -1), '{:s}/posenet.proto'.format(args.saveDir)) print('Finish')