def update_nn_weights_derivative_free_old(cloud, cost, lr, N, kernel_a, alpha,
                                          beta, gamma):

    #get flattened weights, nn shape and weight names
    cloudf, nn_shape, weight_names = flatten_weights(cloud, N)

    #compute kernels
    kernels = [[kernel(cloudf[i], cloudf[j], kernel_a) for j in range(N)]
               for i in range(N)]
    gkernels = [[gkernel(cloudf[i], cloudf[j], kernel_a) for j in range(N)]
                for i in range(N)]

    #plt.imshow(kernels,vmin=0,vmax=1)
    #plt.colorbar()

    #compute mean and standart deviation
    cloud_mean = np.mean(cloudf, axis=0)
    cloud_var = get_var(cloudf, cloud_mean)

    #compute gradient flows
    updates = []
    for nn in range(N):
        R = 0
        P = 0
        S = 0

        Q = [
            gkernels[nn][j] * cost[j] + kernels[nn][j] * cost[j] * np.divide(
                (cloudf[j] - cloud_mean), cloud_var) for j in range(N)
        ]
        Q = np.mean(Q, axis=0)

        if alpha > 0:
            R = [[kernels[nn][j] * (cloudf[j] - cloudf[k]) for j in range(N)]
                 for k in range(N)]
            R = [item for sublist in R
                 for item in sublist]  #Flatten list of lists
            R = np.sum(R, axis=0) * float(1 / N**2)

        if beta > 0:
            P = [gkernels[nn][j] for j in range(N)]
            P = np.mean(P, axis=0)

        if gamma > 0:
            S = [
                kernels[nn][j] * np.divide((cloudf[j] - cloud_mean), cloud_var)
                for j in range(N)
            ]
            S = np.mean(S, axis=0)

        updates.append(-lr * (Q + alpha * R + beta * P + gamma * S))

    #update flattened tensors
    for nn in range(N):
        cloudf[nn] = cloudf[nn] + updates[nn]

    #restore NN weight shapes
    new_nn_weights = unflatten_weights(cloudf, nn_shape, weight_names, N)

    return new_nn_weights, cloud_var
def update_nn_weights(cloud, gradients, lr, N, kernel_a, alpha, beta, gamma):

    #get cloud (flattened weights), gradients, nn shape and weight names
    cloudf, gradientsf, nn_shape, weight_names = flatten_weights_gradients(
        cloud, gradients, N)

    #get updated cloud and its variance
    cloudf, cloud_var = update_cloud(cloudf, gradientsf, lr, N, kernel_a,
                                     alpha, beta, gamma)

    #restore NN weight shapes
    new_nn_weights = unflatten_weights(cloudf, nn_shape, weight_names, N)

    return new_nn_weights, cloud_var
Ejemplo n.º 3
0
def update_nn_weights_derivative_free_profiled(cloud, cost, lr, N_nn, kernel_a,
                                               alpha, beta, gamma):

    #get cloud (flattened weights), gradients, nn shape and weight names
    cloudf, nn_shape, weight_names = flatten_weights(cloud, N_nn)

    #time needed for the update
    time_temp = time.process_time()

    #get updated cloud and its variance
    cloudf, cloud_var = update_cloud_derivative_free(cloudf, cost, lr,
                                                     kernel_a, alpha, beta,
                                                     gamma)
    update_time = time.process_time() - time_temp

    #restore NN weight shapes
    new_nn_weights = unflatten_weights(cloudf, nn_shape, weight_names, N_nn)

    return new_nn_weights, cloud_var, update_time
Ejemplo n.º 4
0
    def funObj(self, weights_flat, X, y):
        weights = unflatten_weights(weights_flat, self.layer_sizes)
        activations = [X]
        for W, b in weights:
            Z = X @ W.T + b
            X = 1 / (1 + np.exp(-Z))
            activations.append(X)

        yhat = Z

        if self.classification:  # softmax
            tmp = np.sum(np.exp(yhat), axis=1)
            f = -np.sum(yhat[y.astype(bool)] - log_sum_exp(yhat))
            grad = np.exp(yhat) / tmp[:, None] - y
        else:  # L2 loss
            f = 0.5 * np.sum((yhat - y)**2)
            grad = yhat - y  # gradient for L2 loss

        grad_W = grad.T @ activations[-2]
        grad_b = np.sum(grad, axis=0)

        g = [(grad_W, grad_b)]

        for i in range(len(self.layer_sizes) - 2, 0, -1):
            W, b = weights[i]
            grad = grad @ W
            grad = grad * (activations[i] *
                           (1 - activations[i]))  # gradient of logistic loss
            grad_W = grad.T @ activations[i - 1]
            grad_b = np.sum(grad, axis=0)

            g = [(grad_W, grad_b)] + g  # insert to start of list

        g = flatten_weights(g)

        # add L2 regularization
        f += 0.5 * self.lammy * np.sum(weights_flat**2)
        g += self.lammy * weights_flat

        return f, g
Ejemplo n.º 5
0
 def fit(self, X, y):
     if y.ndim == 1:
         y = y[:, None]
     self.layer_sizes = [X.shape[1]
                         ] + self.hidden_layer_sizes + [y.shape[1]]
     self.classification = y.shape[
         1] > 1  # assume it's classification iff y has more than 1 column
     # random init
     scale = 0.01
     weights = list()
     for i in range(len(self.layer_sizes) - 1):
         W = scale * np.random.randn(self.layer_sizes[i + 1],
                                     self.layer_sizes[i])
         b = scale * np.random.randn(1, self.layer_sizes[i + 1])
         weights.append((W, b))
     weights_flat = flatten_weights(weights)
     weights_flat_new, f = findMin(self.funObj,
                                   weights_flat,
                                   self.max_iter,
                                   X,
                                   y,
                                   verbose=True)
     self.weights = unflatten_weights(weights_flat_new, self.layer_sizes)
Ejemplo n.º 6
0
def train_experimental(X,
                       Y,
                       nn_architecture,
                       epochs,
                       learning_rate,
                       method,
                       n_batches,
                       batch_size,
                       cost_type,
                       N_nn,
                       kernel_a,
                       alpha_init,
                       alpha_rate,
                       beta,
                       gamma,
                       verbose,
                       var_epsilon,
                       dispersion_factor=6):

    from optimizers import update_cloud_derivative_free
    from utils import flatten_weights, unflatten_weights

    if method == "sgd":
        N_nn = 1

    # initiation of neural net parameters
    params = [
        init_layers(nn_architecture, i, dispersion_factor) for i in range(N_nn)
    ]
    alpha = alpha_init

    # initiation of lists storing the history
    cost_history = []
    cost_history_mean = []
    accuracy_history = []

    elapsed_epochs = 0

    #find optimal kernel_a
    if kernel_a == "auto":
        print("Finding kernel constant...")
        paramsf, _, _ = flatten_weights(params, N_nn)
        params_diff_matrix = paramsf[:, np.newaxis] - paramsf
        norm = np.sum(params_diff_matrix**2, axis=2)
        for kernel_a in [
                0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10
        ]:
            if (np.mean(np.einsum('ij -> i', np.exp(-kernel_a * norm)) /
                        N_nn)) < 0.5:
                break

        print("Kernel constant found: " + str(kernel_a))

    if learning_rate == "auto":
        learning_rate = 1
        lr_decay = True
    else:
        lr_decay = False

    # performing calculations for subsequent iterations
    for i in range(epochs):

        for batch in range(n_batches):

            Y_hat = []
            costs = []
            cache = []
            grads = []

            start = batch * batch_size
            end = start + batch_size

            for j in range(N_nn):

                # step forward
                Y_hat_temp, cache_temp = full_forward_propagation(
                    X[:, start:end], params[j], nn_architecture)
                Y_hat.append(Y_hat_temp)
                cache.append(cache_temp)

                # calculating cost and saving it to history
                costj = get_cost_value(Y_hat[j], Y[:, start:end], cost_type)
                costs.append(costj)

            #get cloud (flattened weights), gradients, nn shape and weight names
            paramsf, nn_shape, weight_names = flatten_weights(params, N_nn)

            #get updated cloud and its variance
            paramsf, var = update_cloud_derivative_free(
                paramsf, costs, learning_rate, N_nn, kernel_a, alpha, beta,
                gamma)

            #restore NN weight shapes
            params = unflatten_weights(paramsf, nn_shape, weight_names, N_nn)

            if (lr_decay):
                if i == 0:
                    paramsf_previous = paramsf
                    gt = 0

                delta = paramsf_previous - paramsf
                gt = gt + np.absolute(delta)
                learning_rate = 1 / np.sqrt(1 + gt)

                paramsf_previous = paramsf
                #print(np.mean(learning_rate))

            #end of iteration
            cost_history.append(costs)

            #mean position
            mean_param = get_mean(params)
            Y_hat_mean, _ = full_forward_propagation(X[:,
                                                       start:end], mean_param,
                                                     nn_architecture)
            cost_mean = get_cost_value(Y_hat_mean, Y[:, start:end], cost_type)
            cost_history_mean.append(cost_mean)

        #end of epoch----------------
        var_mean = np.mean(
            var)  #mean of variances along dimensions of parameter space

        if (verbose):
            print(
                "Iteration: {:05} - mean cost: {:.5f} - particle variance: {:.5f}"
                .format(i, np.mean(costs), var_mean))

        alpha = alpha + alpha_rate
        elapsed_epochs += 1

        if var_mean < var_epsilon:
            print("Convergence achieved - Particles are localized")
            break

    if not method == "sgd":
        plot_cost(cost_history, cost_history_mean, 'Training Cost Function')
        plot_list(np.mean(cost_history, axis=1), 'Mean Cost Function')
        plot_distance_matrix(params, N_nn)
    else:
        plot_list(cost_history, 'Training Cost Function')

    print("Cost Function evaluated {:01} times".format(
        int(n_batches * elapsed_epochs * N_nn)))

    return params, mean_param