Example #1
0
    def backpropagation(self, train_set_y):
#        (train_set_x, train_set_y) = train_xy

        # assuming linear output and square error cost function
        observation_error = self.final_layer_output - train_set_y

        self.W_grads = []
        self.b_grads = []
        current_error = observation_error
        current_activation = self.activations[-1]
        current_W_grad = gnp.dot(current_activation.T, observation_error)
        current_b_grad = gnp.dot(gnp.ones((1, observation_error.shape[0])), observation_error)
        self.W_grads.append(current_W_grad)
        self.b_grads.append(current_b_grad)

        propagate_error = gnp.dot(observation_error, self.W_params[self.n_layers].T) # final layer is linear output, gradient is one
        for i in reversed(list(range(self.n_layers))):
            current_activation = self.activations[i]
            current_gradient = 1.0 - current_activation ** 2
            current_W_grad = gnp.dot(current_activation.T, propagate_error)
            current_b_grad = gnp.dot(gnp.ones((1, propagate_error.shape[0])), propagate_error)
            propagate_error = gnp.dot(propagate_error, self.W_params[i].T) * current_gradient

            self.W_grads.insert(0, current_W_grad)
            self.b_grads.insert(0, current_b_grad)
Example #2
0
    def forward(self, X, test=False):
        """
        Feed-forward pass through the model
        X: ('batchsize' x 'context') matrix of word indices
        """
        batchsize = X.shape[0]
        R = self.R
        C = self.C
        bw = self.bw

        # Obtain word features
        tmp = R.as_numpy_array()[:, X.flatten()].flatten(order='F')
        tmp = tmp.reshape((batchsize, self.K * self.context))
        words = np.zeros((batchsize, self.K, self.context))
        for i in range(batchsize):
            words[i, :, :] = tmp[i, :].reshape((self.K, self.context),
                                               order='F')
        words = gpu.garray(words)

        # Compute the hidden layer (predicted next word representation)
        acts = gpu.zeros((batchsize, self.K))
        for i in range(self.context):
            acts = acts + gpu.dot(words[:, :, i], C[i, :, :])
        acts = gpu.concatenate((acts, gpu.ones((batchsize, 1))), 1)

        # Compute softmax
        preds = gpu.dot(acts, gpu.concatenate((R, bw)))
        preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1))
        denom = preds.sum(1).reshape(batchsize, 1)
        preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1)

        return (words, acts, preds.as_numpy_array())
Example #3
0
 def init_samples(self, num):
     """Generate exact samples from the model assuming the weights are all zero, i.e.
     all units are independent."""
     assert np.allclose(self.weights.as_numpy_array(), 0.)
     vis = rbm_utils.sample_units(gnp.outer(gnp.ones(num), self.vbias))
     hid = rbm_utils.sample_units(gnp.outer(gnp.ones(num), self.hbias))
     return RBMState(vis, hid)
def feedforward(theta, data):
    nData = shape(data)[1]
    x = gpu.concatenate((gpu.ones((1,nData)), data), axis = 0)
    hidden_sum = gpu.dot(theta, x)
    relu_mask_hidden = gpu.ones(shape(hidden_sum)) * (hidden_sum>0)
    hidden_activation = hidden_sum*relu_mask_hidden
    return hidden_activation
Example #5
0
def costfunc_gpu_ReLU(x, *args):
    num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args
    num_weights1 = (num_input+1)*num_hidden
    x = gpu.garray(x)
    inputs = gpu.garray(inputs)
    #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1)))
    weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1))
    #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1)))
    weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1))
    nData = shape(inputs)[1]
    data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0)
    hidden_sum = gpu.dot(weights1, data)
    hidden_activation = gpu.log(1+hidden_sum.exp())
    p_avg = gpu.sum(hidden_activation,axis=1)/nData
    hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0)
    output = gpu.dot(weights2, hidden_activation)
    regularized_penalty1 = weights1[:,1:shape(weights1)[1]]
    regularized_penalty2 = weights2[:,1:shape(weights2)[1]]
    regularized_penalty1 = regularized_penalty1 * regularized_penalty1
    regularized_penalty2 = regularized_penalty2 * regularized_penalty2
    output_target_diff = (output - inputs)*(output - inputs)
    KL = gpu.sum(sparsityParam*gpu.log(sparsityParam/p_avg) + (1-sparsityParam)*gpu.log((1-sparsityParam)/(1-p_avg)))
    cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) + beta*KL
    print 'ReLU Linear Decoder Cost: ', cost
    return cost
def mlpSingleOutput1Layer_costfunc(x, *args):
    inputSize, l1Size, lambda_hidden, inputs, targets = args
    numCases = shape(inputs)[1]
    num_weights_L1 = l1Size * (inputSize + 1)
    inputs = gpu.garray(inputs)
    targets = gpu.garray(targets)
    theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1)))
    theta_output = gpu.garray(reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size+1)))
    inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0)
    hidden_sum_L1 = gpu.dot(theta_L1, inputs)
    hidden_activation_L1 = hidden_sum_L1.logistic()
    hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis = 0)
    #hidden_activation_L1 = hidden_activation_L1 * dropout_prob
    hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1)
    outputs = hidden_sum_output.logistic()
    output_target_diff = (outputs - targets)**2
    regularized_penalty_output = theta_output[:,1:shape(theta_output)[1]]
    regularized_penalty_output = regularized_penalty_output * regularized_penalty_output
    regularized_penalty_L1 = theta_L1[:,1:shape(theta_L1)[1]]
    regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1
    cost = gpu.sum(output_target_diff)/(2*numCases) + 0.5 * lambda_hidden*(gpu.sum(regularized_penalty_L1)+gpu.sum(regularized_penalty_output))
    print 'Multilayer Preceptron Cost:', cost
    del inputs
    del theta_L1
    del hidden_sum_L1
    del hidden_activation_L1
    del regularized_penalty_output
    del regularized_penalty_L1
    gpu.free_reuse_cache()
    return cost
Example #7
0
    def __init__(self, layer_sizes, scale=0.05, verbose=1, l2=0.0001,
                 momentum=0.9, epochs=20, batch_size=256,dropouts=0.0,
                 learning_rate=0.01, learning_rate_decays=0.9):

        self.layer_sizes = layer_sizes
        self.scale = scale
        self.verbose = 1
        self.l2 = l2
        self.momentum = momentum

        self.epochs = epochs
        self.batch_size = batch_size
        self.dropouts = [dropouts for l in range(len(layer_sizes)-1)]

        self.learning_rate = learning_rate
        self.learning_rate_decays = learning_rate_decays

        shapes = [(layer_sizes[i-1], layer_sizes[i])
                  for i in range(1, len(layer_sizes))]

        self.biases = init_biases_matrix(layer_sizes)
        self.weights = init_weights_matrix(shapes, scale)
        self.rms_limits = [None for i in range(len(self.weights))]

        self.hidden_functions = [self.hidden_function for i in range(len(self.weights) - 1)]

        self.weight_grads_l2_norm = [gnp.ones(weight.shape) for weight in self.weights]
        self.bias_gradis_l2_norm = [gnp.ones(bias.shape) for bias in self.biases]
        self.weight_grads = [gnp.zeros(weight.shape) for weight in self.weights]
        self.bias_grads = [gnp.zeros(bias.shape) for bias in self.biases]
Example #8
0
    def backpropagation(self, train_set_y):
        #        (train_set_x, train_set_y) = train_xy

        # assuming linear output and square error cost function
        observation_error = self.final_layer_output - train_set_y

        self.W_grads = []
        self.b_grads = []
        current_error = observation_error
        current_activation = self.activations[-1]
        current_W_grad = gnp.dot(current_activation.T, observation_error)
        current_b_grad = gnp.dot(gnp.ones((1, observation_error.shape[0])),
                                 observation_error)
        self.W_grads.append(current_W_grad)
        self.b_grads.append(current_b_grad)

        propagate_error = gnp.dot(observation_error, self.W_params[
            self.n_layers].T)  # final layer is linear output, gradient is one
        for i in reversed(range(self.n_layers)):
            current_activation = self.activations[i]
            current_gradient = 1.0 - current_activation**2
            current_W_grad = gnp.dot(current_activation.T, propagate_error)
            current_b_grad = gnp.dot(gnp.ones((1, propagate_error.shape[0])),
                                     propagate_error)
            propagate_error = gnp.dot(propagate_error,
                                      self.W_params[i].T) * current_gradient

            self.W_grads.insert(0, current_W_grad)
            self.b_grads.insert(0, current_b_grad)
Example #9
0
 def init_samples(self, num):
     """Generate exact samples from the model assuming the weights are all zero, i.e.
     all units are independent."""
     assert np.allclose(self.weights.as_numpy_array(), 0.)
     vis = rbm_utils.sample_units(gnp.outer(gnp.ones(num), self.vbias))
     hid = rbm_utils.sample_units(gnp.outer(gnp.ones(num), self.hbias))
     return RBMState(vis, hid)
Example #10
0
    def forward(self, X, test=False):
        """
        Feed-forward pass through the model
        X: ('batchsize' x 'context') matrix of word indices
        """
        batchsize = X.shape[0]
        R = self.R
        C = self.C
        bw = self.bw

        # Obtain word features
        tmp = R.as_numpy_array()[:,X.flatten()].flatten(order='F')  # flatten(), default in row-major order, order='F' means Fortran(column-major) order
        tmp = tmp.reshape((batchsize, self.K * self.context))   # reshape(), in row-major order
        words = np.zeros((batchsize, self.K, self.context))
        for i in range(batchsize):
            words[i,:,:] = tmp[i,:].reshape((self.K, self.context), order='F')
        words = gpu.garray(words)

        # Compute the hidden layer (predicted next word representation)
        acts = gpu.zeros((batchsize, self.K))
        for i in range(self.context):
            acts = acts + gpu.dot(words[:,:,i], C[i,:,:]) # the dot() of 2-D matrix is equiverlent to multiply
        acts = gpu.concatenate((acts, gpu.ones((batchsize, 1))), 1)

        # Compute softmax
        preds = gpu.dot(acts, gpu.concatenate((R, bw)))
        preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1))
        denom = preds.sum(1).reshape(batchsize, 1)
        preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1)

        return (words, acts, preds.as_numpy_array())
def grad_costfunc_gpu_ReLU(x, *args):
    num_input, num_hidden, num_output, inputs, lambda_val, sparsityParam, beta = args
    num_weights1 = (num_input + 1) * num_hidden
    num_weights2 = (num_hidden + 1) * num_output
    x = gpu.garray(x)
    inputs = gpu.garray(inputs)
    weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1))
    weights2 = x[num_weights1:shape(x)[0]].reshape(
        (num_output, num_hidden + 1))
    nData = shape(inputs)[1]
    data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0)
    hidden_sum = gpu.dot(weights1, data)
    #hidden_activation = gpu.log(1+hidden_sum.exp())
    relu_mask_hidden1 = gpu.ones(shape(hidden_sum)) * (hidden_sum > 0)
    hidden_activation = hidden_sum * relu_mask_hidden1
    #hidden_derivative = hidden_sum.logistic()
    hidden_derivative = relu_mask_hidden1
    hidden_activation = gpu.concatenate((gpu.ones(
        (1, nData)), hidden_activation),
                                        axis=0)
    hidden_derivative = gpu.concatenate((gpu.ones(
        (1, nData)), hidden_derivative),
                                        axis=0)
    outputs = gpu.dot(weights2, hidden_activation)
    weights1_grad = gpu.zeros(shape(weights1))
    weights2_grad = gpu.zeros(shape(weights2))
    p = outputs - inputs
    weights2_grad += gpu.dot(
        p, gpu.garray(transpose(hidden_activation.as_numpy_array())))
    q_temp = gpu.dot(gpu.garray(transpose(weights2.as_numpy_array())), p)
    #q = multiply(multiply(q_temp,hidden_activation),(1-hidden_activation))
    q = q_temp * hidden_derivative
    delta2 = gpu.dot(q, gpu.garray(transpose(data.as_numpy_array())))
    weights1_grad += delta2[1:shape(delta2)[0], :]
    weights1_grad = weights1_grad / nData
    weights2_grad = weights2_grad / nData
    weights1_grad[:, 1:shape(weights1_grad)[1]] = weights1_grad[:, 1:shape(
        weights1_grad)[1]] + weights1[:, 1:shape(weights1)[1]] * lambda_val
    weights2_grad[:, 1:shape(weights2_grad)[1]] = weights2_grad[:, 1:shape(
        weights2_grad)[1]] + weights2[:, 1:shape(weights2)[1]] * lambda_val
    #weights1_grad = reshape(weights1_grad, num_weights1)
    weights1_grad = weights1_grad.reshape(num_weights1)
    #weights2_grad = reshape(weights2_grad, num_weights2)
    weights2_grad = weights2_grad.reshape(num_weights2)
    del x
    del inputs
    del data
    del p
    del q_temp
    del q
    del delta2
    del hidden_sum
    del hidden_activation
    del weights1
    del weights2
    gpu.free_reuse_cache()
    return hstack(
        (weights1_grad.as_numpy_array(), weights2_grad.as_numpy_array()))
Example #12
0
 def smooth(self, eps=0.001):
     moments = (1. - eps)**2 * self
     moments += eps * (1. - eps) * self.__class__.from_independent(
         0.5 * gnp.ones(moments.expect_vis.size), self.expect_hid)
     moments += eps * (1. - eps) * self.__class__.from_independent(
         self.expect_vis, 0.5 * gnp.ones(moments.expect_hid.size))
     moments += eps**2 * self.__class__.uniform(moments.expect_vis.size,
                                                moments.expect_hid.size)
     return moments
def mlpSoftmax1Layer_grad(x, *args):
    numClasses, inputSize, l1Size, lambda_softmax, lambda_hidden, inputs, groundTruth = args
    numCases = shape(inputs)[1]
    num_weights_L1 = l1Size * (inputSize + 1)
    num_weights_softmax = numClasses * l1Size
    inputs = gpu.garray(inputs)
    theta_L1 = gpu.garray(reshape(x[0:num_weights_L1],
                                  (l1Size, inputSize + 1)))
    theta_softmax = gpu.garray(
        reshape(x[num_weights_L1:shape(x)[0]], (numClasses, l1Size)))
    theta_L1_grad = gpu.zeros(shape(theta_L1))
    inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0)
    hidden_sum_L1 = gpu.dot(theta_L1, inputs)
    #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp())
    #hidden_derivative_L1 = hidden_sum_L1.logistic()
    relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0)
    hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1
    #hidden_activation_L1 = hidden_sum_L1.logistic()
    hidden_derivative_L1 = relu_mask_hidden1
    hidden_sum_softmax_imd = gpu.dot(theta_softmax, hidden_activation_L1)
    hidden_sum_softmax = hidden_sum_softmax_imd - hidden_sum_softmax_imd.max(
        axis=0)
    predictions = hidden_sum_softmax.exp()
    predictions = predictions / gpu.sum(predictions, axis=0)
    softmax_imd = groundTruth - predictions
    theta_softmax_grad = -1 * gpu.dot(
        softmax_imd,
        gpu.garray(transpose(hidden_activation_L1.as_numpy_array()))
    ) / numCases + lambda_softmax * theta_softmax
    deltaOut = -softmax_imd
    delta_L1_imd = gpu.dot(
        gpu.garray(transpose(theta_softmax.as_numpy_array())), deltaOut)
    delta_L1_imd2 = delta_L1_imd * hidden_derivative_L1
    #delta_L1_imd2 = (delta_L1_imd*hidden_activation_L1)*(1-hidden_activation_L1)
    delta_L1 = gpu.dot(delta_L1_imd2,
                       gpu.garray(transpose(inputs.as_numpy_array())))
    theta_L1_grad += delta_L1
    theta_L1_grad = theta_L1_grad / numCases
    theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape(
        theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden
    theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1)
    theta_softmax_grad = reshape(theta_softmax_grad.as_numpy_array(),
                                 num_weights_softmax)
    del inputs
    del theta_L1
    del theta_softmax
    del hidden_sum_L1
    del hidden_activation_L1
    del hidden_sum_softmax
    del predictions
    del softmax_imd
    del deltaOut
    del delta_L1_imd
    del delta_L1_imd2
    del delta_L1
    gpu.free_reuse_cache()
    return hstack((theta_L1_grad, theta_softmax_grad))
Example #14
0
def forwardProp(X, theta1, theta2):

    a1 = gpu.concatenate((X, gpu.ones((np.size(X[:, 0]), 1))), axis=1)

    a2 = sigmoid(theta1.dot(a1.T))
    a2 = gpu.concatenate((a2, gpu.ones((1, np.size(a2[0, :])))), axis=0)

    a3 = sigmoid(theta2.dot(a2))

    return a1, a2, a3
Example #15
0
def mlpSoftmax_costfunc(x, *args):
    numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth = args
    numCases = shape(inputs)[1]
    num_weights_L1 = l1Size * (inputSize + 1)
    num_weights_L2 = l2Size * (l1Size + 1)
    #x = gpu.garray(x)
    inputs = gpu.garray(inputs)
    theta_L1 = gpu.garray(reshape(x[0:num_weights_L1],
                                  (l1Size, inputSize + 1)))
    #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1))
    #print numClasses, l2Size
    theta_L2 = gpu.garray(
        reshape(x[num_weights_L1:num_weights_L2 + num_weights_L1],
                (l2Size, l1Size + 1)))
    #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1))
    theta_softmax = gpu.garray(
        reshape(x[num_weights_L2 + num_weights_L1:shape(x)[0]],
                (numClasses, l2Size)))
    #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size))
    inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0)
    hidden_sum_L1 = gpu.dot(theta_L1, inputs)
    hidden_activation_L1 = hidden_sum_L1.logistic()
    hidden_activation_L1 = gpu.concatenate((gpu.ones(
        (1, numCases)), hidden_activation_L1),
                                           axis=0)
    hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1)
    hidden_activation_L2 = hidden_sum_L2.logistic()
    hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L2)
    hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0)
    predictions = hidden_sum_softmax.exp()
    predictions = predictions / gpu.sum(predictions, axis=0)
    temp = groundTruth * gpu.log(predictions)
    regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]]
    regularized_penalty_L2 = theta_L2[:, 1:shape(theta_L2)[1]]
    regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1
    regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2
    cost = -1 * gpu.sum(temp) / numCases + 0.5 * lambda_hidden * (
        gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_L2)
    ) + 0.5 * lambda_softmax * gpu.sum(theta_softmax * theta_softmax)
    print 'Multilayer Softmax Cost:', cost
    del inputs
    del theta_L1
    del theta_L2
    del theta_softmax
    del hidden_sum_L1
    del hidden_activation_L1
    del hidden_sum_L2
    del hidden_activation_L2
    del hidden_sum_softmax
    del predictions
    del temp
    del regularized_penalty_L1
    del regularized_penalty_L2
    gpu.free_reuse_cache()
    return cost
def costfunc_gpu(x, *args):
    num_input, num_hidden, num_output, inputs, noNoiseData, lambda_val, sparsityParam, beta = args
    num_weights1 = (num_input + 1) * num_hidden
    x = gpu.garray(x)
    #    randomNoise = random.random_sample(shape(inputs))
    #    criteriaTable = randomNoise > 0.32
    #    inputs = inputs * criteriaTable
    inputs = gpu.garray(inputs)
    noNoiseData = gpu.garray(noNoiseData)
    #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1)))
    weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1))
    #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1)))
    weights2 = x[num_weights1:shape(x)[0]].reshape(
        (num_output, num_hidden + 1))
    nData = shape(inputs)[1]
    data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0)
    hidden_sum = gpu.dot(weights1, data)
    hidden_activation = hidden_sum.logistic()
    p_avg = gpu.sum(hidden_activation, axis=1) / nData
    hidden_activation = gpu.concatenate((gpu.ones(
        (1, nData)), hidden_activation),
                                        axis=0)
    output = gpu.dot(weights2, hidden_activation)
    regularized_penalty1 = weights1[:, 1:shape(weights1)[1]]
    regularized_penalty2 = weights2[:, 1:shape(weights2)[1]]
    regularized_penalty1 = regularized_penalty1 * regularized_penalty1
    regularized_penalty2 = regularized_penalty2 * regularized_penalty2
    output_target_diff = (output - noNoiseData) * (output - noNoiseData)
    KL = gpu.sum(sparsityParam * gpu.log(sparsityParam / p_avg) +
                 (1 - sparsityParam) * gpu.log((1 - sparsityParam) /
                                               (1 - p_avg)))
    cost = gpu.sum(output_target_diff) / (2 * nData) + 0.5 * lambda_val * (
        gpu.sum(regularized_penalty1) +
        gpu.sum(regularized_penalty2)) + beta * KL
    print 'GPU Linear Denoising Decoder Cost: ', cost
    del x
    del inputs
    del noNoiseData
    del data
    del hidden_sum
    del hidden_activation
    del p_avg
    del output
    del regularized_penalty1
    del regularized_penalty2
    del weights1
    del weights2
    del output_target_diff
    gpu.free_reuse_cache()
    return cost
Example #17
0
def grad_costfunc_gpu(x, *args):
    num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args
    num_weights1 = (num_input+1)*num_hidden
    num_weights2 = (num_hidden+1)*num_output
    x = gpu.garray(x)
    inputs = gpu.garray(inputs)
    weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1))
    weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1))
    nData = shape(inputs)[1]
    data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0)
    hidden_sum = gpu.dot(weights1, data)
    hidden_activation = hidden_sum.logistic()
    p_avg = gpu.sum(hidden_activation,axis=1)/nData
    grad_sparse = -1*sparsityParam/p_avg.as_numpy_array() + (1-sparsityParam)/(1-p_avg.as_numpy_array())
    grad_sparse = append(0,grad_sparse)
    grad_sparse = tile(grad_sparse, (nData, 1))
    grad_sparse = gpu.garray(transpose(grad_sparse))
    hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0)
    outputs = gpu.dot(weights2, hidden_activation)
    weights1_grad = gpu.zeros(shape(weights1))
    weights2_grad = gpu.zeros(shape(weights2))
    p = outputs-inputs
    weights2_grad += gpu.dot(p, gpu.garray(transpose(hidden_activation.as_numpy_array())))
    q_temp = gpu.dot(gpu.garray(transpose(weights2.as_numpy_array())),p) + beta*grad_sparse
    #q = multiply(multiply(q_temp,hidden_activation),(1-hidden_activation))
    q = (q_temp*hidden_activation)*(1-hidden_activation)
    delta2 = gpu.dot(q, gpu.garray(transpose(data.as_numpy_array())))
    weights1_grad += delta2[1:shape(delta2)[0], :]
    weights1_grad = weights1_grad/nData
    weights2_grad = weights2_grad/nData
    weights1_grad[:,1:shape(weights1_grad)[1]] = weights1_grad[:,1:shape(weights1_grad)[1]] + weights1[:,1:shape(weights1)[1]] * lambda_val
    weights2_grad[:,1:shape(weights2_grad)[1]] = weights2_grad[:,1:shape(weights2_grad)[1]] + weights2[:,1:shape(weights2)[1]] * lambda_val
    #weights1_grad = reshape(weights1_grad, num_weights1)
    weights1_grad = weights1_grad.reshape(num_weights1)
    #weights2_grad = reshape(weights2_grad, num_weights2)
    weights2_grad = weights2_grad.reshape(num_weights2)
    del x
    del inputs
    del data
    del grad_sparse
    del p
    del q_temp
    del q
    del delta2
    del hidden_sum
    del hidden_activation
    del weights1
    del weights2
    gpu.free_reuse_cache()
    return hstack((weights1_grad.as_numpy_array(),weights2_grad.as_numpy_array()))
Example #18
0
def clip(a, a_min, a_max):
    """Clip (limit) the values in an array.

    Given an interval, values outside the interval are clipped to the interval 
    edges. For example, if an interval of [0, 1] is specified, values smaller 
    than 0 become 0, and values larger than 1 become 1."""
    if isinstance(a, gp.garray):
        max_mask = (a > a_max)
        max_tar = gp.ones(a.shape) * a_max
        min_mask = (a < a_min)
        min_tar = gp.ones(a.shape) * a_min
        a_clipped = a*(1-max_mask-min_mask) + max_tar*max_mask + min_tar*min_mask
        return a_clipped
    else:
        return np.clip(a, a_min, a_max)
Example #19
0
 def KL(rho, rho_target, KL_flat):
     y = rho.copy()
     if KL_flat:
         y[gp.where(y < rho_target)] = rho_target * gp.ones(
             y[gp.where(y < rho_target)].shape)
     return rho_target * gp.log(rho_target / y) + (1 - rho_target) * gp.log(
         (1 - rho_target) / (1 - y))
Example #20
0
    def build_layer(self,
                    in_dim,
                    out_dim,
                    nonlin,
                    dropout=0,
                    sparsity=0,
                    sparsity_weight=0,
                    init_scale=1,
                    loss=None,
                    params=None,
                    loss_after_nonlin=False,
                    init_bias=0,
                    use_batch_normalization=False):
        self.nonlin = nonlin
        self.set_params(params if params is not None else \
                LayerParams(in_dim, out_dim, init_scale, dropout, init_bias=init_bias))

        self.sparsity = sparsity
        self.sparsity_weight = sparsity_weight
        if self.sparsity_weight > 0:
            self._sparsity_current = gnp.ones(out_dim) * sparsity
            self._sparsity_smoothing = 0.9
            self._sparsity_objective = 0

        self.loss = loss
        self.loss_value = 0
        self.noise_added = False
        self.loss_computed = False
        self.loss_after_nonlin = loss_after_nonlin

        self.use_batch_normalization = use_batch_normalization
        if use_batch_normalization:
            self.bn_layer = BatchNormalizationLayer(out_dim,
                                                    init_bias=init_bias)
            self._bn_layer_param_id = self.bn_layer._param_id
Example #21
0
def dbn_supervised_predict_sample(ws_vh, ws_v, ws_h, x, k=20):
    """
    Predict the class label of input x from supervised DBN
    WARNING: THIS IS PRETTY SLOW AND LESS RELIABLE THAN THE EXACT METHOD
    Uses the sampling method mentioned in section 6.2 of Hinton, Osindero, Teh 2006
    
    x: Input data. (NxD matrix)
    k: Number of Gibbs steps
    """
    L = len(ws_vh)
    N = x.shape[0]

    # make a forward pass to get from input layer to visible layer of top level
    # RBM
    h_prev = x.T

    # forward (bottom-up) pass, (use deterministic (we pass the activations, not
    # the stochastically sampled steps) forward pass)
    for l in range(L - 1):
        ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l]
        h_prev = gnp.logistic(ah)

    H = ws_vh[-1].shape[0]  # number of visible units top level RBM
    Hx = h_prev.shape[0]  # number of hidden units in the penultimate layer
    K = H - Hx
    # (H - Hx) is the number of supervised inputs to top level RBM
    # we give random values to the supervised portion of the input
    v = gnp.concatenate((gnp.ones((K, N)) / K, h_prev))
    # we keep the visible units clamped while sampling
    h, v = rbm_sample(ws_vh[-1], ws_v[-1], ws_h[-1], v, k, clamped=(K, H))

    # sample visible units of top level RBM given
    return v[0:K, :].T
Example #22
0
    def backprop(self):
        self.timer_logger('backprop', time.time())
        self.results['grads'] = []
        self.results['bias_grads'] = []
        if self.problem == 'classification':
            #assumes softmax + cross entropy so that both gradients cancel out to give: error = y-t
            self.results['error'] = self.results['current'] - gpu.garray(
                self.util.create_t_dataset(self.batch_y))
        else:
            #assumes linear unit + squared error cost function so that both gradients cancel out to give: error = y-t
            self.results['error'] = (self.results['current'] -
                                     gpu.garray(self.batch_y))

        for pair in self.results['activations']:
            activation = pair[0]
            weight = pair[1]

            gradient = self.activation_gradient(activation)
            self.results['grads'].insert(
                0, gpu.dot(activation.T, self.results['error']))
            self.results['bias_grads'].insert(
                0,
                gpu.dot(gpu.ones((1, self.results['error'].shape[0])),
                        self.results['error']))
            self.results['error'] = gpu.dot(self.results['error'],
                                            weight.T) * gradient

        self.timer_logger('backprop', time.time())
Example #23
0
def clip(a, a_min, a_max):
    """Clip (limit) the values in an array.

    Given an interval, values outside the interval are clipped to the interval
    edges. For example, if an interval of [0, 1] is specified, values smaller
    than 0 become 0, and values larger than 1 become 1."""
    if not isinstance(a, np.ndarray):
        max_mask = (a > a_max)
        max_tar = gp.ones(a.shape) * a_max
        min_mask = (a < a_min)
        min_tar = gp.ones(a.shape) * a_min
        a_clipped = (a * (1 - max_mask - min_mask) + max_tar * max_mask +
                     min_tar * min_mask)
        return a_clipped
    else:
        return np.clip(a, a_min, a_max)
    def forward(self, X, Im, test=False):
        """
        Feed-forward pass through the model
        X: ('batchsize' x 'context') matrix of word indices
        """
        batchsize = X.shape[0]
        Im = gpu.garray(Im)
        C = self.C
        M = self.M
        bw = self.bw
        J = self.J
        bj = self.bj
        Wfx = self.Wfx
        Whf = self.Whf
        Wfv = self.Wfv

        # Forwardprop images
        Im = gpu.concatenate((Im, gpu.ones((batchsize, 1))), 1)
        IF = gpu.dot(Im, gpu.concatenate((J, bj)))
        IF = IF * (IF > 0)

        # Obtain word features
        R = gpu.dot(Wfx, Whf)
        tmp = R.as_numpy_array()[:,X.flatten()].flatten(order='F')
        tmp = tmp.reshape((batchsize, self.K * self.context))
        words = np.zeros((batchsize, self.K, self.context))
        for i in range(batchsize):
            words[i,:,:] = tmp[i,:].reshape((self.K, self.context), order='F')
        words = gpu.garray(words)

        # Compute the hidden layer (predicted next word representation)
        acts = gpu.zeros((batchsize, self.K))
        for i in range(self.context):
            acts = acts + gpu.dot(words[:,:,i], C[i,:,:])
        acts = acts + gpu.dot(IF, M)

        # Multiplicative interaction
        F = gpu.dot(acts, Wfx) * gpu.dot(IF, Wfv)
        F = gpu.concatenate((F, gpu.ones((batchsize, 1))), 1)

        # Compute softmax
        preds = gpu.dot(F, gpu.concatenate((Whf, bw)))
        preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1))
        denom = preds.sum(1).reshape(batchsize, 1)
        preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1)

        return (words, acts, IF, F, preds.as_numpy_array())
def mlpSoftmax_costfunc(x, *args):
    numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth = args
    numCases = shape(inputs)[1]
    num_weights_L1 = l1Size * (inputSize + 1)
    num_weights_L2 = l2Size * (l1Size + 1)
    #x = gpu.garray(x)
    inputs = gpu.garray(inputs)
    theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1)))
    #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1))
    #print numClasses, l2Size
    theta_L2 = gpu.garray(reshape(x[num_weights_L1:num_weights_L2+num_weights_L1], (l2Size, l1Size + 1)))
    #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1))
    theta_softmax = gpu.garray(reshape(x[num_weights_L2+num_weights_L1:shape(x)[0]], (numClasses, l2Size)))
    #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size))
    inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0)
    hidden_sum_L1 = gpu.dot(theta_L1, inputs)
    hidden_activation_L1 = hidden_sum_L1.logistic()
    hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis=0)
    hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1)
    hidden_activation_L2 = hidden_sum_L2.logistic()
    hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L2)
    hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis = 0)
    predictions = hidden_sum_softmax.exp()
    predictions = predictions / gpu.sum(predictions,axis = 0)
    temp = groundTruth*gpu.log(predictions)
    regularized_penalty_L1 = theta_L1[:,1:shape(theta_L1)[1]]
    regularized_penalty_L2 = theta_L2[:,1:shape(theta_L2)[1]]
    regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1
    regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2
    cost = -1*gpu.sum(temp)/numCases + 0.5 * lambda_hidden*(gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_L2)) + 0.5 * lambda_softmax * gpu.sum(theta_softmax*theta_softmax)
    print 'Multilayer Softmax Cost:', cost
    del inputs
    del theta_L1
    del theta_L2
    del theta_softmax
    del hidden_sum_L1
    del hidden_activation_L1
    del hidden_sum_L2
    del hidden_activation_L2
    del hidden_sum_softmax
    del predictions
    del temp
    del regularized_penalty_L1
    del regularized_penalty_L2
    gpu.free_reuse_cache()
    return cost
def mlpSingleOutput1Layer_grad(x, *args):
    inputSize, l1Size, lambda_hidden, inputs, targets = args
    numCases = shape(inputs)[1]
    num_weights_L1 = l1Size * (inputSize + 1)
    num_weights_output = 1 * (l1Size + 1)
    inputs = gpu.garray(inputs)
    targets = gpu.garray(targets)
    theta_L1 = gpu.garray(reshape(x[0:num_weights_L1],
                                  (l1Size, inputSize + 1)))
    theta_output = gpu.garray(
        reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size + 1)))
    inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0)
    hidden_sum_L1 = gpu.dot(theta_L1, inputs)
    hidden_activation_L1 = hidden_sum_L1.logistic()
    hidden_activation_L1 = gpu.concatenate((gpu.ones(
        (1, numCases)), hidden_activation_L1),
                                           axis=0)
    #hidden_activation_L1 = hidden_activation_L1 * dropout_prob
    hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1)
    outputs = hidden_sum_output.logistic()
    theta_L1_grad = gpu.zeros(shape(theta_L1))
    theta_output_grad = gpu.zeros(shape(theta_output))
    a = (outputs - targets) * outputs * (1 - outputs)
    theta_output_grad += gpu.dot(
        a, gpu.garray(transpose(hidden_activation_L1.as_numpy_array())))
    b_temp = gpu.dot(gpu.garray(transpose(theta_output.as_numpy_array())), a)
    b = (b_temp * hidden_activation_L1) * (1 - hidden_activation_L1)
    delta2 = gpu.dot(b, gpu.garray(transpose(inputs.as_numpy_array())))
    theta_L1_grad += delta2[1:shape(delta2)[0], :]
    theta_L1_grad = theta_L1_grad / numCases
    theta_output_grad = theta_output_grad / numCases
    theta_output_grad[:, 1:shape(
        theta_output_grad)[1]] = theta_output_grad[:, 1:shape(
            theta_output_grad
        )[1]] + theta_output[:, 1:shape(theta_output)[1]] * lambda_hidden
    theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape(
        theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden
    theta_output_grad = reshape(theta_output_grad.as_numpy_array(),
                                num_weights_output)
    theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1)
    del inputs
    del theta_L1
    del hidden_sum_L1
    del hidden_activation_L1
    gpu.free_reuse_cache()
    return hstack((theta_L1_grad, theta_output_grad))
Example #27
0
    def __init__(self, layer_dim=None, init_bias=0, mean_std_update_rate=0.01):
        if layer_dim is None:
            return

        self.gamma = gnp.ones(layer_dim)
        self.beta = gnp.ones(layer_dim) * init_bias

        # mu and sigma keep a moving average of mean and standard deviation
        self.mu = None
        self.sigma = None
        self.mean_std_update_rate  = mean_std_update_rate

        self.gamma_grad = gnp.zeros(layer_dim)
        self.beta_grad = gnp.zeros(layer_dim)

        self.param_size = self.gamma.size + self.beta.size
        
        self._param_id = LayerParams._param_count
        LayerParams._param_count += 1
Example #28
0
    def __init__(self, layer_dim=None, init_bias=0, mean_std_update_rate=0.01):
        if layer_dim is None:
            return

        self.gamma = gnp.ones(layer_dim)
        self.beta = gnp.ones(layer_dim) * init_bias

        # mu and sigma keep a moving average of mean and standard deviation
        self.mu = None
        self.sigma = None
        self.mean_std_update_rate = mean_std_update_rate

        self.gamma_grad = gnp.zeros(layer_dim)
        self.beta_grad = gnp.zeros(layer_dim)

        self.param_size = self.gamma.size + self.beta.size

        self._param_id = LayerParams._param_count
        LayerParams._param_count += 1
def costfunc_gpu_ReLU(x, *args):
    num_input, num_hidden, num_output, inputs, lambda_val, sparsityParam, beta = args
    num_weights1 = (num_input + 1) * num_hidden
    x = gpu.garray(x)
    inputs = gpu.garray(inputs)
    #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1)))
    weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1))
    #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1)))
    weights2 = x[num_weights1:shape(x)[0]].reshape(
        (num_output, num_hidden + 1))
    nData = shape(inputs)[1]
    data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0)
    hidden_sum = gpu.dot(weights1, data)
    #hidden_activation = gpu.log(1+hidden_sum.exp())
    relu_mask_hidden1 = gpu.ones(shape(hidden_sum)) * (hidden_sum > 0)
    hidden_activation = hidden_sum * relu_mask_hidden1
    hidden_activation = gpu.concatenate((gpu.ones(
        (1, nData)), hidden_activation),
                                        axis=0)
    output = gpu.dot(weights2, hidden_activation)
    regularized_penalty1 = weights1[:, 1:shape(weights1)[1]]
    regularized_penalty2 = weights2[:, 1:shape(weights2)[1]]
    regularized_penalty1 = regularized_penalty1 * regularized_penalty1
    regularized_penalty2 = regularized_penalty2 * regularized_penalty2
    output_target_diff = (output - inputs) * (output - inputs)
    cost = gpu.sum(output_target_diff) / (2 * nData) + 0.5 * lambda_val * (
        gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2))
    print 'GPU ReLU Linear Decoder Cost: ', cost
    del x
    del inputs
    del data
    del hidden_sum
    del hidden_activation
    del output
    del regularized_penalty1
    del regularized_penalty2
    del weights1
    del weights2
    del output_target_diff
    gpu.free_reuse_cache()
    return cost
def mlpSoftmax1Layer_costfunc(x, *args):
    numClasses, inputSize, l1Size, lambda_softmax, lambda_hidden, inputs, groundTruth = args
    numCases = shape(inputs)[1]
    num_weights_L1 = l1Size * (inputSize + 1)
    inputs = gpu.garray(inputs)
    theta_L1 = gpu.garray(reshape(x[0:num_weights_L1],
                                  (l1Size, inputSize + 1)))
    theta_softmax = gpu.garray(
        reshape(x[num_weights_L1:shape(x)[0]], (numClasses, l1Size)))
    inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0)
    hidden_sum_L1 = gpu.dot(theta_L1, inputs)
    #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp())
    relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0)
    hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1
    #hidden_activation_L1 = hidden_sum_L1.logistic()
    hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L1)
    hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0)
    predictions = hidden_sum_softmax.exp()
    predictions = predictions / gpu.sum(predictions, axis=0)
    temp = groundTruth * gpu.log(predictions)
    temp = temp.as_numpy_array()
    temp[temp == -inf] = -200.0
    temp = nan_to_num(temp)
    regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]]
    regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1
    cost = -1 * sum(temp) / numCases + 0.5 * lambda_hidden * (
        gpu.sum(regularized_penalty_L1)) + 0.5 * lambda_softmax * gpu.sum(
            theta_softmax * theta_softmax)
    print 'Multilayer Softmax Cost:', cost
    del inputs
    del theta_L1
    del theta_softmax
    del hidden_sum_L1
    del hidden_activation_L1
    del hidden_sum_softmax
    del predictions
    del temp
    del regularized_penalty_L1
    gpu.free_reuse_cache()
    return cost
    def backward(self, Y, preds, F, IF, acts, words, X, Im):
        """
        Backward pass through the network
        """
        batchsize = preds.shape[0]
        Im = gpu.garray(Im)

        # Compute part of df/dR
        Ix = gpu.garray(preds[:,:-1] - Y) / batchsize
        delta = gpu.dot(F.T, Ix)
        dWhf = delta[:-1,:] + self.gamma_r * self.Whf
        db = delta[-1,:]

        # Compute df/Wfv and part of df/Wfx
        Ix = gpu.dot(Ix, self.Whf.T)
        dWfv = gpu.dot(IF.T, Ix * gpu.dot(acts, self.Wfx)) + self.gamma_r * self.Wfv
        dWfx = gpu.dot(acts.T, Ix * gpu.dot(IF, self.Wfv)) + self.gamma_r * self.Wfx
        
        # Compute df/dC and word inputs for df/dR
        Ix_word = gpu.dot(Ix * gpu.dot(IF, self.Wfv), self.Wfx.T)
        dC = gpu.zeros(np.shape(self.C))
        dR = np.zeros((self.K, self.V))
        for i in range(self.context):
            delta = gpu.dot(words[:,:,i].T, Ix_word)
            dC[i,:,:] = delta + self.gamma_c * self.C[i,:,:]
            delta = gpu.dot(Ix_word, self.C[i,:,:].T)
            delta = delta.as_numpy_array()
            for j in range(X.shape[0]):
                dR[:,X[j,i]] = dR[:,X[j,i]] + delta.T[:,j]

        dR = gpu.garray(dR)
        dWfx = dWfx + gpu.dot(dR, self.Whf.T)
        dWhf = dWhf + gpu.dot(self.Wfx.T, dR)

        # Compute df/dM
        dM = gpu.dot(IF.T, Ix_word) + self.gamma_c * self.M

        # Compute df/dJ
        Ix = gpu.dot(Ix * gpu.dot(acts, self.Wfx), self.Wfv.T) * (IF > 0) + gpu.dot(Ix_word, self.M.T) * (IF > 0)
        Im = gpu.concatenate((Im, gpu.ones((batchsize, 1))), 1)
        delta = gpu.dot(Im.T, Ix)
        dJ = delta[:-1,:] + self.gamma_c * self.J
        dBj = delta[-1,:]

        self.db = db
        self.dC = dC
        self.dM = dM
        self.dJ = dJ
        self.dBj = dBj
        self.dWhf = dWhf
        self.dWfv = dWfv
        self.dWfx = dWfx
Example #32
0
    def __init__(self, in_dim=1, out_dim=1, init_scale=1e-1, dropout=0, init_bias=0):
        self.W = gnp.randn(in_dim, out_dim) * init_scale
        self.b = gnp.ones(out_dim) * init_bias

        self.W_grad = self.W * 0
        self.b_grad = self.b * 0

        self.param_size = self.W.size + self.b.size
        self.dropout = dropout

        # get an ID for this param variable.
        self._param_id = LayerParams._param_count
        LayerParams._param_count += 1
Example #33
0
    def __init__(self, in_dim=[1], out_dim=1, init_scale=1.0, dropout=[0], init_bias=0):
        self.n_inputs = len(in_dim)
        self.W = [gnp.randn(in_dim[i], out_dim) * math.sqrt(float(init_scale) / in_dim[i]) for i in xrange(self.n_inputs)]
        self.b = gnp.ones(out_dim) * init_bias

        self.W_grad = [self.W[i] * 0 for i in xrange(self.n_inputs)]
        self.b_grad = self.b * 0

        self.param_size = sum([W.size for W in self.W]) + self.b.size
        self.dropout = dropout if len(dropout) == self.n_inputs else dropout[:1] * self.n_inputs

        # get an ID for this param variable.
        self._param_id = LayerParams._param_count
        LayerParams._param_count += 1
Example #34
0
    def __init__(self, layer_dim=None):
        if layer_dim is None:
            return

        self.gamma = gnp.ones(layer_dim)
        self.beta = gnp.zeros(layer_dim)

        self.gamma_grad = gnp.zeros(layer_dim)
        self.beta_grad = gnp.zeros(layer_dim)

        self.param_size = self.gamma.size + self.beta.size
        
        self._param_id = LayerParams._param_count
        LayerParams._param_count += 1
def mlpSingleOutput1Layer_costfunc(x, *args):
    inputSize, l1Size, lambda_hidden, inputs, targets = args
    numCases = shape(inputs)[1]
    num_weights_L1 = l1Size * (inputSize + 1)
    inputs = gpu.garray(inputs)
    targets = gpu.garray(targets)
    theta_L1 = gpu.garray(reshape(x[0:num_weights_L1],
                                  (l1Size, inputSize + 1)))
    theta_output = gpu.garray(
        reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size + 1)))
    inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0)
    hidden_sum_L1 = gpu.dot(theta_L1, inputs)
    hidden_activation_L1 = hidden_sum_L1.logistic()
    hidden_activation_L1 = gpu.concatenate((gpu.ones(
        (1, numCases)), hidden_activation_L1),
                                           axis=0)
    #hidden_activation_L1 = hidden_activation_L1 * dropout_prob
    hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1)
    outputs = hidden_sum_output.logistic()
    output_target_diff = (outputs - targets)**2
    regularized_penalty_output = theta_output[:, 1:shape(theta_output)[1]]
    regularized_penalty_output = regularized_penalty_output * regularized_penalty_output
    regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]]
    regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1
    cost = gpu.sum(output_target_diff) / (
        2 *
        numCases) + 0.5 * lambda_hidden * (gpu.sum(regularized_penalty_L1) +
                                           gpu.sum(regularized_penalty_output))
    print 'Multilayer Preceptron Cost:', cost
    del inputs
    del theta_L1
    del hidden_sum_L1
    del hidden_activation_L1
    del regularized_penalty_output
    del regularized_penalty_L1
    gpu.free_reuse_cache()
    return cost
def mlpSingleOutput1Layer_grad(x, *args):
    inputSize, l1Size, lambda_hidden, inputs, targets = args
    numCases = shape(inputs)[1]
    num_weights_L1 = l1Size * (inputSize + 1)
    num_weights_output = 1 * (l1Size+1)
    inputs = gpu.garray(inputs)
    targets = gpu.garray(targets)
    theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1)))
    theta_output = gpu.garray(reshape(x[num_weights_L1:shape(x)[0]], (1, l1Size+1)))
    inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0)
    hidden_sum_L1 = gpu.dot(theta_L1, inputs)
    hidden_activation_L1 = hidden_sum_L1.logistic()
    hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis = 0)
    #hidden_activation_L1 = hidden_activation_L1 * dropout_prob
    hidden_sum_output = gpu.dot(theta_output, hidden_activation_L1)
    outputs = hidden_sum_output.logistic()
    theta_L1_grad = gpu.zeros(shape(theta_L1))
    theta_output_grad = gpu.zeros(shape(theta_output))
    a = (outputs - targets) * outputs * (1-outputs)
    theta_output_grad += gpu.dot(a, gpu.garray(transpose(hidden_activation_L1.as_numpy_array())))
    b_temp = gpu.dot(gpu.garray(transpose(theta_output.as_numpy_array())),a)
    b = (b_temp*hidden_activation_L1)*(1-hidden_activation_L1)
    delta2 = gpu.dot(b, gpu.garray(transpose(inputs.as_numpy_array())))
    theta_L1_grad += delta2[1:shape(delta2)[0], :]
    theta_L1_grad = theta_L1_grad/numCases
    theta_output_grad = theta_output_grad/numCases
    theta_output_grad[:,1:shape(theta_output_grad)[1]] = theta_output_grad[:,1:shape(theta_output_grad)[1]] + theta_output[:,1:shape(theta_output)[1]] * lambda_hidden
    theta_L1_grad[:,1:shape(theta_L1_grad)[1]] = theta_L1_grad[:,1:shape(theta_L1_grad)[1]] + theta_L1[:,1:shape(theta_L1)[1]] * lambda_hidden
    theta_output_grad = reshape(theta_output_grad.as_numpy_array(), num_weights_output)
    theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1)
    del inputs
    del theta_L1
    del hidden_sum_L1
    del hidden_activation_L1
    gpu.free_reuse_cache()
    return hstack((theta_L1_grad,theta_output_grad))
Example #37
0
def test_gnumpy(dat, num_epochs):
    import gnumpy as gpu
    import numpy
    import time
    # load data. <dat> is 2 dimensional: 60000 X 784
    #dat = gpu.garray(load('mnist_cudaTest').T/255.)
    # training parameters
    epsilon = 0.1
    momentum = 0.9
    batch_size = 128
    num_batches = dat.shape[0] / batch_size
    # model parameters
    num_vis = dat.shape[1]
    num_hid = 4096
    # initialize weights
    w_vh = 0.1 * gpu.randn(num_vis, num_hid)
    w_v = gpu.zeros(num_vis)
    w_h = -4. * gpu.ones(num_hid)
    # initialize weight updates
    wu_vh = gpu.zeros((num_vis, num_hid))
    wu_v = gpu.zeros(num_vis)
    wu_h = gpu.zeros(num_hid)
    for epoch in range(num_epochs):
        err = []
        tic = time.clock()
        for batch in range(num_batches):
            # positive phase
            v1 = dat[batch * batch_size:(batch + 1) * batch_size]
            h1 = (gpu.dot(v1, w_vh) + w_h).logistic()
            # sample hiddens
            hSampled = h1.rand() < h1
            # negative phase
            v2 = (gpu.dot(hSampled, w_vh.T) + w_v).logistic()
            h2 = (gpu.dot(v2, w_vh) + w_h).logistic()
            # update weights
            wu_vh = wu_vh * momentum + gpu.dot(v1.T, h1) - gpu.dot(v2.T, h2)
            wu_v = wu_v * momentum + v1.sum(0) - v2.sum(0)
            wu_h = wu_h * momentum + h1.sum(0) - h2.sum(0)

            w_vh += wu_vh * (epsilon / batch_size)
            w_v += wu_v * (epsilon / batch_size)
            w_h += wu_h * (epsilon / batch_size)
            # calculate reconstruction error
            err.append((v2 - v1).euclid_norm()**2 / (num_vis * batch_size))
        toc = time.clock()
        print "Mean squared error: %.4f, takes time: %d" % (numpy.mean(err),
                                                            toc - tic)
    return w_vh, w_v, w_h
Example #38
0
def test_gnumpy(dat, num_epochs):
  import gnumpy as gpu
  import numpy 
  import time
  # load data. <dat> is 2 dimensional: 60000 X 784
  #dat = gpu.garray(load('mnist_cudaTest').T/255.) 
  # training parameters
  epsilon = 0.1
  momentum = 0.9
  batch_size = 128
  num_batches = dat.shape[0]/batch_size
  # model parameters
  num_vis = dat.shape[1]
  num_hid = 4096
  # initialize weights
  w_vh = 0.1 * gpu.randn(num_vis, num_hid)
  w_v = gpu.zeros(num_vis)
  w_h = -4. * gpu.ones(num_hid)
  # initialize weight updates
  wu_vh = gpu.zeros((num_vis, num_hid))
  wu_v = gpu.zeros(num_vis)
  wu_h = gpu.zeros(num_hid)
  for epoch in range(num_epochs):
    err = []
    tic = time.clock()
    for batch in range(num_batches):
      # positive phase
      v1 = dat[batch*batch_size : (batch + 1)*batch_size]
      h1 = (gpu.dot(v1, w_vh) + w_h).logistic()
      # sample hiddens
      hSampled = h1.rand() < h1
      # negative phase
      v2 = (gpu.dot(hSampled, w_vh.T) + w_v).logistic()
      h2 = (gpu.dot(v2, w_vh) + w_h).logistic()
      # update weights
      wu_vh = wu_vh * momentum + gpu.dot(v1.T, h1) - gpu.dot(v2.T, h2)
      wu_v = wu_v * momentum + v1.sum(0) - v2.sum(0)
      wu_h = wu_h * momentum + h1.sum(0) - h2.sum(0)
      
      w_vh += wu_vh * (epsilon/batch_size)
      w_v += wu_v * (epsilon/batch_size)
      w_h += wu_h * (epsilon/batch_size)
      # calculate reconstruction error
      err.append((v2-v1).euclid_norm()**2/(num_vis*batch_size))
    toc = time.clock()
    print "Mean squared error: %.4f, takes time: %d" % (numpy.mean(err), toc-tic)
  return w_vh, w_v, w_h
def fine_tuning_cost_gpu(x, *args):
    inputSize, l1Size, l2Size, l3Size, l4Size, l5Size, lambda_val, inputs = args
    num_weights_L1 = l1Size * (inputSize + 1)
    num_weights_L2 = l2Size * (l1Size + 1)
    num_weights_L3 = l3Size * (l2Size + 1)
    num_weights_L4 = l4Size * (l3Size + 1)
    num_weights_L5 = l5Size * (l4Size + 1)
    #num_weights_L6 = inputSize * (l5Size + 1)
    x = gpu.garray(x)
    inputs = gpu.garray(inputs)
    #weights1 = reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))
    weights1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1))
    #weights2 = reshape(x[num_weights_L1:num_weights_L1+num_weights_L2], (l2Size, l1Size + 1))
    weights2 = x[num_weights_L1:num_weights_L1+num_weights_L2].reshape((l2Size, l1Size + 1))
    #weights3 = reshape(x[num_weights_L1+num_weights_L2:num_weights_L1+num_weights_L2+num_weights_L3], (l3Size, l2Size + 1))
    weights3 = x[num_weights_L1+num_weights_L2:num_weights_L1+num_weights_L2+num_weights_L3].reshape((l3Size, l2Size + 1))
    #weights4 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4], (l4Size, l3Size + 1))
    weights4 = x[num_weights_L1+num_weights_L2+num_weights_L3:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4].reshape((l4Size, l3Size + 1))
    #weights5 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5], (l5Size, l4Size + 1))
    weights5 = x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4:num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5].reshape((l5Size, l4Size + 1))
    #weights6 = reshape(x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5:shape(x)[0]], (inputSize, l5Size+1))
    weights6 = x[num_weights_L1+num_weights_L2+num_weights_L3+num_weights_L4+num_weights_L5:shape(x)[0]].reshape((inputSize, l5Size+1))
    nData = shape(inputs)[1]
    x = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0)
    hidden1_sum = gpu.dot(weights1, x)
    hidden1_activation = hidden1_sum.logistic()
    hidden1_activation = gpu.concatenate((gpu.ones((1,nData)), hidden1_activation), axis = 0)
    hidden2_sum = gpu.dot(weights2, hidden1_activation)
    hidden2_activation = hidden2_sum.logistic()
    hidden2_activation = gpu.concatenate((gpu.ones((1,nData)), hidden2_activation), axis = 0)
    hidden3_sum = gpu.dot(weights3, hidden2_activation)
    hidden3_activation = hidden3_sum.logistic()
    hidden3_activation = gpu.concatenate((gpu.ones((1,nData)), hidden3_activation), axis = 0)
    hidden4_sum = gpu.dot(weights4, hidden3_activation)
    hidden4_activation = hidden4_sum.logistic()
    hidden4_activation = gpu.concatenate((gpu.ones((1,nData)), hidden4_activation), axis = 0)
    hidden5_sum = gpu.dot(weights5, hidden4_activation)
    hidden5_activation = hidden5_sum.logistic()
    hidden5_activation = gpu.concatenate((gpu.ones((1,nData)), hidden5_activation), axis = 0)
    output_sum = gpu.dot(weights6, hidden5_activation)
    outputs = output_sum.logistic()
    regularized_penalty4 = weights4[:,1:shape(weights4)[1]]
    regularized_penalty5 = weights5[:,1:shape(weights5)[1]]
    regularized_penalty6 = weights6[:,1:shape(weights6)[1]]
    regularized_penalty4 = regularized_penalty4 ** 2
    regularized_penalty5 = regularized_penalty5 ** 2
    regularized_penalty6 = regularized_penalty6 ** 2
    output_target_diff = (outputs - inputs)**2
    cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty4) + gpu.sum(regularized_penalty5) + gpu.sum(regularized_penalty6))
    print 'Fine Tuning Cost: ', cost
    return cost
Example #40
0
    def __init__(self,
                 in_dim=1,
                 out_dim=1,
                 init_scale=1.0,
                 dropout=0,
                 init_bias=0):
        self.W = gnp.randn(in_dim, out_dim) * math.sqrt(
            float(init_scale) / in_dim)
        self.b = gnp.ones(out_dim) * init_bias

        self.W_grad = self.W * 0
        self.b_grad = self.b * 0

        self.param_size = self.W.size + self.b.size
        self.dropout = dropout

        # get an ID for this param variable.
        self._param_id = LayerParams._param_count
        LayerParams._param_count += 1
    def backward(self, Y, preds, IF, acts, words, X, Im):
        """
        Backward pass through the network
        """
        batchsize = preds.shape[0]
        Im = gpu.garray(Im)

        # Compute part of df/dR
        Ix = gpu.garray(preds[:,:-1] - Y) / batchsize
        delta = gpu.dot(acts.T, Ix)
        dR = delta[:-1,:] + self.gamma_r * self.R
        db = delta[-1,:]
        dR = dR.as_numpy_array()

        # Compute df/dC and word inputs for df/dR
        Ix = gpu.dot(Ix, self.R.T)
        dC = gpu.zeros(np.shape(self.C))
        for i in range(self.context):
            delta = gpu.dot(words[:,:,i].T, Ix)
            dC[i,:,:] = delta + self.gamma_c * self.C[i,:,:]
            delta = gpu.dot(Ix, self.C[i,:,:].T)
            delta = delta.as_numpy_array()
            for j in range(X.shape[0]):
                dR[:,X[j,i]] = dR[:,X[j,i]] + delta.T[:,j]

        # Compute df/dM
        dM = gpu.dot(IF.T, Ix) + self.gamma_c * self.M

        # Compute df/dJ
        Ix = gpu.dot(Ix, self.M.T) * (IF > 0)
        Im = gpu.concatenate((Im, gpu.ones((batchsize, 1))), 1)
        delta = gpu.dot(Im.T, Ix)
        dJ = delta[:-1,:] + self.gamma_c * self.J
        dBj = delta[-1,:]

        self.dR = gpu.garray(dR)
        self.dM = dM
        self.db = db
        self.dC = dC
        self.dJ = dJ
        self.dBj = dBj
Example #42
0
 def backprop(self):            
     self.timer_logger('backprop', time.time())   
     self.results['grads'] = []
     self.results['bias_grads'] = []   
     if self.problem == 'classification':   
         #assumes softmax + cross entropy so that both gradients cancel out to give: error = y-t   
         self.results['error'] = self.results['current'] - gpu.garray(self.util.create_t_dataset(self.batch_y))     
     else:
         #assumes linear unit + squared error cost function so that both gradients cancel out to give: error = y-t  
         self.results['error'] = (self.results['current'] - gpu.garray(self.batch_y)) 
         
     for pair in self.results['activations']:
         activation = pair[0]
         weight = pair[1] 
         
         gradient = self.activation_gradient(activation)             
         self.results['grads'].insert(0,gpu.dot(activation.T,self.results['error']))     
         self.results['bias_grads'].insert(0,gpu.dot(gpu.ones((1,self.results['error'].shape[0])),self.results['error']))         
         self.results['error'] = gpu.dot(self.results['error'],weight.T)*gradient
         
     self.timer_logger('backprop', time.time())   
Example #43
0
 def feedforward(self, X, return_on_gpu=False):
     """Perform feedforward through this layer.
     """
     # Cleanup debris from any previous feedforward
     self._cleanup()
     # Record (a pointer to) the passed input
     self.X = gp.garray(X)
     # Generate and apply a dropout mask to the input
     if (self.drop_rate > 1e-4):
         drop_mask = self.drop_scale * \
                 (gp.rand((self.X.shape[0], self.X.shape[1])) > self.drop_rate)
     else:
         drop_mask = gp.ones((self.X.shape[0], self.X.shape[1]))
     self.dYdX = drop_mask
     if (self.fuzz_scale > 1e-4):
         fuzz_bump = (self.fuzz_scale / self.drop_scale) * \
                 gp.randn((self.X.shape[0], self.X.shape[1]))
         self.Y = drop_mask * (self.X + fuzz_bump)
     else:
         self.Y = drop_mask * self.X
     if not return_on_gpu:
         self.Y = gp.as_numpy_array(self.Y)
     return self.Y
Example #44
0
def l1svm_x(z, targets, predict=False, error=False, addon=0):
    """
    l1-SVM for the hinge loss, cross(mutual exclusive)
    addon, weight
    Note: the _targets here are (1, -1)
    and targets are single numbers which indicate the class label
    """
    if predict:
        # argmax(z)
        return gpu.argmax(z, axis=1)

    n, m = z.shape
    _targets = -1 * gpu.ones((n, m))
    _targets[np.arange(n), targets] += 2
    _value = (1 - z * _targets)
    indicator = _value > 0
    maximum = indicator * _value
    xhl = gpu.sum(maximum)
    if error:
        err = -_targets * indicator
        return xhl + addon, err
    else:
        return xhl + addon
Example #45
0
def l2svm_x(z, targets, predict=False, error=False, addon=0):
    """
    l2-SVM for the hinge loss, cross(mutual exclusive)
    addon, weight
    Note: the _targets here are (1, -1)
    and targets are single numbers which indicate the class label
    """
    if predict:
        # argmax(z)
        return gpu.argmax(z, axis=1)

    n, m = z.shape
    # _targets (1, -1)
    _targets = -1 * gpu.ones((n, m))
    # targets only has one label for one data
    _targets[np.arange(n), targets] += 2
    _value = (1 - z * _targets)
    maximum = (_value > 0) * _value
    xhl = gpu.sum(maximum**2)
    if error:
        err = -2 * _targets * maximum
        return xhl + addon, err
    else:
        return xhl + addon
Example #46
0
    def __init__(self,
                 in_dim=[1],
                 out_dim=1,
                 init_scale=1.0,
                 dropout=[0],
                 init_bias=0):
        self.n_inputs = len(in_dim)
        self.W = [
            gnp.randn(in_dim[i], out_dim) *
            math.sqrt(float(init_scale) / in_dim[i])
            for i in xrange(self.n_inputs)
        ]
        self.b = gnp.ones(out_dim) * init_bias

        self.W_grad = [self.W[i] * 0 for i in xrange(self.n_inputs)]
        self.b_grad = self.b * 0

        self.param_size = sum([W.size for W in self.W]) + self.b.size
        self.dropout = dropout if len(
            dropout) == self.n_inputs else dropout[:1] * self.n_inputs

        # get an ID for this param variable.
        self._param_id = LayerParams._param_count
        LayerParams._param_count += 1
Example #47
0
    def build_layer(self, in_dim, out_dim, nonlin, dropout=0, sparsity=0, sparsity_weight=0,
            init_scale=1e-1, loss=None, params=None, loss_after_nonlin=False, init_bias=0,
            use_batch_normalization=False):
        self.nonlin = nonlin
        self.set_params(params if params is not None else \
                LayerParams(in_dim, out_dim, init_scale, dropout, init_bias=init_bias))

        self.sparsity = sparsity
        self.sparsity_weight = sparsity_weight
        if self.sparsity_weight > 0:
            self._sparsity_current = gnp.ones(out_dim) * sparsity
            self._sparsity_smoothing = 0.9
            self._sparsity_objective = 0

        self.loss = loss
        self.loss_value = 0
        self.noise_added = False
        self.loss_computed = False
        self.loss_after_nonlin = loss_after_nonlin

        self.use_batch_normalization = use_batch_normalization
        if use_batch_normalization:
            self.bn_layer = BatchNormalizationLayer(out_dim)
            self._bn_layer_param_id = self.bn_layer._param_id
def fine_tuning_cost_gpu(x, *args):
    inputSize, l1Size, l2Size, l3Size, lambda_val, inputs = args
    num_weights_L1 = l1Size * (inputSize + 1)
    num_weights_L2 = l2Size * (l1Size + 1)
    num_weights_L3 = l3Size * (l2Size + 1)
    x = gpu.garray(x)
    inputs = gpu.garray(inputs)
    weights1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1))
    weights2 = x[num_weights_L1:num_weights_L1+num_weights_L2].reshape((l2Size, l1Size + 1))
    weights3 = x[num_weights_L1+num_weights_L2:num_weights_L1+num_weights_L2+num_weights_L3].reshape((l3Size, l2Size + 1))
    weights4 = x[num_weights_L1+num_weights_L2+num_weights_L3:shape(x)[0]].reshape((inputSize, l3Size + 1))
    nData = shape(inputs)[1]
    x = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0)
    hidden1_sum = gpu.dot(weights1, x)
    #hidden1_activation = gpu.log(1+hidden1_sum.exp())
    relu_mask_hidden1 = gpu.ones(shape(hidden1_sum)) * (hidden1_sum>0)
    hidden1_activation = hidden1_sum*relu_mask_hidden1
    hidden1_activation = gpu.concatenate((gpu.ones((1,nData)), hidden1_activation), axis = 0)
    hidden2_sum = gpu.dot(weights2, hidden1_activation)
    #hidden2_activation = gpu.log(1+hidden2_sum.exp())
    relu_mask_hidden2 = gpu.ones(shape(hidden2_sum)) * (hidden2_sum>0)
    hidden2_activation = hidden2_sum*relu_mask_hidden2
    hidden2_activation = gpu.concatenate((gpu.ones((1,nData)), hidden2_activation), axis = 0)
    hidden3_sum = gpu.dot(weights3, hidden2_activation)
    hidden3_activation = hidden3_sum
    hidden3_activation = gpu.concatenate((gpu.ones((1,nData)), hidden3_activation), axis = 0)
    output_sum = gpu.dot(weights4, hidden3_activation)
    outputs = output_sum
    regularized_penalty3 = weights3[:,1:shape(weights3)[1]]
    regularized_penalty4 = weights4[:,1:shape(weights4)[1]]
    regularized_penalty3 = regularized_penalty3 ** 2
    regularized_penalty4 = regularized_penalty4 ** 2
    output_target_diff = (outputs - inputs)**2
    cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty3) + gpu.sum(regularized_penalty4))
    print 'Fine Tuning Cost: ', cost
    return cost
Example #49
0
mb = gpu.zeros((1,10))  

alpha = 0.1
momentum = 0.5
momentum_type = 1

for i in xrange(200):
    for i in xrange(X.shape[0]):        
        
        if momentum_type == 1:
            '''Use nesterov momentum to train the weights
            '''
            n = w + (m*momentum)
            nb = b + (mb*momentum)
            out = gpu.softmax(gpu.dot(X[i],n)+nb)
            gradb = gpu.dot(gpu.ones((1,batch_size)),out - t[i]) 
            grad = gpu.dot(X[i].T,out - t[i])
            
            m = m*momentum - (alpha*grad/128.)
            mb = mb*momentum - (alpha*gradb/128.)
            w += m
            b += mb
        elif momentum_type == 2:            
            '''Use classic momentum to train the weights
            '''
            out = gpu.softmax(gpu.dot(X[i],w)+b)
            gradb = gpu.dot(gpu.ones((1,batch_size)),out - t[i]) 
            grad = gpu.dot(X[i].T,out - t[i])
            
                       
            m = m*momentum - (alpha*grad/128.)
Example #50
0
def bias(X, bias_val=1.0):
    """Append a bias columns of magnitude bias_val to X."""
    Xb = gp.concatenate((X, gp.ones((X.shape[0],1))), axis=1)
    return Xb
Example #51
0
 def __init__(self, config, name):
     super(PAE, self).__init__(config, name)
     self.factor = gp.ones(1000)
     for i in range(1, self.factor.size):
         self.factor[i] = self.factor[i - 1] * i
     self.N = None
Example #52
0
    def backprop_gradient(self, v, network, X, targets, weights):
        '''
        Calculates the value of the cost function and the gradient for CG 
        optimization.

        args:
            array v:            the 1d vector of weights
            list[obj] network:  the network
            array X:            training data
            array targets:      the training targets
            array weights:      the backprop weights
        returns:
            array cost:         the value of the cost function
            array grad:         the value of the gradient

        This function is called by scipy's minimize function during optimization
        '''
        if len(v.shape) == 1:
            v = v.reshape((v.shape[0],1))
        # initialize variables
        n = X.shape[0]
        numHiddenLayers = len(network)

        # put the v weights back into the network
        ind =0 
        for i in range(numHiddenLayers):
            h,w = network[i].W.shape
            network[i].W = gp.garray((v[ind:(ind+h*w)]).reshape((h,w)))
            ind += h*w
            b = network[i].hbias.shape[0]
            network[i].hbias = gp.garray(v[ind:(ind+b)]).reshape((b,1))
            ind += b

        # Run data through the network, keeping activations of each layer
        acts = [X] # a list of numpy arrays
        hid = X
        for layer in network:
            vis = gp.garray(hid)
            hid = self.get_activation(layer, vis) 
            acts.append(hid)
            gp.free_reuse_cache()

        # store the gradients
        dW = []
        db = []

        # Compute the value of the cost function
        if self.targetCost == 'crossEntropy':
            # see www.stanford.edu/group/pdplab/pdphandbook/handbookch6.html
            cost = (-1.0/n) * np.sum(np.sum(targets * np.log(acts[-1]) + \
                    (1.0 - targets) * np.log(1.0 - acts[-1]), axis=1) * weights.T)
            Ix = (acts[-1] - targets) / n
        else: #self.targetCost == 'linSquaredErr':
            cost = 0.5 * np.sum(np.sum(np.square(acts[-1] - targets), axis=1) * \
                    weights.T)
            Ix = (acts[-1] - targets)
        Ix *= np.tile(weights, (1, Ix.shape[1])).reshape((Ix.shape[0],Ix.shape[1]))
        Ix = gp.garray(Ix)

        # Compute the gradients
        for i in range(numHiddenLayers-1,-1,-1):
            # augment activations with ones
            acts[i] = gp.garray(acts[i])
            acts[i] = gp.concatenate((acts[i], gp.ones((n,1))), axis=1)

            # compute delta in next layer
            delta = gp.dot(acts[i].T, Ix)

            # split delta into weights and bias parts
            dW.append(delta[:-1,:].T)
            db.append(delta[-1,:].T)

            # backpropagate the error
            if i > 0:
                if network[i-1].hidtype == 'sigmoid':
                    Ix = gp.dot(Ix,gp.concatenate((network[i].W,network[i].hbias),
                        axis=1)) * acts[i] * (1.0 - acts[i])
                elif network[i-1].hidtype == 'gaussian':
                    Ix = gp.dot(Ix,gp.concatenate((network[i].W,network[i].hbias),
                        axis=1))
                Ix = Ix[:,:-1]
            gp.free_reuse_cache()
        dW.reverse()
        db.reverse()

        # Convert gradient information
        grad = np.zeros_like(v)
        ind = 0
        for i in range(numHiddenLayers):
            grad[ind:(ind+dW[i].size)] = \
                 (dW[i].reshape((dW[i].shape[0]*dW[i].shape[1],1))).as_numpy_array()
            ind += dW[i].size
            grad[ind:(ind+db[i].size),0] = db[i].as_numpy_array()
            ind += db[i].size
        grad = grad.reshape((grad.shape[0],))
        return cost, grad  
Example #53
0
        n1 = w1+(m1*momentum)#nesterov updates 2.2 sec
        n2 = w2+(m2*momentum)
        nb1 = b1+(mb1*momentum)
        nb2 = b2+(mb2*momentum)
  
        z0 = X[i]*d02[rng.randint(0,75)]
        z1 = (gpu.dot(z0,n1)+nb1).logistic()*d05[rng.randint(0,75)]#dropout and activations 7.1 sec 
        t0 = time.time()            
        feedforward = gpu.softmax(gpu.dot(z1,n2)+nb2)
        time_softmax += time.time() - t0      
        #softmax 0.48 sec
        #gradients
        e1 = (feedforward - t[i])
        grad2 = gpu.dot(z1.T,e1) 
        grad1 = gpu.dot(X[i].T,(gpu.dot(e1,n2.T)* z1*(1-z1)))#grads 6 sec
        gradb2 = gpu.dot(gpu.ones((1, batch_size)),e1)
        gradb1= gpu.dot(gpu.ones((1, batch_size)),(gpu.dot(e1,n2.T)* z1*(1-z1)))
        #momentum and weight updates
        m1 = (momentum*m1) - ((grad1 + n1*L2)*alpha/(batch_size*1.0))#momentum und weight updates 7.4 sec    
        m2 = (momentum*m2) - ((grad2 + n2*L2)*alpha/(batch_size*1.0)) 
        mb1 = (momentum*mb1) - ((gradb1 + nb1*L2)*alpha/(batch_size*1.0))
        mb2 = (momentum*mb2) - ((gradb2 + nb2*L2)*alpha/(batch_size*1.0))
      
        w1 = w1 + m1
        w2 = w2 + m2    
        b1 = b1 + mb1
        b2 = b2 + mb2

    momentum = momentum + 0.001
    
    if momentum > 0.95: momentum = 0.95    
Example #54
0
 def uniform(cls, nvis, nhid):
     return cls.from_independent(0.5 * gnp.ones(nvis), 0.5 * gnp.ones(nhid))
def mlpSoftmax_costfunc(x, *args):
    numClasses, inputSize, l1Size, l2Size, l3Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth, dropout_probability = args
    numCases = shape(inputs)[1]
    num_weights_L1 = l1Size * (inputSize + 1)
    num_weights_L2 = l2Size * (l1Size + 1)
    num_weights_L3 = l3Size * (l2Size + 1)
    num_weights_softmax = numClasses * l3Size
    #x = gpu.garray(x)
    inputs = gpu.garray(inputs)
    theta_L1 = gpu.garray(reshape(x[0:num_weights_L1],
                                  (l1Size, inputSize + 1)))
    #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1))
    #print numClasses, l2Size
    theta_L2 = gpu.garray(
        reshape(x[num_weights_L1:num_weights_L2 + num_weights_L1],
                (l2Size, l1Size + 1)))
    #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1))
    theta_L3 = gpu.garray(
        reshape(
            x[num_weights_L2 + num_weights_L1:num_weights_L2 + num_weights_L1 +
              num_weights_L3], (l3Size, l2Size + 1)))
    theta_softmax = gpu.garray(
        reshape(
            x[num_weights_L2 + num_weights_L1 + num_weights_L3:shape(x)[0]],
            (numClasses, l3Size)))
    #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size))
    theta_L1_grad = gpu.zeros(shape(theta_L1))
    theta_L2_grad = gpu.zeros(shape(theta_L2))
    theta_L3_grad = gpu.zeros(shape(theta_L3))
    dropout_l1 = gpu.garray(
        bernoulli.rvs(dropout_probability, size=(l1Size + 1, numCases)))
    dropout_l2 = gpu.garray(
        bernoulli.rvs(dropout_probability, size=(l2Size + 1, numCases)))
    dropout_l3 = gpu.garray(
        bernoulli.rvs(dropout_probability, size=(l3Size, numCases)))
    inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0)
    hidden_sum_L1 = gpu.dot(theta_L1, inputs)
    #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp())
    relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0)
    hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1
    hidden_derivative_L1 = relu_mask_hidden1
    #hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis=0)
    hidden_derivative_L1 = gpu.concatenate((gpu.ones(
        (1, numCases)), hidden_derivative_L1),
                                           axis=0)
    hidden_activation_L1 = gpu.concatenate(
        (gpu.ones((1, numCases)), hidden_activation_L1), axis=0) * dropout_l1
    hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1)
    #hidden_activation_L2 = gpu.log(1+hidden_sum_L2.exp())
    relu_mask_hidden2 = gpu.ones(shape(hidden_sum_L2)) * (hidden_sum_L2 > 0)
    hidden_activation_L2 = hidden_sum_L2 * relu_mask_hidden2
    hidden_derivative_L2 = relu_mask_hidden2
    #hidden_activation_L2 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L2), axis=0)
    hidden_derivative_L2 = gpu.concatenate((gpu.ones(
        (1, numCases)), hidden_derivative_L2),
                                           axis=0)
    hidden_activation_L2 = gpu.concatenate(
        (gpu.ones((1, numCases)), hidden_activation_L2), axis=0) * dropout_l2
    hidden_sum_L3 = gpu.dot(theta_L3, hidden_activation_L2)
    #hidden_activation_L3 = gpu.log(1+hidden_sum_L3.exp())
    relu_mask_hidden3 = gpu.ones(shape(hidden_sum_L3)) * (hidden_sum_L3 > 0)
    #hidden_activation_L3 = hidden_sum_L3*relu_mask_hidden3
    hidden_derivative_L3 = relu_mask_hidden3
    hidden_activation_L3 = hidden_sum_L3 * relu_mask_hidden3 * dropout_l3
    #hidden_activation_L3 = hidden_sum_L3.logistic() * dropout_l3
    hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L3)
    hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0)
    predictions = hidden_sum_softmax.exp()
    predictions = predictions / gpu.sum(predictions, axis=0)
    pred = predictions.argmax(axis=0) + 1
    accuracy = mean(pred == labels) * 100
    temp = groundTruth * gpu.log(predictions)
    temp = temp.as_numpy_array()
    temp[temp == -inf] = -200.0
    temp = nan_to_num(temp)
    regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]]
    regularized_penalty_L2 = theta_L2[:, 1:shape(theta_L2)[1]]
    regularized_penalty_L3 = theta_L3[:, 1:shape(theta_L3)[1]]
    regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1
    regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2
    regularized_penalty_L3 = regularized_penalty_L3 * regularized_penalty_L3
    pred_cost = -1 * sum(temp) / numCases
    l2norm_cost = 0.5 * lambda_hidden * (
        gpu.sum(regularized_penalty_L3) + gpu.sum(regularized_penalty_L2) +
        gpu.sum(regularized_penalty_L1)) + 0.5 * lambda_softmax * gpu.sum(
            theta_softmax * theta_softmax)
    #l2norm_cost = 0
    cost = pred_cost + l2norm_cost
    print 'Prediction Accuracy:                       ', accuracy, '%'
    print 'Multilayer Softmax Prediction Cost:        ', pred_cost
    print 'Multilayer Softmax L2 Normalisation Cost:  ', l2norm_cost
    print 'Multilayer Softmax Cost:                   ', cost
    print '--------------------------------------------------------------------'
    softmax_imd = groundTruth - predictions
    #theta_softmax_grad = -1*gpu.dot(softmax_imd, gpu.garray(transpose(hidden_activation_L3.as_numpy_array())))/numCases
    theta_softmax_grad = -1 * gpu.dot(
        softmax_imd,
        gpu.garray(transpose(hidden_activation_L3.as_numpy_array()))
    ) / numCases + lambda_softmax * theta_softmax
    deltaOut = -softmax_imd
    delta_L3_imd = gpu.dot(
        gpu.garray(transpose(theta_softmax.as_numpy_array())), deltaOut)
    delta_L3_imd2 = delta_L3_imd * hidden_derivative_L3
    #delta_L3_imd2 = (delta_L3_imd * hidden_activation_L3) * (1-hidden_activation_L3)
    delta_L3 = gpu.dot(
        delta_L3_imd2,
        gpu.garray(transpose(hidden_activation_L2.as_numpy_array())))
    theta_L3_grad += delta_L3
    delta_L2_imd = gpu.dot(gpu.garray(transpose(theta_L3.as_numpy_array())),
                           delta_L3_imd2)
    delta_L2_imd2 = delta_L2_imd * hidden_derivative_L2
    delta_L2_imd2 = delta_L2_imd2[1:shape(delta_L2_imd2)[0] + 1, :]
    delta_L2 = gpu.dot(
        delta_L2_imd2,
        gpu.garray(transpose(hidden_activation_L1.as_numpy_array())))
    theta_L2_grad += delta_L2
    delta_L1_imd = gpu.dot(gpu.garray(transpose(theta_L2.as_numpy_array())),
                           delta_L2_imd2)
    delta_L1_imd2 = delta_L1_imd * hidden_derivative_L1
    delta_L1_imd2 = delta_L1_imd2[1:shape(delta_L1_imd2)[0] + 1, :]
    delta_L1 = gpu.dot(delta_L1_imd2,
                       gpu.garray(transpose(inputs.as_numpy_array())))
    theta_L1_grad += delta_L1
    theta_L1_grad = theta_L1_grad / numCases
    theta_L2_grad = theta_L2_grad / numCases
    theta_L3_grad = theta_L3_grad / numCases
    theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape(
        theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden
    theta_L2_grad[:, 1:shape(theta_L2_grad)[1]] = theta_L2_grad[:, 1:shape(
        theta_L2_grad)[1]] + theta_L2[:, 1:shape(theta_L2)[1]] * lambda_hidden
    theta_L3_grad[:, 1:shape(theta_L3_grad)[1]] = theta_L3_grad[:, 1:shape(
        theta_L3_grad)[1]] + theta_L3[:, 1:shape(theta_L3)[1]] * lambda_hidden
    theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1)
    theta_L2_grad = reshape(theta_L2_grad.as_numpy_array(), num_weights_L2)
    theta_L3_grad = reshape(theta_L3_grad.as_numpy_array(), num_weights_L3)
    theta_softmax_grad = reshape(theta_softmax_grad.as_numpy_array(),
                                 num_weights_softmax)
    del inputs
    del theta_L1
    del theta_L2
    del theta_L3
    del theta_softmax
    del hidden_sum_L1
    del hidden_activation_L1
    del hidden_sum_L2
    del hidden_activation_L2
    del hidden_activation_L3
    del hidden_sum_L3
    del hidden_sum_softmax
    del predictions
    del temp
    del softmax_imd
    del deltaOut
    del delta_L3_imd
    del delta_L3_imd2
    del delta_L3
    del delta_L2_imd
    del delta_L2_imd2
    del delta_L2
    del delta_L1_imd
    del delta_L1_imd2
    del delta_L1
    #del regularized_penalty_L1
    #del regularized_penalty_L2
    gpu.free_reuse_cache()
    return cost, hstack(
        (theta_L1_grad, theta_L2_grad, theta_L3_grad, theta_softmax_grad))
Example #56
0
File: xp.py Project: surban/mlutils
def ones(shape):
    if gpu.GPU:
        return gp.ones(shape)
    else:
        return np.ones(shape)