Exemple #1
0
def feedforward_autoencoder(theta, hidden_size, visible_size, data):
    """Accepts theta variable, hidden and visible size integers,
        and the data.
            Theta has shape (hidden_size*visible_size*2 
                                + hidden_size+visible_size,))
            data has shape (visible_size, num_examples).
        Returns activations on the hidden state.
            activations has shape (hidden_size, num_examples)
    """
    hv = hidden_size*visible_size
    assert theta.shape == (2*hv + hidden_size + visible_size,)
    W1 = theta[:hv].reshape(hidden_size, visible_size)
    b1 = theta[2*hv:2*hv+hidden_size]

    return sigmoid(np.dot(W1, data) + T(b1))
Exemple #2
0
def cost(theta, visible_size, hidden_size, weight_decay, sparsity_param, beta,
         data):
    """
    % visible_size: the number of input units (probably 64) 
    % hidden_size: the number of hidden units (probably 25) 
    % lambda: weight decay parameter
    % sparsityParam: The desired average activation for the hidden units (denoted in the lecture
    %                           notes by the greek alphabet rho, which looks like a lower-case "p").
    % beta: weight of sparsity penalty term
    % data: Our 64x10000 matrix containing the training data.  So, data(:,i) is the i-th training example. 
    
    % The input theta is a vector (because minFunc expects the parameters to be a vector). 
    % We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this 
    % follows the notation convention of the lecture notes. 
    """
    sparsity_param = float(sparsity_param)
    W1, W2, b1, b2 = unflatten_params(theta, hidden_size, visible_size)
    num_data = data.shape[1]

    # do a feed forward pass
    a2 = sigmoid(np.dot(W1, data) + T(b1))
    a3 = sigmoid(np.dot(W2, a2) + T(b2))
    assert a2.shape == (hidden_size, num_data)
    assert a3.shape == (visible_size, num_data)

    cost = 1.0 / num_data * (0.5 * np.sum((a3 - data)**2))
    # add in weight decay
    cost += (0.5 * weight_decay) * (np.sum(W1**2) + np.sum(W2**2))
    # add in sparsity parameter
    sparsity = np.sum(a2, axis=1) / float(num_data)
    assert sparsity.shape == (hidden_size, )
    s = np.sum(binary_KL_divergence(sparsity_param, sparsity))
    cost += beta * s

    # delta3: Compute the backprop (product rule)
    delta3 = -(data - a3) * a3 * (1 - a3)
    assert delta3.shape == (visible_size, num_data)
    # delta2: Compute the backprop (product rule)
    # 1. calculate inner derivative
    delta2 = np.dot(W2.T, delta3)
    # 2. add in sparsity parameter
    delta2 += T(beta * ((-sparsity_param / sparsity) + ((1 - sparsity_param) /
                                                        (1 - sparsity))))
    # 3. multiply by outer derivative
    delta2 *= a2 * (1 - a2)
    assert delta2.shape == (hidden_size, num_data)

    # compute final gradient
    W1grad = np.dot(delta2, data.T) / float(num_data)
    W2grad = np.dot(delta3, a2.T) / float(num_data)
    # add weight decay
    W1grad += weight_decay * W1
    W2grad += weight_decay * W2

    b1grad = np.sum(delta2, axis=1) / float(num_data)
    b2grad = np.sum(delta3, axis=1) / float(num_data)
    assert W1grad.shape == W1.shape
    assert W2grad.shape == W2.shape
    assert b1grad.shape == b1.shape
    assert b2grad.shape == b2.shape

    grad = neurolib.flatten_params(W1grad, W2grad, b1grad, b2grad)
    return cost, grad
def cost(theta, visible_size, hidden_size,
         weight_decay, sparsity_param, beta, data):
    """
    % visible_size: the number of input units (probably 64) 
    % hidden_size: the number of hidden units (probably 25) 
    % lambda: weight decay parameter
    % sparsityParam: The desired average activation for the hidden units (denoted in the lecture
    %                           notes by the greek alphabet rho, which looks like a lower-case "p").
    % beta: weight of sparsity penalty term
    % data: Our 64x10000 matrix containing the training data.  So, data(:,i) is the i-th training example. 
    
    % The input theta is a vector (because minFunc expects the parameters to be a vector). 
    % We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this 
    % follows the notation convention of the lecture notes. 
    """
    sparsity_param = float(sparsity_param)
    W1, W2, b1, b2 = unflatten_params(theta, hidden_size, visible_size)
    num_data = data.shape[1]

    # do a feed forward pass
    a2 = sigmoid(np.dot(W1, data) + T(b1))
    a3 = sigmoid(np.dot(W2, a2) + T(b2))
    assert a2.shape == (hidden_size, num_data)
    assert a3.shape == (visible_size, num_data)



    cost = 1.0 / num_data * (0.5 * np.sum((a3 - data)**2))
    # add in weight decay
    cost += (0.5 * weight_decay) * (np.sum(W1**2) + np.sum(W2**2))
    # add in sparsity parameter
    sparsity = np.sum(a2, axis=1) / float(num_data)
    assert sparsity.shape == (hidden_size,)
    s = np.sum(binary_KL_divergence(sparsity_param, sparsity))
    cost += beta * s

    # delta3: Compute the backprop (product rule)
    delta3 = -(data - a3) * a3 * (1 - a3)
    assert delta3.shape == (visible_size, num_data)
    # delta2: Compute the backprop (product rule)
    # 1. calculate inner derivative
    delta2 = np.dot(W2.T, delta3) 
    # 2. add in sparsity parameter
    delta2 += T(beta * ((-sparsity_param / sparsity) +
                                ((1 - sparsity_param) / (1 - sparsity))))
    # 3. multiply by outer derivative
    delta2 *= a2 * (1 - a2)
    assert delta2.shape == (hidden_size, num_data)

    # compute final gradient
    W1grad = np.dot(delta2, data.T) / float(num_data)
    W2grad = np.dot(delta3, a2.T) / float(num_data)
    # add weight decay
    W1grad += weight_decay * W1
    W2grad += weight_decay * W2


    b1grad = np.sum(delta2, axis=1) / float(num_data)
    b2grad = np.sum(delta3, axis=1) / float(num_data)
    assert W1grad.shape == W1.shape
    assert W2grad.shape == W2.shape
    assert b1grad.shape == b1.shape
    assert b2grad.shape == b2.shape
    
    grad = neurolib.flatten_params(W1grad, W2grad, b1grad, b2grad)
    return cost, grad