Ejemplo n.º 1
0
def initialize_params(esize, vsize, cat_size, dictionary_length):
    """
        Accepts esize integer of embedded representation size.
            vsize integer of input size into autoencoder,
                (typically a multiple of esize).
            cat_size integer of dimensionality of multinomial categories
            dictionary_length integer of number of words in vocab.
        Returns one flat array of parameters initialized randomly.

        Initialize parameters randomly based on layer sizes.
    """
    #We'll choose weights uniformly from the interval [-r, r]
    r = np.sqrt(6) / np.sqrt(esize + vsize + 1)

    W1 = np.random.rand(esize, vsize) * 2 * r - r
    W2 = np.random.rand(esize, vsize) * 2 * r - r
    W3 = np.random.rand(vsize, esize) * 2 * r - r
    W4 = np.random.rand(vsize, esize) * 2 * r - r

    We = 1e-3 * (np.random.rand(esize, dictionary_length) * 2 * r - r)

    Wcat = np.random.rand(cat_size, esize) * 2 * r - r

    b1 = np.zeros(esize, 1)
    b2 = np.zeros(vsize, 1)
    b3 = np.zeros(vsize, 1)
    bcat = np.zeros(cat_size, 1)

    # Convert weights and bias gradients to the vector form.
    # This step will "unroll" (flatten and concatenate together) all
    # your parameters into a vector, which can then be used with minFunc.
    return neurolib.flatten_params(W1, W2, W3, W4, b1, b2, b3, Wcat, bcat, We)
Ejemplo n.º 2
0
def initialize_params(esize, vsize, cat_size, dictionary_length):
    """
        Accepts esize integer of embedded representation size.
            vsize integer of input size into autoencoder,
                (typically a multiple of esize).
            cat_size integer of dimensionality of multinomial categories
            dictionary_length integer of number of words in vocab.
        Returns one flat array of parameters initialized randomly.

        Initialize parameters randomly based on layer sizes.
    """
    #We'll choose weights uniformly from the interval [-r, r]
    r  = np.sqrt(6) / np.sqrt(esize + vsize + 1)

    W1 = np.random.rand(esize, vsize) * 2 * r - r;
    W2 = np.random.rand(esize, vsize) * 2 * r - r;
    W3 = np.random.rand(vsize, esize) * 2 * r - r;
    W4 = np.random.rand(vsize, esize) * 2 * r - r;

    We = 1e-3 * (np.random.rand(esize, dictionary_length) * 2 * r - r)

    Wcat = np.random.rand(cat_size, esize) * 2 * r - r;

    b1 = np.zeros(esize, 1);
    b2 = np.zeros(vsize, 1);
    b3 = np.zeros(vsize, 1);
    bcat = np.zeros(cat_size, 1);

    # Convert weights and bias gradients to the vector form.
    # This step will "unroll" (flatten and concatenate together) all
    # your parameters into a vector, which can then be used with minFunc.
    return neurolib.flatten_params(W1, W2, W3, W4, b1, b2, b3, Wcat, bcat, We)
Ejemplo n.º 3
0
def initialize_params(hidden_size, visible_size):
    """Accepts number of hidde states in sparse encoder,
            and number of input states in sparse encoder..
       Initialize parameters randomly based on layer sizes.
       Returns a new flat array of size 2*visisble_size + hidden_size
    """
    assert hidden_size <= visible_size

    #we'll choose weights uniformly from the interval [-r, r]
    r = np.sqrt(6) / np.sqrt(hidden_size + visible_size + 1)
    W1 = np.random.rand(hidden_size, visible_size) * 2 * r - r
    W2 = np.random.rand(visible_size, hidden_size) * 2 * r - r

    b1 = np.zeros(hidden_size)
    b2 = np.zeros(visible_size)
    """
    % Convert weights and bias gradients to the vector form.
    % This step will "unroll" (flatten and concatenate together) all 
    % your parameters into a vector, which can then be used with minFunc. 
    """
    #TODO: jperla: make this a function
    return neurolib.flatten_params(W1, W2, b1, b2)
def initialize_params(hidden_size, visible_size):
    """Accepts number of hidde states in sparse encoder,
            and number of input states in sparse encoder..
       Initialize parameters randomly based on layer sizes.
       Returns a new flat array of size 2*visisble_size + hidden_size
    """
    assert hidden_size <= visible_size

    #we'll choose weights uniformly from the interval [-r, r]
    r  = np.sqrt(6) / np.sqrt(hidden_size + visible_size + 1)
    W1 = np.random.rand(hidden_size, visible_size) * 2 * r - r
    W2 = np.random.rand(visible_size, hidden_size) * 2 * r - r

    b1 = np.zeros(hidden_size)
    b2 = np.zeros(visible_size)

    """
    % Convert weights and bias gradients to the vector form.
    % This step will "unroll" (flatten and concatenate together) all 
    % your parameters into a vector, which can then be used with minFunc. 
    """
    #TODO: jperla: make this a function
    return neurolib.flatten_params(W1, W2, b1, b2)
Ejemplo n.º 5
0
def cost(theta, visible_size, hidden_size, weight_decay, sparsity_param, beta,
         data):
    """
    % visible_size: the number of input units (probably 64) 
    % hidden_size: the number of hidden units (probably 25) 
    % lambda: weight decay parameter
    % sparsityParam: The desired average activation for the hidden units (denoted in the lecture
    %                           notes by the greek alphabet rho, which looks like a lower-case "p").
    % beta: weight of sparsity penalty term
    % data: Our 64x10000 matrix containing the training data.  So, data(:,i) is the i-th training example. 
    
    % The input theta is a vector (because minFunc expects the parameters to be a vector). 
    % We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this 
    % follows the notation convention of the lecture notes. 
    """
    sparsity_param = float(sparsity_param)
    W1, W2, b1, b2 = unflatten_params(theta, hidden_size, visible_size)
    num_data = data.shape[1]

    # do a feed forward pass
    a2 = sigmoid(np.dot(W1, data) + T(b1))
    a3 = sigmoid(np.dot(W2, a2) + T(b2))
    assert a2.shape == (hidden_size, num_data)
    assert a3.shape == (visible_size, num_data)

    cost = 1.0 / num_data * (0.5 * np.sum((a3 - data)**2))
    # add in weight decay
    cost += (0.5 * weight_decay) * (np.sum(W1**2) + np.sum(W2**2))
    # add in sparsity parameter
    sparsity = np.sum(a2, axis=1) / float(num_data)
    assert sparsity.shape == (hidden_size, )
    s = np.sum(binary_KL_divergence(sparsity_param, sparsity))
    cost += beta * s

    # delta3: Compute the backprop (product rule)
    delta3 = -(data - a3) * a3 * (1 - a3)
    assert delta3.shape == (visible_size, num_data)
    # delta2: Compute the backprop (product rule)
    # 1. calculate inner derivative
    delta2 = np.dot(W2.T, delta3)
    # 2. add in sparsity parameter
    delta2 += T(beta * ((-sparsity_param / sparsity) + ((1 - sparsity_param) /
                                                        (1 - sparsity))))
    # 3. multiply by outer derivative
    delta2 *= a2 * (1 - a2)
    assert delta2.shape == (hidden_size, num_data)

    # compute final gradient
    W1grad = np.dot(delta2, data.T) / float(num_data)
    W2grad = np.dot(delta3, a2.T) / float(num_data)
    # add weight decay
    W1grad += weight_decay * W1
    W2grad += weight_decay * W2

    b1grad = np.sum(delta2, axis=1) / float(num_data)
    b2grad = np.sum(delta3, axis=1) / float(num_data)
    assert W1grad.shape == W1.shape
    assert W2grad.shape == W2.shape
    assert b1grad.shape == b1.shape
    assert b2grad.shape == b2.shape

    grad = neurolib.flatten_params(W1grad, W2grad, b1grad, b2grad)
    return cost, grad
def cost(theta, visible_size, hidden_size,
         weight_decay, sparsity_param, beta, data):
    """
    % visible_size: the number of input units (probably 64) 
    % hidden_size: the number of hidden units (probably 25) 
    % lambda: weight decay parameter
    % sparsityParam: The desired average activation for the hidden units (denoted in the lecture
    %                           notes by the greek alphabet rho, which looks like a lower-case "p").
    % beta: weight of sparsity penalty term
    % data: Our 64x10000 matrix containing the training data.  So, data(:,i) is the i-th training example. 
    
    % The input theta is a vector (because minFunc expects the parameters to be a vector). 
    % We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this 
    % follows the notation convention of the lecture notes. 
    """
    sparsity_param = float(sparsity_param)
    W1, W2, b1, b2 = unflatten_params(theta, hidden_size, visible_size)
    num_data = data.shape[1]

    # do a feed forward pass
    a2 = sigmoid(np.dot(W1, data) + T(b1))
    a3 = sigmoid(np.dot(W2, a2) + T(b2))
    assert a2.shape == (hidden_size, num_data)
    assert a3.shape == (visible_size, num_data)



    cost = 1.0 / num_data * (0.5 * np.sum((a3 - data)**2))
    # add in weight decay
    cost += (0.5 * weight_decay) * (np.sum(W1**2) + np.sum(W2**2))
    # add in sparsity parameter
    sparsity = np.sum(a2, axis=1) / float(num_data)
    assert sparsity.shape == (hidden_size,)
    s = np.sum(binary_KL_divergence(sparsity_param, sparsity))
    cost += beta * s

    # delta3: Compute the backprop (product rule)
    delta3 = -(data - a3) * a3 * (1 - a3)
    assert delta3.shape == (visible_size, num_data)
    # delta2: Compute the backprop (product rule)
    # 1. calculate inner derivative
    delta2 = np.dot(W2.T, delta3) 
    # 2. add in sparsity parameter
    delta2 += T(beta * ((-sparsity_param / sparsity) +
                                ((1 - sparsity_param) / (1 - sparsity))))
    # 3. multiply by outer derivative
    delta2 *= a2 * (1 - a2)
    assert delta2.shape == (hidden_size, num_data)

    # compute final gradient
    W1grad = np.dot(delta2, data.T) / float(num_data)
    W2grad = np.dot(delta3, a2.T) / float(num_data)
    # add weight decay
    W1grad += weight_decay * W1
    W2grad += weight_decay * W2


    b1grad = np.sum(delta2, axis=1) / float(num_data)
    b2grad = np.sum(delta3, axis=1) / float(num_data)
    assert W1grad.shape == W1.shape
    assert W2grad.shape == W2.shape
    assert b1grad.shape == b1.shape
    assert b2grad.shape == b2.shape
    
    grad = neurolib.flatten_params(W1grad, W2grad, b1grad, b2grad)
    return cost, grad