def initialize_params(esize, vsize, cat_size, dictionary_length): """ Accepts esize integer of embedded representation size. vsize integer of input size into autoencoder, (typically a multiple of esize). cat_size integer of dimensionality of multinomial categories dictionary_length integer of number of words in vocab. Returns one flat array of parameters initialized randomly. Initialize parameters randomly based on layer sizes. """ #We'll choose weights uniformly from the interval [-r, r] r = np.sqrt(6) / np.sqrt(esize + vsize + 1) W1 = np.random.rand(esize, vsize) * 2 * r - r W2 = np.random.rand(esize, vsize) * 2 * r - r W3 = np.random.rand(vsize, esize) * 2 * r - r W4 = np.random.rand(vsize, esize) * 2 * r - r We = 1e-3 * (np.random.rand(esize, dictionary_length) * 2 * r - r) Wcat = np.random.rand(cat_size, esize) * 2 * r - r b1 = np.zeros(esize, 1) b2 = np.zeros(vsize, 1) b3 = np.zeros(vsize, 1) bcat = np.zeros(cat_size, 1) # Convert weights and bias gradients to the vector form. # This step will "unroll" (flatten and concatenate together) all # your parameters into a vector, which can then be used with minFunc. return neurolib.flatten_params(W1, W2, W3, W4, b1, b2, b3, Wcat, bcat, We)
def initialize_params(esize, vsize, cat_size, dictionary_length): """ Accepts esize integer of embedded representation size. vsize integer of input size into autoencoder, (typically a multiple of esize). cat_size integer of dimensionality of multinomial categories dictionary_length integer of number of words in vocab. Returns one flat array of parameters initialized randomly. Initialize parameters randomly based on layer sizes. """ #We'll choose weights uniformly from the interval [-r, r] r = np.sqrt(6) / np.sqrt(esize + vsize + 1) W1 = np.random.rand(esize, vsize) * 2 * r - r; W2 = np.random.rand(esize, vsize) * 2 * r - r; W3 = np.random.rand(vsize, esize) * 2 * r - r; W4 = np.random.rand(vsize, esize) * 2 * r - r; We = 1e-3 * (np.random.rand(esize, dictionary_length) * 2 * r - r) Wcat = np.random.rand(cat_size, esize) * 2 * r - r; b1 = np.zeros(esize, 1); b2 = np.zeros(vsize, 1); b3 = np.zeros(vsize, 1); bcat = np.zeros(cat_size, 1); # Convert weights and bias gradients to the vector form. # This step will "unroll" (flatten and concatenate together) all # your parameters into a vector, which can then be used with minFunc. return neurolib.flatten_params(W1, W2, W3, W4, b1, b2, b3, Wcat, bcat, We)
def initialize_params(hidden_size, visible_size): """Accepts number of hidde states in sparse encoder, and number of input states in sparse encoder.. Initialize parameters randomly based on layer sizes. Returns a new flat array of size 2*visisble_size + hidden_size """ assert hidden_size <= visible_size #we'll choose weights uniformly from the interval [-r, r] r = np.sqrt(6) / np.sqrt(hidden_size + visible_size + 1) W1 = np.random.rand(hidden_size, visible_size) * 2 * r - r W2 = np.random.rand(visible_size, hidden_size) * 2 * r - r b1 = np.zeros(hidden_size) b2 = np.zeros(visible_size) """ % Convert weights and bias gradients to the vector form. % This step will "unroll" (flatten and concatenate together) all % your parameters into a vector, which can then be used with minFunc. """ #TODO: jperla: make this a function return neurolib.flatten_params(W1, W2, b1, b2)
def cost(theta, visible_size, hidden_size, weight_decay, sparsity_param, beta, data): """ % visible_size: the number of input units (probably 64) % hidden_size: the number of hidden units (probably 25) % lambda: weight decay parameter % sparsityParam: The desired average activation for the hidden units (denoted in the lecture % notes by the greek alphabet rho, which looks like a lower-case "p"). % beta: weight of sparsity penalty term % data: Our 64x10000 matrix containing the training data. So, data(:,i) is the i-th training example. % The input theta is a vector (because minFunc expects the parameters to be a vector). % We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this % follows the notation convention of the lecture notes. """ sparsity_param = float(sparsity_param) W1, W2, b1, b2 = unflatten_params(theta, hidden_size, visible_size) num_data = data.shape[1] # do a feed forward pass a2 = sigmoid(np.dot(W1, data) + T(b1)) a3 = sigmoid(np.dot(W2, a2) + T(b2)) assert a2.shape == (hidden_size, num_data) assert a3.shape == (visible_size, num_data) cost = 1.0 / num_data * (0.5 * np.sum((a3 - data)**2)) # add in weight decay cost += (0.5 * weight_decay) * (np.sum(W1**2) + np.sum(W2**2)) # add in sparsity parameter sparsity = np.sum(a2, axis=1) / float(num_data) assert sparsity.shape == (hidden_size, ) s = np.sum(binary_KL_divergence(sparsity_param, sparsity)) cost += beta * s # delta3: Compute the backprop (product rule) delta3 = -(data - a3) * a3 * (1 - a3) assert delta3.shape == (visible_size, num_data) # delta2: Compute the backprop (product rule) # 1. calculate inner derivative delta2 = np.dot(W2.T, delta3) # 2. add in sparsity parameter delta2 += T(beta * ((-sparsity_param / sparsity) + ((1 - sparsity_param) / (1 - sparsity)))) # 3. multiply by outer derivative delta2 *= a2 * (1 - a2) assert delta2.shape == (hidden_size, num_data) # compute final gradient W1grad = np.dot(delta2, data.T) / float(num_data) W2grad = np.dot(delta3, a2.T) / float(num_data) # add weight decay W1grad += weight_decay * W1 W2grad += weight_decay * W2 b1grad = np.sum(delta2, axis=1) / float(num_data) b2grad = np.sum(delta3, axis=1) / float(num_data) assert W1grad.shape == W1.shape assert W2grad.shape == W2.shape assert b1grad.shape == b1.shape assert b2grad.shape == b2.shape grad = neurolib.flatten_params(W1grad, W2grad, b1grad, b2grad) return cost, grad
def cost(theta, visible_size, hidden_size, weight_decay, sparsity_param, beta, data): """ % visible_size: the number of input units (probably 64) % hidden_size: the number of hidden units (probably 25) % lambda: weight decay parameter % sparsityParam: The desired average activation for the hidden units (denoted in the lecture % notes by the greek alphabet rho, which looks like a lower-case "p"). % beta: weight of sparsity penalty term % data: Our 64x10000 matrix containing the training data. So, data(:,i) is the i-th training example. % The input theta is a vector (because minFunc expects the parameters to be a vector). % We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this % follows the notation convention of the lecture notes. """ sparsity_param = float(sparsity_param) W1, W2, b1, b2 = unflatten_params(theta, hidden_size, visible_size) num_data = data.shape[1] # do a feed forward pass a2 = sigmoid(np.dot(W1, data) + T(b1)) a3 = sigmoid(np.dot(W2, a2) + T(b2)) assert a2.shape == (hidden_size, num_data) assert a3.shape == (visible_size, num_data) cost = 1.0 / num_data * (0.5 * np.sum((a3 - data)**2)) # add in weight decay cost += (0.5 * weight_decay) * (np.sum(W1**2) + np.sum(W2**2)) # add in sparsity parameter sparsity = np.sum(a2, axis=1) / float(num_data) assert sparsity.shape == (hidden_size,) s = np.sum(binary_KL_divergence(sparsity_param, sparsity)) cost += beta * s # delta3: Compute the backprop (product rule) delta3 = -(data - a3) * a3 * (1 - a3) assert delta3.shape == (visible_size, num_data) # delta2: Compute the backprop (product rule) # 1. calculate inner derivative delta2 = np.dot(W2.T, delta3) # 2. add in sparsity parameter delta2 += T(beta * ((-sparsity_param / sparsity) + ((1 - sparsity_param) / (1 - sparsity)))) # 3. multiply by outer derivative delta2 *= a2 * (1 - a2) assert delta2.shape == (hidden_size, num_data) # compute final gradient W1grad = np.dot(delta2, data.T) / float(num_data) W2grad = np.dot(delta3, a2.T) / float(num_data) # add weight decay W1grad += weight_decay * W1 W2grad += weight_decay * W2 b1grad = np.sum(delta2, axis=1) / float(num_data) b2grad = np.sum(delta3, axis=1) / float(num_data) assert W1grad.shape == W1.shape assert W2grad.shape == W2.shape assert b1grad.shape == b1.shape assert b2grad.shape == b2.shape grad = neurolib.flatten_params(W1grad, W2grad, b1grad, b2grad) return cost, grad