def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE grad = np.zeros(outputVectors.shape) gradPred = np.zeros(predicted.shape) # indices = [target] # for k in xrange(K): # newidx = dataset.sampleTokenIdx() # while newidx == target: # newidx = dataset.sampleTokenIdx() # indices += [newidx] # # labels = np.array([1] + [-1 for k in xrange(K)]) # vecs = outputVectors[indices,:] # # t = sigmoid(vecs.dot(predicted) * labels) # cost = -np.sum(np.log(t)) # # delta = labels * (t - 1) # gradPred = delta.reshape((1,K+1)).dot(vecs).flatten() # gradtemp = delta.reshape((K+1,1)).dot(predicted.reshape( # (1,predicted.shape[0]))) # for k in xrange(K+1): # grad[indices[k]] += gradtemp[k,:] t = sigmoid(predicted.dot(outputVectors[target,:])) cost = -np.log(t) delta = t - 1 gradPred += delta * outputVectors[target, :] grad[target, :] += delta * predicted for k in xrange(K): idx = dataset.sampleTokenIdx() t = sigmoid(-predicted.dot(outputVectors[idx,:])) cost += -np.log(t) delta = 1 - t gradPred += delta * outputVectors[idx, :] grad[idx, :] += delta * predicted ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K = 10): klist = [] for k in range(K): randomId = dataset.sampleTokenIdx() while randomId == target: randomId = dataset.sampleTokenIdx() klist.append(randomId) u0 = outputVectors[target] uks = -outputVectors[klist] vc = predicted U = np.vstack((u0, uks)) dot = U.dot(vc) sigmoid_value = sigmoid(dot) cost = -np.sum(np.log(sigmoid_value)) # print "cost is %f" % (cost, ) gradPred = np.zeros(predicted.shape) grad = np.zeros(outputVectors.shape) temp = sigmoid(u0.dot(vc)) - 1 intermediate = (sigmoid_value-1).reshape(-1, 1) * np.vstack((u0, -uks)) gradPred += intermediate[0] - np.sum(intermediate[1:,], axis=0) grad[target] += temp * vc counter_dictionary = Counter(klist) unique_ks = list(counter_dictionary.keys()) frequency_count = np.array(list(counter_dictionary.values())) grad[unique_ks] += (sigmoid(-outputVectors[unique_ks].dot(vc)) -1).reshape(-1,1) * -vc grad[unique_ks] *= frequency_count.reshape(-1, 1) return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. # # Input/Output Specifications: same as softmaxCostAndGradient u_o = outputVectors[target,:] sigmoid_o = sigmoid(np.dot(u_o, predicted)) cost = - np.log(sigmoid_o) gradPred = -u_o*(1-sigmoid_o) grad = np.zeros_like(outputVectors) grad[target,:] = - predicted*(1-sigmoid_o) for _ in range(K): k = dataset.sampleTokenIdx() while k == target: k = dataset.sampleTokenIdx() sigmoid_k = sigmoid(-np.dot(outputVectors[k,:],predicted)) cost += - np.log(sigmoid_k) gradPred += outputVectors[k,:] * (1-sigmoid_k) grad[k,:] += predicted * (1-sigmoid_k) return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE negativeSamples = [dataset.sampleTokenIdx() for i in range(K)] sigmoidTargetPred = sigmoid(outputVectors[target,:].transpose().dot(predicted)) cost = -np.log(sigmoidTargetPred) gradPred = (sigmoidTargetPred - 1.0)*outputVectors[target,:] grad = np.zeros(outputVectors.shape) grad[target,:] = predicted * (sigmoidTargetPred - 1.0) for sample in negativeSamples: sigmoidSamplePredicted = sigmoid(-outputVectors[sample,:].transpose().dot(predicted)) cost -= np.log(sigmoidSamplePredicted) gradPred += (1.0 - sigmoidSamplePredicted)*outputVectors[sample,:] grad[sample,:] += (1.0 - sigmoidSamplePredicted)*predicted.transpose() ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE grad = np.zeros(outputVectors.shape) s = sigmoid(np.dot(outputVectors[target,:], predicted)) cost = -np.log(s) gradPred = - sigmoid_grad(s)/s*outputVectors[target,:] grad[target,:] = - sigmoid_grad(s)/s*predicted for k in range(K): i = dataset.sampleTokenIdx() s = sigmoid( - np.dot(outputVectors[i,:], predicted)) cost -= np.log(s) gradPred += sigmoid_grad(s)/s*outputVectors[i,:] grad[i,:] += sigmoid_grad(s)/s*predicted ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE ''' Keep track of dims: D - dim of word vector V - number of words predicted : (D, ) target : integer outputVectors : (V, D) cost : float gradPred : (D, ) grad : (V, D) ''' predicted = predicted.reshape(-1, 1) # (D ,1) sampledIndices = [dataset.sampleTokenIdx() for i in xrange(K)] sampledVectors = outputVectors[sampledIndices, :] # (K, D) outputVec = outputVectors[target, :] # (D, ) prob_out = sigmoid(outputVec.dot(predicted)) # float probs_negative = sigmoid(-sampledVectors.dot(predicted)) # (K, 1) cost = - np.log(prob_out) - np.sum(np.log(probs_negative)) gradPred = (prob_out - 1) * outputVec - np.sum((probs_negative-1) * sampledVectors, axis=0) # (D, ) grad = np.zeros_like(outputVectors) # (V, D) grad[target, :] = (prob_out - 1) * predicted.reshape(-1) # Note that sampledIndices may have repeated indices, we may loop over all K samples. # And target should not be appeared in sampledIndices, but it's ok in gradient check, # because we use += to update each output vector, and no grads will be missed. for i in xrange(K): grad[sampledIndices[i], :] += (1 - probs_negative[i]) * predicted.reshape(-1) ### END YOUR CODE return cost, gradPred, grad
def test_sigmoid_permutation_axis1(dim_1): a1 = np.random.normal(size=(1,dim_1)) s1 = sigmoid(a1) permutation = np.random.permutation(dim_1) inverse_permutation = np.argsort(permutation) s1_perm = sigmoid(a1.ravel()[permutation]) assert rel_error(s1_perm.ravel()[inverse_permutation], s1) <= 1e-8
def test_sigmoid_gradient(dim_1, dim_2): a1 = np.random.normal(loc=0., scale=20., size=(dim_1,dim_2)) shift = np.random.uniform(low=1e-9, high=1e-5, size=(dim_1,dim_2)) ap = a1 + shift am = a1 - shift dsigmoid = (sigmoid(ap) - sigmoid(am)) / (2*shift) assert np.abs(np.max(dsigmoid - sigmoid_grad(sigmoid(a1)))) <= 1e-7 assert np.abs(np.min(dsigmoid - sigmoid_grad(sigmoid(a1)))) <= 1e-7
def test_sigmoid_shape(dim): testing_shape = [] for y in range(0,dim): testing_shape.append(np.random.randint(3,8)) shape = tuple(testing_shape) #z = np.random.randn(*testing_shape) x = np.random.standard_normal(shape) y = np.copy(x) assert x.shape == sigmoid(y).shape assert x.shape == sigmoid_grad(sigmoid(y)).shape
def test_sigmoid_permutation_axis0(dim_1, execution_number): """ sigmoid needs to be applied element-wise;""" a1 = np.random.normal(size=(dim_1,1)) s1 = sigmoid(a1) permutation = np.random.permutation(dim_1) inverse_permutation = np.argsort(permutation) s1_perm = sigmoid(a1[permutation]) assert rel_error(s1_perm[inverse_permutation], s1) <= 1e-8
def sigmoid_forward(x): """ Computes the forward pass for a sigmoid activation. Inputs: - x: Input data, numpy array of arbitary shape; Returns a tuple (out, cache) - out: output of the same shape as x - cache: identical to out; required for backpropagation """ return sigmoid(x), sigmoid(x)
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE v_c = predicted u_o = outputVectors[target] suv = sigmoid(u_o.dot(v_c)) pos = np.log(suv) # positive sample # sample w/ dataset.sampleTokenIdx() method iteratively n = [] # indexes of negative samples while len(n) < K: x = dataset.sampleTokenIdx() if x != target: n.append(x) neg_samples = outputVectors[n] skv = sigmoid((neg_samples.dot(v_c))) neg = np.sum(np.log(1-skv)) cost = -pos - neg # neg_samples: K x d, skv: 1 x K gradPred = -(1 - suv) * u_o + (neg_samples.T * (skv)).sum(axis=1) grad = np.zeros(outputVectors.shape) grad[target] += -(1-suv) * v_c negGrad = np.outer(skv, v_c) # sum grads together when they have been sampled with replacement for i,x in enumerate(n): grad[x] += negGrad[i] ### END YOUR CODE return cost, gradPred, grad
def test_sigmoidgrad(): """ Original sigmoid gradient test defined in q2_sigmoid.py; """ x = np.array([[1, 2], [-1, -2]]) f = sigmoid(x) g = sigmoid_grad(f) assert rel_error(g, np.array([[0.19661193, 0.10499359], [0.19661193, 0.10499359]])) <= 1e-7
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) T = data.shape[0] ### YOUR CODE HERE: forward propagation z1 = np.dot(data, W1) + b1 # MxH + 1xH = MxH h = sigmoid(z1) z2 = np.dot(h, W2) + b2 # MxDy + 1xDy = MxDy y_ = softmax(z2) # MxDy cost = -1*np.sum(np.log(y_)*labels)/T #raise NotImplementedError ### END YOUR CODE ### YOUR CODE HERE: backward propagation dz2 = (y_ - labels)/T # MxDy db2 = np.sum(dz2, axis=0) # 1xDy dh = np.dot(dz2, W2.T) # MxH dW2 = np.dot(h.T, dz2) # HxDy dz1 = h*(1-h)*dh # MxH db1 = np.sum(dz1, axis=0) # 1xH dW1 = np.dot(data.T, dz1) # Dx x H gradb2 = db2 gradW2 = dW2 gradb1 = db1 gradW1 = dW1 ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) #print "Cost: %f \t grad[0] %f, grad[1] %f" % (cost, grad[0], grad[1]) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation h = sigmoid(np.dot(X, W1) + b1) yhat = softmax(np.dot(h, W2) + b2) cross_entropy = -np.log(yhat)[labels == 1] cost = np.sum(cross_entropy) # / len(labels) ### END YOUR CODE # Things look too good to be true... Tried twice and the gradient check passed. ### YOUR CODE HERE: backward propagation dl_dyhat = (-1 / yhat)[labels == 1] # m x 1, m the number of points; dyhat_dsoftmax = yhat * (1 - yhat) # m x n, n the number of classes; dl_dsoftmax = dyhat_dsoftmax * np.reshape(dl_dyhat, (-1, 1)) gradW2 = np.reshape(np.sum(h, 0), [-1, 1]) * np.reshape( np.sum(dl_dsoftmax, 0), [1, -1]) #np.dot(h.T, dl_dsoftmax) # n x h, transpose shape of W2; gradb2 = np.reshape(np.sum(dl_dsoftmax, axis=0), (1, -1)) # n x 1 dl_dh = np.dot(np.sum(dl_dsoftmax, axis=0), W2.T) # m x h, sum up all m dh_dsigmoid = np.sum(sigmoid_grad(h), 0) # m x h, sumup all m dl_dsigmoid = dl_dh * dh_dsigmoid gradW1 = np.reshape(np.sum(X, 0), [-1, 1]) * np.reshape( dl_dsigmoid, [1, -1]) gradb1 = np.reshape(dl_dsigmoid, (1, -1)) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def test_sigmoid(): """ Original sigmoid test defined in q2_sigmoid.py; """ x = np.array([[1, 2], [-1, -2]]) f = sigmoid(x) assert rel_error( f, np.array([[0.73105858, 0.88079708], [0.26894142, 0.11920292] ])) <= 1e-7
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE u_o, v_c = outputVectors[target], outputVectors[indices[1:]] loss = -np.log(sigmoid(np.matmul(u_o, predicted))) print(u_o) print(v_c) ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation layer1 = data.dot(W1) + b1 activation1 = sigmoid(layer1) layer2 = activation1.dot(W2) + b2 predictions = softmax(layer2) softmaxs = np.sum(predictions * labels, axis=1) cost = -np.log(softmaxs) num_train = data.shape[0] cost = np.sum(cost) / num_train ### END YOUR CODE ### YOUR CODE HERE: backward propagation dC = 1.0 dLayer2 = dC / num_train * predictions - dC / num_train * labels gradb2 = np.sum(dLayer2, axis=0) gradW2 = activation1.T.dot(dLayer2) dActivation1 = dLayer2.dot(W2.T) dLayer1 = dActivation1 * sigmoid_grad(activation1) gradb1 = np.sum(dLayer1, axis=0) gradW1 = data.T.dot(dLayer1) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): grad = np.zeros(outputVectors.shape) gradPred = np.zeros(predicted.shape) indices = [target] for k in range(K): newidx = dataset.sampleTokenIdx() while newidx == target: newidx = dataset.sampleTokenIdx() indices += [newidx] labels = np.array([1] + [-1 for k in range(K)]) vecs = outputVectors[indices, :] t = sigmoid(vecs.dot(predicted) * labels) cost = -np.sum(np.log(t)) delta = labels * (t - 1) gradPred = delta.reshape((1, K + 1)).dot(vecs).flatten() gradtemp = delta.reshape( (K + 1, 1)).dot(predicted.reshape((1, predicted.shape[0]))) for k in range(K + 1): grad[indices[k]] += gradtemp[k, :] return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation N = data.shape[0] a1 = data z2 = np.matmul(a1, W1) + b1 a2 = sigmoid(z2) z3 = np.matmul(a2, W2) + b2 a3 = softmax(z3) ycap = a3 cost = -np.sum(labels * np.log(ycap)) #raise NotImplementedError ### END YOUR CODE ### YOUR CODE HERE: backward propagation a3grad = a3 - labels # this is the grad for softmax gradW2 = np.dot(a2.T, a3grad) gradb2 = np.sum(a3grad, axis=0, keepdims=True) t = np.dot(W2, a3grad.T) * sigmoid_grad(a2).T gradW1 = np.dot(t, a1).T gradb1 = np.sum(np.dot(a3grad, W2.T) * sigmoid_grad(a2), axis=0, keepdims=True) #raise NotImplementedError ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE prod = sigmoid(np.dot(outputVectors[target], predicted)) cost = -np.log(prod) - \ sum([np.log(sigmoid(-np.dot(outputVectors[i], predicted))) for i in indices[1:]]) gradPred = (prod - 1) * outputVectors[target] - \ sum([(sigmoid(-np.dot(outputVectors[i], predicted)) - 1) * outputVectors[i] for i in indices[1:]]) grad = np.zeros(outputVectors.shape) grad[target] = (prod - 1) * predicted ### IMPORTANT: GRADIENTS FOR SAMPLED WORDS SHOULD BE ACCUMULATED BECAUSE THEY CAN APPEAR SEVERAL TIMES => -= for i in indices[1:]: grad[i] -= (sigmoid(-np.dot(outputVectors[i], predicted)) - 1) * predicted assert gradPred.shape == predicted.shape assert grad.shape == outputVectors.shape ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) gradPred = np.zeros_like(predicted) grad = np.zeros_like(outputVectors) ### YOUR CODE HERE V = outputVectors.shape[0] #vocabulary size y = np.zeros(V) y[target] = 1 similarity = np.dot(predicted, outputVectors[target].T) #Ut . Vc probability = sigmoid(similarity) # cost = -np.log(probability) cost = -np.sum(y * np.log(probability)) # context_word_vec = np.zeros((len(indices), outputVectors.shape[1])) gradPred = (probability - 1) * outputVectors[target] grad[target] = np.dot(probability - 1, predicted) i = 0 for i in indices[1:]: neg_similarity = np.dot(predicted, outputVectors[i].T) neg_probability = sigmoid(-neg_similarity) # cost += -np.log(neg_probability) cost += -np.sum(y * np.log(neg_probability)) gradPred += (1 - neg_probability) * outputVectors[i] grad[i] += np.dot((1 - neg_probability), predicted) ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ ### YOUR CODE HERE V, D = outputVectors.shape # get the k random indices k_indicies = [] for i in range(K): rand_index = dataset.sampleTokenIdx() # 有没有可能随机到正确的样本?有,但概率很小 k_indicies.append(rand_index) # loss function neg_sample_vector = outputVectors[k_indicies, :] # KxD assert neg_sample_vector.shape == (K, D) sigm_neg = sigmoid(-1.0 * np.dot(neg_sample_vector, predicted.reshape( (D, 1)))) # KxD Dx1 = Kx1 cost_neg = np.sum(np.log(sigm_neg), axis=0) sigm_cor = sigmoid(np.dot(outputVectors[target], predicted.reshape( (D, 1)))) cost = -1.0 * np.log(sigm_cor) - cost_neg # gradient on output vectors grad = np.zeros(outputVectors.shape) # V, D grad[target] = predicted * (sigm_cor - 1.0) # 1xD for k in k_indicies: grad[k, :] += -1.0 * predicted.reshape( (D, )) * (sigmoid(np.dot(-1.0 * predicted, outputVectors[k])) - 1.0) # gradient on input vector # 这里第一项减一跟公式有点不一样啊。。但是结果是对的。。1 - sigm_neg.reshape((1,K)) gradPred_neg = np.dot(1 - sigm_neg.reshape((1, K)), neg_sample_vector).reshape((1, D)) # 1xK KxD = 1xD gradPred_cor = (sigm_cor - 1) * outputVectors[target].reshape((1, D)) gradPred = gradPred_neg + gradPred_cor ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE #initialize variables cost = 0 gradPred = np.zeros(predicted.shape) grad = np.zeros(outputVectors.shape) output_word = outputVectors[target] target_sigmoid = sigmoid(np.dot(output_word, predicted)) cost = -np.log(target_sigmoid) gradPred = (target_sigmoid - 1.0) * output_word grad[target] = (target_sigmoid - 1.0) * predicted for index in indices: word = outputVectors[index] k_sigmoid = sigmoid(np.dot(-word, predicted)) cost -= np.log(k_sigmoid) gradPred += ((1.0 - k_sigmoid) * word) grad[index] += -((k_sigmoid - 1.0) * predicted) assert predicted.shape == gradPred.shape assert outputVectors.shape == grad.shape ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] # indices[0]=target, indices[1--K] = not target indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE u_o = outputVectors[indices[0]] v_c = predicted # [dim] z = sigmoid(np.dot(u_o.T, v_c)) cost = -np.log(z) gradPred = np.zeros(np.shape(predicted)) gradPred += (z - 1.0) * u_o grad = np.zeros(np.shape(outputVectors)) # 按位乘 [1,2,3]*[1,2,3] = [1,4,9] # 对u_o求偏导 grad[target] += (z - 1.0) * v_c # for negative samples for k in range(K): u_k = outputVectors[indices[k + 1]] z = sigmoid(np.dot(u_k.T, v_c)) cost -= np.log(1.0 - z) gradPred += z * u_k grad[indices[k + 1]] += z * v_c ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE grad = np.zeros(outputVectors.shape) gradPred = np.zeros(predicted.shape) cost = 0 z = sigmoid(np.dot(outputVectors[target], predicted)) cost -= np.log(z) grad[target] += predicted * (z - 1.0) gradPred += outputVectors[target] * (z - 1.0) grad = np.zeros(outputVectors.shape) grad[target] = predicted * (sigmoid(outputVectors[target].dot(predicted)) - 1) gradPred = outputVectors[target] * ( sigmoid(outputVectors[target].dot(predicted)) - 1) cost = -np.log(sigmoid(outputVectors[target].dot(predicted))) for i in xrange(1, K + 1): negtive_index = indices[i] sig = sigmoid((outputVectors[negtive_index]).dot(predicted)) cost += -np.log(1 - sig) grad[negtive_index] += predicted * sig gradPred += outputVectors[negtive_index] * sig ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE grad = np.zeros_like(outputVectors) gradPred = np.zeros_like(predicted) activate = sigmoid(np.dot(predicted.reshape(-1), outputVectors[target].T)) cost = 0 cost -= np.log(activate) grad[target:target + 1] = (activate - 1) * predicted gradPred += (activate - 1) * outputVectors[target] neg_samples = [] for i in range(K): idx = dataset.sampleTokenIdx() if (idx == target) or (idx in neg_samples): i -= 1 continue neg_samples.append(idx) neg_activate = sigmoid( -np.dot(predicted.reshape(-1), outputVectors[idx].T)) cost -= np.log(neg_activate) grad[idx:idx + 1] = -(neg_activate - 1) * predicted gradPred -= (neg_activate - 1) * outputVectors[idx] ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data [input] -- M x Dx matrix, where each row is a training example. labels [expected O/p] -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation # M,Dx * (Dx,H) + H = M * H pre_h = data.dot(W1) + b1 h = sigmoid(pre_h) # M,H * H,Dy + Dy = M * Dy pre_Y = np.dot(h, W2) + b2 Y = softmax(pre_Y) # //cross entorpy cost cost = -np.sum(labels * np.log(Y)) # M * Dy ### END YOUR CODE ### YOUR CODE HERE: backward propagation #first grad of cost function delta_cost = Y - labels gradW2 = h.T.dot(delta_cost) gradb2 = np.sum(delta_cost, axis=0) delta_h = delta_cost.dot(W2.T) * sigmoid_grad(h) gradW1 = data.T.dot(delta_h) gradb1 = np.sum(delta_h, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation #raise NotImplementedError ### END YOUR CODE h = sigmoid(X.dot(W1) + b1) y_h = softmax(h.dot(W2) + b2) ### YOUR CODE HERE: backward propagation cost = np.sum(-np.log(y_h[labels == 1])) / X.shape[0] delta1 = (y_h - labels) / X.shape[0] delta2 = delta1.dot(W2.transpose()) delta3 = sigmoid_grad(h) * delta2 #calculate gradient gradW1 = X.transpose().dot(delta3) gradb1 = np.sum(delta3, 0) gradW2 = h.transpose().dot(delta1) gradb2 = np.sum(delta1, 0, keepdims = True) #raise NotImplementedError ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation z1 = np.dot(X,W1)+b1 # (M x H) h = sigmoid(z1) # (M x H) z2 = np.dot(h,W2)+b2 # (M x Dy) y_dash = softmax(z2) # (M x Dy) cost = - np.sum(labels* np.log(y_dash)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation cost_theta = y_dash - labels gradW2 = np.dot(h.T, cost_theta) gradb2 = np.reshape(np.sum(cost_theta, axis = 0), b2.shape) cost_h = np.dot(cost_theta, W2.T) # (M x H) cost_z1 = sigmoid_grad(h) * cost_h # (M x H) gradW1 = np.dot(X.T, cost_z1) # (Dx x M)*(M x H) = (Dx x M) gradb1 = np.reshape(np.sum(cost_z1, axis = 0), b1.shape) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h = sigmoid( np.dot(data, W1) + b1 ) # when adding bias +b1 with dimentions(1, H), then we add each element of b1 to its correspond position in dot(data, W1) y_pred = softmax(np.dot(h, W2) + b2) ### END YOUR CODE ### YOUR CODE HERE: backward propagation cost = np.sum(-np.log(y_pred[labels == 1])) / data.shape[ 0] # cross entropy cost - average sum of all elements in y_pred where the correspondent in labels (y) is 1 but not 0 (label: one-hot vector) delta_3 = (y_pred - labels) / data.shape[ 0] # QUESTION: why dividing by data.shape[0] (number of examples) delta_2 = sigmoid_grad(h) * np.dot( delta_3, W2.T ) # compute delta using the dot product between the error (delta_3) and weights of second layer and the Hadamard product with the derivative of the activations gradW2 = np.dot(h.T, delta_3) gradb2 = np.sum( delta_3, 0, keepdims=True) # QUESTION: why summing the values of delta_3 gradW1 = np.dot(data.T, delta_2) gradb1 = np.sum( delta_2, 0, keepdims=True) # QUESTION: why summing the values of delta_2 ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation hidden = sigmoid(np.dot(X, W1) + b1) # R: M*Dx * Dx*H + 1*H = M*H output = softmax(np.dot(hidden, W2) + b2) # R: M*H * H*Dy + 1*Dy = M*Dy cost = -np.sum(labels * np.log(output)) ### YOUR CODE HERE: backward propagation gradb2 = -labels + output # R: M*Dy - M*Dy = M*Dy gradW2 = hidden[:, :, np. newaxis] * gradb2[:, np. newaxis, :] # R: M*H*1 * M*1*Dy = M*H*Dy gradb1 = np.sum(gradb2[:, np.newaxis, :] * W2[np.newaxis, :, :], axis=2) * sigmoid_grad( hidden) # R: = sum(M*1*Dy * M*Dy*H) * M*H= M*H gradW1 = X[:, :, np.newaxis] * gradb1[:, np.newaxis, :] #R: M*H*1 * M*1*Dx = M*Dx*H gradb2 = np.sum(gradb2, axis=0) # sum by column gradb1 = np.sum(gradb1, axis=0) gradW1 = np.sum(gradW1, axis=0) gradW2 = np.sum(gradW2, axis=0) ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE indices = [target] for k in xrange(K): sampleTokenIdx = dataset.sampleTokenIdx() while sampleTokenIdx == target: sampleTokenIdx = dataset.sampleTokenIdx() indices += [sampleTokenIdx] signs = np.array([1] + [-1 for k in xrange(K)]) vecs = outputVectors[indices, :] t = sigmoid(vecs.dot(predicted) * signs) delta = (t - 1) * signs cost = np.sum(-np.log(t)) gradPred = delta.reshape(1, K + 1).dot(vecs).flatten() grad = np.zeros(outputVectors.shape) gradtemp = delta.reshape((K+1,1)).dot(predicted.reshape( (1,predicted.shape[0]))) for k in xrange(K+1): grad[indices[k]] += gradtemp[k,:] # naive implementation but not efficient cause it makes |V| computation. # uv = outputVectors.dot(predicted) # negSamplesCost = 0 # negSampleGradPred = np.zeros(predicted.shape[0]) # grad = np.zeros(outputVectors.shape) # # for i in xrange(K): # sampleTokenIdx = dataset.sampleTokenIdx() # while sampleTokenIdx == target: # sampleTokenIdx = dataset.sampleTokenIdx() # negSamplesCost += np.log(sigmoid(-uv[sampleTokenIdx])) # negSampleGradPred += (sigmoid(-uv[sampleTokenIdx]) - 1) * outputVectors[sampleTokenIdx, :] # grad[sampleTokenIdx] += -(sigmoid(-uv[sampleTokenIdx]) - 1) * predicted # # cost = -np.log(sigmoid(uv[target])) - negSamplesCost # gradPred = (sigmoid(uv[target]) - 1) * outputVectors[target, :] - negSampleGradPred # grad[target] = (sigmoid(uv[target]) - 1) * predicted ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation # data: N x Dx, W1: Dx x H, b: 1 x H a = data.dot(W1) + b1 h = sigmoid(a) # h: N x H, W2: H x Dy, b2: 1 x Dy t = h.dot(W2) + b2 y_hat = softmax(t) # y_hat: N x Dy, labels: N x Dy (as int) probs = labels * y_hat cost = np.sum(-np.log(probs.sum(axis=1))) ### END YOUR CODE ### YOUR CODE HERE: backward propagation # obtain the softmax gradient dJdt = (y_hat - labels) # N x Dy # b2 grad is sum along each index of the Dy vectors gradb2 = np.sum(dJdt, 0) # h: N x H, dJdt: N x Dy gradW2 = h.T.dot(dJdt) # H x Dy # dJdt: N x Dy, W2: H x Dy dJdh = dJdt.dot(W2.T) # h: N x H dhda = sigmoid_grad(h) # data: N x Dx, dhda: N x H, DJdh: N x H gradW1 = data.T.dot(dhda * dJdh) # dhda: N x H, DJdh: N x H gradb1 = np.sum(dhda * dJdh, 0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation z = np.dot(data, W1) + b1 h = sigmoid(z) scores = np.dot(h, W2) + b2 probs = softmax(scores) cost = - np.sum(labels * np.log(probs)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation dscores = probs - labels gradW2 = np.dot(h.T, dscores) gradb2 = np.sum(dscores, axis=0, keepdims=True) dh = np.dot(dscores, W2.T) dz = sigmoid_grad(h) * dh gradW1 = np.dot(data.T, dz) gradb1 = np.sum(dz, axis=0, keepdims=True) assert(np.all(gradW2.shape == W2.shape)) assert(np.all(gradb2.shape == b2.shape)) assert(np.all(gradW1.shape == W1.shape)) assert(np.all(gradb1.shape == b1.shape)) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation z1 = np.matmul(X, W1) + b1 a1 = sigmoid(z1) a2 = softmax(np.matmul(a1, W2) + b2) cost = (-np.log(a2) * labels).sum() ### END YOUR CODE ### YOUR CODE HERE: backward propagation dz2 = a2 - labels dw2 = np.matmul(a1.T, dz2) assert dw2.shape == (H, Dy) db2 = dz2.sum(0) assert db2.shape == (Dy, ) dz1 = np.matmul(dz2, W2.T) * sigmoid_grad(a1) dw1 = np.matmul(X.T, dz1) assert dw1.shape == (Dx, H) db1 = dz1.sum(0) assert db1.shape == (H, ) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate( (dw1.flatten(), db1.flatten(), dw2.flatten(), db2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) # (10, 5) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) # (1, 5) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) # (5, 10) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # (1, 10) ### YOUR CODE HERE: forward propagation z1 = np.dot(X, W1) + b1 # (20, 5) a1 = sigmoid(z1) # (20, 5) scores = np.dot(a1, W2) + b2 # (20, 10) y_pred = softmax(scores) cost = -np.sum(labels * np.log(y_pred)) # ### END YOUR CODE # ### YOUR CODE HERE: backward propagation dscores = y_pred - labels # (20, 10) gradW2 = np.dot(a1.T, dscores) # (5, 10) # gradW2 bp step 1: back to d(softmax(theta))/d(theta), this is equal to y-bar - y, which is dscores defined in line 52 # gradW2 bp step 2: back to d(W2*X2)/d(W2), this is equal to X2 # so combine 2 steps together, gradW2 = X2.T * (y-bar - y) gradb2 = np.sum(dscores, axis=0) # (1, 10) da1 = np.dot(dscores, W2.T) dz1 = sigmoid_grad(a1)*da1 gradW1 = np.dot(X.T, dz1) gradb1 = np.sum(dz1, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad # cost is a single number, grad is a nparray
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### 前向运算 N = Dx # 第一个隐层做内积 a1 = sigmoid(data.dot(W1) + b1) # 第二个隐层做内积 a2 = softmax(a1.dot(W2) + b2) cost = - np.sum(np.log(a2[labels == 1]))/N ### 反向传播 # Calculate analytic gradient for the cross entropy loss function grad_a2 = ( a2 - labels ) / N # Backpropagate through the second latent layer gradW2 = np.dot( a1.T, grad_a2 ) gradb2 = np.sum( grad_a2, axis=0, keepdims=True ) # Backpropagate through the first latent layer grad_a1 = np.dot( grad_a2, W2.T ) * sigmoid_grad(a1) gradW1 = np.dot( data.T, grad_a1 ) gradb1 = np.sum( grad_a1, axis=0, keepdims=True ) # if verbose: # Verbose mode for logging information # print ("W1 shape: {}".format( str(W1.shape) )) # print ("W1 gradient shape: {}".format( str(gradW1.shape) )) # print ("b1 shape: {}".format( str(b1.shape) )) # print ("b1 gradient shape: {}".format( str(gradb1.shape) )) ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h = sigmoid(data.dot(W1) + b1) y_hat = softmax(h.dot(W2) + b2) # cost = - np.sum(np.log(y_hat).dot(labels.transpose())) cost = -np.sum(labels * np.log(y_hat)) / data.shape[0] ### END YOUR CODE ### YOUR CODE HERE: backward propagation # gradb2 = y_hat - labels # gradW2 = np.matmul((y_hat - labels), sigmoid(np.matmul(data, W1) + b1)) # gradb1 = np.matmul(np.matmul((y_hat - labels), W2.transpose()), np.matmul(sigmoid(np.matmul(data, W1), + b1), (1- sigmoid(np.matmul(data, W1), + b1)))) # # gradb1 = (y_hat - labels) * W2 * sigmoid(data * W1 + b1) * (1 - sigmoid(data * W1 + b1)) # gradW1 = (y_hat - labels) * W2 * sigmoid(data * W1 + b1) * (1 - sigmoid(data * W1 + b1)) * data gradZ2 = (y_hat - labels) / data.shape[0] gradb2 = np.sum(gradZ2, axis=0, keepdims=True) gradW2 = (h.T).dot(gradZ2) gradH = gradZ2.dot(W2.T) gradZ1 = gradH * sigmoid_grad(h) #相同坐标的元素相乘 gradb1 = np.sum(gradZ1, axis=0, keepdims=True) gradW1 = (data.T).dot(gradZ1) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation #z1 = data.dot(W1) + b1 #hidden = sigmoid(z1) #z2 = hidden.dot(W2) + b2 #print 'z2.shape: ', z2.shape #prediction = softmax(z2) ### END YOUR CODE hidden = sigmoid(data.dot(W1) + b1) prediction = softmax(hidden.dot(W2) + b2) cost = -np.sum(np.log(prediction) * labels) ### YOUR CODE HERE: backward propagation #print 'NN: ', Dx, H, Dy #print 'b1.shape: ', b1.shape #print 'prediction.shape: ', prediction.shape #print 'labels.shape : ', labels.shape #print 'W2.shape: ', W2.shape #print 'hidden.shape: ', hidden.shape #print 'hidden.T.shape: ', hidden.T.shape #print 'delta.shape: ', delta.shape #print 'W1.shape: ', W1.shape #print 'data.shape: ', data.shape #gradW2 = delta * hidden #print 'sigmoid_grad(hidden).shape: ', sigmoid_grad(hidden).shape delta = prediction - labels gradW2 = hidden.T.dot(delta) gradb2 = np.sum(delta, axis = 0) hidden_delta = delta.dot(W2.T) * sigmoid_grad(hidden) gradW1 = data.T.dot(hidden_delta) gradb1 = np.sum(hidden_delta, axis = 0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation f1 = np.dot(X, W1) + b1 h = sigmoid(f1) f2 = np.dot(h, W2) + b2 y_hat = softmax(f2) cost = -np.sum(labels * np.log(y_hat)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation sigma1 = y_hat - labels sigma2 = W2.transpose() sigma3 = sigmoid_grad(h) gradb2 = np.sum(sigma1, axis=0) gradW2 = np.dot(h.transpose(), sigma1) sigma4 = sigma3 * np.dot(sigma1, sigma2) gradb1 = np.sum(sigma4, axis=0) gradW1 = np.dot(X.transpose(), sigma4) # print(gradW1.shape) # print(gradb1.shape) # print(gradW2.shape) # print(gradb2.shape) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) # print(grad.shape) return cost, grad
def forward_backward_prop(X, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, the backward propagation for the gradients for all parameters. Notice the gradients computed here are different from the gradients in the assignment sheet: they are w.r.t. weights, not inputs. Arguments: X -- M x Dx matrix, where each row is a training example x. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # Note: compute cost based on `sum` not `mean`. ### YOUR CODE HERE: forward propagation z1 = np.dot(X, W1) + b1 g1 = sigmoid(z1) # M , H z2 = np.dot(g1, W2) + b2 final_scores = softmax(z2) # M , Dy cost = -np.sum(labels * np.log(final_scores)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation gradSoft = final_scores - labels # M , Dy gradW2 = g1.T.dot(gradSoft) gradb2 = np.sum(gradSoft, axis=0) # 1 , Dy gradz1 = gradSoft.dot(W2.T) # M , H gradSig = gradz1 * sigmoid_grad(g1) # M , H gradW1 = X.T.dot(gradSig) # Dx , H gradb1 = np.sum(gradSig, axis=0) # 1 , H ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation # data: N x Dx, W1: Dx x H, b: 1 x H a = data.dot(W1) + b1 h = sigmoid(a) # h: N x H, W2: H x Dy, b2: 1 x Dy t = h.dot(W2) + b2 y_hat = softmax(t) # y_hat: N x Dy, labels: N x Dy (as int) probs = labels * y_hat cost = np.sum(-np.log(probs.sum(axis=1))) ### END YOUR CODE ### YOUR CODE HERE: backward propagation # obtain the softmax gradient dJdt = (y_hat - labels) # N x Dy # b2 grad is sum along each index of the Dy vectors gradb2 = np.sum(dJdt, 0) # h: N x H, dJdt: N x Dy gradW2 = h.T.dot(dJdt) # H x Dy # dJdt: N x Dy, W2: H x Dy dJdh = dJdt.dot(W2.T) # h: N x H dhda = sigmoid_grad(h) # data: N x Dx, dhda: N x H, DJdh: N x H gradW1 = data.T.dot(dhda * dJdh) # dhda: N x H, DJdh: N x H gradb1 = np.sum(dhda * dJdh, 0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE # refactorization of the first draft below. V,D = outputVectors.shape output = outputVectors[indices] # the 0th is the center uv = np.dot(output, predicted) uv[0] = -uv[0] sig = sigmoid(-uv) cost = -np.sum(np.log(sig)) gradTheta = 1 - sig gradTheta[0] = - gradTheta[0] gradPred = np.dot(output.T, gradTheta) # 1 x D array samples = np.reshape(gradTheta, (-1, 1)) * predicted grad = np.zeros([V, D]) for i in range(len(indices)): grad[indices[i]] += samples[i] ########################## First draft ########################### ### !!!this is super slow !!! bottle neck should be the 'for' loop # uov = np.dot(predicted, outputVectors[target]) # sigmoid_uov = sigmoid(uov) # ukv = np.dot(outputVectors[indices[1:]], predicted) # exclude the target # sigmoid_ukv = sigmoid(-ukv) # 1 x K # cost = -np.log(sigmoid_uov) - np.sum(np.log(sigmoid_ukv)) # gradTheta1 = -(1 - sigmoid_uov) # a scalar # gradTheta2 = (1 - sigmoid_ukv) # K x 1 array # gradPred = gradTheta1 * outputVectors[target] + np.sum(outputVectors[indices[1:]] * np.reshape(gradTheta2, (-1,1)), axis = 0) # 1 x D array # gradOutput = np.zeros([V,D]) # # # only K none-zero rows indicates the K negative samples, same numble of samples but might contain duplicated words # # Can be parallelized further. # samples = np.reshape(gradTheta2, (-1, 1)) * predicted # for i in range(V): # gradOutput[i] = np.sum(samples[np.where(np.array(indices[1:]) == i)], axis=0) # # the positive sample # gradOutput[indices[0]] = np.reshape(gradTheta1, (-1, 1)) * predicted # grad = gradOutput ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation #raise NotImplementedError z1 = np.matmul(data, W1) + b1 h = sigmoid(z1) z2 = np.matmul(h, W2) + b2 y_hat = softmax(z2) cost = np.sum(-np.log(y_hat[labels == 1])) / (data.shape[0]) ### END YOUR CODE ### YOUR CODE HERE: backward propagation #raise NotImplementedError d1 = (y_hat - labels) / (data.shape[0]) gradW2 = np.matmul(np.transpose(h), d1) gradb2 = np.sum(d1, axis=0, keepdims=True) d2 = np.matmul(d1, np.transpose(W2)) d3 = d2 * sigmoid_grad(h) gradW1 = np.matmul(np.transpose(data), d3) gradb1 = np.sum(d3, axis=0, keepdims=True) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE K_set = [] while (len(K_set) < K): candidateIndex = dataset.sampleTokenIdx() if (candidateIndex != target): K_set += [candidateIndex] cost = 0 grad = np.zeros(outputVectors.shape) gradPred = np.zeros(predicted.shape) for k in K_set: score = sigmoid(- outputVectors[k].dot(predicted)) cost += - np.log(score) grad[k,:] += - (score - 1) * predicted gradPred += - (score - 1) * outputVectors[k] score = sigmoid(outputVectors[target].dot(predicted)) cost += - np.log(score) grad[target,:] = (score - 1) * predicted gradPred += (score - 1) * outputVectors[target] # ugly fix ... gradPred = gradPred[np.newaxis, :] #raise NotImplementedError ### END YOUR CODE return cost, gradPred, grad
def test_sigmoid(dim_1, dim_2): a1 = np.random.normal(loc=0., scale=20., size=(dim_1,dim_2)) a1_copy = a1.copy() s_a1 = sigmoid(a1) s_sol_a1 = sigmoid_sol(a1_copy) assert rel_error(sigmoid_grad(s_a1), sigmoid_grad_sol(s_sol_a1)) <= 1e-10
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation # data : N * Dx # W1 : Dx * H # b1 : 1 * H # W2 : H * Dy # b2 : 1 * Dy N = data.shape[0] z1 = data.dot(W1) + b1 a1 = sigmoid(z1) # N * H z2 = a1.dot(W2) + b2 a2 = softmax(z2) # N * Dy cost = np.sum(-np.log(a2[labels == 1])) / N ### END YOUR CODE ### YOUR CODE HERE: backward propagation delta_score = a2 - labels # 1 * Dy delta_score /= N gradW2 = np.dot(a1.T, delta_score) # H * 1 * 1 * Dy = H * Dy gradb2 = np.sum(delta_score, axis=0) grad_h = np.dot(delta_score, W2.T) # 1 * Dy * Dy * H = 1 * H grad_h = sigmoid_grad(a1) * grad_h gradW1 = np.dot(data.T, grad_h) gradb1 = np.sum(grad_h, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE #raise NotImplementedError U = outputVectors u_o = U[target] v_c = predicted N = U.shape[0] sig_o_c = sigmoid(np.dot(u_o, v_c)) cost = -np.log(sig_o_c) gradPred = (sig_o_c - 1)*u_o grad = np.zeros((N, len(v_c))) grad[target] += (sig_o_c - 1)*v_c for k in indices: u_k = U[k] sig_k_c = sigmoid(-np.dot(u_k, v_c)) cost += -np.log(sig_k_c) gradPred += -(sig_k_c - 1)*u_k grad[k] += -(sig_k_c - 1)*v_c ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE W,D = outputVectors.shape UK = np.zeros((K+1, D)) indices = [target] for i in xrange(K): k = dataset.sampleTokenIdx() while k == target: k = dataset.sampleTokenIdx() indices.append(k) for i,ix in enumerate(indices): UK[i] = outputVectors[ix] u_o = outputVectors[target] # (D,) cost = - np.log(sigmoid(np.dot(u_o, predicted))) - np.sum(np.log(sigmoid(-np.dot(UK[1:], predicted)))) gradPred = (sigmoid(np.dot(u_o,predicted))-1) * u_o + np.dot(UK[1:].T,sigmoid(np.dot(UK[1:], predicted))) # dJ/dV_c, (D,) y = np.zeros(K+1); y[0] = 1.0 # grad = np.zeros(outputVectors.shape) gradK = np.outer(sigmoid(np.dot(UK, predicted)) - y, predicted) for i,ix in enumerate(indices): grad[ix] += gradK[i] ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation fc_out = np.dot(data, W1) + b1 # shape (M, H) fc_sigmoid_out = sigmoid(fc_out) # shape (M, H) scores = np.dot(fc_sigmoid_out, W2) + b2 # shape (M, Dy) y_hat = softmax(scores) # shape (M, Dy) # M = data.shape[0] cost = -np.sum(labels * np.log(y_hat)) # / M ### END YOUR CODE ### YOUR CODE HERE: backward propagation dscores = y_hat - labels # / M # shape (M, Dy) gradW2 = np.dot(fc_sigmoid_out.T, dscores) # shape (H, Dy) gradb2 = np.sum(dscores, axis=0) # shape (Dy,) dfc_sigmoid_out = np.dot(dscores, W2.T) # shape (M, H) dfc_out = dfc_sigmoid_out * sigmoid_grad(fc_sigmoid_out) # shape (M, H) gradW1 = np.dot(data.T, dfc_out) # shape (Dx, H) gradb1 = np.sum(dfc_out, axis=0) # shape (H,) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. """ indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) d = predicted.shape[0] v = outputVectors.shape[0] u_0 = outputVectors[target].reshape(1,d) # row v_c = predicted.reshape(d,1) # column predict = sigmoid(np.dot(u_0, v_c)) # reuse cost = - np.log(predict) grad = np.zeros((v,d))+0. grad[target] = ((predict- 1.0) *(v_c.T)).reshape((d,)) gradPred = (predict - 1.0) *u_0 for k in indices[1:]: u_k = outputVectors[k] u_k = u_k.reshape((1,d)) # row predict = sigmoid(np.dot(u_k, v_c)) cost -= np.log(1-predict) gradPred += predict * u_k grad[k] +=( predict * v_c.T).reshape((d,)) gradPred = gradPred.reshape((d,)) return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, using the negative sampling technique. K is the sample # size. You might want to use dataset.sampleTokenIdx() to sample # a random word index. # # Note: See test_word2vec below for dataset's initialization. # # Input/Output Specifications: same as softmaxCostAndGradient # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! cost = -np.log(sigmoid(np.dot(predicted,outputVectors[target]))) num_rand = 0 grad = np.zeros(outputVectors.shape) gradPred = -outputVectors[target]*(1-sigmoid(np.dot(predicted,outputVectors[target]))) grad[target] = -predicted*(1-sigmoid(np.dot(predicted,outputVectors[target]))) while num_rand < K: rand = dataset.sampleTokenIdx() if rand == target: continue num_rand += 1 cost -= np.log(sigmoid(-np.dot(predicted,outputVectors[rand]))) grad[rand] += predicted*(1-sigmoid(-np.dot(predicted,outputVectors[rand]))) gradPred += outputVectors[rand]*(1-sigmoid(-np.dot(predicted,outputVectors[rand]))) return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### forward propagation N = data.shape[0] l1 = data.dot(W1) + b1 h = sigmoid(l1) l2 = h.dot(W2) + b2 y_hat = softmax(l2) cost = -np.sum(labels * np.log(y_hat)) / N # cross entropy ### backward propagation dl2 = y_hat - labels dW2 = np.dot(h.T, dl2) db2 = np.sum(dl2, axis=0) dh = np.dot(dl2, W2.T) dl1 = dh * sigmoid_grad(h) dW1 = np.dot(data.T, dl1) db1 = np.sum(dl1, axis=0) gradW2 = dW2/N gradb2 = db2/N gradW1 = dW1/N gradb1 = db1/N ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation p1 = np.dot(data, W1) + b1 h = sigmoid(p1) #(M,H) p2 = np.dot(h, W2) + b2 y_pred = softmax(p2) #(M, Dy) cost = np.mean(np.sum(-1 * np.multiply(labels, np.log(y_pred)), axis=1)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation gradp2 = (y_pred - labels) / np.shape(data)[0] #(M,Dy) gradW2 = np.dot(h.T, gradp2) #(H, Dy) gradb2 = np.sum(gradp2, axis=0).reshape((1,-1)) #(1, Dy) gradh = np.dot(gradp2, W2.T) #(M,H) gradp1 = np.multiply(gradh, h * (1 - h)) #(M,H) element wise multiplication gradW1 = np.dot(data.T, gradp1) # (Dx,H) gradb1 = np.sum(gradp1, axis=0).reshape((1,-1)) #(1,H) ### END YOUR CODE ### Stack gradients (do not modify)[0].reshape((1,-1)) #(1, Dy) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h = sigmoid(np.dot(data,W1) + b1) yhat = softmax(np.dot(h,W2) + b2) ### END YOUR CODE ### YOUR CODE HERE: backward propagation cost = np.sum(-np.log(yhat[labels==1])) / data.shape[0] d3 = (yhat - labels) / data.shape[0] gradW2 = np.dot(h.T, d3) gradb2 = np.sum(d3,0,keepdims=True) dh = np.dot(d3,W2.T) grad_h = sigmoid_grad(h) * dh gradW1 = np.dot(data.T,grad_h) gradb1 = np.sum(grad_h,0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE cost = 0. gradPred = np.zeros(predicted.shape) # shape (N,) grad = np.zeros(outputVectors.shape) # shape (W, N) vc = predicted # shape (N,) uo = outputVectors[target] # shape (N,) yo = sigmoid(np.dot(uo, vc)) # scaler cost += -np.log(yo) gradPred += (yo - 1) * uo # shape (N,) grad[target] += (yo - 1) * vc for k in indices[1:]: uk = outputVectors[k] y_neg_k = sigmoid(-np.dot(uk, vc)) cost += -np.log(y_neg_k) gradPred += -(y_neg_k - 1) * uk # shape (N,) grad[k] += -(y_neg_k - 1) * vc ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE grad = np.zeros(outputVectors.shape) gradPred = np.zeros(predicted.shape) cost = 0 z = sigmoid(np.dot(outputVectors[target], predicted)) cost -= np.log(z) grad[target] += predicted * (z - 1.0) gradPred += outputVectors[target] * (z - 1.0) for k in xrange(K): samp = indices[k + 1] z = sigmoid(np.dot(outputVectors[samp], predicted)) cost -= np.log(1.0 - z) grad[samp] += predicted * z gradPred += outputVectors[samp] * z ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE gradPred = np.zeros_like(predicted) grad = np.zeros_like(outputVectors) cost = 0.0 product = np.dot(outputVectors[target], predicted) #intermediate value cost = -np.log(sigmoid(product)) gradPred = (sigmoid(product) - 1) * outputVectors[target] grad[target] = (sigmoid(product) - 1) * predicted for index in indices: neg_sig = sigmoid (-1 * np.dot(outputVectors[index], predicted)) cost += -np.log(neg_sig) gradPred += -(neg_sig -1) * outputVectors[index] grad[index] += -(neg_sig - 1) * predicted ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation N = data.shape[0] Z1 = data.dot(W1) + b1 # (N, H) A1 = sigmoid(Z1) # (N, H) scores = A1.dot(W2) + b2 # (N, Dy) probs = softmax(scores) # (N, Dy) cost = -np.sum(np.log(probs[labels==1])) / N ### END YOUR CODE ### YOUR CODE HERE: backward propagation dscores = (probs - labels) / N dW2 = A1.T.dot(dscores) db2 = np.sum(dscores, axis=0) dA1 = dscores.dot(W2.T) dZ1 = sigmoid_grad(A1) * dA1 dW1 = data.T.dot(dZ1) db1 = np.sum(dZ1, axis=0) gradW1 = dW1 gradW2 = dW2 gradb1 = db1 gradb2 = db2 ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad