def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE """ Arguments: predicted -- v_c target -- o in the notations outputVectors -- all the U's (but as rows and not as columns (need to transpose) dataset -- needed for negative sampling, unused here. Return: cost -- neg-sampling cost gradPred -- dJ/dv_c grad -- dJ/dU """ target_pred_dot_sig = sigmoid(np.dot(outputVectors[indices[0]], predicted)) # s(u_o^T * v_c) sample_pred_dot_sig = sigmoid(-np.dot(outputVectors[indices[1:]], predicted)) # s(u_k^T * v_c) as whole matrix log_part = np.log(target_pred_dot_sig) sum_part = np.sum(np.log(sample_pred_dot_sig)) cost = - log_part - sum_part ## (s(U*v_c) -[1,00]) * v_c # e_1 = np.zeros(len(indices)) # e_1[target] = 1 # grad_calc = (sigmoid(np.dot(outputVectors[indices], predicted)) - e_1).reshape(-1, 1) * predicted # grad = np.zeros_like(outputVectors) # grad[target] = (sigmoid(outputVectors[])) # for i in xrange(len(indices)): # grad[indices[i]] = grad_calc[i].copy() probs = outputVectors.dot(predicted) grad = np.zeros_like(outputVectors) grad[target] = (sigmoid(probs[target]) - 1) * predicted for k in indices[1:]: grad[k] += (1.0 - sigmoid(-np.dot(outputVectors[k], predicted))) * predicted ## -(1-s(u_o^T * v_c)) * u_o^T + sum_K[(1-s(-u_k^T * v_c)) * u_k^T] gradPred = -1 * (1 - target_pred_dot_sig) * outputVectors[indices[0]] \ + np.sum((1 - sample_pred_dot_sig).reshape(-1,1) * outputVectors[indices[1:]], axis=0) ### END YOUR CODE return cost, gradPred, grad
def your_sanity_checks(): """ Use this space add any additional sanity checks by running: python q2_gradcheck.py This function will not be called by the autograder, nor will your additional tests be graded. """ print "Running your sanity checks..." from q1d_sigmoid import sigmoid, sigmoid_grad sig_f = lambda x: (sigmoid(x), sigmoid_grad(sigmoid(x))) gradcheck_naive(sig_f, np.random.randn(1)) gradcheck_naive(sig_f, np.random.randn(3, )) # 1-D test gradcheck_naive(sig_f, np.random.randn(4, 5)) # 2-D test
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) grad = np.zeros(outputVectors.shape) gradPred = np.zeros(predicted.shape) # outputVectors >> U # predicted >> v_c # target >> o v_c = predicted U = outputVectors dot_prod = np.dot(U,v_c) sigmoid_out = sigmoid(dot_prod) sigmoid_out_neg = sigmoid(-dot_prod) grad[target] += v_c * (sigmoid_out[target]-1) gradPred += U[target] * (sigmoid_out[target]-1) cost = -np.log(sigmoid_out[target]) for i in indices[1:]: cost -= np.log(sigmoid_out_neg[i]) grad[i] += v_c * (1-sigmoid_out_neg[i]) gradPred += U[i] * (1-sigmoid_out_neg[i]) return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE grad = np.zeros(outputVectors.shape) # V * d gradPred = np.zeros(predicted.shape) # d size cost = 0 # sigmoid(Uo dot Vc) sig_outer = sigmoid(np.dot(outputVectors[target], predicted)) cost -= np.log(sig_outer) grad[target] = predicted * (sig_outer - 1.0) # derive acc. to Uo gradPred = outputVectors[target] * (sig_outer - 1.0) # derive acc. to Vc for sample in indices[1:]: # sigmoid(-Uk dot Vc) sig_val = sigmoid(-1.0 * np.dot(outputVectors[sample], predicted)) cost -= np.log(sig_val) grad[sample] = (1.0 - sig_val) * predicted gradPred += (1.0 - sig_val) * outputVectors[sample] ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE grad = np.zeros(outputVectors.shape) activation = sigmoid(np.dot(outputVectors[target], predicted)) cost = -np.log(activation) gradPred = (activation - 1.) * outputVectors[target] grad[target] = (activation - 1.) * predicted for idx in range(1, K + 1): sample_idx = indices[idx] sample = outputVectors[sample_idx] activation = sigmoid(-np.dot(sample, predicted)) cost -= np.log(activation) gradPred -= (activation - 1.) * sample grad[sample_idx] -= (activation - 1.) * predicted ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE grad = np.zeros(outputVectors.shape) gradPred = np.zeros(predicted.shape) cost = 0 z = sigmoid(np.dot(outputVectors[target], predicted)) cost -= np.log(z) grad[target] += predicted * (z - 1.0) gradPred += outputVectors[target] * (z - 1.0) for k in xrange(K): samp = indices[k + 1] z = sigmoid(np.dot(outputVectors[samp], predicted)) cost -= np.log(1.0 - z) grad[samp] += predicted * z gradPred += outputVectors[samp] * z ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) ### YOUR CODE HERE output_products_vector = np.dot( outputVectors, predicted) # vector of uoT * Vc for o = 1,2, ... ,W output_sigmoid_vector = sigmoid( output_products_vector) # vector of sig(uoT * Vc) for o = 1,2, ... ,W output_minus_sigmoid_vector = 1 - output_sigmoid_vector cost = -np.log(output_sigmoid_vector[target]) - np.sum( np.log(output_minus_sigmoid_vector[indices[1:]])) # cost = -log(sig(uoT * Vc) - sum_k[log(sig(-ukT * Vc))] grad_pred_max_part = -output_minus_sigmoid_vector[target] * outputVectors[ target] # -sig(uoT * Vc) * uo grad_pred_neg_samp_part = outputVectors[ indices[1:]] * output_sigmoid_vector[indices[ 1:]][:, np.newaxis] # matrix with k rows of sig(ukT * Vc) * uk grad_pred_sum_neg_samp = np.sum(grad_pred_neg_samp_part, axis=0) # sum the k vectors gradPred = grad_pred_max_part + grad_pred_sum_neg_samp grad = np.zeros( outputVectors.shape) # besides u0 and uk's gradient of rest is zero grad[target] = (output_sigmoid_vector[target] - 1) * predicted # (grad(u0) = sig(u0T * Vc0) -1) * Vc for k in indices[1:]: # k is small ~10 maximum so this is still efficient grad[k] += output_sigmoid_vector[ k] * predicted # grad(uk) = sig(ukT * Vc) * Vc ### END YOUR CODE return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) W = outputVectors.shape[0] #extructing dictionary size as W D = outputVectors.shape[1] #extructing embadings size as D #turn predicted to row vector - if not already if predicted.shape[0] != 1: predicted = np.expand_dims(predicted, axis=1) predicted = np.transpose(predicted) #calc inner product for predicted with all vectors of outputVectors outputVectorsSampled = outputVectors[indices, :] # dim [ (K+1) x D ] inner_prod = np.matmul( predicted, np.transpose(outputVectorsSampled)) # dim [1 X (K+1)] samples_sigmoid = sigmoid(inner_prod[0, :]) # dim [K+1] # caculating the cost cost = -np.log(samples_sigmoid[0]) - np.sum( np.log(1 - samples_sigmoid[1:K + 1])) # calculating gradPred according to our calculations at 2c gradPred = - (1 - samples_sigmoid[0]) * outputVectorsSampled[0,:] \ + np.sum(outputVectorsSampled[1:K+1,:] * np.tile(np.expand_dims((samples_sigmoid[1:K+1]),axis=1),(1,D)) , axis=0) # dim [ 1 x D ] grad = np.zeros([W, D], dtype=np.float32) # dim [ W x D ] grad[indices[0], :] = -predicted * (1 - samples_sigmoid[0]) for idx in range(1, K + 1): #indices[1:K+1]: grad[indices[idx]:indices[idx] + 1, :] += predicted * (samples_sigmoid[idx]) return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) U = outputVectors uo = U[target] vc = predicted sigmoid_uo_dot_vc = sigmoid(uo.dot(vc)) cost = -np.log(sigmoid_uo_dot_vc) gradPred = (sigmoid_uo_dot_vc - 1.0) * uo grad = np.zeros(outputVectors.shape) grad[target] = (sigmoid_uo_dot_vc - 1.0) * vc for k in indices[1:]: sigmoid_minus_uk_dot_vc = sigmoid(-U[k].dot(vc)) cost -= np.log(sigmoid_minus_uk_dot_vc) gradPred += (1.0 - sigmoid_minus_uk_dot_vc) * U[k] grad[k] += (1.0 - sigmoid_minus_uk_dot_vc) * vc return cost, gradPred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) y_hat = sigmoid(np.matmul(outputVectors, predicted)) # We use here that sigmoid(-x) = 1-sigmoid(x) as proved in the PDF cost = -np.log(y_hat[target]) - np.sum([np.log(1 - y_hat[indices[1:]])]) grad_pred = -(1 - y_hat[target]) * outputVectors[target] grad_pred += np.sum((y_hat[indices[1:]] * outputVectors[indices[1:]].transpose()).transpose(), 0) # for all non negative sampling or target the grad is zero grad = np.zeros(shape=outputVectors.shape) # for target grad[target, :] = (-1) * (1 - y_hat[target]) * predicted # for negative samples for negative_idx in range(1, K + 1): grad[indices[negative_idx]] += y_hat[indices[negative_idx]] * predicted return cost, grad_pred, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10): """ Negative sampling cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, using the negative sampling technique. K is the sample size. Note: See test_word2vec below for dataset's initialization. Arguments/Return Specifications: same as softmaxCostAndGradient """ # Sampling of indices is done for you. Do not modify this if you # wish to match the autograder and receive points! indices = [target] indices.extend(getNegativeSamples(target, dataset, K)) grad = np.zeros(outputVectors.shape) # YOUR CODE HERE sigmoid_result = sigmoid(outputVectors.dot(predicted)) # Jneg(o) = -log(sigma(u_o*v_c))-sum(log(sigma(-u_k*v_c))) cost = -np.log(sigmoid_result[target]) - np.sum(np.log(1-sigmoid_result[indices[1:]])) # v_c = sum(sigmoid(u_w*v_c)u_w)-u_o for w=o,k gradPred = -outputVectors[target] + np.sum( sigmoid_result[indices][:, np.newaxis] * outputVectors[indices], axis=0) # u_k = sigmoid(u_k*v_c)*u_k for i in indices: grad[i] += sigmoid_result[i] * predicted # u_o = sigmoid(u_o*v_c)*u_o - v_c grad[target] -= predicted return cost, gradPred, grad # END YOUR CODE return cost, gradPred, grad