def test_softmax_linearity_rowwise(dim_1, dim_2): shift = np.random.uniform(low=-100,high=100,size=(dim_1,1)) #print(shift) a1 = np.random.normal(size=(dim_1,dim_2)) a2 = a1 + shift assert rel_error(np.max(a2 - a1), np.max(shift)) < 1e-8 assert rel_error(softmax(a1),softmax(a2)) < 1e-8
def test_softmax_permutation_axis1(dim_1): a1 = np.random.normal(size=(1,dim_1)) s1 = softmax(a1) permutation = np.random.permutation(dim_1) inverse_permutation = np.argsort(permutation) s1_perm = softmax(a1.ravel()[permutation]) assert rel_error(s1_perm.ravel()[inverse_permutation], s1) <= 1e-8
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models """ # Inputs: # - predicted: numpy ndarray, predicted word vector (\hat{v} in # the written component or \hat{r} in an earlier version) # - target: integer, the index of the target word # - outputVectors: "output" vectors (as rows) for all tokens # - dataset: needed for negative sampling, unused here. # Outputs: # - cost: cross entropy cost for the softmax word prediction # - gradPred: the gradient with respect to the predicted word # vector # - grad: the gradient with respect to all the other word # vectors y = np.zeros((outputVectors.shape[0],)) y[target] = 1.0 y_hat = softmax(np.dot(outputVectors, predicted)) cost = -np.dot(y, np.log(y_hat)) gradPred = -outputVectors[target,:] + np.dot(outputVectors.T, y_hat) grad = np.outer(y_hat - y, predicted) return cost, gradPred, grad
def add_model(self, input_data): """Adds a linear-layer plus a softmax transformation The core transformation for this model which transforms a batch of input data into a batch of predictions. In this case, the mathematical transformation effected is y = softmax(xW + b) Hint: Make sure to create tf.Variables as needed. Also, make sure to use tf.name_scope to ensure that your name spaces are clean. Hint: For this simple use-case, it's sufficient to initialize both weights W and biases b with zeros. Args: input_data: A tensor of shape (batch_size, n_features). Returns: out: A tensor of shape (batch_size, n_classes) """ ### YOUR CODE HERE # W = tf.Variable(tf.zeros((self.config.n_features, self.config.n_classes)), name="weights") # b = tf.Variable(tf.zeros((self.config.n_classes, )), name="biases") with tf.variable_scope('softmax'): W = tf.get_variable("weights", (self.config.n_features, self.config.n_classes), initializer=tf.constant_initializer(0.0)) b = tf.get_variable("bias", (self.config.n_classes,), initializer=tf.constant_initializer(0.0)) out = softmax(tf.matmul(input_data, W) + b) ### END YOUR CODE return out
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False): """ Softmax Regression """ # Implement softmax regression with weight regularization. # Inputs: # - features: feature vectors, each row is a feature vector # - labels: labels corresponding to the feature vectors # - weights: weights of the regressor # - regularization: L2 regularization constant # Output: # - cost: cost of the regressor # - grad: gradient of the regressor cost with respect to its # weights # - pred: label predictions of the regressor (you might find # np.argmax helpful) prob = softmax(features.dot(weights)) if len(features.shape) > 1: N = features.shape[0] else: N = 1 # A vectorized implementation of 1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2 cost = np.sum(-np.log(prob[range(N), labels])) / N cost += 0.5 * regularization * np.sum(weights ** 2) ### YOUR CODE HERE: compute the gradients and predictions raise NotImplementedError ### END YOUR CODE if nopredictions: return cost, grad else: return cost, grad, pred
def add_model(self, input_data): """Adds a linear-layer plus a softmax transformation The core transformation for this model which transforms a batch of input data into a batch of predictions. In this case, the mathematical transformation effected is y = softmax(xW + b) Hint: Make sure to create tf.Variables as needed. Also, make sure to use tf.name_scope to ensure that your name spaces are clean. Hint: For this simple use-case, it's sufficient to initialize both weights W and biases b with zeros. Args: input_data: A tensor of shape (batch_size, n_features). Returns: out: A tensor of shape (batch_size, n_classes) """ ### YOUR CODE HERE #raise NotImplementedError self.W = tf.Variable(tf.zeros([self.config.n_features, self.config.n_classes]), tf.float32, name="weight") self.b = tf.Variable(tf.zeros([self.config.batch_size, self.config.n_classes]), tf.float32, name="bias") out = softmax(tf.matmul(input_data, self.W) + self.b) ### END YOUR CODE return out
def add_model(self, input_data): """Adds a linear-layer plus a softmax transformation The core transformation for this model which transforms a batch of input data into a batch of predictions. In this case, the mathematical transformation effected is y = softmax(xW + b) Hint: Make sure to create tf.Variables as needed. Also, make sure to use tf.name_scope to ensure that your name spaces are clean. Hint: For this simple use-case, it's sufficient to initialize both weights W and biases b with zeros. Args: input_data: A tensor of shape (batch_size, n_features). Returns: out: A tensor of shape (batch_size, n_classes) """ ### YOUR CODE HERE with tf.variable_scope("model"): W = tf.get_variable("W", shape=[self.config.n_features, self.config.n_classes], initializer=tf.random_normal_initializer(0.5, 0.1)) # W = tf.Variable(tf.random_normal(shape=[self.config.n_features, self.config.n_classes], dtype=tf.float32, name="weights")) b = tf.get_variable("b", shape=[self.config.n_classes], initializer=tf.constant_initializer(0.0)) affine_transformation = tf.matmul(self.input_placeholder, W) + b #tf.constant_initializer(value) #tf.random_uniform_initializer(a,b) # b = tf.Variable(tf.zeros(shape=[1,self.config.n_classes], dtype=tf.float32), name="bias") # affine_transformation = tf.add(tf.matmul(W, self.input_placeholder), b, name="affine") out = softmax(affine_transformation) ### END YOUR CODE return out
def add_model(self, input_data): """Adds a linear-layer plus a softmax transformation The core transformation for this model which transforms a batch of input data into a batch of predictions. In this case, the mathematical transformation effected is y = softmax(xW + b) Hint: Make sure to create tf.Variables as needed. Also, make sure to use tf.name_scope to ensure that your name spaces are clean. Hint: For this simple use-case, it's sufficient to initialize both weights W and biases b with zeros. Args: input_data: A tensor of shape (batch_size, n_features). Returns: out: A tensor of shape (batch_size, n_classes) """ ### YOUR CODE HERE n_features, n_classes = self.config.n_features, self.config.n_classes with tf.name_scope('softmax_linear'): weights = tf.Variable( tf.zeros([n_features, n_classes]), name='weights') biases = tf.Variable(tf.zeros([n_classes]), name='biases') logits = tf.matmul(input_data, weights) + biases out = softmax(logits) ### END YOUR CODE return out
def add_model(self, input_data): """Adds a linear-layer plus a softmax transformation The core transformation for this model which transforms a batch of input data into a batch of predictions. In this case, the mathematical transformation effected is y = softmax(xW + b) Hint: Make sure to create tf.Variables as needed. Also, make sure to use tf.name_scope to ensure that your name spaces are clean. Hint: For this simple use-case, it's sufficient to initialize both weights W and biases b with zeros. Args: input_data: A tensor of shape (batch_size, n_features). Returns: out: A tensor of shape (batch_size, n_classes) """ # Create a variable. self.w = tf.Variable(tf.zeros([self.config.n_features, self.config.n_classes]), name = "w") self.b = tf.Variable(tf.zeros([self.config.n_classes]), name = "b") out = softmax(tf.matmul(input_data, self.w) + self.b) #w_hist = tf.histogram_summary("w", self.w) return out
def softmaxCostAndGradient(predicted, target, outputVectors, data): """ Softmax cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, assuming the softmax prediction function and cross # entropy loss. # Inputs: # - predicted: numpy ndarray, predicted word vector (\hat{v} in # the written component or \hat{r} in an earlier version) # - target: integer, the index of the target word # - outputVectors: "output" vectors (as rows) for all tokens # - dataset: needed for negative sampling, unused here. # Outputs: # - cost: cross entropy cost for the softmax word prediction # - gradPred: the gradient with respect to the predicted word # vector # - grad: the gradient with respect to all the other word # vectors # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! prods = np.dot(outputVectors,predicted.T) # 1xV probs = softmax(prods) # 1xV cost = -np.log(probs[target]) # 1x1 dscore = probs dscore[target] -= 1.0 gradPred = np.dot(dscore,outputVectors) grad = np.outer(dscore,predicted) return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation #z1 = data.dot(W1) + b1 #hidden = sigmoid(z1) #z2 = hidden.dot(W2) + b2 #print 'z2.shape: ', z2.shape #prediction = softmax(z2) ### END YOUR CODE hidden = sigmoid(data.dot(W1) + b1) prediction = softmax(hidden.dot(W2) + b2) cost = -np.sum(np.log(prediction) * labels) ### YOUR CODE HERE: backward propagation #print 'NN: ', Dx, H, Dy #print 'b1.shape: ', b1.shape #print 'prediction.shape: ', prediction.shape #print 'labels.shape : ', labels.shape #print 'W2.shape: ', W2.shape #print 'hidden.shape: ', hidden.shape #print 'hidden.T.shape: ', hidden.T.shape #print 'delta.shape: ', delta.shape #print 'W1.shape: ', W1.shape #print 'data.shape: ', data.shape #gradW2 = delta * hidden #print 'sigmoid_grad(hidden).shape: ', sigmoid_grad(hidden).shape delta = prediction - labels gradW2 = hidden.T.dot(delta) gradb2 = np.sum(delta, axis = 0) hidden_delta = delta.dot(W2.T) * sigmoid_grad(hidden) gradW1 = data.T.dot(hidden_delta) gradb1 = np.sum(hidden_delta, axis = 0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation # data: N x Dx, W1: Dx x H, b: 1 x H a = data.dot(W1) + b1 h = sigmoid(a) # h: N x H, W2: H x Dy, b2: 1 x Dy t = h.dot(W2) + b2 y_hat = softmax(t) # y_hat: N x Dy, labels: N x Dy (as int) probs = labels * y_hat cost = np.sum(-np.log(probs.sum(axis=1))) ### END YOUR CODE ### YOUR CODE HERE: backward propagation # obtain the softmax gradient dJdt = (y_hat - labels) # N x Dy # b2 grad is sum along each index of the Dy vectors gradb2 = np.sum(dJdt, 0) # h: N x H, dJdt: N x Dy gradW2 = h.T.dot(dJdt) # H x Dy # dJdt: N x Dy, W2: H x Dy dJdh = dJdt.dot(W2.T) # h: N x H dhda = sigmoid_grad(h) # data: N x Dx, dhda: N x H, DJdh: N x H gradW1 = data.T.dot(dhda * dJdh) # dhda: N x H, DJdh: N x H gradb1 = np.sum(dhda * dJdh, 0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, assuming the softmax prediction function and cross # entropy loss. # Inputs: # - predicted: numpy ndarray, predicted word vector (\hat{v} in # the written component or \hat{r} in an earlier version) # - target: integer, the index of the target word # - outputVectors: "output" vectors (as rows) for all tokens # - dataset: needed for negative sampling, unused here. # Outputs: # - cost: cross entropy cost for the softmax word prediction # - gradPred: the gradient with respect to the predicted word # vector # - grad: the gradient with respect to all the other word # vectors # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE ''' Keep track of dims: D - dim of word vector V - number of words predicted : (D, ) target : integer outputVectors : (V, D) cost : float gradPred : (D, ) grad : (V, D) ''' predicted = predicted.reshape(-1, 1) scores = outputVectors.dot(predicted) # (V, 1) probs = softmax(scores.T) # (1, V) targetProb = probs[0, target] cost = -np.log(targetProb) scores_exp = np.exp(scores) # (V, 1) scores_exp_sum = np.sum(scores_exp) # float gradPred = - outputVectors[target, :] + np.sum(scores_exp * outputVectors, axis=0) / scores_exp_sum # (D, ) grad = scores_exp.dot(predicted.T) / scores_exp_sum # (V, D) grad[target, :] -= predicted.reshape(-1) ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation # data : N * Dx # W1 : Dx * H # b1 : 1 * H # W2 : H * Dy # b2 : 1 * Dy N = data.shape[0] z1 = data.dot(W1) + b1 a1 = sigmoid(z1) # N * H z2 = a1.dot(W2) + b2 a2 = softmax(z2) # N * Dy cost = np.sum(-np.log(a2[labels == 1])) / N ### END YOUR CODE ### YOUR CODE HERE: backward propagation delta_score = a2 - labels # 1 * Dy delta_score /= N gradW2 = np.dot(a1.T, delta_score) # H * 1 * 1 * Dy = H * Dy gradb2 = np.sum(delta_score, axis=0) grad_h = np.dot(delta_score, W2.T) # 1 * Dy * Dy * H = 1 * H grad_h = sigmoid_grad(a1) * grad_h gradW1 = np.dot(data.T, grad_h) gradb1 = np.sum(grad_h, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors We will not provide starter code for this function, but feel free to reference the code you previously wrote for this assignment! """ ### YOUR CODE HERE #raise NotImplementedError v_c = predicted U = outputVectors N = U.shape[0] #print v_c.shape, U.shape theta = np.zeros(N) for i in range(N): theta[i] = np.dot(U[i], v_c) y_hat = softmax(theta) #print y_hat.shape cost = -np.log(y_hat[target]) gradPred = -U[target] for i in range(N): gradPred += U[i]*y_hat[i] grad = np.zeros((N, len(v_c))) for i in range(N): if i == target: grad[i] = (y_hat[i] - 1)*v_c else: grad[i] = y_hat[i]*v_c #print grad.shape, gradPred.shape ### END YOUR CODE return cost, gradPred, grad
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False): """ Softmax Regression """ # Implement softmax regression with weight regularization. # Inputs: # - features: feature vectors, each row is a feature vector # - labels: labels corresponding to the feature vectors # - weights: weights of the regressor # - regularization: L2 regularization constant # Output: # - cost: cost of the regressor # - grad: gradient of the regressor cost with respect to its # weights # - pred: label predictions of the regressor (you might find # np.argmax helpful) prob = softmax(features.dot(weights)) if len(features.shape) > 1: N = features.shape[0] else: N = 1 # A vectorized implementation of 1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2 cost = np.sum(-np.log(prob[range(N), labels])) / N cost += 0.5 * regularization * np.sum(weights ** 2) ### YOUR CODE HERE: compute the gradients and predictions # NOTE - N is the batch size # features is an N x M matrix, M being # features # weights is an M X K matrix, K being # classes # prob is an N x K matrix (batchSize x classes) # labels is a 1-hot (row) vector # Get delta, an N x K matrix with CE error signal # z = XW, where X = features and W = weights # dJ/dz delta = np.array(prob) delta[range(N), labels] -= 1. # dz/dW = 1/N * X * delta # dJ/dW = dJ/dz * dz/dW grad = features.T.dot(delta) / N grad += regularization * weights if N > 1: pred = np.argmax(prob, axis=1) else: pred = np.argmax(prob) ### END YOUR CODE if nopredictions: return cost, grad else: return cost, grad, pred
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation fc_out = np.dot(data, W1) + b1 # shape (M, H) fc_sigmoid_out = sigmoid(fc_out) # shape (M, H) scores = np.dot(fc_sigmoid_out, W2) + b2 # shape (M, Dy) y_hat = softmax(scores) # shape (M, Dy) # M = data.shape[0] cost = -np.sum(labels * np.log(y_hat)) # / M ### END YOUR CODE ### YOUR CODE HERE: backward propagation dscores = y_hat - labels # / M # shape (M, Dy) gradW2 = np.dot(fc_sigmoid_out.T, dscores) # shape (H, Dy) gradb2 = np.sum(dscores, axis=0) # shape (Dy,) dfc_sigmoid_out = np.dot(dscores, W2.T) # shape (M, H) dfc_out = dfc_sigmoid_out * sigmoid_grad(fc_sigmoid_out) # shape (M, H) gradW1 = np.dot(data.T, dfc_out) # shape (Dx, H) gradb1 = np.sum(dfc_out, axis=0) # shape (H,) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, assuming the softmax prediction function and cross # entropy loss. # Inputs: # - predicted: numpy ndarray, predicted word vector (\hat{v} in # the written component or \hat{r} in an earlier version) # - target: integer, the index of the target word # - outputVectors: "output" vectors (as rows) for all tokens # - dataset: needed for negative sampling, unused here. # Outputs: # - cost: cross entropy cost for the softmax word prediction # - gradPred: the gradient with respect to the predicted word # vector # - grad: the gradient with respect to all the other word # vectors # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE N,D = outputVectors.shape score = softmax(np.sum(outputVectors * predicted,axis = 1)) cost = -np.log(score[target]) gradPred = np.sum(outputVectors * score.reshape((N,1)),axis = 0) - outputVectors[target] new_score = score.copy() new_score[target] -= 1 grad = np.dot(new_score.reshape((N,1)),predicted.reshape((1,D))) # # (2C,D) * (1,D) -> (2C,D), element-wise!, sum -> (2C,) # y_hat = softmax(np.sum(outputVectors * predicted, axis=1, keepdims=True)) # y = np.zeros([len(y_hat) ,1]) # y[target] = 1 # cost = -np.log(y_hat[target]) # delta = y_hat - y # (2C,) # gradPred = np.sum(outputVectors * delta, axis=0) # gradPred = delta.T.dot(outputVectors).reshape((-1,)) # (,2C) x (2C,D) -> (1,D) # # gradPred = np.sum(outputVectors * y_hat, axis=0) # grad = delta.dot(np.reshape(predicted, (-1,1) ) )# (2C,) * (1,D) -> (2C,D) ### END YOUR CODE return cost, gradPred, grad
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False): """ Softmax Regression """ # Implement softmax regression with weight regularization. # Inputs: # - features: feature vectors, each row is a feature vector (N * D) # - labels: labels corresponding to the feature vectors (N,) # - weights: weights of the regressor (D * C) # - regularization: L2 regularization constant # Output: # - cost: cost of the regressor # - grad: gradient of the regressor cost with respect to its # weights # - pred: label predictions of the regressor (you might find # np.argmax helpful) # calculate the scores # scores shape (N, C) dot1 = features.dot(weights) prob = softmax(dot1) # print "prob shape %s" % (prob.shape, ) # print "weights shape %s" % (weights.shape, ) # print "features shape %s" % (features.shape, ) if len(features.shape) > 1: N = features.shape[0] else: N = 1 # A vectorized implementation of 1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2 cost = np.sum(-np.log(prob[range(N), labels])) / N cost += 0.5 * regularization * np.sum(weights ** 2) ### YOUR CODE HERE: compute the gradients and predictions ddot = prob.copy() ddot[range(N), labels] -= 1 ddot /= N # dot1 = features.dot(weights) # weights shape D* C # feature shape N, D # dot shape N * C dweights = features.T.dot(ddot) grad = dweights grad += (regularization * weights) ### END YOUR CODE pred = np.argmax(prob, axis=1) if nopredictions: return cost, grad else: return cost, grad, pred
def your_sanity_checks(): """ Use this space add any additional sanity checks by running: python q2_gradcheck.py This function will not be called by the autograder, nor will your additional tests be graded. """ print "Running your sanity checks..." print "checking softmax_loss_grad" gradcheck_naive(softmax_loss_grad, np.array(123.456)) # scalar test gradcheck_naive(softmax_loss_grad, np.random.randn(3,)) # 1-D test gradcheck_naive(softmax_loss_grad, np.random.randn(4,5)) # 2-D test print "checking sigmoid_loss_grad" gradcheck_naive(sigmoid_loss_grad, np.array(123.456)) # scalar test gradcheck_naive(sigmoid_loss_grad, np.random.randn(3,)) # 1-D test gradcheck_naive(sigmoid_loss_grad, np.random.randn(4,5)) # 2-D test print "checking cross_category_loss_grad" gradcheck_naive(lambda x: cross_category_loss_grad(x, np.array(134.1)), np.array(123.456)) # scalar test l1 = softmax(np.random.randn(3,)) l2 = softmax(np.random.randn(4, 5)) gradcheck_naive(lambda x: cross_category_loss_grad(x, l1), softmax(np.random.randn(3,))) # 1-D test gradcheck_naive(lambda x: cross_category_loss_grad(x, l2), softmax(np.random.randn(4, 5))) # 2-D test print "checking score_to_loss_grad" l1 = softmax(np.random.randn(3,)) l2 = softmax(np.random.randn(4, 5)) gradcheck_naive(lambda x: score_to_loss_grad(x, l1), np.random.randn(3,)) # 1-D test gradcheck_naive(lambda x: score_to_loss_grad(x, l2), np.random.randn(4, 5)) # 2-D test
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False): """ Softmax Regression """ # Implement softmax regression with weight regularization. # Inputs: # - features: feature vectors, each row is a feature vector # - labels: labels corresponding to the feature vectors # - weights: weights of the regressor # - regularization: L2 regularization constant # Output: # - cost: cost of the regressor # - grad: gradient of the regressor cost with respect to its # weights # - pred: label predictions of the regressor (you might find # np.argmax helpful) ''' Keep track of dims: N - number of sentences D - size of sentence feature C - number of clases features : (N, D) weights : (D, C) labels : (N, ) grad : (D, C) pred : (N, ) ''' prob = softmax(features.dot(weights)) # (N, C) if len(features.shape) > 1: N = features.shape[0] else: N = 1 # A vectorized implementation of 1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2 cost = np.sum(-np.log(prob[range(N), labels])) / N cost += 0.5 * regularization * np.sum(weights ** 2) ### YOUR CODE HERE: compute the gradients and predictions pred = np.argmax(prob, axis=1) dscores = prob dscores[range(N), labels] -= 1 dscores /= N grad = features.T.dot(dscores) + regularization * weights ### END YOUR CODE if nopredictions: return cost, grad else: return cost, grad, pred
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors """ """ word number v """ v = outputVectors.shape[0] d = predicted.shape[0] v_c = predicted """ calculated y_hat = softmax (U.T @ v_c) the result is still a numpy array """ product = np.dot(outputVectors, v_c) y_hat = softmax(product) """ cost = - log (soft_max[target]) """ cost = - np.log(y_hat[target]) """ gradPred = U (y_hat - y) grad = v_c @ (y_hat - y).T and take the transpose for row vectors """ y_gap = y_hat y_gap[target] -= 1.0 gradPred = np.dot(outputVectors.T, y_gap) grad = np.outer(y_gap, v_c) return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation p1 = np.dot(data, W1) + b1 h = sigmoid(p1) #(M,H) p2 = np.dot(h, W2) + b2 y_pred = softmax(p2) #(M, Dy) cost = np.mean(np.sum(-1 * np.multiply(labels, np.log(y_pred)), axis=1)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation gradp2 = (y_pred - labels) / np.shape(data)[0] #(M,Dy) gradW2 = np.dot(h.T, gradp2) #(H, Dy) gradb2 = np.sum(gradp2, axis=0).reshape((1,-1)) #(1, Dy) gradh = np.dot(gradp2, W2.T) #(M,H) gradp1 = np.multiply(gradh, h * (1 - h)) #(M,H) element wise multiplication gradW1 = np.dot(data.T, gradp1) # (Dx,H) gradb1 = np.sum(gradp1, axis=0).reshape((1,-1)) #(1,H) ### END YOUR CODE ### Stack gradients (do not modify)[0].reshape((1,-1)) #(1, Dy) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### forward propagation N = data.shape[0] l1 = data.dot(W1) + b1 h = sigmoid(l1) l2 = h.dot(W2) + b2 y_hat = softmax(l2) cost = -np.sum(labels * np.log(y_hat)) / N # cross entropy ### backward propagation dl2 = y_hat - labels dW2 = np.dot(h.T, dl2) db2 = np.sum(dl2, axis=0) dh = np.dot(dl2, W2.T) dl1 = dh * sigmoid_grad(h) dW1 = np.dot(data.T, dl1) db1 = np.sum(dl1, axis=0) gradW2 = dW2/N gradb2 = db2/N gradW1 = dW1/N gradb1 = db1/N ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h = sigmoid(np.dot(data,W1) + b1) yhat = softmax(np.dot(h,W2) + b2) ### END YOUR CODE ### YOUR CODE HERE: backward propagation cost = np.sum(-np.log(yhat[labels==1])) / data.shape[0] d3 = (yhat - labels) / data.shape[0] gradW2 = np.dot(h.T, d3) gradb2 = np.sum(d3,0,keepdims=True) dh = np.dot(d3,W2.T) grad_h = sigmoid_grad(h) * dh gradW1 = np.dot(data.T,grad_h) gradb1 = np.sum(grad_h,0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False): """ Softmax Regression """ # Implement softmax regression with weight regularization. # Inputs: # - features: feature vectors, each row is a feature vector # - labels: labels corresponding to the feature vectors # - weights: weights of the regressor # - regularization: L2 regularization constant # Output: # - cost: cost of the regressor # - grad: gradient of the regressor cost with respect to its # weights # - pred: label predictions of the regressor (you might find # np.argmax helpful) prob = softmax(features.dot(weights)) if len(features.shape) > 1: N = features.shape[0] C = weights.shape[1] else: N = 1 C = weights.shape[0] #print "C", C # A vectorized implementation of 1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2 cost = np.sum(-np.log(prob[range(N), labels])) / N cost += 0.5 * regularization * np.sum(weights ** 2) / N #print "cost: ", cost #print "weights: ", weights y = np.zeros((N, C)) y[range(N), labels] = 1 grad = features.T.dot(prob - y) / N + weights * regularization / N #print "y: ", y #print "prob: ", prob.shape #print "features: ", features.shape #print "labels: ", labels #print "W:", weights.shape if nopredictions: return cost, grad else: pred = np.argmax(prob, axis=1) return cost, grad, pred
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation N = data.shape[0] Z1 = data.dot(W1) + b1 # (N, H) A1 = sigmoid(Z1) # (N, H) scores = A1.dot(W2) + b2 # (N, Dy) probs = softmax(scores) # (N, Dy) cost = -np.sum(np.log(probs[labels==1])) / N ### END YOUR CODE ### YOUR CODE HERE: backward propagation dscores = (probs - labels) / N dW2 = A1.T.dot(dscores) db2 = np.sum(dscores, axis=0) dA1 = dscores.dot(W2.T) dZ1 = sigmoid_grad(A1) * dA1 dW1 = data.T.dot(dZ1) db1 = np.sum(dZ1, axis=0) gradW1 = dW1 gradW2 = dW2 gradb1 = db1 gradb2 = db2 ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation N, D = data.shape h = sigmoid(data.dot(W1) + b1) scores = softmax(h.dot(W2) + b2) cost = np.sum(- np.log(scores[labels == 1])) / N ### END YOUR CODE ### YOUR CODE HERE: backward propagation dscores = scores - labels # good dscores /= N gradb2 = np.sum(dscores, axis=0) gradW2 = np.dot(h.T, dscores) grad_h = np.dot(dscores, W2.T) grad_h = sigmoid_grad(h) * grad_h gradb1 = np.sum(grad_h, axis=0) gradW1 = np.dot(data.T, grad_h) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h_raw = np.dot(data, W1) + b1 # N x H h = sigmoid(h_raw) # N x H pred_raw = np.dot(h, W2) + b2 # N x Dy pred = softmax(pred_raw) # N x Dy cost = - np.sum(np.log(pred[labels == 1.0])) # scalar ### END YOUR CODE ### YOUR CODE HERE: backward propagation grad_pred_raw = pred - labels # N x Dy gradW2 = np.dot(h.T, grad_pred_raw) # H x Dy gradb2 = np.sum(grad_pred_raw, axis=0) # 1 x Dy grad_h = np.dot(grad_pred_raw, W2.T) # N x H grad_h_raw = grad_h * h * (1 - h) # N x H gradW1 = np.dot(data.T, grad_h_raw) # Dx x H gradb1 = np.sum(grad_h_raw, axis=0) # 1 x H grad_data = np.dot(grad_h_raw, W1.T) # N x Dx ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False): """ Softmax Regression """ # Implement softmax regression with weight regularization. # Inputs: # - features: feature vectors, each row is a feature vector # - labels: labels corresponding to the feature vectors # - weights: weights of the regressor # - regularization: L2 regularization constant # Output: # - cost: cost of the regressor # - grad: gradient of the regressor cost with respect to its # weights # - pred: label predictions of the regressor (you might find # np.argmax helpful) prob = softmax(features.dot(weights)) if len(features.shape) > 1: N = features.shape[0] else: N = 1 # A vectorized implementation of 1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2 cost = np.sum(-np.log(prob[range(N), labels])) / N cost += 0.5 * regularization * np.sum(weights ** 2) ### YOUR CODE HERE: compute the gradients and predictions if N==1: x = features.dot(weights)[np.nexaxis] else: x = features.dot(weights) pred = np.argmax(prob, axis=1) y = labels probs = np.exp(x - np.max(x, axis=1, keepdims=True)) probs /= np.sum(probs, axis=1, keepdims=True) loss = -np.sum(np.log(probs[np.arange(N), y])) / N dx = probs.copy() dx[np.arange(N), y] -= 1 dx /= N grad = features.T.dot(dx) + regularization*weights ### END YOUR CODE if nopredictions: return cost, grad else: return cost, grad, pred
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, assuming the softmax prediction function and cross # entropy loss. # Inputs: # - predicted: numpy ndarray, predicted word vector (\hat{v} in # the written component or \hat{r} in an earlier version) # - target: integer, the index of the target word # - outputVectors: "output" vectors (as rows) for all tokens # - dataset: needed for negative sampling, unused here. # Outputs: # - cost: cross entropy cost for the softmax word prediction # - gradPred: the gradient with respect to the predicted word # vector # - grad: the gradient with respect to all the other word # vectors # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE N = outputVectors.shape[0] # n_words: vocab size y = np.zeros(N) y[target] = 1 # (n_words) score = np.dot(predicted, outputVectors.T) # (1, n_words) out = softmax(score) cost = np.sum(-y * np.log(out)) dout = out - y # (1, n_words) gradPred = np.dot(dout, outputVectors) # (1, dim_embed) grad = np.dot(dout.T, predicted) # (n_words, dim_embed) ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. Arguments: data -- M x Dx matrix, where each row is a training example. labels -- M x Dy matrix, where each row is a one-hot vector. params -- Model parameters, these are unpacked for you. dimensions -- A tuple of input dimension, number of hidden units and output dimension """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) h = sigmoid(np.dot(data, W1) + b1) y_hat = softmax(np.dot(h, W2) + b2) cost = -np.dot(labels.flatten(), np.log(y_hat).flatten()) gradb2 = y_hat - labels gradW2 = np.dot(h.T, gradb2) gradb1 = np.dot(gradb2, W2.T) * sigmoid_grad(h) gradW1 = np.dot(data.T, gradb1) gradb2 = gradb2.sum(axis=0) gradb1 = gradb1.sum(axis=0) ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxRegression(features, labels, weights, regularization=0.0, nopredictions=False): """ Softmax Regression """ # Implement softmax regression with weight regularization. # Inputs: # - features: feature vectors, each row is a feature vector # - labels: labels corresponding to the feature vectors # - weights: weights of the regressor # - regularization: L2 regularization constant # Output: # - cost: cost of the regressor # - grad: gradient of the regressor cost with respect to its # weights # - pred: label predictions of the regressor (you might find # np.argmax helpful) prob = softmax(features.dot(weights)) if len(features.shape) > 1: N = features.shape[0] else: N = 1 # A vectorized implementation of 1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2 cost = np.sum(-np.log(prob[range(N), labels])) / N cost += 0.5 * regularization * np.sum(weights**2) ### YOUR CODE HERE: compute the gradients and predictions D = weights.shape[1] delta = prob - np.eye(D)[labels] grad = (np.dot(features.T, delta) / N) + regularization * weights pred = np.argmax(prob, axis=1) if N > 1 else np.argmax(prob) ### END YOUR CODE if nopredictions: return cost, grad else: return cost, grad, pred
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### forward propagation z1 = np.dot(data, W1) + b1 # Shape = (20,5) h = sigmoid(z1) # Shape = (20,5) z2 = np.dot(h, W2) + b2 # Shape = (20,10) yHat = softmax(z2) cost = -np.sum(np.multiply(labels, np.log(yHat))) ### YOUR CODE HERE: backward propagation gradSigma = sigmoid_grad(h) delta1 = yHat - labels delta2 = delta1.dot(W2.T) delta3 = np.multiply(delta2, gradSigma) gradW2 = np.dot(delta1.T, h).T gradb2 = np.sum(delta1, axis=0) gradW1 = np.dot(delta3.T, data).T gradb1 = np.sum(delta3, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors We will not provide starter code for this function, but feel free to reference the code you previously wrote for this assignment! """ ### YOUR CODE HERE #raise NotImplementedError v_hat = np.dot(outputVectors,predicted) y_hat = softmax(v_hat) cost = -np.log(y_hat[target]) y_hat[target] -= 1.0 #subtracting the correct class gradPred = np.dot(np.transpose(outputVectors),y_hat) #grad = np.dot(y_hat,np.transpose(predicted)) grad = np.outer(y_hat,predicted) ### END YOUR CODE return cost, gradPred, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors We will not provide starter code for this function, but feel free to reference the code you previously wrote for this assignment! """ ### YOUR CODE HERE # calculate the predictions vhat = predicted z = np.dot(outputVectors, vhat) preds = softmax(z) # calculate the cost cost = -np.log(preds[target]) # Gradients z = preds.copy() z[target] -= 1.0 grad = np.outer(z, vhat) # WxD gradPred = np.dot(outputVectors.T, z) # Dx1 # raise NotImplementedError ### END YOUR CODE return cost, gradPred, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors We will not provide starter code for this function, but feel free to reference the code you previously wrote for this assignment! """ z = np.sum(np.multiply(outputVectors, predicted), axis=1) z_e = np.dot(outputVectors, predicted) print(z) print(z_e) y_h = softmax(z) print(outputVectors.shape()) cost = -np.log(y_h[target]) y_h_copy = y_h y_h_copy[target] -= 1 gradPred = np.dot(outputVectors.T, y_h_copy) grad = np.multiply(predicted, y_h_copy.T) ### YOUR CODE HERE ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation ### cost http://tinyurl.com/jblb265 hidden = sigmoid(np.dot(data, W1) + b1) prediction = softmax(np.dot(hidden, W2) + b2) cost = -np.sum(np.log(prediction) * labels) ### END YOUR CODE ### YOUR CODE HERE: backward propagation ### reference : http://neuralnetworksanddeeplearning.com/chap2.html delta = prediction - labels gradW2 = np.dot(hidden.T, delta) gradb2 = np.sum(delta, axis=0) delta = delta.dot(W2.T) * sigmoid_grad(hidden) gradW1 = data.T.dot(delta) gradb1 = np.sum(delta, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation z1 = np.dot(data, W1) + b1 h = sigmoid(z1) z2 = np.dot(h, W2) + b2 preds = softmax(z2) cost = -np.sum(labels * np.log(preds)) ### YOUR CODE HERE: backward propagation # Calculate dcost/dz2 dcost_dz2 = preds - labels gradW2 = np.dot(h.T, dcost_dz2) gradb2 = np.sum(dcost_dz2, axis=0) dcost_dz1 = np.multiply(np.dot(dcost_dz2, W2.T), sigmoid_grad(h)) gradW1 = np.dot(data.T, dcost_dz1) gradb1 = np.sum(dcost_dz1, axis=0) ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors We will not provide starter code for this function, but feel free to reference the code you previously wrote for this assignment! """ ### YOUR CODE HERE #outputVectors shape W,D #predicted Dx1 y_ = softmax(np.dot(outputVectors, predicted).flatten()) cost = -np.log(y_[target]) delta = y_.reshape(-1, 1) # delta Wx1 delta[target] -= 1 gradPred = outputVectors.T.dot(delta) #DxW grad = predicted.dot(delta.T).T ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h = sigmoid(np.dot(data, W1) + b1) y = softmax(np.dot(h, W2) + b2) cost = -np.sum(labels * np.log(y)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation grady = (y - labels) gradW2 = np.dot(h.T, grady) gradb2 = np.sum(grady, axis=0) gradh = np.dot(grady, W2.T) gradz1 = gradh * h * (1 - h) gradW1 = np.dot(data.T, gradz1) gradb1 = np.sum(gradz1, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors We will not provide starter code for this function, but feel free to reference the code you previously wrote for this assignment! """ ### YOUR CODE HERE # https://courses.cs.ut.ee/MTAT.03.277/2015_fall/uploads/Main/word2vec.pdf # http://mccormickml.com/assets/word2vec/Alex_Minnaar_Word2Vec_Tutorial_Part_I_The_Skip-Gram_Model.pdf # считаем вероятность (каждого вектора из outputVectors) scores = softmax(np.dot(outputVectors, predicted.reshape(-1, 1)).reshape(-1)) # считаем LogLoss cost = - np.log(scores[target]) # gradient grad_L = scores grad_L[target] -= 1.0 # градиент по слову из контекста gradPred = np.dot(scores.reshape(1, -1), outputVectors) # градиент по всем эмбеддингам словаря grad = np.dot(scores.reshape(-1, 1), predicted.reshape(1, -1)) return cost, gradPred, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, assuming the softmax prediction function and cross # entropy loss. # Inputs: # - predicted: numpy ndarray, predicted word vector (\hat{v} in # the written component or \hat{r} in an earlier version) # - target: integer, the index of the target word # - outputVectors: "output" vectors (as rows) for all tokens # - dataset: needed for negative sampling, unused here. # Outputs: # - cost: cross entropy cost for the softmax word prediction # - gradPred: the gradient with respect to the predicted word # vector # - grad: the gradient with respect to all the other word # vectors # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE #由中心词推导外围词的概率 probabilities = softmax(predicted.dot(outputVectors.T)) cost = -np.log(probabilities[target]) #获取指定目标外围词的误差,Cross Entropy delta = probabilities #1*5 delta[target] -= 1 N = delta.shape[0] #5 D = predicted.shape[0] #3 grad = delta.reshape((N, 1)) * predicted.reshape((1, D)) #输出词词向量变化值,5*3 gradPred = (delta.reshape( (1, N)).dot(outputVectors)).flatten() #输出中心词词向量变化量,即predicted vector,1*3 ### END YOUR CODE return cost, gradPred, grad
def softmaxRegression(features, labels, weights, regularization=0.0, nopredictions=False): """ Softmax Regression """ # Implement softmax regression with weight regularization. # Inputs: # - features: feature vectors, each row is a feature vector # - labels: labels corresponding to the feature vectors # - weights: weights of the regressor # - regularization: L2 regularization constant # Output: # - cost: cost of the regressor # - grad: gradient of the regressor cost with respect to its # weights # - pred: label predictions of the regressor (you might find # np.argmax helpful) prob = softmax(features.dot(weights)) if len(features.shape) > 1: N = features.shape[0] else: N = 1 # A vectorized implementation of 1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2 cost = np.sum(-np.log(prob[range(N), labels])) / N cost += 0.5 * regularization * np.sum(weights**2) dz = np.zeros(prob.shape) dz += prob dz[range(N), labels] -= 1. dw = np.dot(features.T, dz) / N dw += regularization * weights if nopredictions: return cost, dw else: pred = np.argmax(prob, axis=1) # class labels return cost, dw, pred
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation hidden_out = sigmoid(np.matmul(data, W1) + b1) output = softmax(np.matmul(hidden_out, W2) + b2) cost = np.sum(-labels * np.log(output)) / data.shape[0] ### END YOUR CODE ### YOUR CODE HERE: backward propagation grad_output = (output - labels) / data.shape[0] gradW2 = np.dot(hidden_out.transpose(), grad_output) gradb2 = np.sum(grad_output, axis=0) grad_hidden = np.dot(grad_output, W2.transpose()) grad_hidden = grad_hidden * hidden_out * (1 - hidden_out) gradW1 = np.dot(data.transpose(), grad_hidden) gradb1 = np.sum(grad_hidden, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors We will not provide starter code for this function, but feel free to reference the code you previously wrote for this assignment! """ ### YOUR CODE HERE vhat = predicted U = outputVectors z = np.dot(U, vhat) yhat = softmax(z) cost = -np.log(yhat[target]) dz = yhat dz[target] -= 1 gradPred = np.dot(U.T, dz) grad = dz.reshape(dz.shape[0], 1) * vhat.reshape(1, vhat.shape[0]) ### END YOUR CODE return cost, gradPred, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, assuming the softmax prediction function and cross # entropy loss. # Inputs: # - predicted: numpy ndarray, predicted word vector (\hat{v} in # the written component or \hat{r} in an earlier version) # - target: integer, the index of the target word # - outputVectors: "output" vectors (as rows) for all tokens # - dataset: needed for negative sampling, unused here. # Outputs: # - cost: cross entropy cost for the softmax word prediction # - gradPred: the gradient with respect to the predicted word # vector # - grad: the gradient with respect to all the other word # vectors # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE assert predicted.shape[-1] == outputVectors.shape[-1] scalar_products = np.sum(outputVectors * predicted, axis=1) #implement softmax function yhat = softmax(scalar_products) #compute cost cost = -np.log(yhat[target]) #gradPred gradPred = np.sum(outputVectors * yhat[:, np.newaxis], axis=0) - outputVectors[target] #grad grad = yhat[:, np.newaxis] * predicted[np.newaxis, :] grad[target] = grad[target] - predicted ### END YOUR CODE return cost, gradPred, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset, indices=None): """ Softmax cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, assuming the softmax prediction function and cross # entropy loss. # Inputs: # - predicted: numpy ndarray, predicted word vector (\hat{v} in # the written component or \hat{r} in an earlier version) # - target: integer, the index of the target word # - outputVectors: "output" vectors (as rows) for all tokens # - dataset: needed for negative sampling, unused here. # Outputs: # - cost: cross entropy cost for the softmax word prediction # - gradPred: the gradient with respect to the predicted word # vector # - grad: the gradient with respect to all the other word # vectors # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE W, D = outputVectors.shape y = np.zeros(W) y[target] = 1.0 theta = np.dot(outputVectors, predicted) # (W,D), (D,) -> (W,) y_hat = softmax(theta) # (W,) cost = -np.sum(y * np.log(y_hat)) gradPred = np.dot(y_hat - y, outputVectors) # dJ/dV_c, (D,) grad = np.outer(y_hat - y, predicted) # dJ/dU, (W, D), U: outputVectors ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation N = data.shape[0] h = sigmoid(np.dot(data, W1) + np.tile(b1, (N, 1))) # (N, H) y = softmax(np.dot(h, W2) + np.tile(b2, (N, 1))) # (N, Dy) cost = -np.sum(labels * np.log(y)) # float ### END YOUR CODE ### YOUR CODE HERE: backward propagation d2 = (y - labels) # (N, Dy) d1 = np.dot(d2, W2.T) * sigmoid_grad(h) # (N, H) gradW2 = np.dot(h.T, d2) # (H, Dy) gradW1 = np.dot(data.T, d1) # (Dx, H) gradb2 = np.sum(d2, axis=0) # (Dy) gradb1 = np.sum(d1, axis=0) # (Dx) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False): """ Softmax Regression """ # Implement softmax regression with weight regularization. # Inputs: # - features: feature vectors, each row is a feature vector # - labels: labels corresponding to the feature vectors # - weights: weights of the regressor # - regularization: L2 regularization constant # Output: # - cost: cost of the regressor # - grad: gradient of the regressor cost with respect to its # weights # - pred: label predictions of the regressor (you might find # np.argmax helpful) cost = 0.0 prob = softmax(features.dot(weights)) # (N,D).dot(D,C) if len(features.shape) > 1: N = features.shape[0] else: N = 1 # A vectorized implementation of 1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2 cost = np.sum(-np.log(prob[range(N), labels])) / N cost += 0.5 * regularization * np.sum(weights ** 2) ### YOUR CODE HERE: compute the gradients and predictions pred = np.argmax(prob, axis=1) # gradient dscore = prob dscore[np.arange(N), labels] -= 1 dscore /= N # 损失函数cost除以了N,计算的是平均损失 grad = features.T.dot(dscore) grad += regularization * weights ### END YOUR CODE if nopredictions: return cost, grad else: return cost, grad, pred
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors We will not provide starter code for this function, but feel free to reference the code you previously wrote for this assignment! """ ### YOUR CODE HERE probs = softmax(predicted.dot(outputVectors.T)) cost = -np.log(probs[target]) grad_pred = probs grad_pred[target] -= 1 grad = grad_pred[:, np.newaxis] * predicted[np.newaxis, :] gradPred = grad_pred.dot(outputVectors) print grad_pred.shape print outputVectors.shape ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) # (Dx, H) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) #(1, H) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) #(H, Dy) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) #(1, Dy) ### YOUR CODE HERE: forward propagation (using notations of Lecture 5) # x = z1 = a1 z2 = np.dot(data, W1) + b1 # (1, H) a2 = sigmoid(z2) # (1, H) z3 = np.dot(a2, W2) + b2 #(1, Dy) a3 = softmax(z3) #(1, Dy) S = -np.sum(np.log(np.sum(a3 * labels, axis=1))) ### END YOUR CODE ### YOUR CODE HERE: backward propagation delta3 = a3 - labels # (1, Dy) gradW2 = np.dot(a2.T, delta3) # (H, Dy) delta2 = sigmoid_grad(a2) * np.dot(delta3, W2.T) # (1, H) gradW1 = np.dot(data.T, delta2) # (H, H) gradb2 = np.sum(delta3, axis=0) gradb1 = np.sum(delta2, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return S, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors We will not provide starter code for this function, but feel free to reference the code you previously wrote for this assignment! """ ### YOUR CODE HERE n, d = outputVectors.shape predicted = np.reshape(predicted, (1, d)) y = np.zeros((n, 1)) y[target] = 1 y_hat = softmax(np.dot(outputVectors, predicted.T).reshape((n, ))).reshape( (n, 1)) cost = -np.sum(y * np.log(y_hat)) # print 'this_cost: ', y_hat, cost gradPred = np.reshape(np.dot((y_hat - y).T, outputVectors), (d, )) grad = np.dot((y_hat - y), predicted) ### END YOUR CODE return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h = sigmoid(data.dot(W1) + b1) y_pred = softmax(h.dot(W2) + b2) # print y_pred cost = -np.sum(labels * np.log(y_pred)) ### END YOUR CODE ### YOUR CODE HERE: backward propagation grad_ce = y_pred - labels gradW2 = h.T.dot(grad_ce) gradb2 = np.sum(grad_ce, axis=0) xx = grad_ce.dot(W2.T) * sigmoid_grad(h) gradW1 = data.T.dot(xx) gradb1 = np.sum(xx, axis=0) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models """ # Implement the cost and gradients for one predicted word vector # and one target word vector as a building block for word2vec # models, assuming the softmax prediction function and cross # entropy loss. # Inputs: # - predicted: numpy ndarray, predicted word vector (\hat{v} in # the written component or \hat{r} in an earlier version) # - target: integer, the index of the target word # - outputVectors: "output" vectors (as rows) for all tokens # - dataset: needed for negative sampling, unused here. # Outputs: # - cost: cross entropy cost for the softmax word prediction # - gradPred: the gradient with respect to the predicted word # vector # - grad: the gradient with respect to all the other word # vectors # We will not provide starter code for this function, but feel # free to reference the code you previously wrote for this # assignment! ### YOUR CODE HERE #print outputVectors.shape prob_of_each = softmax(np.dot(outputVectors, predicted)) #print prob_of_each cost = -np.log(prob_of_each[target]) #print prob_of_each.shape prob_of_each[target] -= 1 gradPred = np.dot(outputVectors.T, prob_of_each.reshape(prob_of_each.shape[0], 1)).flatten() grad = np.dot(prob_of_each.reshape(prob_of_each.shape[0], 1), predicted.reshape(1, predicted.shape[0])) ### END YOUR CODE return cost, gradPred, grad
def add_prediction_op(self): """Adds the core transformation for this model which transforms a batch of input data into a batch of predictions. In this case, the transformation is a linear layer plus a softmax transformation: y = softmax(Wx + b) Hint: Make sure to create tf.Variable as needed. Hint: For this simple use-case, it's sufficient to initialize both weights W and biases b with zeros. Args: input_data: A tensor of shape (batch_size, n_features). Returns: pred: A tensor of shape (batch_size, n_classes) """ W = tf.Variable(tf.zeros([self.config.n_features, self.config.n_classes])) b = tf.Variable(tf.zeros([self.config.batch_size, self.config.n_classes])) pred = softmax(tf.matmul(self.input_placeholder, W) + b) return pred
def forward_test(data, labels, params, dimensions): ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) # print(params) # print(params[ofs:ofs + Dx * H]) # print((Dx, H)) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation h = sigmoid(np.dot(data, W1) + b1) yHat = softmax(np.dot(h, W2) + b2) cost = np.count_nonzero( np.argmax(yHat, axis=1) - np.argmax(labels, axis=1)) return cost
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors We will not provide starter code for this function, but feel free to reference the code you previously wrote for this assignment! """ prob = softmax(np.matmul(outputVectors, predicted)) cost = -np.log(prob[target]) #the gradient for V_c gradPred = np.sum(outputVectors * prob.reshape(-1, 1), axis=0) - outputVectors[target] #gradients for U(ont only o, but other context word would be contained) grad = np.tile(predicted, (outputVectors.shape[0], 1)) * prob.reshape(-1, 1) grad[target] -= predicted return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions): """ Forward and backward propagation for a two-layer sigmoidal network Compute the forward propagation and for the cross entropy cost, and backward propagation for the gradients for all parameters. """ ### Unpack network parameters (do not modify) ofs = 0 Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2]) W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H)) ofs += Dx * H b1 = np.reshape(params[ofs:ofs + H], (1, H)) ofs += H W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) ofs += H * Dy b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) ### YOUR CODE HERE: forward propagation N, D = data.shape hidden = sigmoid(data.dot(W1) + b1) prediction = softmax(hidden.dot(W2) + b2) cost = (-1) * np.sum(np.log(prediction) * labels) ### END YOUR CODE ### YOUR CODE HERE: backward propagation delta = (prediction - labels) gradW2 = hidden.T.dot(delta) gradb2 = np.sum(delta, axis=0, keepdims=True) hidden_delta = delta.dot(W2.T) * sigmoid_grad(hidden) gradW1 = data.T.dot(hidden_delta) gradb1 = np.sum(hidden_delta, axis=0, keepdims=True) ### END YOUR CODE ### Stack gradients (do not modify) grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten())) return cost, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset): """ Softmax cost function for word2vec models Implement the cost and gradients for one predicted word vector and one target word vector as a building block for word2vec models, assuming the softmax prediction function and cross entropy loss. Arguments: predicted -- numpy ndarray, predicted word vector (\hat{v} in the written component) target -- integer, the index of the target word outputVectors -- "output" vectors (as rows) for all tokens dataset -- needed for negative sampling, unused here. Return: cost -- cross entropy cost for the softmax word prediction gradPred -- the gradient with respect to the predicted word vector grad -- the gradient with respect to all the other word vectors We will not provide starter code for this function, but feel free to reference the code you previously wrote for this assignment! """ predicted = np.expand_dims(predicted, axis=1) softy = softmax(np.dot(outputVectors, predicted), 0) ### YOUR CODE HERE onehotmatrix = np.eye(len(outputVectors), len(outputVectors)) cost = -np.log(softy[target]) # gradPred = np.sum(np.dot(outputVectors, predicted), axis=0) - outputVectors[target] gradPred = np.dot((softy-np.expand_dims(onehotmatrix[target].T, axis=1)).T, outputVectors) grad = np.dot((softy-np.expand_dims(onehotmatrix[target].T, axis=1)), predicted.T) gradPred = np.squeeze(gradPred) grad = np.squeeze(grad) print 'gradpred is ', gradPred ### END YOUR CODE return cost, gradPred, grad