def test_softmax_linearity_rowwise(dim_1, dim_2):
	shift = np.random.uniform(low=-100,high=100,size=(dim_1,1))
	#print(shift)
	a1    = np.random.normal(size=(dim_1,dim_2))
	a2    = a1 + shift
	assert rel_error(np.max(a2 - a1), np.max(shift)) < 1e-8
	assert rel_error(softmax(a1),softmax(a2)) < 1e-8
def test_softmax_permutation_axis1(dim_1):
	a1          = np.random.normal(size=(1,dim_1))
	s1          = softmax(a1)

	permutation = np.random.permutation(dim_1)
	inverse_permutation = np.argsort(permutation)

	s1_perm     = softmax(a1.ravel()[permutation])
	assert rel_error(s1_perm.ravel()[inverse_permutation], s1) <= 1e-8
Esempio n. 3
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models """                                               
    
    # Inputs:                                                         
    # - predicted: numpy ndarray, predicted word vector (\hat{v} in 
    #   the written component or \hat{r} in an earlier version)
    # - target: integer, the index of the target word               
    # - outputVectors: "output" vectors (as rows) for all tokens     
    # - dataset: needed for negative sampling, unused here.         
    
    # Outputs:                                                        
    # - cost: cross entropy cost for the softmax word prediction    
    # - gradPred: the gradient with respect to the predicted word   
    #        vector                                                
    # - grad: the gradient with respect to all the other word        
    #        vectors                                                                                            
    
    y = np.zeros((outputVectors.shape[0],))
    y[target] = 1.0
    
    y_hat = softmax(np.dot(outputVectors, predicted))
    cost = -np.dot(y, np.log(y_hat))
    
    gradPred = -outputVectors[target,:] + np.dot(outputVectors.T, y_hat)
    grad = np.outer(y_hat - y, predicted)
    
    return cost, gradPred, grad
Esempio n. 4
0
  def add_model(self, input_data):
    """Adds a linear-layer plus a softmax transformation

    The core transformation for this model which transforms a batch of input
    data into a batch of predictions. In this case, the mathematical
    transformation effected is

    y = softmax(xW + b)

    Hint: Make sure to create tf.Variables as needed. Also, make sure to use
          tf.name_scope to ensure that your name spaces are clean.
    Hint: For this simple use-case, it's sufficient to initialize both weights W
          and biases b with zeros.

    Args:
      input_data: A tensor of shape (batch_size, n_features).
    Returns:
      out: A tensor of shape (batch_size, n_classes)
    """
    ### YOUR CODE HERE
    # W = tf.Variable(tf.zeros((self.config.n_features, self.config.n_classes)), name="weights")
    # b = tf.Variable(tf.zeros((self.config.n_classes, )), name="biases")
    
    with tf.variable_scope('softmax'):
        W = tf.get_variable("weights", (self.config.n_features, self.config.n_classes),
                            initializer=tf.constant_initializer(0.0))
        b = tf.get_variable("bias", (self.config.n_classes,),
                            initializer=tf.constant_initializer(0.0))
    
    out = softmax(tf.matmul(input_data, W) + b)
    ### END YOUR CODE
    return out
Esempio n. 5
0
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False):
    """ Softmax Regression """
    # Implement softmax regression with weight regularization.        
    
    # Inputs:                                                         
    # - features: feature vectors, each row is a feature vector     
    # - labels: labels corresponding to the feature vectors         
    # - weights: weights of the regressor                           
    # - regularization: L2 regularization constant                  
    
    # Output:                                                         
    # - cost: cost of the regressor                                 
    # - grad: gradient of the regressor cost with respect to its    
    #        weights                                               
    # - pred: label predictions of the regressor (you might find    
    #        np.argmax helpful)  
    
    prob = softmax(features.dot(weights))
    if len(features.shape) > 1:
        N = features.shape[0]
    else:
        N = 1
    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2
    cost = np.sum(-np.log(prob[range(N), labels])) / N 
    cost += 0.5 * regularization * np.sum(weights ** 2)
    
    ### YOUR CODE HERE: compute the gradients and predictions
    raise NotImplementedError
    ### END YOUR CODE
    
    if nopredictions:
        return cost, grad
    else:
        return cost, grad, pred
  def add_model(self, input_data):
    """Adds a linear-layer plus a softmax transformation

    The core transformation for this model which transforms a batch of input
    data into a batch of predictions. In this case, the mathematical
    transformation effected is

    y = softmax(xW + b)

    Hint: Make sure to create tf.Variables as needed. Also, make sure to use
          tf.name_scope to ensure that your name spaces are clean.
    Hint: For this simple use-case, it's sufficient to initialize both weights W
          and biases b with zeros.

    Args:
      input_data: A tensor of shape (batch_size, n_features).
    Returns:
      out: A tensor of shape (batch_size, n_classes)
    """
    ### YOUR CODE HERE
    #raise NotImplementedError

    self.W = tf.Variable(tf.zeros([self.config.n_features, self.config.n_classes]), tf.float32, name="weight")
    self.b = tf.Variable(tf.zeros([self.config.batch_size, self.config.n_classes]), tf.float32, name="bias")
    out = softmax(tf.matmul(input_data, self.W) + self.b)

    ### END YOUR CODE
    return out
Esempio n. 7
0
  def add_model(self, input_data):
    """Adds a linear-layer plus a softmax transformation

    The core transformation for this model which transforms a batch of input
    data into a batch of predictions. In this case, the mathematical
    transformation effected is

    y = softmax(xW + b)

    Hint: Make sure to create tf.Variables as needed. Also, make sure to use
          tf.name_scope to ensure that your name spaces are clean.
    Hint: For this simple use-case, it's sufficient to initialize both weights W
          and biases b with zeros.

    Args:
      input_data: A tensor of shape (batch_size, n_features).
    Returns:
      out: A tensor of shape (batch_size, n_classes)
    """
    ### YOUR CODE HERE
    with tf.variable_scope("model"):
        W = tf.get_variable("W", shape=[self.config.n_features, self.config.n_classes], initializer=tf.random_normal_initializer(0.5, 0.1))
        # W = tf.Variable(tf.random_normal(shape=[self.config.n_features, self.config.n_classes], dtype=tf.float32, name="weights"))
        b = tf.get_variable("b", shape=[self.config.n_classes], initializer=tf.constant_initializer(0.0))
        affine_transformation = tf.matmul(self.input_placeholder, W) + b
        #tf.constant_initializer(value)
        #tf.random_uniform_initializer(a,b)
        # b = tf.Variable(tf.zeros(shape=[1,self.config.n_classes], dtype=tf.float32), name="bias")
        # affine_transformation = tf.add(tf.matmul(W, self.input_placeholder), b, name="affine")
    out = softmax(affine_transformation)
    ### END YOUR CODE
    return out
  def add_model(self, input_data):
    """Adds a linear-layer plus a softmax transformation

    The core transformation for this model which transforms a batch of input
    data into a batch of predictions. In this case, the mathematical
    transformation effected is

    y = softmax(xW + b)

    Hint: Make sure to create tf.Variables as needed. Also, make sure to use
          tf.name_scope to ensure that your name spaces are clean.
    Hint: For this simple use-case, it's sufficient to initialize both weights W
          and biases b with zeros.

    Args:
      input_data: A tensor of shape (batch_size, n_features).
    Returns:
      out: A tensor of shape (batch_size, n_classes)
    """
    ### YOUR CODE HERE
    n_features, n_classes = self.config.n_features, self.config.n_classes
    with tf.name_scope('softmax_linear'):
      weights = tf.Variable(
          tf.zeros([n_features, n_classes]),
          name='weights')
      biases = tf.Variable(tf.zeros([n_classes]),
                           name='biases')
      logits = tf.matmul(input_data, weights) + biases
      out = softmax(logits)
    ### END YOUR CODE
    return out
  def add_model(self, input_data):
    """Adds a linear-layer plus a softmax transformation

    The core transformation for this model which transforms a batch of input
    data into a batch of predictions. In this case, the mathematical
    transformation effected is

    y = softmax(xW + b)

    Hint: Make sure to create tf.Variables as needed. Also, make sure to use
          tf.name_scope to ensure that your name spaces are clean.
    Hint: For this simple use-case, it's sufficient to initialize both weights W
          and biases b with zeros.

    Args:
      input_data: A tensor of shape (batch_size, n_features).
    Returns:
      out: A tensor of shape (batch_size, n_classes)
    """

    # Create a variable.
    self.w = tf.Variable(tf.zeros([self.config.n_features, self.config.n_classes]), name = "w")
    self.b = tf.Variable(tf.zeros([self.config.n_classes]), name = "b")
    out = softmax(tf.matmul(input_data, self.w) + self.b)

    #w_hist = tf.histogram_summary("w", self.w)

    return out
def softmaxCostAndGradient(predicted, target, outputVectors, data):
    """ Softmax cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector
    # and one target word vector as a building block for word2vec
    # models, assuming the softmax prediction function and cross
    # entropy loss.

    # Inputs:
    # - predicted: numpy ndarray, predicted word vector (\hat{v} in
    #   the written component or \hat{r} in an earlier version)
    # - target: integer, the index of the target word
    # - outputVectors: "output" vectors (as rows) for all tokens
    # - dataset: needed for negative sampling, unused here.

    # Outputs:
    # - cost: cross entropy cost for the softmax word prediction
    # - gradPred: the gradient with respect to the predicted word
    #        vector
    # - grad: the gradient with respect to all the other word
    #        vectors

    # We will not provide starter code for this function, but feel
    # free to reference the code you previously wrote for this
    # assignment!
    prods = np.dot(outputVectors,predicted.T) # 1xV
    probs = softmax(prods) # 1xV
    cost = -np.log(probs[target]) # 1x1

    dscore = probs
    dscore[target] -= 1.0
    gradPred = np.dot(dscore,outputVectors)
    grad = np.outer(dscore,predicted)
    return cost, gradPred, grad
Esempio n. 11
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    #z1 = data.dot(W1) + b1
    #hidden = sigmoid(z1)
    #z2 = hidden.dot(W2) + b2
    #print 'z2.shape: ', z2.shape
    #prediction = softmax(z2)
    ### END YOUR CODE
    
    hidden = sigmoid(data.dot(W1) + b1)
    prediction = softmax(hidden.dot(W2) + b2)
    cost = -np.sum(np.log(prediction) * labels)

    
    ### YOUR CODE HERE: backward propagation
    #print 'NN: ', Dx, H, Dy
    #print 'b1.shape: ', b1.shape
    #print 'prediction.shape: ', prediction.shape
    #print 'labels.shape : ', labels.shape
    #print 'W2.shape: ', W2.shape
    #print 'hidden.shape: ', hidden.shape
    #print 'hidden.T.shape: ', hidden.T.shape
    #print 'delta.shape: ', delta.shape
    #print 'W1.shape: ', W1.shape
    #print 'data.shape: ', data.shape
    #gradW2 = delta * hidden
    #print 'sigmoid_grad(hidden).shape: ', sigmoid_grad(hidden).shape
    delta = prediction - labels
    gradW2 = hidden.T.dot(delta)
    gradb2 = np.sum(delta, axis = 0)
    hidden_delta = delta.dot(W2.T) * sigmoid_grad(hidden)
    gradW1 = data.T.dot(hidden_delta)
    gradb1 = np.sum(hidden_delta, axis = 0)
    ### END YOUR CODE
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad
Esempio n. 12
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    # data: N x Dx, W1: Dx x H, b: 1 x H 
    a = data.dot(W1) + b1
    h = sigmoid(a)
    # h: N x H, W2: H x Dy, b2: 1 x Dy
    t = h.dot(W2) + b2
    y_hat = softmax(t)
    # y_hat: N x Dy, labels: N x Dy (as int)
    probs = labels * y_hat
    cost = np.sum(-np.log(probs.sum(axis=1)))
    ### END YOUR CODE
    
    ### YOUR CODE HERE: backward propagation
    # obtain the softmax gradient
    dJdt = (y_hat - labels) # N x Dy

    # b2 grad is sum along each index of the Dy vectors
    gradb2 = np.sum(dJdt, 0) 

    # h: N x H, dJdt: N x Dy
    gradW2 = h.T.dot(dJdt) # H x Dy

    # dJdt: N x Dy, W2: H x Dy
    dJdh = dJdt.dot(W2.T)
    # h: N x H
    dhda = sigmoid_grad(h)

    # data: N x Dx, dhda: N x H, DJdh: N x H
    gradW1 = data.T.dot(dhda * dJdh)
    
    # dhda: N x H, DJdh: N x H
    gradb1 = np.sum(dhda * dJdh, 0)
    ### END YOUR CODE
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 13
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models """
    
    # Implement the cost and gradients for one predicted word vector  
    # and one target word vector as a building block for word2vec     
    # models, assuming the softmax prediction function and cross      
    # entropy loss.                                                   
    
    # Inputs:                                                         
    # - predicted: numpy ndarray, predicted word vector (\hat{v} in 
    #   the written component or \hat{r} in an earlier version)
    # - target: integer, the index of the target word               
    # - outputVectors: "output" vectors (as rows) for all tokens     
    # - dataset: needed for negative sampling, unused here.         
    
    # Outputs:                                                        
    # - cost: cross entropy cost for the softmax word prediction    
    # - gradPred: the gradient with respect to the predicted word   
    #        vector                                                
    # - grad: the gradient with respect to all the other word        
    #        vectors                                               
    
    # We will not provide starter code for this function, but feel    
    # free to reference the code you previously wrote for this        
    # assignment!                                                  
    
    ### YOUR CODE HERE
    '''
    Keep track of dims:
    
    D - dim of word vector
    V - number of words
    
    predicted     :  (D, )
    target        :  integer
    outputVectors :  (V, D)
    
    cost          :  float
    gradPred      :  (D, )
    grad          :  (V, D)
    '''
    predicted = predicted.reshape(-1, 1)
    
    scores = outputVectors.dot(predicted)       # (V, 1)
    probs = softmax(scores.T)                   # (1, V)
    targetProb = probs[0, target]
    cost = -np.log(targetProb)
    
    scores_exp = np.exp(scores)                 # (V, 1)
    scores_exp_sum = np.sum(scores_exp)         # float
    
    gradPred = - outputVectors[target, :] + np.sum(scores_exp * outputVectors, axis=0) / scores_exp_sum    # (D, )
    
    grad = scores_exp.dot(predicted.T) / scores_exp_sum    # (V, D)
    grad[target, :] -= predicted.reshape(-1)
    ### END YOUR CODE
    
    return cost, gradPred, grad
Esempio n. 14
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    # data : N * Dx
    # W1   : Dx * H
    # b1   : 1 * H
    # W2   : H * Dy
    # b2   : 1 * Dy
    N = data.shape[0]

    z1 = data.dot(W1) + b1
    a1 = sigmoid(z1)  # N * H
    z2 = a1.dot(W2) + b2
    a2 = softmax(z2)  # N * Dy

    cost = np.sum(-np.log(a2[labels == 1])) / N

    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    delta_score = a2 - labels  # 1 * Dy
    delta_score /= N

    gradW2 = np.dot(a1.T, delta_score)  # H * 1 * 1 * Dy = H * Dy
    gradb2 = np.sum(delta_score, axis=0)

    grad_h = np.dot(delta_score, W2.T)  # 1 * Dy * Dy * H = 1 * H
    grad_h = sigmoid_grad(a1) * grad_h

    gradW1 = np.dot(data.T, grad_h)
    gradb1 = np.sum(grad_h, axis=0)

    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 15
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    ### YOUR CODE HERE
    #raise NotImplementedError
    v_c = predicted
    U = outputVectors
    N = U.shape[0] 
    #print v_c.shape, U.shape
    theta = np.zeros(N)
    for i in range(N):
        theta[i] = np.dot(U[i], v_c)

    y_hat = softmax(theta)
    #print y_hat.shape
    cost = -np.log(y_hat[target])

    gradPred = -U[target] 
    for i in range(N):
        gradPred += U[i]*y_hat[i]

    grad = np.zeros((N, len(v_c)))

    for i in range(N):
        if i == target:
            grad[i] = (y_hat[i] - 1)*v_c
        else:
            grad[i] = y_hat[i]*v_c

    #print grad.shape, gradPred.shape
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 16
0
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False):
    """ Softmax Regression """
    # Implement softmax regression with weight regularization.        
    
    # Inputs:                                                         
    # - features: feature vectors, each row is a feature vector     
    # - labels: labels corresponding to the feature vectors         
    # - weights: weights of the regressor                           
    # - regularization: L2 regularization constant                  
    
    # Output:                                                         
    # - cost: cost of the regressor                                 
    # - grad: gradient of the regressor cost with respect to its    
    #        weights                                               
    # - pred: label predictions of the regressor (you might find    
    #        np.argmax helpful)  
    
    prob = softmax(features.dot(weights))
    if len(features.shape) > 1:
        N = features.shape[0]
    else:
        N = 1
    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2
    cost = np.sum(-np.log(prob[range(N), labels])) / N 
    cost += 0.5 * regularization * np.sum(weights ** 2)
    
    ### YOUR CODE HERE: compute the gradients and predictions
    # NOTE - N is the batch size
    # features is an N x M matrix, M being # features
    # weights is an M X K matrix, K being # classes
    # prob is an N x K matrix (batchSize x classes)
    # labels is a 1-hot (row) vector

    # Get delta, an N x K matrix with CE error signal
    # z = XW, where X = features and W = weights
    # dJ/dz
    delta = np.array(prob)
    delta[range(N), labels] -= 1.

    # dz/dW = 1/N * X * delta
    # dJ/dW = dJ/dz * dz/dW
    grad = features.T.dot(delta) / N

    grad += regularization * weights

    if N > 1:
        pred = np.argmax(prob, axis=1)
    else:
        pred = np.argmax(prob)
    ### END YOUR CODE
    
    if nopredictions:
        return cost, grad
    else:
        return cost, grad, pred
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    fc_out = np.dot(data, W1) + b1  # shape (M, H)
    fc_sigmoid_out = sigmoid(fc_out)  # shape (M, H)
    scores = np.dot(fc_sigmoid_out, W2) + b2  # shape (M, Dy)
    y_hat = softmax(scores)  # shape (M, Dy)
    # M = data.shape[0]
    cost = -np.sum(labels * np.log(y_hat))  # / M
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    dscores = y_hat - labels  # / M  # shape (M, Dy)

    gradW2 = np.dot(fc_sigmoid_out.T, dscores)  # shape (H, Dy)
    gradb2 = np.sum(dscores, axis=0)  # shape (Dy,)
    dfc_sigmoid_out = np.dot(dscores, W2.T)  # shape (M, H)
    dfc_out = dfc_sigmoid_out * sigmoid_grad(fc_sigmoid_out)  # shape (M, H)

    gradW1 = np.dot(data.T, dfc_out)  # shape (Dx, H)
    gradb1 = np.sum(dfc_out, axis=0)  # shape (H,)

    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
	""" Softmax cost function for word2vec models """
	
	# Implement the cost and gradients for one predicted word vector  
	# and one target word vector as a building block for word2vec     
	# models, assuming the softmax prediction function and cross      
	# entropy loss.                                                   
	
	# Inputs:                                                         
	# - predicted: numpy ndarray, predicted word vector (\hat{v} in 
	#   the written component or \hat{r} in an earlier version)
	# - target: integer, the index of the target word               
	# - outputVectors: "output" vectors (as rows) for all tokens     
	# - dataset: needed for negative sampling, unused here.         
	
	# Outputs:                                                        
	# - cost: cross entropy cost for the softmax word prediction    
	# - gradPred: the gradient with respect to the predicted word   
	#        vector                                                
	# - grad: the gradient with respect to all the other word        
	#        vectors                                               
	
	# We will not provide starter code for this function, but feel    
	# free to reference the code you previously wrote for this        
	# assignment!                                                  
	
	### YOUR CODE HERE

	N,D = outputVectors.shape

	score = softmax(np.sum(outputVectors * predicted,axis = 1))
	cost = -np.log(score[target])
	
	gradPred = np.sum(outputVectors * score.reshape((N,1)),axis = 0) - outputVectors[target]

	new_score = score.copy()
	new_score[target] -= 1

	grad = np.dot(new_score.reshape((N,1)),predicted.reshape((1,D)))

	# # (2C,D) * (1,D) -> (2C,D), element-wise!, sum -> (2C,)
	# y_hat = softmax(np.sum(outputVectors * predicted, axis=1, keepdims=True))
	# y = np.zeros([len(y_hat) ,1])
	# y[target] = 1
	# cost = -np.log(y_hat[target])

	# delta = y_hat - y # (2C,)
	# gradPred = np.sum(outputVectors * delta, axis=0)
	# gradPred = delta.T.dot(outputVectors).reshape((-1,)) # (,2C) x (2C,D) -> (1,D)
	# # gradPred = np.sum(outputVectors * y_hat, axis=0)
	# grad = delta.dot(np.reshape(predicted, (-1,1) ) )# (2C,) * (1,D) -> (2C,D)
	### END YOUR CODE
	
	return cost, gradPred, grad
Esempio n. 19
0
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False):
    """ Softmax Regression """
    # Implement softmax regression with weight regularization.

    # Inputs:
    # - features: feature vectors, each row is a feature vector (N * D)
    # - labels: labels corresponding to the feature vectors (N,)
    # - weights: weights of the regressor (D * C)
    # - regularization: L2 regularization constant

    # Output:
    # - cost: cost of the regressor
    # - grad: gradient of the regressor cost with respect to its
    #        weights
    # - pred: label predictions of the regressor (you might find
    #        np.argmax helpful)


    # calculate the scores
    # scores shape (N, C)
    dot1 = features.dot(weights)
    prob = softmax(dot1)
    # print "prob shape %s" % (prob.shape, )
    # print "weights shape %s" % (weights.shape, )
    # print "features shape %s" % (features.shape, )

    if len(features.shape) > 1:
        N = features.shape[0]
    else:
        N = 1
    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2
    cost = np.sum(-np.log(prob[range(N), labels])) / N
    cost += 0.5 * regularization * np.sum(weights ** 2)

    ### YOUR CODE HERE: compute the gradients and predictions
    ddot = prob.copy()
    ddot[range(N), labels] -= 1
    ddot /= N

    # dot1 = features.dot(weights)
    # weights shape D* C
    # feature shape N, D
    # dot shape N * C
    dweights = features.T.dot(ddot)
    grad = dweights
    grad += (regularization * weights)

    ### END YOUR CODE
    pred = np.argmax(prob, axis=1)
    if nopredictions:
        return cost, grad
    else:
        return cost, grad, pred
Esempio n. 20
0
def your_sanity_checks(): 
    """
    Use this space add any additional sanity checks by running:
        python q2_gradcheck.py 
    This function will not be called by the autograder, nor will
    your additional tests be graded.
    """
    print "Running your sanity checks..."

    print "checking softmax_loss_grad"
    gradcheck_naive(softmax_loss_grad, np.array(123.456))      # scalar test
    gradcheck_naive(softmax_loss_grad, np.random.randn(3,))    # 1-D test
    gradcheck_naive(softmax_loss_grad, np.random.randn(4,5))   # 2-D test

    print "checking sigmoid_loss_grad"
    gradcheck_naive(sigmoid_loss_grad, np.array(123.456))      # scalar test
    gradcheck_naive(sigmoid_loss_grad, np.random.randn(3,))    # 1-D test
    gradcheck_naive(sigmoid_loss_grad, np.random.randn(4,5))   # 2-D test

    print "checking cross_category_loss_grad"
    gradcheck_naive(lambda x: cross_category_loss_grad(x, np.array(134.1)), np.array(123.456))      # scalar test

    l1 = softmax(np.random.randn(3,))
    l2 = softmax(np.random.randn(4, 5))
    gradcheck_naive(lambda x: cross_category_loss_grad(x, l1), softmax(np.random.randn(3,)))    # 1-D test
    gradcheck_naive(lambda x: cross_category_loss_grad(x, l2), softmax(np.random.randn(4, 5)))    # 2-D test


    print "checking score_to_loss_grad"

    l1 = softmax(np.random.randn(3,))
    l2 = softmax(np.random.randn(4, 5))
    gradcheck_naive(lambda x: score_to_loss_grad(x, l1), np.random.randn(3,))    # 1-D test
    gradcheck_naive(lambda x: score_to_loss_grad(x, l2), np.random.randn(4, 5))    # 2-D test
Esempio n. 21
0
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False):
    """ Softmax Regression """
    # Implement softmax regression with weight regularization.        
    
    # Inputs:                                                         
    # - features: feature vectors, each row is a feature vector     
    # - labels: labels corresponding to the feature vectors         
    # - weights: weights of the regressor                           
    # - regularization: L2 regularization constant                  
    
    # Output:                                                         
    # - cost: cost of the regressor                                 
    # - grad: gradient of the regressor cost with respect to its    
    #        weights                                               
    # - pred: label predictions of the regressor (you might find    
    #        np.argmax helpful)
    
    '''
    Keep track of dims:
    
    N - number of sentences
    D - size of sentence feature
    C - number of clases
    
    features :  (N, D)
    weights  :  (D, C)
    labels   :  (N, )
    grad     :  (D, C)
    pred     :  (N, )
    '''
    
    prob = softmax(features.dot(weights))   # (N, C)
    if len(features.shape) > 1:
        N = features.shape[0]
    else:
        N = 1
    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2
    cost = np.sum(-np.log(prob[range(N), labels])) / N 
    cost += 0.5 * regularization * np.sum(weights ** 2)
    
    ### YOUR CODE HERE: compute the gradients and predictions
    pred = np.argmax(prob, axis=1)
    dscores = prob
    dscores[range(N), labels] -= 1
    dscores /= N
    grad = features.T.dot(dscores) + regularization * weights
    ### END YOUR CODE
    
    if nopredictions:
        return cost, grad
    else:
        return cost, grad, pred
Esempio n. 22
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    """
    """ word number v """



    v = outputVectors.shape[0]
    d = predicted.shape[0]

    v_c = predicted

    """  calculated y_hat  = softmax (U.T @ v_c)
    the result is still a numpy array """
    product = np.dot(outputVectors, v_c)
    y_hat = softmax(product)

    """ cost = - log (soft_max[target]) """

    cost = - np.log(y_hat[target])

    """ gradPred = U (y_hat - y)  grad = v_c @ (y_hat - y).T
        and take the transpose for row vectors """

    y_gap = y_hat
    y_gap[target] -= 1.0

    gradPred = np.dot(outputVectors.T, y_gap)
    grad = np.outer(y_gap, v_c)

    return cost, gradPred, grad
Esempio n. 23
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    
    ### YOUR CODE HERE: forward propagation
    p1 = np.dot(data, W1) + b1
    h = sigmoid(p1) #(M,H)
    p2 = np.dot(h, W2) + b2
    y_pred = softmax(p2) #(M, Dy)
    cost = np.mean(np.sum(-1 * np.multiply(labels, np.log(y_pred)), axis=1))
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    gradp2 = (y_pred - labels) / np.shape(data)[0] #(M,Dy)
    gradW2 = np.dot(h.T, gradp2) #(H, Dy) 
    gradb2 = np.sum(gradp2, axis=0).reshape((1,-1)) #(1, Dy)
    gradh = np.dot(gradp2, W2.T) #(M,H)
    gradp1 = np.multiply(gradh, h * (1 - h)) #(M,H) element wise multiplication
    gradW1 = np.dot(data.T, gradp1) # (Dx,H)
    gradb1 =  np.sum(gradp1, axis=0).reshape((1,-1)) #(1,H)
    ### END YOUR CODE

    ### Stack gradients (do not modify)[0].reshape((1,-1)) #(1, Dy)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
        gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 24
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### forward propagation
    N = data.shape[0]

    l1 = data.dot(W1) + b1
    h = sigmoid(l1)
    l2 = h.dot(W2) + b2
    y_hat = softmax(l2)

    cost = -np.sum(labels * np.log(y_hat)) / N # cross entropy
    
    ### backward propagation
    dl2 = y_hat - labels
    dW2 = np.dot(h.T, dl2)
    db2 = np.sum(dl2, axis=0)

    dh = np.dot(dl2, W2.T)

    dl1 = dh * sigmoid_grad(h)
    dW1 = np.dot(data.T, dl1)
    db1 = np.sum(dl1, axis=0)

    gradW2 = dW2/N
    gradb2 = db2/N
    gradW1 = dW1/N
    gradb1 = db1/N
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad
Esempio n. 25
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    h = sigmoid(np.dot(data,W1) + b1)
    yhat = softmax(np.dot(h,W2) + b2)
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    cost = np.sum(-np.log(yhat[labels==1])) / data.shape[0]

    d3 = (yhat - labels) / data.shape[0]
    gradW2 = np.dot(h.T, d3)
    gradb2 = np.sum(d3,0,keepdims=True)

    dh = np.dot(d3,W2.T)
    grad_h = sigmoid_grad(h) * dh

    gradW1 = np.dot(data.T,grad_h)
    gradb1 = np.sum(grad_h,0)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
        gradW2.flatten(), gradb2.flatten()))

    return cost, grad
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False):
    """ Softmax Regression """
    # Implement softmax regression with weight regularization.

    # Inputs:
    # - features: feature vectors, each row is a feature vector
    # - labels: labels corresponding to the feature vectors
    # - weights: weights of the regressor
    # - regularization: L2 regularization constant

    # Output:
    # - cost: cost of the regressor
    # - grad: gradient of the regressor cost with respect to its
    #        weights
    # - pred: label predictions of the regressor (you might find
    #        np.argmax helpful)

    prob = softmax(features.dot(weights))
    if len(features.shape) > 1:
        N = features.shape[0]
        C = weights.shape[1]
    else:
        N = 1
        C = weights.shape[0]

    #print "C", C

    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2

    cost = np.sum(-np.log(prob[range(N), labels])) / N
    cost += 0.5 * regularization * np.sum(weights ** 2) / N

    #print "cost: ", cost
    #print "weights: ", weights

    y = np.zeros((N, C))
    y[range(N), labels] = 1
    grad = features.T.dot(prob - y) / N + weights * regularization / N

    #print "y: ", y
    #print "prob: ", prob.shape
    #print "features: ", features.shape
    #print "labels: ", labels
    #print "W:", weights.shape

    if nopredictions:
        return cost, grad
    else:
        pred = np.argmax(prob, axis=1)
        return cost, grad, pred
Esempio n. 27
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    N = data.shape[0]
    Z1 = data.dot(W1) + b1     # (N, H)
    A1 = sigmoid(Z1)           # (N, H)
    scores = A1.dot(W2) + b2   # (N, Dy)
    probs = softmax(scores)    # (N, Dy)
    cost = -np.sum(np.log(probs[labels==1])) / N
    ### END YOUR CODE
    
    ### YOUR CODE HERE: backward propagation
    dscores = (probs - labels) / N
    dW2 = A1.T.dot(dscores)
    db2 = np.sum(dscores, axis=0)
    dA1 = dscores.dot(W2.T)
    dZ1 = sigmoid_grad(A1) * dA1
    dW1 = data.T.dot(dZ1)
    db1 = np.sum(dZ1, axis=0)
    
    gradW1 = dW1
    gradW2 = dW2
    gradb1 = db1
    gradb2 = db2
    ### END YOUR CODE
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad
Esempio n. 28
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    N, D = data.shape
    
    h = sigmoid(data.dot(W1) + b1)
    scores = softmax(h.dot(W2) + b2)
    cost = np.sum(- np.log(scores[labels == 1])) / N
    ### END YOUR CODE
    
    ### YOUR CODE HERE: backward propagation
    dscores = scores - labels  # good
    
    dscores /= N
    
    gradb2 = np.sum(dscores, axis=0)
    gradW2 = np.dot(h.T, dscores)
    
    
    grad_h = np.dot(dscores, W2.T)
    grad_h = sigmoid_grad(h) * grad_h
    
    gradb1 = np.sum(grad_h, axis=0)
    gradW1 = np.dot(data.T, grad_h)
    ### END YOUR CODE
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad
Esempio n. 29
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    h_raw = np.dot(data, W1) + b1 # N x H
    h = sigmoid(h_raw) # N x H

    pred_raw = np.dot(h, W2) + b2 # N x Dy
    pred = softmax(pred_raw) # N x Dy

    cost = - np.sum(np.log(pred[labels == 1.0])) # scalar
    ### END YOUR CODE
    
    ### YOUR CODE HERE: backward propagation
    grad_pred_raw = pred - labels # N x Dy

    gradW2 = np.dot(h.T, grad_pred_raw) # H x Dy
    gradb2 = np.sum(grad_pred_raw, axis=0) # 1 x Dy
    grad_h = np.dot(grad_pred_raw, W2.T) # N x H

    grad_h_raw = grad_h * h * (1 - h) # N x H

    gradW1 = np.dot(data.T, grad_h_raw) # Dx x H
    gradb1 = np.sum(grad_h_raw, axis=0) # 1 x H
    grad_data = np.dot(grad_h_raw, W1.T) # N x Dx
    ### END YOUR CODE
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad
Esempio n. 30
0
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False):
    """ Softmax Regression """
    # Implement softmax regression with weight regularization.        
    
    # Inputs:                                                         
    # - features: feature vectors, each row is a feature vector     
    # - labels: labels corresponding to the feature vectors         
    # - weights: weights of the regressor                           
    # - regularization: L2 regularization constant                  
    
    # Output:                                                         
    # - cost: cost of the regressor                                 
    # - grad: gradient of the regressor cost with respect to its    
    #        weights                                               
    # - pred: label predictions of the regressor (you might find    
    #        np.argmax helpful)  
    
    prob = softmax(features.dot(weights))
    if len(features.shape) > 1:
        N = features.shape[0]
    else:
        N = 1
    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2
    cost = np.sum(-np.log(prob[range(N), labels])) / N 
    cost += 0.5 * regularization * np.sum(weights ** 2)
    
    ### YOUR CODE HERE: compute the gradients and predictions
    if N==1:
        x = features.dot(weights)[np.nexaxis]
    else:
        x = features.dot(weights)

    pred = np.argmax(prob, axis=1)
    y = labels
    probs = np.exp(x - np.max(x, axis=1, keepdims=True))
    probs /= np.sum(probs, axis=1, keepdims=True)
    loss = -np.sum(np.log(probs[np.arange(N), y])) / N
    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N

    grad = features.T.dot(dx) + regularization*weights

    ### END YOUR CODE
    
    if nopredictions:
        return cost, grad
    else:
        return cost, grad, pred
Esempio n. 31
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector
    # and one target word vector as a building block for word2vec
    # models, assuming the softmax prediction function and cross
    # entropy loss.

    # Inputs:
    # - predicted: numpy ndarray, predicted word vector (\hat{v} in
    #   the written component or \hat{r} in an earlier version)
    # - target: integer, the index of the target word
    # - outputVectors: "output" vectors (as rows) for all tokens
    # - dataset: needed for negative sampling, unused here.

    # Outputs:
    # - cost: cross entropy cost for the softmax word prediction
    # - gradPred: the gradient with respect to the predicted word
    #        vector
    # - grad: the gradient with respect to all the other word
    #        vectors

    # We will not provide starter code for this function, but feel
    # free to reference the code you previously wrote for this
    # assignment!

    ### YOUR CODE HERE
    N = outputVectors.shape[0]  # n_words: vocab size
    y = np.zeros(N)
    y[target] = 1  # (n_words)

    score = np.dot(predicted, outputVectors.T)  # (1, n_words)
    out = softmax(score)

    cost = np.sum(-y * np.log(out))

    dout = out - y  # (1, n_words)
    gradPred = np.dot(dout, outputVectors)  # (1, dim_embed)
    grad = np.dot(dout.T, predicted)  # (n_words, dim_embed)

    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 32
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    h = sigmoid(np.dot(data, W1) + b1)
    y_hat = softmax(np.dot(h, W2) + b2)
    cost = -np.dot(labels.flatten(), np.log(y_hat).flatten())

    gradb2 = y_hat - labels
    gradW2 = np.dot(h.T, gradb2)
    gradb1 = np.dot(gradb2, W2.T) * sigmoid_grad(h)
    gradW1 = np.dot(data.T, gradb1)
    gradb2 = gradb2.sum(axis=0)
    gradb1 = gradb1.sum(axis=0)

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 33
0
def softmaxRegression(features,
                      labels,
                      weights,
                      regularization=0.0,
                      nopredictions=False):
    """ Softmax Regression """
    # Implement softmax regression with weight regularization.

    # Inputs:
    # - features: feature vectors, each row is a feature vector
    # - labels: labels corresponding to the feature vectors
    # - weights: weights of the regressor
    # - regularization: L2 regularization constant

    # Output:
    # - cost: cost of the regressor
    # - grad: gradient of the regressor cost with respect to its
    #        weights
    # - pred: label predictions of the regressor (you might find
    #        np.argmax helpful)

    prob = softmax(features.dot(weights))
    if len(features.shape) > 1:
        N = features.shape[0]
    else:
        N = 1
    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2
    cost = np.sum(-np.log(prob[range(N), labels])) / N
    cost += 0.5 * regularization * np.sum(weights**2)

    ### YOUR CODE HERE: compute the gradients and predictions
    D = weights.shape[1]

    delta = prob - np.eye(D)[labels]
    grad = (np.dot(features.T, delta) / N) + regularization * weights

    pred = np.argmax(prob, axis=1) if N > 1 else np.argmax(prob)
    ### END YOUR CODE

    if nopredictions:
        return cost, grad
    else:
        return cost, grad, pred
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### forward propagation
    z1 = np.dot(data, W1) + b1  # Shape = (20,5)
    h = sigmoid(z1)  # Shape = (20,5)
    z2 = np.dot(h, W2) + b2  # Shape = (20,10)
    yHat = softmax(z2)
    cost = -np.sum(np.multiply(labels, np.log(yHat)))

    ### YOUR CODE HERE: backward propagation
    gradSigma = sigmoid_grad(h)
    delta1 = yHat - labels
    delta2 = delta1.dot(W2.T)
    delta3 = np.multiply(delta2, gradSigma)
    gradW2 = np.dot(delta1.T, h).T
    gradb2 = np.sum(delta1, axis=0)
    gradW1 = np.dot(delta3.T, data).T
    gradb1 = np.sum(delta3, axis=0)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 35
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    ### YOUR CODE HERE
    #raise NotImplementedError
    v_hat = np.dot(outputVectors,predicted)
    y_hat = softmax(v_hat)
    
    cost = -np.log(y_hat[target])
    
    y_hat[target] -= 1.0 #subtracting the correct class
    
    gradPred = np.dot(np.transpose(outputVectors),y_hat)
    
    #grad = np.dot(y_hat,np.transpose(predicted))
    grad = np.outer(y_hat,predicted)
    ### END YOUR CODE

    return cost, gradPred, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    ### YOUR CODE HERE
    # calculate the predictions
    vhat = predicted
    z = np.dot(outputVectors, vhat)
    preds = softmax(z)
    # calculate the cost
    cost = -np.log(preds[target])
    # Gradients
    z = preds.copy()
    z[target] -= 1.0
    grad = np.outer(z, vhat)   # WxD
    gradPred = np.dot(outputVectors.T, z)  # Dx1
    # raise NotImplementedError
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 37
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """
    z = np.sum(np.multiply(outputVectors, predicted), axis=1)
    z_e = np.dot(outputVectors, predicted)
    print(z)
    print(z_e)
    y_h = softmax(z)
    print(outputVectors.shape())
    cost = -np.log(y_h[target])
    y_h_copy = y_h
    y_h_copy[target] -= 1
    gradPred = np.dot(outputVectors.T, y_h_copy)
    grad = np.multiply(predicted, y_h_copy.T)

    ### YOUR CODE HERE

    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 38
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    ### cost http://tinyurl.com/jblb265
    hidden = sigmoid(np.dot(data, W1) + b1)
    prediction = softmax(np.dot(hidden, W2) + b2)
    cost = -np.sum(np.log(prediction) * labels)
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    ### reference : http://neuralnetworksanddeeplearning.com/chap2.html
    delta = prediction - labels
    gradW2 = np.dot(hidden.T, delta)
    gradb2 = np.sum(delta, axis=0)
    delta = delta.dot(W2.T) * sigmoid_grad(hidden)
    gradW1 = data.T.dot(delta)
    gradb1 = np.sum(delta, axis=0)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 39
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """
    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    z1 = np.dot(data, W1) + b1
    h  = sigmoid(z1)
    z2 = np.dot(h, W2) + b2 
    preds = softmax(z2)
    cost = -np.sum(labels * np.log(preds))
    ### YOUR CODE HERE: backward propagation
    # Calculate dcost/dz2
    dcost_dz2 = preds - labels

    gradW2 = np.dot(h.T, dcost_dz2)
    gradb2 = np.sum(dcost_dz2, axis=0)

    dcost_dz1 = np.multiply(np.dot(dcost_dz2, W2.T), sigmoid_grad(h))

    gradW1 = np.dot(data.T, dcost_dz1)
    gradb1 = np.sum(dcost_dz1, axis=0)
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad
Esempio n. 40
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    ### YOUR CODE HERE
    #outputVectors shape W,D
    #predicted Dx1
    y_ = softmax(np.dot(outputVectors, predicted).flatten())
    cost = -np.log(y_[target])
    delta = y_.reshape(-1, 1)

    # delta Wx1
    delta[target] -= 1
    gradPred = outputVectors.T.dot(delta)
    #DxW
    grad = predicted.dot(delta.T).T

    ### END YOUR CODE
    return cost, gradPred, grad
Esempio n. 41
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation

    h = sigmoid(np.dot(data, W1) + b1)
    y = softmax(np.dot(h, W2) + b2)
    cost = -np.sum(labels * np.log(y))
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    grady = (y - labels)
    gradW2 = np.dot(h.T, grady)
    gradb2 = np.sum(grady, axis=0)
    gradh = np.dot(grady, W2.T)
    gradz1 = gradh * h * (1 - h)
    gradW1 = np.dot(data.T, gradz1)
    gradb1 = np.sum(gradz1, axis=0)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 42
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    ### YOUR CODE HERE
    # https://courses.cs.ut.ee/MTAT.03.277/2015_fall/uploads/Main/word2vec.pdf
    # http://mccormickml.com/assets/word2vec/Alex_Minnaar_Word2Vec_Tutorial_Part_I_The_Skip-Gram_Model.pdf
    # считаем вероятность (каждого вектора из outputVectors)
    scores = softmax(np.dot(outputVectors, predicted.reshape(-1, 1)).reshape(-1))
    # считаем LogLoss
    cost = - np.log(scores[target])
    # gradient
    grad_L = scores
    grad_L[target] -= 1.0
    # градиент по слову из контекста
    gradPred = np.dot(scores.reshape(1, -1), outputVectors)
    # градиент по всем эмбеддингам словаря
    grad = np.dot(scores.reshape(-1, 1), predicted.reshape(1, -1))
    return cost, gradPred, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector
    # and one target word vector as a building block for word2vec
    # models, assuming the softmax prediction function and cross
    # entropy loss.

    # Inputs:
    # - predicted: numpy ndarray, predicted word vector (\hat{v} in
    #   the written component or \hat{r} in an earlier version)
    # - target: integer, the index of the target word
    # - outputVectors: "output" vectors (as rows) for all tokens
    # - dataset: needed for negative sampling, unused here.

    # Outputs:
    # - cost: cross entropy cost for the softmax word prediction
    # - gradPred: the gradient with respect to the predicted word
    #        vector
    # - grad: the gradient with respect to all the other word
    #        vectors

    # We will not provide starter code for this function, but feel
    # free to reference the code you previously wrote for this
    # assignment!

    ### YOUR CODE HERE
    #由中心词推导外围词的概率
    probabilities = softmax(predicted.dot(outputVectors.T))
    cost = -np.log(probabilities[target])  #获取指定目标外围词的误差,Cross Entropy
    delta = probabilities  #1*5
    delta[target] -= 1
    N = delta.shape[0]  #5
    D = predicted.shape[0]  #3
    grad = delta.reshape((N, 1)) * predicted.reshape((1, D))  #输出词词向量变化值,5*3
    gradPred = (delta.reshape(
        (1,
         N)).dot(outputVectors)).flatten()  #输出中心词词向量变化量,即predicted vector,1*3

    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 44
0
def softmaxRegression(features,
                      labels,
                      weights,
                      regularization=0.0,
                      nopredictions=False):
    """ Softmax Regression """
    # Implement softmax regression with weight regularization.

    # Inputs:
    # - features: feature vectors, each row is a feature vector
    # - labels: labels corresponding to the feature vectors
    # - weights: weights of the regressor
    # - regularization: L2 regularization constant

    # Output:
    # - cost: cost of the regressor
    # - grad: gradient of the regressor cost with respect to its
    #        weights
    # - pred: label predictions of the regressor (you might find
    #        np.argmax helpful)

    prob = softmax(features.dot(weights))
    if len(features.shape) > 1:
        N = features.shape[0]
    else:
        N = 1
    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2
    cost = np.sum(-np.log(prob[range(N), labels])) / N
    cost += 0.5 * regularization * np.sum(weights**2)

    dz = np.zeros(prob.shape)
    dz += prob
    dz[range(N), labels] -= 1.

    dw = np.dot(features.T, dz) / N
    dw += regularization * weights

    if nopredictions:
        return cost, dw
    else:
        pred = np.argmax(prob, axis=1)  # class labels
        return cost, dw, pred
Esempio n. 45
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    hidden_out = sigmoid(np.matmul(data, W1) + b1)
    output = softmax(np.matmul(hidden_out, W2) + b2)
    cost = np.sum(-labels * np.log(output)) / data.shape[0]
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    grad_output = (output - labels) / data.shape[0]
    gradW2 = np.dot(hidden_out.transpose(), grad_output)
    gradb2 = np.sum(grad_output, axis=0)

    grad_hidden = np.dot(grad_output, W2.transpose())
    grad_hidden = grad_hidden * hidden_out * (1 - hidden_out)
    gradW1 = np.dot(data.transpose(), grad_hidden)
    gradb1 = np.sum(grad_hidden, axis=0)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 46
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    ### YOUR CODE HERE
    vhat = predicted
    U = outputVectors
    z = np.dot(U, vhat)
    yhat = softmax(z)

    cost = -np.log(yhat[target])
    dz = yhat
    dz[target] -= 1

    gradPred = np.dot(U.T, dz)
    grad = dz.reshape(dz.shape[0], 1) * vhat.reshape(1, vhat.shape[0])
    ### END YOUR CODE

    return cost, gradPred, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector
    # and one target word vector as a building block for word2vec
    # models, assuming the softmax prediction function and cross
    # entropy loss.

    # Inputs:
    # - predicted: numpy ndarray, predicted word vector (\hat{v} in
    #   the written component or \hat{r} in an earlier version)
    # - target: integer, the index of the target word
    # - outputVectors: "output" vectors (as rows) for all tokens
    # - dataset: needed for negative sampling, unused here.

    # Outputs:
    # - cost: cross entropy cost for the softmax word prediction
    # - gradPred: the gradient with respect to the predicted word
    #        vector
    # - grad: the gradient with respect to all the other word
    #        vectors

    # We will not provide starter code for this function, but feel
    # free to reference the code you previously wrote for this
    # assignment!

    ### YOUR CODE HERE
    assert predicted.shape[-1] == outputVectors.shape[-1]
    scalar_products = np.sum(outputVectors * predicted, axis=1)
    #implement softmax function
    yhat = softmax(scalar_products)
    #compute cost
    cost = -np.log(yhat[target])
    #gradPred
    gradPred = np.sum(outputVectors * yhat[:, np.newaxis],
                      axis=0) - outputVectors[target]
    #grad
    grad = yhat[:, np.newaxis] * predicted[np.newaxis, :]
    grad[target] = grad[target] - predicted
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 48
0
def softmaxCostAndGradient(predicted,
                           target,
                           outputVectors,
                           dataset,
                           indices=None):
    """ Softmax cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector
    # and one target word vector as a building block for word2vec
    # models, assuming the softmax prediction function and cross
    # entropy loss.

    # Inputs:
    # - predicted: numpy ndarray, predicted word vector (\hat{v} in
    #   the written component or \hat{r} in an earlier version)
    # - target: integer, the index of the target word
    # - outputVectors: "output" vectors (as rows) for all tokens
    # - dataset: needed for negative sampling, unused here.

    # Outputs:
    # - cost: cross entropy cost for the softmax word prediction
    # - gradPred: the gradient with respect to the predicted word
    #        vector
    # - grad: the gradient with respect to all the other word
    #        vectors

    # We will not provide starter code for this function, but feel
    # free to reference the code you previously wrote for this
    # assignment!

    ### YOUR CODE HERE
    W, D = outputVectors.shape
    y = np.zeros(W)
    y[target] = 1.0
    theta = np.dot(outputVectors, predicted)  # (W,D), (D,) -> (W,)
    y_hat = softmax(theta)  # (W,)
    cost = -np.sum(y * np.log(y_hat))
    gradPred = np.dot(y_hat - y, outputVectors)  # dJ/dV_c, (D,)
    grad = np.outer(y_hat - y, predicted)  # dJ/dU, (W, D), U: outputVectors
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 49
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    N = data.shape[0]
    h = sigmoid(np.dot(data, W1) + np.tile(b1, (N, 1)))  # (N, H)
    y = softmax(np.dot(h, W2) + np.tile(b2, (N, 1)))     # (N, Dy)
    cost = -np.sum(labels * np.log(y))                   # float
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    d2 = (y - labels)                          # (N, Dy)
    d1 = np.dot(d2, W2.T) * sigmoid_grad(h)    # (N, H)

    gradW2 = np.dot(h.T, d2)                   # (H, Dy)
    gradW1 = np.dot(data.T, d1)                # (Dx, H)
    gradb2 = np.sum(d2, axis=0)                # (Dy)
    gradb1 = np.sum(d1, axis=0)                # (Dx)
    ### END YOUR CODE
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad
Esempio n. 50
0
def softmaxRegression(features, labels, weights, regularization = 0.0, nopredictions = False):
    """ Softmax Regression """
    # Implement softmax regression with weight regularization.        
    
    # Inputs:                                                         
    # - features: feature vectors, each row is a feature vector     
    # - labels: labels corresponding to the feature vectors         
    # - weights: weights of the regressor                           
    # - regularization: L2 regularization constant                  
    
    # Output:                                                         
    # - cost: cost of the regressor                                 
    # - grad: gradient of the regressor cost with respect to its    
    #        weights                                               
    # - pred: label predictions of the regressor (you might find    
    #        np.argmax helpful)  
    cost = 0.0
    prob = softmax(features.dot(weights)) # (N,D).dot(D,C)
    if len(features.shape) > 1:
        N = features.shape[0]
    else:
        N = 1
    # A vectorized implementation of    1/N * sum(cross_entropy(x_i, y_i)) + 1/2*|w|^2
    cost = np.sum(-np.log(prob[range(N), labels])) / N
    cost += 0.5 * regularization * np.sum(weights ** 2)
    
    ### YOUR CODE HERE: compute the gradients and predictions
    pred = np.argmax(prob, axis=1)

    # gradient
    dscore = prob
    dscore[np.arange(N), labels] -= 1
    dscore /= N      # 损失函数cost除以了N,计算的是平均损失

    grad = features.T.dot(dscore)
    grad += regularization * weights
    ### END YOUR CODE
    
    if nopredictions:
        return cost, grad
    else:
        return cost, grad, pred
Esempio n. 51
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    ### YOUR CODE HERE
    probs = softmax(predicted.dot(outputVectors.T))
    cost = -np.log(probs[target])

    grad_pred = probs
    grad_pred[target] -= 1

    grad = grad_pred[:, np.newaxis] * predicted[np.newaxis, :]
    gradPred = grad_pred.dot(outputVectors)

    print grad_pred.shape
    print outputVectors.shape
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 52
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])
    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))  # (Dx, H)
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))  #(1, H)
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))  #(H, Dy)
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))  #(1, Dy)

    ### YOUR CODE HERE: forward propagation (using notations of Lecture 5)
    # x = z1 = a1
    z2 = np.dot(data, W1) + b1  # (1, H)
    a2 = sigmoid(z2)  # (1, H)
    z3 = np.dot(a2, W2) + b2  #(1, Dy)
    a3 = softmax(z3)  #(1, Dy)
    S = -np.sum(np.log(np.sum(a3 * labels, axis=1)))
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    delta3 = a3 - labels  # (1, Dy)
    gradW2 = np.dot(a2.T, delta3)  # (H, Dy)
    delta2 = sigmoid_grad(a2) * np.dot(delta3, W2.T)  # (1, H)
    gradW1 = np.dot(data.T, delta2)  # (H, H)
    gradb2 = np.sum(delta3, axis=0)
    gradb1 = np.sum(delta2, axis=0)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return S, grad
Esempio n. 53
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    ### YOUR CODE HERE
    n, d = outputVectors.shape
    predicted = np.reshape(predicted, (1, d))
    y = np.zeros((n, 1))
    y[target] = 1
    y_hat = softmax(np.dot(outputVectors, predicted.T).reshape((n, ))).reshape(
        (n, 1))
    cost = -np.sum(y * np.log(y_hat))
    # print 'this_cost: ', y_hat, cost

    gradPred = np.reshape(np.dot((y_hat - y).T, outputVectors), (d, ))
    grad = np.dot((y_hat - y), predicted)
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 54
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    h = sigmoid(data.dot(W1) + b1)
    y_pred = softmax(h.dot(W2) + b2)
    # print y_pred
    cost = -np.sum(labels * np.log(y_pred))
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    grad_ce = y_pred - labels
    gradW2 = h.T.dot(grad_ce)
    gradb2 = np.sum(grad_ce, axis=0)
    xx = grad_ce.dot(W2.T) * sigmoid_grad(h)
    gradW1 = data.T.dot(xx)
    gradb1 = np.sum(xx, axis=0)

    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector
    # and one target word vector as a building block for word2vec
    # models, assuming the softmax prediction function and cross
    # entropy loss.

    # Inputs:
    # - predicted: numpy ndarray, predicted word vector (\hat{v} in
    #   the written component or \hat{r} in an earlier version)
    # - target: integer, the index of the target word
    # - outputVectors: "output" vectors (as rows) for all tokens
    # - dataset: needed for negative sampling, unused here.

    # Outputs:
    # - cost: cross entropy cost for the softmax word prediction
    # - gradPred: the gradient with respect to the predicted word
    #        vector
    # - grad: the gradient with respect to all the other word
    #        vectors

    # We will not provide starter code for this function, but feel
    # free to reference the code you previously wrote for this
    # assignment!

    ### YOUR CODE HERE
    #print outputVectors.shape
    prob_of_each = softmax(np.dot(outputVectors, predicted))
    #print prob_of_each
    cost = -np.log(prob_of_each[target])
    #print prob_of_each.shape

    prob_of_each[target] -= 1
    gradPred = np.dot(outputVectors.T,
                      prob_of_each.reshape(prob_of_each.shape[0],
                                           1)).flatten()
    grad = np.dot(prob_of_each.reshape(prob_of_each.shape[0], 1),
                  predicted.reshape(1, predicted.shape[0]))
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 56
0
    def add_prediction_op(self):
        """Adds the core transformation for this model which transforms a batch of input
        data into a batch of predictions. In this case, the transformation is a linear layer plus a
        softmax transformation:

        y = softmax(Wx + b)

        Hint: Make sure to create tf.Variable as needed.
        Hint: For this simple use-case, it's sufficient to initialize both weights W
                    and biases b with zeros.

        Args:
            input_data: A tensor of shape (batch_size, n_features).
        Returns:
            pred: A tensor of shape (batch_size, n_classes)
        """
        W = tf.Variable(tf.zeros([self.config.n_features, self.config.n_classes]))
        b = tf.Variable(tf.zeros([self.config.batch_size, self.config.n_classes]))
        pred = softmax(tf.matmul(self.input_placeholder, W) + b)
        return pred
Esempio n. 57
0
def forward_test(data, labels, params, dimensions):
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])
    # print(params)
    # print(params[ofs:ofs + Dx * H])
    # print((Dx, H))
    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    h = sigmoid(np.dot(data, W1) + b1)
    yHat = softmax(np.dot(h, W2) + b2)
    cost = np.count_nonzero(
        np.argmax(yHat, axis=1) - np.argmax(labels, axis=1))
    return cost
Esempio n. 58
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ 
    Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """
    prob = softmax(np.matmul(outputVectors, predicted))
    cost = -np.log(prob[target])

    #the gradient for V_c

    gradPred = np.sum(outputVectors * prob.reshape(-1, 1),
                      axis=0) - outputVectors[target]

    #gradients for U(ont only o, but other context word would be contained)
    grad = np.tile(predicted,
                   (outputVectors.shape[0], 1)) * prob.reshape(-1, 1)
    grad[target] -= predicted

    return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    N, D = data.shape
    hidden = sigmoid(data.dot(W1) + b1)
    prediction = softmax(hidden.dot(W2) + b2)
    cost = (-1) * np.sum(np.log(prediction) * labels)
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    delta = (prediction - labels)
    gradW2 = hidden.T.dot(delta)
    gradb2 = np.sum(delta, axis=0, keepdims=True)
    hidden_delta = delta.dot(W2.T) * sigmoid_grad(hidden)
    gradW1 = data.T.dot(hidden_delta)
    gradb1 = np.sum(hidden_delta, axis=0, keepdims=True)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 60
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """
     
    predicted = np.expand_dims(predicted, axis=1) 
    softy = softmax(np.dot(outputVectors, predicted), 0)
    ### YOUR CODE HERE
    onehotmatrix = np.eye(len(outputVectors), len(outputVectors))  
    cost = -np.log(softy[target])
#    gradPred = np.sum(np.dot(outputVectors, predicted), axis=0) - outputVectors[target]
    gradPred = np.dot((softy-np.expand_dims(onehotmatrix[target].T, axis=1)).T, outputVectors)    
    grad = np.dot((softy-np.expand_dims(onehotmatrix[target].T, axis=1)), predicted.T)
    gradPred = np.squeeze(gradPred)
    grad = np.squeeze(grad)    
    print 'gradpred is ', gradPred
    ### END YOUR CODE

    return cost, gradPred, grad