Esempio n. 1
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, 
    K=10):
    """ Negative sampling cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector  
    # and one target word vector as a building block for word2vec     
    # models, using the negative sampling technique. K is the sample  
    # size. You might want to use dataset.sampleTokenIdx() to sample  
    # a random word index. 
    # 
    # Note: See test_word2vec below for dataset's initialization.
    #                                       
    # Input/Output Specifications: same as softmaxCostAndGradient     
    # We will not provide starter code for this function, but feel    
    # free to reference the code you previously wrote for this        
    # assignment!
    
    ### YOUR CODE HERE
    grad = np.zeros(outputVectors.shape)
    gradPred = np.zeros(predicted.shape)

#    indices = [target]
#    for k in xrange(K):
#        newidx = dataset.sampleTokenIdx()
#        while newidx == target:
#            newidx = dataset.sampleTokenIdx()
#        indices += [newidx]
#
#    labels = np.array([1] + [-1 for k in xrange(K)])
#    vecs = outputVectors[indices,:]
#
#    t = sigmoid(vecs.dot(predicted) * labels)
#    cost = -np.sum(np.log(t))
#
#    delta = labels * (t - 1)
#    gradPred = delta.reshape((1,K+1)).dot(vecs).flatten()
#    gradtemp = delta.reshape((K+1,1)).dot(predicted.reshape(
#        (1,predicted.shape[0])))
#    for k in xrange(K+1):
#        grad[indices[k]] += gradtemp[k,:]

    t = sigmoid(predicted.dot(outputVectors[target,:]))
    cost = -np.log(t)
    delta = t - 1

    gradPred += delta * outputVectors[target, :]
    grad[target, :] += delta * predicted

    for k in xrange(K):
        idx = dataset.sampleTokenIdx()
    
        t = sigmoid(-predicted.dot(outputVectors[idx,:]))
        cost += -np.log(t)
        delta = 1 - t
    
        gradPred += delta * outputVectors[idx, :]
        grad[idx, :] += delta * predicted
    ### END YOUR CODE
    
    return cost, gradPred, grad
Esempio n. 2
0
def negSamplingCostAndGradient(predicted, target, outputVectors,
dataset, K = 10):
    klist = []
    for k in range(K):
        randomId = dataset.sampleTokenIdx()
        while randomId == target:
            randomId = dataset.sampleTokenIdx()
        klist.append(randomId)

    u0 = outputVectors[target]
    uks = -outputVectors[klist]
    vc = predicted
    U = np.vstack((u0, uks))

    dot = U.dot(vc)
    sigmoid_value = sigmoid(dot)
    cost = -np.sum(np.log(sigmoid_value))

    # print "cost is %f" % (cost, )

    gradPred = np.zeros(predicted.shape)
    grad = np.zeros(outputVectors.shape)

    temp = sigmoid(u0.dot(vc)) - 1
    intermediate = (sigmoid_value-1).reshape(-1, 1) * np.vstack((u0, -uks))
    gradPred += intermediate[0] - np.sum(intermediate[1:,], axis=0)
    grad[target] += temp * vc

    counter_dictionary = Counter(klist)
    unique_ks = list(counter_dictionary.keys())
    frequency_count = np.array(list(counter_dictionary.values()))
    grad[unique_ks] += (sigmoid(-outputVectors[unique_ks].dot(vc)) -1).reshape(-1,1) * -vc
    grad[unique_ks] *= frequency_count.reshape(-1, 1)

    return cost, gradPred, grad
Esempio n. 3
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, 
    K=10):
    """ Negative sampling cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector  
    # and one target word vector as a building block for word2vec     
    # models, using the negative sampling technique. K is the sample  
    # size.
    #                                       
    # Input/Output Specifications: same as softmaxCostAndGradient     

    u_o = outputVectors[target,:]
    sigmoid_o = sigmoid(np.dot(u_o, predicted))
    cost = - np.log(sigmoid_o)
    gradPred = -u_o*(1-sigmoid_o)
    grad = np.zeros_like(outputVectors)
    grad[target,:] = - predicted*(1-sigmoid_o)
    
    for _ in range(K):
        k = dataset.sampleTokenIdx()
        while k == target:
            k = dataset.sampleTokenIdx()
        sigmoid_k = sigmoid(-np.dot(outputVectors[k,:],predicted))
        cost += - np.log(sigmoid_k)
        gradPred += outputVectors[k,:] * (1-sigmoid_k)
        grad[k,:] += predicted * (1-sigmoid_k)
    
    return cost, gradPred, grad
Esempio n. 4
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, 
    K=10):
    """ Negative sampling cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector  
    # and one target word vector as a building block for word2vec     
    # models, using the negative sampling technique. K is the sample  
    # size. You might want to use dataset.sampleTokenIdx() to sample  
    # a random word index. 
    # 
    # Note: See test_word2vec below for dataset's initialization.
    #                                       
    # Input/Output Specifications: same as softmaxCostAndGradient     
    # We will not provide starter code for this function, but feel    
    # free to reference the code you previously wrote for this        
    # assignment!
    
    ### YOUR CODE HERE
    negativeSamples = [dataset.sampleTokenIdx() for i in range(K)]
    sigmoidTargetPred = sigmoid(outputVectors[target,:].transpose().dot(predicted))
    cost = -np.log(sigmoidTargetPred)   
    gradPred = (sigmoidTargetPred - 1.0)*outputVectors[target,:]
    grad = np.zeros(outputVectors.shape)
    grad[target,:] = predicted * (sigmoidTargetPred - 1.0)

    for sample in negativeSamples:
        sigmoidSamplePredicted = sigmoid(-outputVectors[sample,:].transpose().dot(predicted))
        cost -= np.log(sigmoidSamplePredicted)
        gradPred += (1.0 - sigmoidSamplePredicted)*outputVectors[sample,:]
        grad[sample,:] += (1.0 - sigmoidSamplePredicted)*predicted.transpose()
    ### END YOUR CODE
    
    return cost, gradPred, grad
Esempio n. 5
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
    K=10):
    """ Negative sampling cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector
    # and one target word vector as a building block for word2vec
    # models, using the negative sampling technique. K is the sample
    # size. You might want to use dataset.sampleTokenIdx() to sample
    # a random word index.
    #
    # Note: See test_word2vec below for dataset's initialization.
    #
    # Input/Output Specifications: same as softmaxCostAndGradient
    # We will not provide starter code for this function, but feel
    # free to reference the code you previously wrote for this
    # assignment!

    ### YOUR CODE HERE
    grad = np.zeros(outputVectors.shape)

    s = sigmoid(np.dot(outputVectors[target,:], predicted))
    cost = -np.log(s)
    gradPred = - sigmoid_grad(s)/s*outputVectors[target,:]
    grad[target,:] = - sigmoid_grad(s)/s*predicted

    for k in range(K):
        i = dataset.sampleTokenIdx()
        s = sigmoid( - np.dot(outputVectors[i,:], predicted))
        cost -= np.log(s)
        gradPred += sigmoid_grad(s)/s*outputVectors[i,:]
        grad[i,:] += sigmoid_grad(s)/s*predicted

    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 6
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, 
    K=10):
    """ Negative sampling cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector  
    # and one target word vector as a building block for word2vec     
    # models, using the negative sampling technique. K is the sample  
    # size. You might want to use dataset.sampleTokenIdx() to sample  
    # a random word index. 
    # 
    # Note: See test_word2vec below for dataset's initialization.
    #                                       
    # Input/Output Specifications: same as softmaxCostAndGradient     
    # We will not provide starter code for this function, but feel    
    # free to reference the code you previously wrote for this        
    # assignment!
    
    ### YOUR CODE HERE
    '''
    Keep track of dims:
    
    D - dim of word vector
    V - number of words
    
    predicted     :  (D, )
    target        :  integer
    outputVectors :  (V, D)
    
    cost          :  float
    gradPred      :  (D, )
    grad          :  (V, D)
    '''
    predicted = predicted.reshape(-1, 1)                          # (D ,1)
    
    sampledIndices = [dataset.sampleTokenIdx() for i in xrange(K)]
    sampledVectors = outputVectors[sampledIndices, :]             # (K, D)

    outputVec = outputVectors[target, :]                          # (D, )

    prob_out = sigmoid(outputVec.dot(predicted))                  # float
    probs_negative = sigmoid(-sampledVectors.dot(predicted))      # (K, 1)
    
    cost = - np.log(prob_out) - np.sum(np.log(probs_negative))
    
    gradPred = (prob_out - 1) * outputVec - np.sum((probs_negative-1) * sampledVectors, axis=0)    # (D, )

    grad = np.zeros_like(outputVectors)                           # (V, D)
    grad[target, :] = (prob_out - 1) * predicted.reshape(-1)

    # Note that sampledIndices may have repeated indices, we may loop over all K samples.
    # And target should not be appeared in sampledIndices, but it's ok in gradient check,
    # because we use += to update each output vector, and no grads will be missed.
    for i in xrange(K):
        grad[sampledIndices[i], :] += (1 - probs_negative[i]) * predicted.reshape(-1)

    ### END YOUR CODE
    
    return cost, gradPred, grad
Esempio n. 7
0
def test_sigmoid_permutation_axis1(dim_1):
    a1          = np.random.normal(size=(1,dim_1))
    s1          = sigmoid(a1)

    permutation = np.random.permutation(dim_1)
    inverse_permutation = np.argsort(permutation)

    s1_perm     = sigmoid(a1.ravel()[permutation])
    assert rel_error(s1_perm.ravel()[inverse_permutation], s1) <= 1e-8
Esempio n. 8
0
def test_sigmoid_gradient(dim_1, dim_2):
    a1    = np.random.normal(loc=0., scale=20., size=(dim_1,dim_2))
    shift = np.random.uniform(low=1e-9, high=1e-5, size=(dim_1,dim_2))
    ap = a1 + shift
    am = a1 - shift

    dsigmoid = (sigmoid(ap) - sigmoid(am)) / (2*shift)
    assert np.abs(np.max(dsigmoid - sigmoid_grad(sigmoid(a1)))) <= 1e-7
    assert np.abs(np.min(dsigmoid - sigmoid_grad(sigmoid(a1)))) <= 1e-7
Esempio n. 9
0
def test_sigmoid_shape(dim):
    testing_shape = []
    for y in range(0,dim):
        testing_shape.append(np.random.randint(3,8))
    shape = tuple(testing_shape)
    #z = np.random.randn(*testing_shape)
    x = np.random.standard_normal(shape)
    y = np.copy(x)
    assert x.shape == sigmoid(y).shape
    assert x.shape == sigmoid_grad(sigmoid(y)).shape
Esempio n. 10
0
def test_sigmoid_permutation_axis0(dim_1, execution_number):
    """ sigmoid needs to be applied element-wise;"""
    a1          = np.random.normal(size=(dim_1,1))
    s1          = sigmoid(a1)

    permutation = np.random.permutation(dim_1)
    inverse_permutation = np.argsort(permutation)

    s1_perm     = sigmoid(a1[permutation])
    assert rel_error(s1_perm[inverse_permutation], s1) <= 1e-8
Esempio n. 11
0
def sigmoid_forward(x):
    """
    Computes the forward pass for a sigmoid activation.

    Inputs:
    - x: Input data, numpy array of arbitary shape;

    Returns a tuple (out, cache)
    - out: output of the same shape as x
    - cache: identical to out; required for backpropagation
    """
    return sigmoid(x), sigmoid(x)
Esempio n. 12
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, 
    K=10):
    """ Negative sampling cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector  
    # and one target word vector as a building block for word2vec     
    # models, using the negative sampling technique. K is the sample  
    # size. You might want to use dataset.sampleTokenIdx() to sample  
    # a random word index. 
    # 
    # Note: See test_word2vec below for dataset's initialization.
    #                                       
    # Input/Output Specifications: same as softmaxCostAndGradient     
    # We will not provide starter code for this function, but feel    
    # free to reference the code you previously wrote for this        
    # assignment!
    
    ### YOUR CODE HERE
    v_c = predicted
    u_o = outputVectors[target]

    suv = sigmoid(u_o.dot(v_c))
    pos = np.log(suv) # positive sample

    # sample w/ dataset.sampleTokenIdx() method iteratively
    n = []  # indexes of negative samples
    while len(n) < K:
        x = dataset.sampleTokenIdx()
        if x != target:
            n.append(x)
    neg_samples = outputVectors[n]


    skv = sigmoid((neg_samples.dot(v_c)))
    neg = np.sum(np.log(1-skv))

    cost = -pos - neg

    # neg_samples: K x d, skv: 1 x K
    gradPred = -(1 - suv) * u_o + (neg_samples.T * (skv)).sum(axis=1)
    
    grad = np.zeros(outputVectors.shape)
    grad[target] += -(1-suv) * v_c
    negGrad = np.outer(skv, v_c)
    # sum grads together when they have been sampled with replacement
    for i,x in enumerate(n):
        grad[x] += negGrad[i]
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 13
0
def test_sigmoidgrad():
    """ Original sigmoid gradient test defined in q2_sigmoid.py; """
    x = np.array([[1, 2], [-1, -2]])
    f = sigmoid(x)
    g = sigmoid_grad(f)
    assert rel_error(g, np.array([[0.19661193, 0.10499359],
        [0.19661193, 0.10499359]])) <= 1e-7
Esempio n. 14
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    T = data.shape[0]

    ### YOUR CODE HERE: forward propagation
    z1 = np.dot(data, W1) + b1	# MxH + 1xH = MxH
    h = sigmoid(z1)		
    z2 = np.dot(h, W2) + b2	# MxDy + 1xDy = MxDy
    y_ = softmax(z2)		# MxDy
    cost = -1*np.sum(np.log(y_)*labels)/T
    #raise NotImplementedError
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    dz2 = (y_ - labels)/T	# MxDy
    db2 = np.sum(dz2, axis=0)	# 1xDy
    dh	= np.dot(dz2, W2.T)	# MxH
    dW2	= np.dot(h.T, dz2)	# HxDy
    dz1 = h*(1-h)*dh		# MxH 
    db1 = np.sum(dz1, axis=0)	# 1xH
    dW1	= np.dot(data.T, dz1)	# Dx x H

    gradb2 = db2
    gradW2 = dW2
    gradb1 = db1
    gradW1 = dW1
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
        gradW2.flatten(), gradb2.flatten()))

    #print "Cost: %f \t grad[0] %f, grad[1] %f" % (cost, grad[0], grad[1])

    return cost, grad
Esempio n. 15
0
def forward_backward_prop(X, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    the backward propagation for the gradients for all parameters.

    Notice the gradients computed here are different from the gradients in
    the assignment sheet: they are w.r.t. weights, not inputs.

    Arguments:
    X -- M x Dx matrix, where each row is a training example x.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    # Note: compute cost based on `sum` not `mean`.
    ### YOUR CODE HERE: forward propagation
    h = sigmoid(np.dot(X, W1) + b1)
    yhat = softmax(np.dot(h, W2) + b2)
    cross_entropy = -np.log(yhat)[labels == 1]
    cost = np.sum(cross_entropy)  # / len(labels)
    ### END YOUR CODE

    # Things look too good to be true... Tried twice and the gradient check passed.
    ### YOUR CODE HERE: backward propagation
    dl_dyhat = (-1 / yhat)[labels == 1]  # m x 1, m the number of points;
    dyhat_dsoftmax = yhat * (1 - yhat)  # m x n, n the number of classes;
    dl_dsoftmax = dyhat_dsoftmax * np.reshape(dl_dyhat, (-1, 1))
    gradW2 = np.reshape(np.sum(h, 0), [-1, 1]) * np.reshape(
        np.sum(dl_dsoftmax, 0),
        [1, -1])  #np.dot(h.T, dl_dsoftmax)  # n x h, transpose shape of W2;
    gradb2 = np.reshape(np.sum(dl_dsoftmax, axis=0), (1, -1))  # n x 1
    dl_dh = np.dot(np.sum(dl_dsoftmax, axis=0), W2.T)  # m x h, sum up all m
    dh_dsigmoid = np.sum(sigmoid_grad(h), 0)  # m x h, sumup all m
    dl_dsigmoid = dl_dh * dh_dsigmoid
    gradW1 = np.reshape(np.sum(X, 0), [-1, 1]) * np.reshape(
        dl_dsigmoid, [1, -1])
    gradb1 = np.reshape(dl_dsigmoid, (1, -1))
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 16
0
def test_sigmoid():
    """ Original sigmoid test defined in q2_sigmoid.py; """
    x = np.array([[1, 2], [-1, -2]])
    f = sigmoid(x)
    assert rel_error(
        f, np.array([[0.73105858, 0.88079708], [0.26894142, 0.11920292]
                     ])) <= 1e-7
Esempio n. 17
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

    ### YOUR CODE HERE
    u_o, v_c = outputVectors[target], outputVectors[indices[1:]]
    loss = -np.log(sigmoid(np.matmul(u_o, predicted)))
    print(u_o)
    print(v_c)
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 18
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    layer1 = data.dot(W1) + b1
    activation1 = sigmoid(layer1)

    layer2 = activation1.dot(W2) + b2
    predictions = softmax(layer2)

    softmaxs = np.sum(predictions * labels, axis=1)
    cost = -np.log(softmaxs)

    num_train = data.shape[0]
    cost = np.sum(cost) / num_train
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    dC = 1.0
    dLayer2 = dC / num_train * predictions - dC / num_train * labels

    gradb2 = np.sum(dLayer2, axis=0)
    gradW2 = activation1.T.dot(dLayer2)

    dActivation1 = dLayer2.dot(W2.T)
    dLayer1 = dActivation1 * sigmoid_grad(activation1)

    gradb1 = np.sum(dLayer1, axis=0)
    gradW1 = data.T.dot(dLayer1)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 19
0
def negSamplingCostAndGradient(predicted,
                               target,
                               outputVectors,
                               dataset,
                               K=10):
    grad = np.zeros(outputVectors.shape)
    gradPred = np.zeros(predicted.shape)
    indices = [target]
    for k in range(K):
        newidx = dataset.sampleTokenIdx()
        while newidx == target:
            newidx = dataset.sampleTokenIdx()
        indices += [newidx]

    labels = np.array([1] + [-1 for k in range(K)])
    vecs = outputVectors[indices, :]
    t = sigmoid(vecs.dot(predicted) * labels)
    cost = -np.sum(np.log(t))
    delta = labels * (t - 1)
    gradPred = delta.reshape((1, K + 1)).dot(vecs).flatten()
    gradtemp = delta.reshape(
        (K + 1, 1)).dot(predicted.reshape((1, predicted.shape[0])))

    for k in range(K + 1):
        grad[indices[k]] += gradtemp[k, :]

    return cost, gradPred, grad
Esempio n. 20
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation

    N = data.shape[0]
    a1 = data
    z2 = np.matmul(a1, W1) + b1
    a2 = sigmoid(z2)
    z3 = np.matmul(a2, W2) + b2
    a3 = softmax(z3)
    ycap = a3
    cost = -np.sum(labels * np.log(ycap))

    #raise NotImplementedError
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation

    a3grad = a3 - labels  # this is the grad for softmax
    gradW2 = np.dot(a2.T, a3grad)
    gradb2 = np.sum(a3grad, axis=0, keepdims=True)
    t = np.dot(W2, a3grad.T) * sigmoid_grad(a2).T
    gradW1 = np.dot(t, a1).T
    gradb1 = np.sum(np.dot(a3grad, W2.T) * sigmoid_grad(a2),
                    axis=0,
                    keepdims=True)

    #raise NotImplementedError
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 21
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))


    ### YOUR CODE HERE

    prod = sigmoid(np.dot(outputVectors[target], predicted))


    cost = -np.log(prod) - \
           sum([np.log(sigmoid(-np.dot(outputVectors[i], predicted))) for i in indices[1:]])


    gradPred = (prod - 1) * outputVectors[target] - \
               sum([(sigmoid(-np.dot(outputVectors[i], predicted)) - 1) * outputVectors[i] for i in indices[1:]])
    grad = np.zeros(outputVectors.shape)
    grad[target] = (prod - 1) * predicted

    ### IMPORTANT: GRADIENTS FOR SAMPLED WORDS SHOULD BE ACCUMULATED BECAUSE THEY CAN APPEAR SEVERAL TIMES => -=
    for i in indices[1:]:
        grad[i] -= (sigmoid(-np.dot(outputVectors[i], predicted)) - 1) * predicted



    assert gradPred.shape == predicted.shape
    assert grad.shape == outputVectors.shape

    ### END YOUR CODE


    return cost, gradPred, grad
Esempio n. 22
0
def negSamplingCostAndGradient(predicted,
                               target,
                               outputVectors,
                               dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

    gradPred = np.zeros_like(predicted)
    grad = np.zeros_like(outputVectors)
    ### YOUR CODE HERE
    V = outputVectors.shape[0]  #vocabulary size
    y = np.zeros(V)
    y[target] = 1
    similarity = np.dot(predicted, outputVectors[target].T)  #Ut . Vc
    probability = sigmoid(similarity)
    #    cost = -np.log(probability)
    cost = -np.sum(y * np.log(probability))
    #    context_word_vec = np.zeros((len(indices), outputVectors.shape[1]))
    gradPred = (probability - 1) * outputVectors[target]
    grad[target] = np.dot(probability - 1, predicted)

    i = 0
    for i in indices[1:]:
        neg_similarity = np.dot(predicted, outputVectors[i].T)
        neg_probability = sigmoid(-neg_similarity)
        #        cost += -np.log(neg_probability)
        cost += -np.sum(y * np.log(neg_probability))
        gradPred += (1 - neg_probability) * outputVectors[i]
        grad[i] += np.dot((1 - neg_probability), predicted)
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 23
0
def negSamplingCostAndGradient(predicted,
                               target,
                               outputVectors,
                               dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    """

    ### YOUR CODE HERE
    V, D = outputVectors.shape

    # get the k random indices
    k_indicies = []
    for i in range(K):
        rand_index = dataset.sampleTokenIdx()  # 有没有可能随机到正确的样本?有,但概率很小
        k_indicies.append(rand_index)

    # loss function
    neg_sample_vector = outputVectors[k_indicies, :]  # KxD
    assert neg_sample_vector.shape == (K, D)
    sigm_neg = sigmoid(-1.0 *
                       np.dot(neg_sample_vector, predicted.reshape(
                           (D, 1))))  # KxD Dx1 = Kx1
    cost_neg = np.sum(np.log(sigm_neg), axis=0)

    sigm_cor = sigmoid(np.dot(outputVectors[target], predicted.reshape(
        (D, 1))))
    cost = -1.0 * np.log(sigm_cor) - cost_neg

    # gradient on output vectors
    grad = np.zeros(outputVectors.shape)  # V, D
    grad[target] = predicted * (sigm_cor - 1.0)  # 1xD
    for k in k_indicies:
        grad[k, :] += -1.0 * predicted.reshape(
            (D, )) * (sigmoid(np.dot(-1.0 * predicted, outputVectors[k])) -
                      1.0)

    # gradient on input vector
    # 这里第一项减一跟公式有点不一样啊。。但是结果是对的。。1 - sigm_neg.reshape((1,K))
    gradPred_neg = np.dot(1 - sigm_neg.reshape((1, K)),
                          neg_sample_vector).reshape((1, D))  # 1xK KxD = 1xD
    gradPred_cor = (sigm_cor - 1) * outputVectors[target].reshape((1, D))
    gradPred = gradPred_neg + gradPred_cor

    ### END YOUR CODE
    return cost, gradPred, grad
Esempio n. 24
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

    ### YOUR CODE HERE

    #initialize variables
    cost = 0
    gradPred = np.zeros(predicted.shape)
    grad = np.zeros(outputVectors.shape)

    output_word = outputVectors[target]
    target_sigmoid = sigmoid(np.dot(output_word, predicted))

    cost = -np.log(target_sigmoid)
    gradPred = (target_sigmoid - 1.0) * output_word
    grad[target] = (target_sigmoid - 1.0) * predicted

    for index in indices:
        word = outputVectors[index]
        k_sigmoid = sigmoid(np.dot(-word, predicted))

        cost -= np.log(k_sigmoid)
        gradPred += ((1.0 - k_sigmoid) * word)
        grad[index] += -((k_sigmoid - 1.0) * predicted)

    assert predicted.shape == gradPred.shape
    assert outputVectors.shape == grad.shape

    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 25
0
def negSamplingCostAndGradient(predicted,
                               target,
                               outputVectors,
                               dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]

    # indices[0]=target, indices[1--K] = not target
    indices.extend(getNegativeSamples(target, dataset, K))

    ### YOUR CODE HERE
    u_o = outputVectors[indices[0]]
    v_c = predicted
    # [dim]
    z = sigmoid(np.dot(u_o.T, v_c))
    cost = -np.log(z)
    gradPred = np.zeros(np.shape(predicted))
    gradPred += (z - 1.0) * u_o
    grad = np.zeros(np.shape(outputVectors))
    # 按位乘 [1,2,3]*[1,2,3] = [1,4,9]
    # 对u_o求偏导
    grad[target] += (z - 1.0) * v_c

    # for negative samples
    for k in range(K):
        u_k = outputVectors[indices[k + 1]]
        z = sigmoid(np.dot(u_k.T, v_c))
        cost -= np.log(1.0 - z)
        gradPred += z * u_k
        grad[indices[k + 1]] += z * v_c
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 26
0
def negSamplingCostAndGradient(predicted,
                               target,
                               outputVectors,
                               dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

    ### YOUR CODE HERE
    grad = np.zeros(outputVectors.shape)
    gradPred = np.zeros(predicted.shape)
    cost = 0
    z = sigmoid(np.dot(outputVectors[target], predicted))

    cost -= np.log(z)
    grad[target] += predicted * (z - 1.0)
    gradPred += outputVectors[target] * (z - 1.0)

    grad = np.zeros(outputVectors.shape)
    grad[target] = predicted * (sigmoid(outputVectors[target].dot(predicted)) -
                                1)
    gradPred = outputVectors[target] * (
        sigmoid(outputVectors[target].dot(predicted)) - 1)
    cost = -np.log(sigmoid(outputVectors[target].dot(predicted)))
    for i in xrange(1, K + 1):
        negtive_index = indices[i]
        sig = sigmoid((outputVectors[negtive_index]).dot(predicted))
        cost += -np.log(1 - sig)
        grad[negtive_index] += predicted * sig
        gradPred += outputVectors[negtive_index] * sig
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 27
0
def negSamplingCostAndGradient(predicted,
                               target,
                               outputVectors,
                               dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector
    # and one target word vector as a building block for word2vec
    # models, using the negative sampling technique. K is the sample
    # size. You might want to use dataset.sampleTokenIdx() to sample
    # a random word index.
    #
    # Note: See test_word2vec below for dataset's initialization.
    #
    # Input/Output Specifications: same as softmaxCostAndGradient
    # We will not provide starter code for this function, but feel
    # free to reference the code you previously wrote for this
    # assignment!

    ### YOUR CODE HERE
    grad = np.zeros_like(outputVectors)
    gradPred = np.zeros_like(predicted)
    activate = sigmoid(np.dot(predicted.reshape(-1), outputVectors[target].T))
    cost = 0
    cost -= np.log(activate)
    grad[target:target + 1] = (activate - 1) * predicted

    gradPred += (activate - 1) * outputVectors[target]

    neg_samples = []
    for i in range(K):
        idx = dataset.sampleTokenIdx()
        if (idx == target) or (idx in neg_samples):
            i -= 1
            continue
        neg_samples.append(idx)

        neg_activate = sigmoid(
            -np.dot(predicted.reshape(-1), outputVectors[idx].T))
        cost -= np.log(neg_activate)
        grad[idx:idx + 1] = -(neg_activate - 1) * predicted
        gradPred -= (neg_activate - 1) * outputVectors[idx]

    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 28
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data [input]            -- M x Dx matrix, where each row is a training example.
    labels [expected O/p]   -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    #       M,Dx * (Dx,H)   + H    = M * H
    pre_h = data.dot(W1) + b1
    h = sigmoid(pre_h)
    #       M,H * H,Dy   + Dy   = M * Dy
    pre_Y = np.dot(h, W2) + b2
    Y = softmax(pre_Y)

    #    //cross entorpy cost
    cost = -np.sum(labels * np.log(Y))  # M * Dy

    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation

    #first grad of cost function
    delta_cost = Y - labels

    gradW2 = h.T.dot(delta_cost)
    gradb2 = np.sum(delta_cost, axis=0)

    delta_h = delta_cost.dot(W2.T) * sigmoid_grad(h)

    gradW1 = data.T.dot(delta_h)
    gradb1 = np.sum(delta_h, axis=0)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 29
0
def forward_backward_prop(X, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network
    Compute the forward propagation and for the cross entropy cost,
    the backward propagation for the gradients for all parameters.
    Notice the gradients computed here are different from the gradients in
    the assignment sheet: they are w.r.t. weights, not inputs.
    Arguments:
    X -- M x Dx matrix, where each row is a training example x.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    # Note: compute cost based on `sum` not `mean`.
    ### YOUR CODE HERE: forward propagation
    
    #raise NotImplementedError
    ### END YOUR CODE
    
    h = sigmoid(X.dot(W1) + b1)
    y_h = softmax(h.dot(W2) + b2)
    
    ### YOUR CODE HERE: backward propagation

    cost = np.sum(-np.log(y_h[labels == 1])) / X.shape[0]
    
    delta1 = (y_h - labels) / X.shape[0]
    delta2 = delta1.dot(W2.transpose())
    delta3 = sigmoid_grad(h) * delta2
    
    #calculate gradient
    gradW1 = X.transpose().dot(delta3)
    gradb1 = np.sum(delta3, 0)
    
    gradW2 = h.transpose().dot(delta1)
    gradb2 = np.sum(delta1, 0, keepdims = True)
    
    #raise NotImplementedError
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad
Esempio n. 30
0
def forward_backward_prop(X, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    the backward propagation for the gradients for all parameters.

    Notice the gradients computed here are different from the gradients in
    the assignment sheet: they are w.r.t. weights, not inputs.

    Arguments:
    X -- M x Dx matrix, where each row is a training example x.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    # Note: compute cost based on `sum` not `mean`.
    ### YOUR CODE HERE: forward propagation
    z1 = np.dot(X,W1)+b1 # (M x H)

    h = sigmoid(z1) # (M x H)
    z2 = np.dot(h,W2)+b2 # (M x Dy)
    y_dash = softmax(z2) # (M x Dy)

    cost = - np.sum(labels* np.log(y_dash))
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    cost_theta = y_dash - labels
    gradW2 = np.dot(h.T, cost_theta)   
    gradb2 = np.reshape(np.sum(cost_theta, axis = 0), b2.shape)

    cost_h = np.dot(cost_theta, W2.T) # (M x H)
    cost_z1 = sigmoid_grad(h) * cost_h # (M x H)

    gradW1 = np.dot(X.T, cost_z1) # (Dx x M)*(M x H) = (Dx x M)
    gradb1 = np.reshape(np.sum(cost_z1, axis = 0), b1.shape)
    
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
        gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 31
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))
    ### YOUR CODE HERE: forward propagation
    h = sigmoid(
        np.dot(data, W1) + b1
    )  # when adding bias +b1 with dimentions(1, H), then we add each element of b1 to its correspond position in dot(data, W1)
    y_pred = softmax(np.dot(h, W2) + b2)
    ### END YOUR CODE
    ### YOUR CODE HERE: backward propagation
    cost = np.sum(-np.log(y_pred[labels == 1])) / data.shape[
        0]  # cross entropy cost - average sum of all elements in y_pred where the correspondent in labels (y) is 1 but not 0 (label: one-hot vector)

    delta_3 = (y_pred - labels) / data.shape[
        0]  # QUESTION: why dividing by data.shape[0] (number of examples)
    delta_2 = sigmoid_grad(h) * np.dot(
        delta_3, W2.T
    )  # compute delta using the dot product between the error (delta_3) and weights of second layer and the Hadamard product with the derivative of the activations

    gradW2 = np.dot(h.T, delta_3)
    gradb2 = np.sum(
        delta_3, 0,
        keepdims=True)  # QUESTION: why summing the values of delta_3

    gradW1 = np.dot(data.T, delta_2)
    gradb1 = np.sum(
        delta_2, 0,
        keepdims=True)  # QUESTION: why summing the values of delta_2

    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
def forward_backward_prop(X, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    the backward propagation for the gradients for all parameters.

    Notice the gradients computed here are different from the gradients in
    the assignment sheet: they are w.r.t. weights, not inputs.

    Arguments:
    X -- M x Dx matrix, where each row is a training example x.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    # Note: compute cost based on `sum` not `mean`.
    ### YOUR CODE HERE: forward propagation
    hidden = sigmoid(np.dot(X, W1) + b1)  # R: M*Dx * Dx*H + 1*H = M*H
    output = softmax(np.dot(hidden, W2) + b2)  # R: M*H * H*Dy + 1*Dy = M*Dy
    cost = -np.sum(labels * np.log(output))

    ### YOUR CODE HERE: backward propagation
    gradb2 = -labels + output  # R: M*Dy - M*Dy = M*Dy
    gradW2 = hidden[:, :, np.
                    newaxis] * gradb2[:, np.
                                      newaxis, :]  # R: M*H*1 * M*1*Dy = M*H*Dy

    gradb1 = np.sum(gradb2[:, np.newaxis, :] * W2[np.newaxis, :, :],
                    axis=2) * sigmoid_grad(
                        hidden)  # R: = sum(M*1*Dy * M*Dy*H) * M*H= M*H
    gradW1 = X[:, :,
               np.newaxis] * gradb1[:,
                                    np.newaxis, :]  #R: M*H*1 * M*1*Dx = M*Dx*H

    gradb2 = np.sum(gradb2, axis=0)  # sum by column
    gradb1 = np.sum(gradb1, axis=0)
    gradW1 = np.sum(gradW1, axis=0)
    gradW2 = np.sum(gradW2, axis=0)

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 33
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector
    # and one target word vector as a building block for word2vec
    # models, using the negative sampling technique. K is the sample
    # size. You might want to use dataset.sampleTokenIdx() to sample
    # a random word index.
    #
    # Note: See test_word2vec below for dataset's initialization.
    #
    # Input/Output Specifications: same as softmaxCostAndGradient
    # We will not provide starter code for this function, but feel
    # free to reference the code you previously wrote for this
    # assignment!

    ### YOUR CODE HERE
    indices = [target]
    for k in xrange(K):
        sampleTokenIdx = dataset.sampleTokenIdx()
        while sampleTokenIdx == target:
            sampleTokenIdx = dataset.sampleTokenIdx()
        indices += [sampleTokenIdx]

    signs = np.array([1] + [-1 for k in xrange(K)])
    vecs = outputVectors[indices, :]
    t = sigmoid(vecs.dot(predicted) * signs)
    delta = (t - 1) * signs

    cost = np.sum(-np.log(t))
    gradPred = delta.reshape(1, K + 1).dot(vecs).flatten()

    grad = np.zeros(outputVectors.shape)
    gradtemp = delta.reshape((K+1,1)).dot(predicted.reshape(
        (1,predicted.shape[0])))
    for k in xrange(K+1):
        grad[indices[k]] += gradtemp[k,:]

    # naive implementation but not efficient cause it makes |V| computation.
    # uv = outputVectors.dot(predicted)
    # negSamplesCost = 0
    # negSampleGradPred = np.zeros(predicted.shape[0])
    # grad = np.zeros(outputVectors.shape)
    #
    # for i in xrange(K):
    #     sampleTokenIdx = dataset.sampleTokenIdx()
    #     while sampleTokenIdx == target:
    #         sampleTokenIdx = dataset.sampleTokenIdx()
    #     negSamplesCost += np.log(sigmoid(-uv[sampleTokenIdx]))
    #     negSampleGradPred += (sigmoid(-uv[sampleTokenIdx]) - 1) * outputVectors[sampleTokenIdx, :]
    #     grad[sampleTokenIdx] += -(sigmoid(-uv[sampleTokenIdx]) - 1) * predicted
    #
    # cost = -np.log(sigmoid(uv[target])) - negSamplesCost
    # gradPred = (sigmoid(uv[target]) - 1) * outputVectors[target, :] - negSampleGradPred
    # grad[target] = (sigmoid(uv[target]) - 1) * predicted
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 34
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    # data: N x Dx, W1: Dx x H, b: 1 x H 
    a = data.dot(W1) + b1
    h = sigmoid(a)
    # h: N x H, W2: H x Dy, b2: 1 x Dy
    t = h.dot(W2) + b2
    y_hat = softmax(t)
    # y_hat: N x Dy, labels: N x Dy (as int)
    probs = labels * y_hat
    cost = np.sum(-np.log(probs.sum(axis=1)))
    ### END YOUR CODE
    
    ### YOUR CODE HERE: backward propagation
    # obtain the softmax gradient
    dJdt = (y_hat - labels) # N x Dy

    # b2 grad is sum along each index of the Dy vectors
    gradb2 = np.sum(dJdt, 0) 

    # h: N x H, dJdt: N x Dy
    gradW2 = h.T.dot(dJdt) # H x Dy

    # dJdt: N x Dy, W2: H x Dy
    dJdh = dJdt.dot(W2.T)
    # h: N x H
    dhda = sigmoid_grad(h)

    # data: N x Dx, dhda: N x H, DJdh: N x H
    gradW1 = data.T.dot(dhda * dJdh)
    
    # dhda: N x H, DJdh: N x H
    gradb1 = np.sum(dhda * dJdh, 0)
    ### END YOUR CODE
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 35
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    z = np.dot(data, W1) + b1
    h = sigmoid(z)

    scores = np.dot(h, W2) + b2

    probs = softmax(scores)

    cost = - np.sum(labels * np.log(probs))
    ### END YOUR CODE
    ### YOUR CODE HERE: backward propagation
    dscores = probs - labels

    gradW2 = np.dot(h.T, dscores)
    gradb2 = np.sum(dscores, axis=0, keepdims=True)
    dh = np.dot(dscores, W2.T)

    dz = sigmoid_grad(h) * dh
    gradW1 = np.dot(data.T, dz)
    gradb1 = np.sum(dz, axis=0, keepdims=True)

    assert(np.all(gradW2.shape == W2.shape))
    assert(np.all(gradb2.shape == b2.shape))
    assert(np.all(gradW1.shape == W1.shape))
    assert(np.all(gradb1.shape == b1.shape))
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
        gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 36
0
def forward_backward_prop(X, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    the backward propagation for the gradients for all parameters.

    Notice the gradients computed here are different from the gradients in
    the assignment sheet: they are w.r.t. weights, not inputs.

    Arguments:
    X -- M x Dx matrix, where each row is a training example x.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    # Note: compute cost based on `sum` not `mean`.
    ### YOUR CODE HERE: forward propagation

    z1 = np.matmul(X, W1) + b1
    a1 = sigmoid(z1)
    a2 = softmax(np.matmul(a1, W2) + b2)
    cost = (-np.log(a2) * labels).sum()

    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    dz2 = a2 - labels
    dw2 = np.matmul(a1.T, dz2)
    assert dw2.shape == (H, Dy)
    db2 = dz2.sum(0)
    assert db2.shape == (Dy, )

    dz1 = np.matmul(dz2, W2.T) * sigmoid_grad(a1)
    dw1 = np.matmul(X.T, dz1)
    assert dw1.shape == (Dx, H)
    db1 = dz1.sum(0)
    assert db1.shape == (H, )
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate(
        (dw1.flatten(), db1.flatten(), dw2.flatten(), db2.flatten()))

    return cost, grad
def forward_backward_prop(X, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    the backward propagation for the gradients for all parameters.

    Notice the gradients computed here are different from the gradients in
    the assignment sheet: they are w.r.t. weights, not inputs.

    Arguments:
    X -- M x Dx matrix, where each row is a training example x.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))       # (10, 5)
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))            # (1, 5)
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))      # (5, 10)
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))          # (1, 10)

    ### YOUR CODE HERE: forward propagation
    z1 = np.dot(X, W1) + b1                                 # (20, 5)
    a1 = sigmoid(z1)                                        # (20, 5)
    scores = np.dot(a1, W2) + b2                            # (20, 10)

    y_pred = softmax(scores)
    cost = -np.sum(labels * np.log(y_pred))

    # ### END YOUR CODE

    # ### YOUR CODE HERE: backward propagation
    dscores = y_pred - labels                               # (20, 10)
    gradW2 = np.dot(a1.T, dscores)                          # (5, 10)
    # gradW2 bp step 1: back to d(softmax(theta))/d(theta), this is equal to y-bar - y, which is dscores defined in line 52
    # gradW2 bp step 2: back to d(W2*X2)/d(W2), this is equal to X2
    # so combine 2 steps together, gradW2 = X2.T * (y-bar - y)

    gradb2 = np.sum(dscores, axis=0)                        #  (1, 10)    
    da1 = np.dot(dscores, W2.T)
    dz1 = sigmoid_grad(a1)*da1
    gradW1 = np.dot(X.T, dz1)
    gradb1 = np.sum(dz1, axis=0)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
        gradW2.flatten(), gradb2.flatten()))

    return cost, grad                                       # cost is a single number, grad is a nparray
Esempio n. 38
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

### 前向运算
    N = Dx
    # 第一个隐层做内积
    a1 = sigmoid(data.dot(W1) + b1)    
    # 第二个隐层做内积
    a2 = softmax(a1.dot(W2) + b2)

    cost = - np.sum(np.log(a2[labels == 1]))/N

    ### 反向传播

    # Calculate analytic gradient for the cross entropy loss function
    grad_a2 = ( a2 - labels ) / N

    # Backpropagate through the second latent layer
    gradW2 = np.dot( a1.T, grad_a2 )
    gradb2 = np.sum( grad_a2, axis=0, keepdims=True )

    # Backpropagate through the first latent layer
    grad_a1 = np.dot( grad_a2, W2.T ) * sigmoid_grad(a1)

    gradW1 = np.dot( data.T, grad_a1 )
    gradb1 = np.sum( grad_a1, axis=0, keepdims=True )

#    if verbose: # Verbose mode for logging information
#        print ("W1 shape: {}".format( str(W1.shape) ))
#        print ("W1 gradient shape: {}".format( str(gradW1.shape) ))
#        print ("b1 shape: {}".format( str(b1.shape) ))
#        print ("b1 gradient shape: {}".format( str(gradb1.shape) ))




    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad
Esempio n. 39
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    h = sigmoid(data.dot(W1) + b1)
    y_hat = softmax(h.dot(W2) + b2)

    # cost = - np.sum(np.log(y_hat).dot(labels.transpose()))

    cost = -np.sum(labels * np.log(y_hat)) / data.shape[0]
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    # gradb2 = y_hat - labels
    # gradW2 = np.matmul((y_hat - labels), sigmoid(np.matmul(data, W1) + b1))
    # gradb1 = np.matmul(np.matmul((y_hat - labels), W2.transpose()), np.matmul(sigmoid(np.matmul(data, W1), + b1), (1- sigmoid(np.matmul(data, W1), + b1))))
    #
    # gradb1 = (y_hat - labels) * W2 * sigmoid(data * W1 + b1) * (1 - sigmoid(data * W1 + b1))
    # gradW1 = (y_hat - labels) * W2 * sigmoid(data * W1 + b1) * (1 - sigmoid(data * W1 + b1)) * data

    gradZ2 = (y_hat - labels) / data.shape[0]
    gradb2 = np.sum(gradZ2, axis=0, keepdims=True)
    gradW2 = (h.T).dot(gradZ2)
    gradH = gradZ2.dot(W2.T)
    gradZ1 = gradH * sigmoid_grad(h)  #相同坐标的元素相乘
    gradb1 = np.sum(gradZ1, axis=0, keepdims=True)
    gradW1 = (data.T).dot(gradZ1)

    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 40
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    #z1 = data.dot(W1) + b1
    #hidden = sigmoid(z1)
    #z2 = hidden.dot(W2) + b2
    #print 'z2.shape: ', z2.shape
    #prediction = softmax(z2)
    ### END YOUR CODE
    
    hidden = sigmoid(data.dot(W1) + b1)
    prediction = softmax(hidden.dot(W2) + b2)
    cost = -np.sum(np.log(prediction) * labels)

    
    ### YOUR CODE HERE: backward propagation
    #print 'NN: ', Dx, H, Dy
    #print 'b1.shape: ', b1.shape
    #print 'prediction.shape: ', prediction.shape
    #print 'labels.shape : ', labels.shape
    #print 'W2.shape: ', W2.shape
    #print 'hidden.shape: ', hidden.shape
    #print 'hidden.T.shape: ', hidden.T.shape
    #print 'delta.shape: ', delta.shape
    #print 'W1.shape: ', W1.shape
    #print 'data.shape: ', data.shape
    #gradW2 = delta * hidden
    #print 'sigmoid_grad(hidden).shape: ', sigmoid_grad(hidden).shape
    delta = prediction - labels
    gradW2 = hidden.T.dot(delta)
    gradb2 = np.sum(delta, axis = 0)
    hidden_delta = delta.dot(W2.T) * sigmoid_grad(hidden)
    gradW1 = data.T.dot(hidden_delta)
    gradb1 = np.sum(hidden_delta, axis = 0)
    ### END YOUR CODE
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad
Esempio n. 41
0
def forward_backward_prop(X, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    the backward propagation for the gradients for all parameters.

    Notice the gradients computed here are different from the gradients in
    the assignment sheet: they are w.r.t. weights, not inputs.

    Arguments:
    X -- M x Dx matrix, where each row is a training example x.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    # Note: compute cost based on `sum` not `mean`.
    ### YOUR CODE HERE: forward propagation
    f1 = np.dot(X, W1) + b1
    h = sigmoid(f1)
    f2 = np.dot(h, W2) + b2
    y_hat = softmax(f2)
    cost = -np.sum(labels * np.log(y_hat))
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    sigma1 = y_hat - labels
    sigma2 = W2.transpose()
    sigma3 = sigmoid_grad(h)
    gradb2 = np.sum(sigma1, axis=0)
    gradW2 = np.dot(h.transpose(), sigma1)
    sigma4 = sigma3 * np.dot(sigma1, sigma2)
    gradb1 = np.sum(sigma4, axis=0)
    gradW1 = np.dot(X.transpose(), sigma4)
    # print(gradW1.shape)
    # print(gradb1.shape)
    # print(gradW2.shape)
    # print(gradb2.shape)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))
    # print(grad.shape)
    return cost, grad
Esempio n. 42
0
def forward_backward_prop(X, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    the backward propagation for the gradients for all parameters.

    Notice the gradients computed here are different from the gradients in
    the assignment sheet: they are w.r.t. weights, not inputs.

    Arguments:
    X -- M x Dx matrix, where each row is a training example x.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    # Note: compute cost based on `sum` not `mean`.
    ### YOUR CODE HERE: forward propagation
    z1 = np.dot(X, W1) + b1
    g1 = sigmoid(z1)  # M , H

    z2 = np.dot(g1, W2) + b2
    final_scores = softmax(z2)  # M , Dy

    cost = -np.sum(labels * np.log(final_scores))
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation

    gradSoft = final_scores - labels  # M , Dy
    gradW2 = g1.T.dot(gradSoft)
    gradb2 = np.sum(gradSoft, axis=0)  # 1 , Dy
    gradz1 = gradSoft.dot(W2.T)  # M , H

    gradSig = gradz1 * sigmoid_grad(g1)  # M , H
    gradW1 = X.T.dot(gradSig)  # Dx , H
    gradb1 = np.sum(gradSig, axis=0)  # 1 , H

    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 43
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    # data: N x Dx, W1: Dx x H, b: 1 x H
    a = data.dot(W1) + b1
    h = sigmoid(a)
    # h: N x H, W2: H x Dy, b2: 1 x Dy
    t = h.dot(W2) + b2
    y_hat = softmax(t)
    # y_hat: N x Dy, labels: N x Dy (as int)
    probs = labels * y_hat
    cost = np.sum(-np.log(probs.sum(axis=1)))
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    # obtain the softmax gradient
    dJdt = (y_hat - labels)  # N x Dy

    # b2 grad is sum along each index of the Dy vectors
    gradb2 = np.sum(dJdt, 0)

    # h: N x H, dJdt: N x Dy
    gradW2 = h.T.dot(dJdt)  # H x Dy

    # dJdt: N x Dy, W2: H x Dy
    dJdh = dJdt.dot(W2.T)
    # h: N x H
    dhda = sigmoid_grad(h)

    # data: N x Dx, dhda: N x H, DJdh: N x H
    gradW1 = data.T.dot(dhda * dJdh)

    # dhda: N x H, DJdh: N x H
    gradb1 = np.sum(dhda * dJdh, 0)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 44
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

    ### YOUR CODE HERE
    # refactorization of the first draft below.
    V,D = outputVectors.shape
    output = outputVectors[indices] # the 0th is the center
    uv = np.dot(output, predicted)
    uv[0] = -uv[0]
    sig = sigmoid(-uv)
    cost = -np.sum(np.log(sig))
    gradTheta = 1 - sig
    gradTheta[0] = - gradTheta[0]
    gradPred = np.dot(output.T, gradTheta)  # 1 x D array

    samples = np.reshape(gradTheta, (-1, 1)) * predicted
    grad = np.zeros([V, D])
    for i in range(len(indices)):
        grad[indices[i]] += samples[i]

    ########################## First draft ###########################
    ### !!!this is super slow !!! bottle neck should be the 'for' loop
    # uov = np.dot(predicted, outputVectors[target])
    # sigmoid_uov = sigmoid(uov)
    # ukv = np.dot(outputVectors[indices[1:]], predicted)  # exclude the target
    # sigmoid_ukv = sigmoid(-ukv)  # 1 x K
    # cost = -np.log(sigmoid_uov) - np.sum(np.log(sigmoid_ukv))
    # gradTheta1 = -(1 - sigmoid_uov)    # a scalar
    # gradTheta2 = (1 - sigmoid_ukv) # K x 1 array
    # gradPred = gradTheta1 * outputVectors[target] + np.sum(outputVectors[indices[1:]] * np.reshape(gradTheta2, (-1,1)), axis = 0)  # 1 x D array
    # gradOutput = np.zeros([V,D])
    # 
    # # only K none-zero rows indicates the K negative samples, same numble of samples but might contain duplicated words
    # # Can be parallelized further.
    # samples = np.reshape(gradTheta2, (-1, 1)) * predicted
    # for i in range(V):
    #     gradOutput[i] = np.sum(samples[np.where(np.array(indices[1:]) == i)], axis=0)
    # # the positive sample
    # gradOutput[indices[0]] = np.reshape(gradTheta1, (-1, 1)) * predicted
    # grad = gradOutput
    ### END YOUR CODE

    return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    #raise NotImplementedError

    z1 = np.matmul(data, W1) + b1
    h = sigmoid(z1)
    z2 = np.matmul(h, W2) + b2
    y_hat = softmax(z2)

    cost = np.sum(-np.log(y_hat[labels == 1])) / (data.shape[0])

    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    #raise NotImplementedError

    d1 = (y_hat - labels) / (data.shape[0])
    gradW2 = np.matmul(np.transpose(h), d1)
    gradb2 = np.sum(d1, axis=0, keepdims=True)

    d2 = np.matmul(d1, np.transpose(W2))
    d3 = d2 * sigmoid_grad(h)

    gradW1 = np.matmul(np.transpose(data), d3)
    gradb1 = np.sum(d3, axis=0, keepdims=True)

    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 46
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, 
    K=10):
    """ Negative sampling cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector  
    # and one target word vector as a building block for word2vec     
    # models, using the negative sampling technique. K is the sample  
    # size. You might want to use dataset.sampleTokenIdx() to sample  
    # a random word index. 
    # 
    # Note: See test_word2vec below for dataset's initialization.
    #                                       
    # Input/Output Specifications: same as softmaxCostAndGradient     
    # We will not provide starter code for this function, but feel    
    # free to reference the code you previously wrote for this        
    # assignment!
    
    ### YOUR CODE HERE
    K_set = []
    while (len(K_set) < K):
        candidateIndex = dataset.sampleTokenIdx()
        if (candidateIndex != target):
            K_set += [candidateIndex]

    cost = 0
    grad = np.zeros(outputVectors.shape)
    gradPred = np.zeros(predicted.shape)
    for k in K_set:
        score = sigmoid(- outputVectors[k].dot(predicted))
        cost += - np.log(score)
        grad[k,:] += - (score - 1) * predicted 
        gradPred += - (score - 1) * outputVectors[k]

    score = sigmoid(outputVectors[target].dot(predicted))
    cost += - np.log(score)
    grad[target,:] = (score - 1) * predicted
    gradPred += (score - 1) * outputVectors[target]

    # ugly fix ...
    gradPred = gradPred[np.newaxis, :]
        

    #raise NotImplementedError
    ### END YOUR CODE
    
    return cost, gradPred, grad
def test_sigmoid(dim_1, dim_2):
    a1       = np.random.normal(loc=0., scale=20., size=(dim_1,dim_2))
    a1_copy  = a1.copy()

    s_a1     = sigmoid(a1)
    s_sol_a1 = sigmoid_sol(a1_copy)

    assert rel_error(sigmoid_grad(s_a1), sigmoid_grad_sol(s_sol_a1)) <= 1e-10
Esempio n. 48
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    # data : N * Dx
    # W1   : Dx * H
    # b1   : 1 * H
    # W2   : H * Dy
    # b2   : 1 * Dy
    N = data.shape[0]

    z1 = data.dot(W1) + b1
    a1 = sigmoid(z1)  # N * H
    z2 = a1.dot(W2) + b2
    a2 = softmax(z2)  # N * Dy

    cost = np.sum(-np.log(a2[labels == 1])) / N

    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    delta_score = a2 - labels  # 1 * Dy
    delta_score /= N

    gradW2 = np.dot(a1.T, delta_score)  # H * 1 * 1 * Dy = H * Dy
    gradb2 = np.sum(delta_score, axis=0)

    grad_h = np.dot(delta_score, W2.T)  # 1 * Dy * Dy * H = 1 * H
    grad_h = sigmoid_grad(a1) * grad_h

    gradW1 = np.dot(data.T, grad_h)
    gradb1 = np.sum(grad_h, axis=0)

    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 49
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

    ### YOUR CODE HERE
    #raise NotImplementedError
    U = outputVectors
    u_o = U[target]
    v_c = predicted 
    N = U.shape[0]
    
    sig_o_c = sigmoid(np.dot(u_o, v_c))
    cost = -np.log(sig_o_c)
    gradPred = (sig_o_c - 1)*u_o
    grad = np.zeros((N, len(v_c)))
    grad[target] += (sig_o_c - 1)*v_c

    for k in indices:
        u_k = U[k]
        sig_k_c = sigmoid(-np.dot(u_k, v_c))
        cost += -np.log(sig_k_c)
        gradPred += -(sig_k_c - 1)*u_k
        grad[k] += -(sig_k_c - 1)*v_c

    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 50
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
    K=10):
    """ Negative sampling cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector
    # and one target word vector as a building block for word2vec
    # models, using the negative sampling technique. K is the sample
    # size. You might want to use dataset.sampleTokenIdx() to sample
    # a random word index.
    #
    # Note: See test_word2vec below for dataset's initialization.
    #
    # Input/Output Specifications: same as softmaxCostAndGradient
    # We will not provide starter code for this function, but feel
    # free to reference the code you previously wrote for this
    # assignment!

    ### YOUR CODE HERE
    W,D = outputVectors.shape

    UK = np.zeros((K+1, D))
    indices = [target]
    for i in xrange(K):
        k = dataset.sampleTokenIdx()
        while k == target:
            k = dataset.sampleTokenIdx()
        indices.append(k)
    for i,ix in enumerate(indices):
        UK[i] = outputVectors[ix]

    u_o = outputVectors[target] # (D,)
    cost = - np.log(sigmoid(np.dot(u_o, predicted))) - np.sum(np.log(sigmoid(-np.dot(UK[1:], predicted))))
    gradPred = (sigmoid(np.dot(u_o,predicted))-1) * u_o + np.dot(UK[1:].T,sigmoid(np.dot(UK[1:], predicted))) # dJ/dV_c, (D,)

    y = np.zeros(K+1); y[0] = 1.0 #
    grad = np.zeros(outputVectors.shape)
    gradK = np.outer(sigmoid(np.dot(UK, predicted)) - y, predicted)
    for i,ix in enumerate(indices):
        grad[ix] += gradK[i]
    ### END YOUR CODE

    return cost, gradPred, grad
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    fc_out = np.dot(data, W1) + b1  # shape (M, H)
    fc_sigmoid_out = sigmoid(fc_out)  # shape (M, H)
    scores = np.dot(fc_sigmoid_out, W2) + b2  # shape (M, Dy)
    y_hat = softmax(scores)  # shape (M, Dy)
    # M = data.shape[0]
    cost = -np.sum(labels * np.log(y_hat))  # / M
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    dscores = y_hat - labels  # / M  # shape (M, Dy)

    gradW2 = np.dot(fc_sigmoid_out.T, dscores)  # shape (H, Dy)
    gradb2 = np.sum(dscores, axis=0)  # shape (Dy,)
    dfc_sigmoid_out = np.dot(dscores, W2.T)  # shape (M, H)
    dfc_out = dfc_sigmoid_out * sigmoid_grad(fc_sigmoid_out)  # shape (M, H)

    gradW1 = np.dot(data.T, dfc_out)  # shape (Dx, H)
    gradb1 = np.sum(dfc_out, axis=0)  # shape (H,)

    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 52
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    """

    indices = [target]

    indices.extend(getNegativeSamples(target, dataset, K))

    d = predicted.shape[0]
    v = outputVectors.shape[0]

    u_0 = outputVectors[target].reshape(1,d) # row
    v_c = predicted.reshape(d,1) # column

    predict = sigmoid(np.dot(u_0, v_c)) # reuse

    cost = - np.log(predict)
    grad = np.zeros((v,d))+0.
    grad[target] = ((predict- 1.0) *(v_c.T)).reshape((d,))
    gradPred = (predict - 1.0) *u_0

    for k in indices[1:]:
        u_k  =  outputVectors[k]
        u_k  = u_k.reshape((1,d)) # row

        predict = sigmoid(np.dot(u_k, v_c))
        cost -= np.log(1-predict)

        gradPred += predict * u_k
        grad[k] +=( predict * v_c.T).reshape((d,))

    gradPred = gradPred.reshape((d,))

    return cost, gradPred, grad
Esempio n. 53
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, 
    K=10):
    """ Negative sampling cost function for word2vec models """

    # Implement the cost and gradients for one predicted word vector  
    # and one target word vector as a building block for word2vec     
    # models, using the negative sampling technique. K is the sample  
    # size. You might want to use dataset.sampleTokenIdx() to sample  
    # a random word index. 
    # 
    # Note: See test_word2vec below for dataset's initialization.
    #                                       
    # Input/Output Specifications: same as softmaxCostAndGradient     
    # We will not provide starter code for this function, but feel    
    # free to reference the code you previously wrote for this        
    # assignment!
    
    cost = -np.log(sigmoid(np.dot(predicted,outputVectors[target])))
    num_rand = 0
    grad = np.zeros(outputVectors.shape)
    gradPred = -outputVectors[target]*(1-sigmoid(np.dot(predicted,outputVectors[target])))
    grad[target] = -predicted*(1-sigmoid(np.dot(predicted,outputVectors[target])))
    while num_rand < K:
        rand = dataset.sampleTokenIdx()
        if rand == target:
            continue
        num_rand += 1
        cost -= np.log(sigmoid(-np.dot(predicted,outputVectors[rand])))
        grad[rand] += predicted*(1-sigmoid(-np.dot(predicted,outputVectors[rand])))
        gradPred += outputVectors[rand]*(1-sigmoid(-np.dot(predicted,outputVectors[rand])))
    
    return cost, gradPred, grad
Esempio n. 54
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### forward propagation
    N = data.shape[0]

    l1 = data.dot(W1) + b1
    h = sigmoid(l1)
    l2 = h.dot(W2) + b2
    y_hat = softmax(l2)

    cost = -np.sum(labels * np.log(y_hat)) / N # cross entropy
    
    ### backward propagation
    dl2 = y_hat - labels
    dW2 = np.dot(h.T, dl2)
    db2 = np.sum(dl2, axis=0)

    dh = np.dot(dl2, W2.T)

    dl1 = dh * sigmoid_grad(h)
    dW1 = np.dot(data.T, dl1)
    db1 = np.sum(dl1, axis=0)

    gradW2 = dW2/N
    gradb2 = db2/N
    gradW1 = dW1/N
    gradb1 = db1/N
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad
Esempio n. 55
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    
    ### YOUR CODE HERE: forward propagation
    p1 = np.dot(data, W1) + b1
    h = sigmoid(p1) #(M,H)
    p2 = np.dot(h, W2) + b2
    y_pred = softmax(p2) #(M, Dy)
    cost = np.mean(np.sum(-1 * np.multiply(labels, np.log(y_pred)), axis=1))
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    gradp2 = (y_pred - labels) / np.shape(data)[0] #(M,Dy)
    gradW2 = np.dot(h.T, gradp2) #(H, Dy) 
    gradb2 = np.sum(gradp2, axis=0).reshape((1,-1)) #(1, Dy)
    gradh = np.dot(gradp2, W2.T) #(M,H)
    gradp1 = np.multiply(gradh, h * (1 - h)) #(M,H) element wise multiplication
    gradW1 = np.dot(data.T, gradp1) # (Dx,H)
    gradb1 =  np.sum(gradp1, axis=0).reshape((1,-1)) #(1,H)
    ### END YOUR CODE

    ### Stack gradients (do not modify)[0].reshape((1,-1)) #(1, Dy)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
        gradW2.flatten(), gradb2.flatten()))

    return cost, grad
Esempio n. 56
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    h = sigmoid(np.dot(data,W1) + b1)
    yhat = softmax(np.dot(h,W2) + b2)
    ### END YOUR CODE

    ### YOUR CODE HERE: backward propagation
    cost = np.sum(-np.log(yhat[labels==1])) / data.shape[0]

    d3 = (yhat - labels) / data.shape[0]
    gradW2 = np.dot(h.T, d3)
    gradb2 = np.sum(d3,0,keepdims=True)

    dh = np.dot(d3,W2.T)
    grad_h = sigmoid_grad(h) * dh

    gradW1 = np.dot(data.T,grad_h)
    gradb1 = np.sum(grad_h,0)
    ### END YOUR CODE

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
        gradW2.flatten(), gradb2.flatten()))

    return cost, grad
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

    ### YOUR CODE HERE
    cost = 0.
    gradPred = np.zeros(predicted.shape)  # shape (N,)
    grad = np.zeros(outputVectors.shape)  # shape (W, N)
    vc = predicted  # shape (N,)
    uo = outputVectors[target]  # shape (N,)
    yo = sigmoid(np.dot(uo, vc))  # scaler
    cost += -np.log(yo)

    gradPred += (yo - 1) * uo  # shape (N,)
    grad[target] += (yo - 1) * vc
    for k in indices[1:]:
        uk = outputVectors[k]
        y_neg_k = sigmoid(-np.dot(uk, vc))
        cost += -np.log(y_neg_k)
        gradPred += -(y_neg_k - 1) * uk  # shape (N,)
        grad[k] += -(y_neg_k - 1) * vc
    ### END YOUR CODE
    return cost, gradPred, grad
Esempio n. 58
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

    ### YOUR CODE HERE
    grad = np.zeros(outputVectors.shape)
    gradPred = np.zeros(predicted.shape)
    cost = 0
    z = sigmoid(np.dot(outputVectors[target], predicted))

    cost -= np.log(z)
    grad[target] += predicted * (z - 1.0)
    gradPred += outputVectors[target] * (z - 1.0)

    for k in xrange(K):
        samp = indices[k + 1]
        z = sigmoid(np.dot(outputVectors[samp], predicted))
        cost -= np.log(1.0 - z)
        grad[samp] += predicted * z
        gradPred += outputVectors[samp] * z
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 59
0
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
                               K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

    ### YOUR CODE HERE
    gradPred = np.zeros_like(predicted)
    grad = np.zeros_like(outputVectors)
    cost = 0.0 

    product = np.dot(outputVectors[target], predicted) #intermediate value
    cost = -np.log(sigmoid(product))
    gradPred = (sigmoid(product) - 1) * outputVectors[target]
    grad[target] = (sigmoid(product) - 1) * predicted
    
    for index in indices:
        neg_sig = sigmoid (-1 * np.dot(outputVectors[index], predicted))
        cost += -np.log(neg_sig)
        gradPred += -(neg_sig -1) * outputVectors[index]
        grad[index] += -(neg_sig - 1) * predicted
    ### END YOUR CODE

    return cost, gradPred, grad
Esempio n. 60
0
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    ### YOUR CODE HERE: forward propagation
    N = data.shape[0]
    Z1 = data.dot(W1) + b1     # (N, H)
    A1 = sigmoid(Z1)           # (N, H)
    scores = A1.dot(W2) + b2   # (N, Dy)
    probs = softmax(scores)    # (N, Dy)
    cost = -np.sum(np.log(probs[labels==1])) / N
    ### END YOUR CODE
    
    ### YOUR CODE HERE: backward propagation
    dscores = (probs - labels) / N
    dW2 = A1.T.dot(dscores)
    db2 = np.sum(dscores, axis=0)
    dA1 = dscores.dot(W2.T)
    dZ1 = sigmoid_grad(A1) * dA1
    dW1 = data.T.dot(dZ1)
    db1 = np.sum(dZ1, axis=0)
    
    gradW1 = dW1
    gradW2 = dW2
    gradb1 = db1
    gradb2 = db2
    ### END YOUR CODE
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad