コード例 #1
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for i
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    # YOUR CODE HERE
    softmax_result = softmax(outputVectors.dot(predicted))
    cost = -np.log(softmax_result)[target]
    gradPred = -outputVectors[target, :] + softmax_result.dot(outputVectors) # -u_o + sum_i(SoftMax_i * u_i)
    grad = softmax_result[np.newaxis, :].T.dot(predicted[np.newaxis, :])  # for all it is (0 + SoftMax_w)*v_c
    grad[target, :] -= predicted  # for u_o it is (-1 + SoftMax_o)*v_c, so need to subtract v_c from _o location
    # END YOUR CODE
    return cost, gradPred, grad
コード例 #2
0
ファイル: q2e_word2vec.py プロジェクト: oriyor/NLP
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word vector
    grad -- the gradient with respect to all the other word vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    ### YOUR CODE HERE

    output_products_vector = np.dot(
        outputVectors, predicted)  # vector of uoT * vc for o = 1,2, ... ,W
    output_probabilities_vector = softmax(
        output_products_vector)  # vector of p(o|c) for o = 1,2, ... ,W
    u_o = outputVectors[target]

    cost = -np.log(output_probabilities_vector[target])  # -log(p(oi|c))

    gradPred = -u_o + np.sum(
        output_probabilities_vector[:, np.newaxis] * outputVectors,
        axis=0)  # returning as ndarray shape=(|v|,1)

    target_indicator = np.zeros(
        outputVectors.shape[0])  # shape of vector is number of words in corpus
    target_indicator[
        target] = -1  # create a vector with 0 in every index expect for target index (o index)
    grad = predicted * (target_indicator +
                        output_probabilities_vector)[:, np.newaxis]

    ### END YOUR CODE

    return cost, gradPred, grad
コード例 #3
0
ファイル: q2e_word2vec.py プロジェクト: nlp-yotam-guy/hw1
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """
    # outputVectors >> U
    # predicted     >> v_c
    # target        >> o

    v_c = predicted
    U = outputVectors

    dot_prod = np.dot(v_c,U.T)
    softmax_out = softmax(dot_prod)

    cost = -np.log(softmax_out[target])

    softmax_out[target] -= 1

    gradPred = np.dot(softmax_out, U)

    softmax_out = softmax_out.reshape(softmax_out.shape[0],1)

    grad = np.dot(softmax_out,v_c.reshape(1,len(v_c)))

    return cost, gradPred, grad
コード例 #4
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    ### YOUR CODE HERE

    #  predictions:
    vhat = predicted
    z = np.dot(outputVectors, vhat)
    preds = softmax(z)

    #  Calculate the cost:
    cost = -np.log(preds[target])

    #  Gradients
    z = preds.copy()
    z[target] -= 1.0

    grad = np.outer(z, vhat)
    gradPred = np.dot(outputVectors.T, z)

    ### END YOUR CODE

    return cost, gradPred, grad
コード例 #5
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted (Vc) -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target (Uo) -- integer, the index of the target word
    outputVectors (Uw) -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """
    ### YOUR CODE HERE
    w = np.dot(outputVectors, predicted)  # (V,d) x (d,1)
    # prob[i] = p(oi/c) -- exp(Ui * Vc) / sigma exp(Ux * Vc)
    prob = softmax(w)  # y_pred dim. (V,1)

    #  Cost:
    cost = -np.log(prob[target])  # CE(y, y_c) = -Sigma y_i * log(y_pred_i)

    prob[target] -= 1.0  # p(o/c) - 1

    # Vc * P(x/c) and Vc * (p(o/c) - 1)
    grad = np.zeros(outputVectors.shape)
    grad += predicted
    # [Vc]i * p(x/c) for each output vector x
    grad = (grad.T * prob).T  # v * d
    # -Uo + sigma Ux * y_pred[x]
    gradPred = np.dot(outputVectors.T, prob)  # d * V * V * 1
    ### END YOUR CODE

    return cost, gradPred, grad
コード例 #6
0
ファイル: q2e_word2vec.py プロジェクト: dan22333/NLP
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """
    ### YOUR CODE HERE
    # V * d * d * 1
    # predicted = V_c
    w = np.dot(outputVectors, predicted)
    # prob = y_pred = U_targ * Vc / sigma exp Ux * Vc for each output vector
    prob = softmax(w)  # V dim

    #  Cost:
    cost = -np.log(prob[target])  # CE(y, y_c) = -Sigma y_i * log(y_pred_i)

    prob[target] -= 1.0  # U_targ * (y_pred[targ] - 1)

    grad = np.dot(prob, predicted.T)  # V * 1 * 1 * d : V * d
    # -U_targ + sigma Ux * y_pred[x]
    gradPred = np.dot(outputVectors.T, prob)  # d * V * V * 1
    ### END YOUR CODE

    return cost, gradPred, grad
コード例 #7
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    U = outputVectors
    uo = U[target]
    vc = predicted
    U_dot_Vc = U.dot(vc)
    softmax_U_dot_vc = softmax(U_dot_Vc)

    cost = -np.log(softmax_U_dot_vc[target])
    gradPred = -uo + np.sum(softmax_U_dot_vc[:, np.newaxis] * U, axis=0)
    grad = softmax_U_dot_vc[:, np.newaxis] * np.tile(
        vc, (outputVectors.shape[0], 1))
    grad[target] -= vc

    return cost, gradPred, grad
コード例 #8
0
ファイル: q2e_word2vec.py プロジェクト: Verose/AML_NLP-2018
def softmaxCostAndGradient(predicted, target, output_vectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    grad_pred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    # x_i(hat) for the sigmoid is u_o^Tv_i so x(hat) is vu^T
    x_hat = np.matmul(predicted, output_vectors.T)
    y_hat = softmax(x_hat)
    # target index refers to y hot vecor
    cost = -(np.log(y_hat[target]))

    # using the derivative of the cost from 2a
    grad_pred = np.matmul(y_hat, output_vectors) - output_vectors[target]
    # using the derivative of the cost from 2b
    grad = np.outer(predicted, y_hat).transpose()
    grad[target] -= predicted

    return cost, grad_pred, grad
コード例 #9
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    y_hat = softmax(np.dot(outputVectors,predicted))
    y = np.zeros(outputVectors.shape[0])
    y[target] = 1

    cost =  np.log(y_hat[target]) #CE is log of predict probebilty according to 1-hot vector
    gradPred = -np.dot(outputVectors.transpose(), (y_hat - y))# U[y^hat - y]

    temp = np.expand_dims(y_hat - y, 1)
    grad = -np.multiply(temp,predicted)  # (y_w^hat - y_w)v_c

    return cost, gradPred, grad
コード例 #10
0
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """

    W = outputVectors.shape[0]  #extructing dictionary size as W
    D = outputVectors.shape[1]  #extructing embadings size as D

    #turn predicted to row vector - if not already
    if predicted.shape[0] != 1:
        predicted = np.expand_dims(predicted, axis=1)
        predicted = np.transpose(predicted)

    #calc inner product for predicted with all vectors of outputVectors
    inner_prod = np.matmul(predicted,
                           np.transpose(outputVectors))  # dim [1 X W]

    #calc softmax for each word in the dictionary
    s = softmax(inner_prod)

    #caculating the cost
    cost = -np.log(
        s[0, target]
    )  # since y is a one-hot vector , hence the only element that is not zeroed is y[target]

    #calculating gradPred according to our calculations at 2a
    gradPred_numerator = np.sum((np.tile(np.exp(np.transpose(inner_prod)),
                                         (1, D)) * outputVectors),
                                axis=0)
    gradPred_denumerator = np.sum(np.exp(inner_prod))
    gradPred = -np.transpose(outputVectors[
        target, :]) + gradPred_numerator / gradPred_denumerator  # dim [Dx1]

    # calculating grad according to our calculations at 2b
    grad_numerator = np.tile(predicted, (W, 1)) * np.tile(
        np.exp(np.transpose(inner_prod)), (1, D))
    grad_denumerator = gradPred_denumerator
    grad = np.zeros([W, D], dtype=np.float32)
    grad[target, :] = -predicted
    grad += grad_numerator / grad_denumerator

    return cost, gradPred, grad