Example #1
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for i
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
    grad -- the gradient with respect to all the other word

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this

    softmax_result = softmax(outputVectors.dot(predicted))
    cost = -np.log(softmax_result)[target]
    gradPred = -outputVectors[target, :] + softmax_result.dot(outputVectors) # -u_o + sum_i(SoftMax_i * u_i)
    grad = softmax_result[np.newaxis, :].T.dot(predicted[np.newaxis, :])  # for all it is (0 + SoftMax_w)*v_c
    grad[target, :] -= predicted  # for u_o it is (-1 + SoftMax_o)*v_c, so need to subtract v_c from _o location
    return cost, gradPred, grad
Example #2
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word vector
    grad -- the gradient with respect to all the other word vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this


    output_products_vector = np.dot(
        outputVectors, predicted)  # vector of uoT * vc for o = 1,2, ... ,W
    output_probabilities_vector = softmax(
        output_products_vector)  # vector of p(o|c) for o = 1,2, ... ,W
    u_o = outputVectors[target]

    cost = -np.log(output_probabilities_vector[target])  # -log(p(oi|c))

    gradPred = -u_o + np.sum(
        output_probabilities_vector[:, np.newaxis] * outputVectors,
        axis=0)  # returning as ndarray shape=(|v|,1)

    target_indicator = np.zeros(
        outputVectors.shape[0])  # shape of vector is number of words in corpus
        target] = -1  # create a vector with 0 in every index expect for target index (o index)
    grad = predicted * (target_indicator +
                        output_probabilities_vector)[:, np.newaxis]


    return cost, gradPred, grad
Example #3
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
    grad -- the gradient with respect to all the other word

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    # outputVectors >> U
    # predicted     >> v_c
    # target        >> o

    v_c = predicted
    U = outputVectors

    dot_prod = np.dot(v_c,U.T)
    softmax_out = softmax(dot_prod)

    cost = -np.log(softmax_out[target])

    softmax_out[target] -= 1

    gradPred = np.dot(softmax_out, U)

    softmax_out = softmax_out.reshape(softmax_out.shape[0],1)

    grad = np.dot(softmax_out,v_c.reshape(1,len(v_c)))

    return cost, gradPred, grad
Example #4
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
    grad -- the gradient with respect to all the other word

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this


    #  predictions:
    vhat = predicted
    z = np.dot(outputVectors, vhat)
    preds = softmax(z)

    #  Calculate the cost:
    cost = -np.log(preds[target])

    #  Gradients
    z = preds.copy()
    z[target] -= 1.0

    grad = np.outer(z, vhat)
    gradPred = np.dot(outputVectors.T, z)


    return cost, gradPred, grad
Example #5
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted (Vc) -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target (Uo) -- integer, the index of the target word
    outputVectors (Uw) -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
    grad -- the gradient with respect to all the other word

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    w = np.dot(outputVectors, predicted)  # (V,d) x (d,1)
    # prob[i] = p(oi/c) -- exp(Ui * Vc) / sigma exp(Ux * Vc)
    prob = softmax(w)  # y_pred dim. (V,1)

    #  Cost:
    cost = -np.log(prob[target])  # CE(y, y_c) = -Sigma y_i * log(y_pred_i)

    prob[target] -= 1.0  # p(o/c) - 1

    # Vc * P(x/c) and Vc * (p(o/c) - 1)
    grad = np.zeros(outputVectors.shape)
    grad += predicted
    # [Vc]i * p(x/c) for each output vector x
    grad = (grad.T * prob).T  # v * d
    # -Uo + sigma Ux * y_pred[x]
    gradPred = np.dot(outputVectors.T, prob)  # d * V * V * 1

    return cost, gradPred, grad
Example #6
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
    grad -- the gradient with respect to all the other word

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    # V * d * d * 1
    # predicted = V_c
    w = np.dot(outputVectors, predicted)
    # prob = y_pred = U_targ * Vc / sigma exp Ux * Vc for each output vector
    prob = softmax(w)  # V dim

    #  Cost:
    cost = -np.log(prob[target])  # CE(y, y_c) = -Sigma y_i * log(y_pred_i)

    prob[target] -= 1.0  # U_targ * (y_pred[targ] - 1)

    grad = np.dot(prob, predicted.T)  # V * 1 * 1 * d : V * d
    # -U_targ + sigma Ux * y_pred[x]
    gradPred = np.dot(outputVectors.T, prob)  # d * V * V * 1

    return cost, gradPred, grad
Example #7
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
    grad -- the gradient with respect to all the other word

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this

    U = outputVectors
    uo = U[target]
    vc = predicted
    U_dot_Vc = U.dot(vc)
    softmax_U_dot_vc = softmax(U_dot_Vc)

    cost = -np.log(softmax_U_dot_vc[target])
    gradPred = -uo + np.sum(softmax_U_dot_vc[:, np.newaxis] * U, axis=0)
    grad = softmax_U_dot_vc[:, np.newaxis] * np.tile(
        vc, (outputVectors.shape[0], 1))
    grad[target] -= vc

    return cost, gradPred, grad
Example #8
def softmaxCostAndGradient(predicted, target, output_vectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    grad_pred -- the gradient with respect to the predicted word
    grad -- the gradient with respect to all the other word

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this

    # x_i(hat) for the sigmoid is u_o^Tv_i so x(hat) is vu^T
    x_hat = np.matmul(predicted, output_vectors.T)
    y_hat = softmax(x_hat)
    # target index refers to y hot vecor
    cost = -(np.log(y_hat[target]))

    # using the derivative of the cost from 2a
    grad_pred = np.matmul(y_hat, output_vectors) - output_vectors[target]
    # using the derivative of the cost from 2b
    grad = np.outer(predicted, y_hat).transpose()
    grad[target] -= predicted

    return cost, grad_pred, grad
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
    grad -- the gradient with respect to all the other word

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this

    y_hat = softmax(np.dot(outputVectors,predicted))
    y = np.zeros(outputVectors.shape[0])
    y[target] = 1

    cost =  np.log(y_hat[target]) #CE is log of predict probebilty according to 1-hot vector
    gradPred = -np.dot(outputVectors.transpose(), (y_hat - y))# U[y^hat - y]

    temp = np.expand_dims(y_hat - y, 1)
    grad = -np.multiply(temp,predicted)  # (y_w^hat - y_w)v_c

    return cost, gradPred, grad
Example #10
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
    grad -- the gradient with respect to all the other word

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this

    W = outputVectors.shape[0]  #extructing dictionary size as W
    D = outputVectors.shape[1]  #extructing embadings size as D

    #turn predicted to row vector - if not already
    if predicted.shape[0] != 1:
        predicted = np.expand_dims(predicted, axis=1)
        predicted = np.transpose(predicted)

    #calc inner product for predicted with all vectors of outputVectors
    inner_prod = np.matmul(predicted,
                           np.transpose(outputVectors))  # dim [1 X W]

    #calc softmax for each word in the dictionary
    s = softmax(inner_prod)

    #caculating the cost
    cost = -np.log(
        s[0, target]
    )  # since y is a one-hot vector , hence the only element that is not zeroed is y[target]

    #calculating gradPred according to our calculations at 2a
    gradPred_numerator = np.sum((np.tile(np.exp(np.transpose(inner_prod)),
                                         (1, D)) * outputVectors),
    gradPred_denumerator = np.sum(np.exp(inner_prod))
    gradPred = -np.transpose(outputVectors[
        target, :]) + gradPred_numerator / gradPred_denumerator  # dim [Dx1]

    # calculating grad according to our calculations at 2b
    grad_numerator = np.tile(predicted, (W, 1)) * np.tile(
        np.exp(np.transpose(inner_prod)), (1, D))
    grad_denumerator = gradPred_denumerator
    grad = np.zeros([W, D], dtype=np.float32)
    grad[target, :] = -predicted
    grad += grad_numerator / grad_denumerator

    return cost, gradPred, grad