def test_word2vec(): """ Test the two word2vec implementations, before running on Stanford Sentiment Treebank """ dataset = type('dummy', (), {})() def dummySampleTokenIdx(): return random.randint(0, 4) def getRandomContext(C): tokens = ["a", "b", "c", "d", "e"] return tokens[random.randint(0,4)], \ [tokens[random.randint(0,4)] for i in range(2*C)] dataset.sampleTokenIdx = dummySampleTokenIdx dataset.getRandomContext = getRandomContext random.seed(31415) np.random.seed(9265) dummy_vectors = normalizeRows(np.random.randn(10,3)) dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)]) print("==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====") gradcheck_naive(lambda vec: word2vec_sgd_wrapper( skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient), dummy_vectors, "naiveSoftmaxLossAndGradient Gradient") grad_tests_softmax(skipgram, dummy_tokens, dummy_vectors, dataset) print("==== Gradient check for skip-gram with negSamplingLossAndGradient ====") gradcheck_naive(lambda vec: word2vec_sgd_wrapper( skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient), dummy_vectors, "negSamplingLossAndGradient Gradient") grad_tests_negsamp(skipgram, dummy_tokens, dummy_vectors, dataset, negSamplingLossAndGradient)
def test_word2vec(): """ Test the two word2vec implementations, before running on Stanford Sentiment Treebank """ dataset, dummy_vectors, dummy_tokens = dummy() print("==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====") gradcheck_naive(lambda vec: word2vec_sgd_wrapper( skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient), dummy_vectors, "naiveSoftmaxLossAndGradient Gradient") print("\n\n\t\t\tSkip-Gram with naiveSoftmaxLossAndGradient\t\t\t") print("\nYour Result:") loss, dj_dv, dj_du = skipgram(inputs['test_word2vec']['currentCenterWord'], inputs['test_word2vec']['windowSize'], inputs['test_word2vec']['outsideWords'], dummy_tokens, dummy_vectors[:5, :], dummy_vectors[5:, :], dataset, naiveSoftmaxLossAndGradient) print( "Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\nGradient wrt Outside Vectors (dJ/dU):\n {}\n".format(loss, dj_dv, dj_du)) print("Expected Result: Value should approximate these:") print( "Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\nGradient wrt Outside Vectors (dJ/dU):\n {}\n".format( outputs['test_word2vec']['loss'], outputs['test_word2vec']['dj_dv'], outputs['test_word2vec']['dj_du']))
def test_skipgram(): """ Test skip-gram with naiveSoftmaxLossAndGradient """ dataset, dummy_vectors, dummy_tokens = getDummyObjects() print("==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====") gradcheck_naive(lambda vec: word2vec_sgd_wrapper( skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient), dummy_vectors, "naiveSoftmaxLossAndGradient Gradient") grad_tests_softmax(skipgram, dummy_tokens, dummy_vectors, dataset) print("==== Gradient check for skip-gram with negSamplingLossAndGradient ====") gradcheck_naive(lambda vec: word2vec_sgd_wrapper( skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient), dummy_vectors, "negSamplingLossAndGradient Gradient") grad_tests_negsamp(skipgram, dummy_tokens, dummy_vectors, dataset, negSamplingLossAndGradient)
def test_negSamplingLossAndGradient(): """ Test negSamplingLossAndGradient """ dataset, dummy_vectors, dummy_tokens = getDummyObjects() print("==== Gradient check for negSamplingLossAndGradient ====") def temp(vec): loss, gradCenterVec, gradOutsideVecs = negSamplingLossAndGradient(vec, 1, dummy_vectors, dataset) return loss, gradCenterVec gradcheck_naive(temp, np.random.randn(3), "negSamplingLossAndGradient gradCenterVec") centerVec = np.random.randn(3) def temp(vec): loss, gradCenterVec, gradOutsideVecs = negSamplingLossAndGradient(centerVec, 1, vec, dataset) return loss, gradOutsideVecs gradcheck_naive(temp, dummy_vectors, "negSamplingLossAndGradient gradOutsideVecs")
def test_word2vec(): """ Test the two word2vec implementations, before running on Stanford Sentiment Treebank """ dataset = type('dummy', (), {})() def dummySampleTokenIdx(): return random.randint(0, 4) def getRandomContext(C): tokens = ["a", "b", "c", "d", "e"] return tokens[random.randint(0,4)], \ [tokens[random.randint(0,4)] for i in range(2*C)] dataset.sampleTokenIdx = dummySampleTokenIdx dataset.getRandomContext = getRandomContext random.seed(31415) np.random.seed(9265) dummy_vectors = normalizeRows(np.random.randn(10, 3)) dummy_tokens = dict([("a", 0), ("b", 1), ("c", 2), ("d", 3), ("e", 4)]) print( "==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====" ) gradcheck_naive( lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient), dummy_vectors, "naiveSoftmaxLossAndGradient Gradient") print( "==== Gradient check for skip-gram with negSamplingLossAndGradient ====" ) gradcheck_naive( lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient), dummy_vectors, "negSamplingLossAndGradient Gradient") print("\n=== Results ===") print("Skip-Gram with naiveSoftmaxLossAndGradient") print("Your Result:") print( "Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\nGradient wrt Outside Vectors (dJ/dU):\n {}\n" .format( *skipgram("c", 3, ["a", "b", "e", "d", "b", "c"], dummy_tokens, dummy_vectors[:5, :], dummy_vectors[5:, :], dataset))) print("Expected Result: Value should approximate these:") print("""Loss: 11.16610900153398 Gradient wrt Center Vectors (dJ/dV): [[ 0. 0. 0. ] [ 0. 0. 0. ] [-1.26947339 -1.36873189 2.45158957] [ 0. 0. 0. ] [ 0. 0. 0. ]] Gradient wrt Outside Vectors (dJ/dU): [[-0.41045956 0.18834851 1.43272264] [ 0.38202831 -0.17530219 -1.33348241] [ 0.07009355 -0.03216399 -0.24466386] [ 0.09472154 -0.04346509 -0.33062865] [-0.13638384 0.06258276 0.47605228]] """) print("Skip-Gram with negSamplingLossAndGradient") print("Your Result:") print( "Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\n Gradient wrt Outside Vectors (dJ/dU):\n {}\n" .format(*skipgram("c", 1, ["a", "b"], dummy_tokens, dummy_vectors[:5, :], dummy_vectors[ 5:, :], dataset, negSamplingLossAndGradient))) print("Expected Result: Value should approximate these:") print("""Loss: 16.15119285363322 Gradient wrt Center Vectors (dJ/dV): [[ 0. 0. 0. ] [ 0. 0. 0. ] [-4.54650789 -1.85942252 0.76397441] [ 0. 0. 0. ] [ 0. 0. 0. ]] Gradient wrt Outside Vectors (dJ/dU): [[-0.69148188 0.31730185 2.41364029] [-0.22716495 0.10423969 0.79292674] [-0.45528438 0.20891737 1.58918512] [-0.31602611 0.14501561 1.10309954] [-0.80620296 0.36994417 2.81407799]] """)
def test_word2vec(): """ Test the two word2vec implementations, before running on Stanford Sentiment Treebank """ dataset = type('dummy', (), {})() def dummySampleTokenIdx(): return random.randint(0, 4) def getRandomContext(C): tokens = ["a", "b", "c", "d", "e"] return tokens[random.randint(0,4)], \ [tokens[random.randint(0,4)] for i in range(2*C)] dataset.sampleTokenIdx = dummySampleTokenIdx dataset.getRandomContext = getRandomContext random.seed(31415) np.random.seed(9265) dummy_vectors = normalizeRows(np.random.randn(10, 3)) dummy_tokens = dict([("a", 0), ("b", 1), ("c", 2), ("d", 3), ("e", 4)]) print( "==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====" ) gradcheck_naive( lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient), dummy_vectors, "naiveSoftmaxLossAndGradient Gradient") print( "==== Gradient check for skip-gram with negSamplingLossAndGradient ====" ) gradcheck_naive( lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient), dummy_vectors, "negSamplingLossAndGradient Gradient") print("\n=== Results ===") print("Skip-Gram with naiveSoftmaxLossAndGradient") print("Your Result:") print( "Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\nGradient wrt Outside Vectors (dJ/dU):\n {}\n" .format( *skipgram("c", 3, ["a", "b", "e", "d", "b", "c"], dummy_tokens, dummy_vectors[:5, :], dummy_vectors[5:, :], dataset))) print("Expected Result: Value should approximate these:") print("""Loss: 11.16610900153398 Gradient wrt Center Vectors (dJ/dV): [[ 0. 0. 0. ] [ 0. 0. 0. ] [-1.26947339 -1.36873189 2.45158957] [ 0. 0. 0. ] [ 0. 0. 0. ]] Gradient wrt Outside Vectors (dJ/dU): [[-0.41045956 0.18834851 1.43272264] [ 0.38202831 -0.17530219 -1.33348241] [ 0.07009355 -0.03216399 -0.24466386] [ 0.09472154 -0.04346509 -0.33062865] [-0.13638384 0.06258276 0.47605228]] """) print("Skip-Gram with negSamplingLossAndGradient") print("Your Result:") print( "Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\n Gradient wrt Outside Vectors (dJ/dU):\n {}\n" .format(*skipgram("c", 1, ["a", "b"], dummy_tokens, dummy_vectors[:5, :], dummy_vectors[ 5:, :], dataset, negSamplingLossAndGradient))) print("Expected Result: Value should approximate these:") print("""Loss: 14.3018669327 Gradient wrt Center Vectors (dJ/dV): [[ 0. 0. 0. ] [ 0. 0. 0. ] [-3.86035429 -2.8660339 -0.9739887 ] [ 0. 0. 0. ] [ 0. 0. 0. ]] Gradient wrt Outside Vectors (dJ/dU): [[-0.30559455 0.14022886 1.06668785] [-0.12708467 0.05831563 0.44359323] [-0.45528438 0.20891737 1.58918512] [-0.73739425 0.33836976 2.57389893] [-0.64496237 0.29595533 2.25126239]] """)
def test_word2vec(): """ Test the two word2vec implementations, before running on Stanford Sentiment Treebank """ # dummy class declaration/initialization dataset = type('dummy', (), {})() def dummySampleTokenIdx(): return random.randint(0, 4) def getRandomContext(C): tokens = ["a", "b", "c", "d", "e"] return tokens[random.randint(0,4)], \ [tokens[random.randint(0,4)] for i in range(2*C)] # adding variables and methods to the dummy class is allowed!! dataset.sampleTokenIdx = dummySampleTokenIdx dataset.getRandomContext = getRandomContext # Random number generation isn't truly "random". It is deterministic, and the # sequence it generates is dictated by the seed value you pass into random.seed. # Typically you just invoke random.seed(), and it uses the current time as the # seed value, which means whenever you run the script you will get a different # sequence of values random.seed(31415) np.random.seed(9265) #print(np.random.randn(10,3)) # Each vector should have unit length. Divide by the total vector length. # normalization of vectors. Search for the why? dummy_vectors = normalizeRows(np.random.randn(10, 3)) # this is our dummy corpus. words = strings do not matter dummy_tokens = dict([("a", 0), ("b", 1), ("c", 2), ("d", 3), ("e", 4)]) print( "==== Gradient check for skip-gram with naiveSoftmaxLossAndGradient ====" ) gradcheck_naive( lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient), dummy_vectors, "naiveSoftmaxLossAndGradient Gradient") print( "==== Gradient check for skip-gram with negSamplingLossAndGradient ====" ) gradcheck_naive( lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, 5, negSamplingLossAndGradient), dummy_vectors, "negSamplingLossAndGradient Gradient") print("\n=== Results ===") print("Skip-Gram with naiveSoftmaxLossAndGradient") print("Your Result:") print( "Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\nGradient wrt Outside Vectors (dJ/dU):\n {}\n" .format( *skipgram("c", 3, ["a", "b", "e", "d", "b", "c"], dummy_tokens, dummy_vectors[:5, :], dummy_vectors[5:, :], dataset))) print("Expected Result: Value should approximate these:") print("""Loss: 11.16610900153398 Gradient wrt Center Vectors (dJ/dV): [[ 0. 0. 0. ] [ 0. 0. 0. ] [-1.26947339 -1.36873189 2.45158957] [ 0. 0. 0. ] [ 0. 0. 0. ]] Gradient wrt Outside Vectors (dJ/dU): [[-0.41045956 0.18834851 1.43272264] [ 0.38202831 -0.17530219 -1.33348241] [ 0.07009355 -0.03216399 -0.24466386] [ 0.09472154 -0.04346509 -0.33062865] [-0.13638384 0.06258276 0.47605228]] """) print("Skip-Gram with negSamplingLossAndGradient") print("Your Result:") print( "Loss: {}\nGradient wrt Center Vectors (dJ/dV):\n {}\n Gradient wrt Outside Vectors (dJ/dU):\n {}\n" .format(*skipgram("c", 1, ["a", "b"], dummy_tokens, dummy_vectors[:5, :], dummy_vectors[ 5:, :], dataset, negSamplingLossAndGradient))) print("Expected Result: Value should approximate these:") print("""Loss: 16.15119285363322 Gradient wrt Center Vectors (dJ/dV): [[ 0. 0. 0. ] [ 0. 0. 0. ] [-4.54650789 -1.85942252 0.76397441] [ 0. 0. 0. ] [ 0. 0. 0. ]] Gradient wrt Outside Vectors (dJ/dU): [[-0.69148188 0.31730185 2.41364029] [-0.22716495 0.10423969 0.79292674] [-0.45528438 0.20891737 1.58918512] [-0.31602611 0.14501561 1.10309954] [-0.80620296 0.36994417 2.81407799]] """)
#!/usr/bin/env python """ Finished@Oct 17, 2019 by @ivan1rufus The specific implementation of part word2vec model, skipgram model. Attention points: 1. Gradient of centerVec is only a row vector. 2. In negativeSampleLoss, there may be multiple same negative samples. 3. The way how matrix multiplication is the point. Bonus: 1. help(gradcheck_naive) gradcheck_naive(f, x, gradientText) Gradient check for a function f. Arguments: f -- a function that takes a single argument and outputs the loss and its gradients x -- the point (numpy array) to check the gradient at gradientText -- a string detailing some context about the gradient computation 2. help(normalizeRows) normalizeRows(x) Row normalization function Implement a function that normalizes each row of a matrix to have unit length. """ import numpy as np import random