dummy_vectors[0]) gradcheck_naive(lambda vec: g_func_wrapper2(softmax_cost_and_gradient, dummy_vectors[0], 0, vec, dataset, parameters=parameters), dummy_vectors[1:]) print "==== Gradient check for neg_sampling_max_cost_and_gradient ====" print "test 1" gradcheck_naive(lambda vec: g_func_wrapper1(neg_sampling_cost_and_gradient, vec, 0, dummy_vectors[1:], dataset, parameters=parameters), dummy_vectors[0], verbose=False) print "test 2" gradcheck_naive(lambda vec: g_func_wrapper2(neg_sampling_cost_and_gradient, dummy_vectors[0], 0, vec, dataset, parameters=parameters), dummy_vectors[1:]) print "==== Gradient check for skip-gram ====" print "test 1" gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, parameters=parameters, verbose=False), dummy_vectors, verbose=False) print "test 2" gradcheck_naive(lambda vec: word2vec_sgd_wrapper(skipgram, dummy_tokens, vec, dataset, parameters=parameters, cost_grad_func=neg_sampling_cost_and_gradient), dummy_vectors) print "\n==== Gradient check for CBOW ====" print "test 1" gradcheck_naive(lambda vec: word2vec_sgd_wrapper(cbow, dummy_tokens, vec, dataset, parameters=parameters), dummy_vectors) print "test 2" gradcheck_naive(lambda vec: word2vec_sgd_wrapper(cbow, dummy_tokens, vec, dataset, parameters=parameters, cost_grad_func=neg_sampling_cost_and_gradient), dummy_vectors) print "\n=== For autograder ===" print skipgram("c", 3, ["a", "b", "e", "d", "b", "c"], dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:],
random.seed(31415) np.random.seed(9265) word_vectors = np.concatenate(( (np.random.rand(num_words, dim_vectors) - .5) / dim_vectors, np.zeros((num_words, dim_vectors))), axis=0 ) params['sgd']['step'] = 0.2 params['sgd']['iterations'] = 40000 params['sgd']['tolerance'] = 1e-48 params['sgd']['anneal_every'] = 20000 params['sgd']['anneal_factor'] = 0.5 word_vectors0 = sgd( lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, params, neg_sampling_cost_and_gradient), word_vectors, params, postprocessing=normalize_rows, use_saved=True, print_every=100, save_params_every=5000) # sanity check: cost at convergence should be around or below 10 # sum the input and output word vectors word_vectors = (word_vectors0[:num_words,:] + word_vectors0[num_words:,:]) print "\n=== For autograder ===" check_words = ["the", "a", "an", "movie", "ordinary", "but", "and"] check_idx = [tokens[word] for word in check_words] check_vecs = word_vectors[check_idx, :] print check_vecs # Visualize the word vectors you trained
# Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate( ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0) wordVectors = sgd( lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, negSamplingLossAndGradient), wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10) # Note that normalization is not called here. This is not a bug, # normalizing during training loses the notion of length. print("sanity check: cost at convergence should be around or below 10") print("training took %d seconds" % (time.time() - startTime)) # concatenate the input and output word vectors wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0) visualizeWords = [ "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "enjoyable", "boring", "bad", "dumb",