def test_fitting(): """ Verify that the square error diminishes with fitting """ num_sentences = 5000 seed = 10 corpus = Corpus() corpus.fit(generate_training_corpus(num_sentences, vocabulary_size=50, seed=seed)) # Check that the performance is poor without fitting glove_model = Glove(no_components=100, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=0, no_threads=2) log_cooc_mat = corpus.matrix.copy() log_cooc_mat.data = np.log(log_cooc_mat.data) log_cooc_mat = np.asarray(log_cooc_mat.todense()) repr_matrix = _reproduce_input_matrix(glove_model) assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0 # Check that it is good with fitting glove_model = Glove(no_components=100, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=500, no_threads=2) repr_matrix = _reproduce_input_matrix(glove_model) assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
def test_large_corpus_construction(): num_sentences = 5000 seed = 10 corpus = Corpus() corpus.fit(generate_training_corpus(num_sentences, seed=seed)) matrix = corpus.matrix.tocsr().tocoo() check_matrix = build_coocurrence_matrix( generate_training_corpus(num_sentences, seed=seed)) assert (matrix.row == check_matrix.row).all() assert (matrix.col == check_matrix.col).all() assert np.allclose(matrix.data, check_matrix.data) assert (matrix.data > 0).all()
def test_large_corpus_construction_wo_weighting_symmetric(): num_sentences = 5000 seed = 10 corpus = Corpus() corpus.fit(generate_training_corpus(num_sentences, seed=seed), distance_weighting=False, symmetric=True) matrix = corpus.matrix.tocsr().tocoo() check_matrix = build_coocurrence_matrix_wo_weighting_symmetric( generate_training_corpus(num_sentences, seed=seed)) assert (matrix.row == check_matrix.row).all() assert (matrix.col == check_matrix.col).all() assert np.allclose(matrix.data, check_matrix.data) assert (matrix.data > 0).all()