Beispiel #1
0
 def test_analogy(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     d = word_embedding.analogy('the',
                                'fox',
                                'quick',
                                num=2,
                                metric='euclidean')
     self.assertEqual(2, len(d), 'wrong number of analogies returned')
     self.assertEqual('jumped', d[0], 'wrong most likely analogy returned')
     self.assertEqual('over', d[1],
                      'wrong 2nd most likely analogy returned')
Beispiel #2
0
 def test_gets(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1],
                              [1.0, 0.0], [0, 1.01], [-1.0, 0.0]])
     word_embedding = WordVector(embed_matrix, dictionary)
     d = word_embedding.get_dict()
     dr = word_embedding.get_reverse_dict()
     em = word_embedding.get_embed()
     d.pop('the')  # mutate, check that copies were returned
     dr.pop(1)
     em[0, 0] = 10
     d = word_embedding.get_dict()
     dr = word_embedding.get_reverse_dict()
     em = word_embedding.get_embed()
     self.assertEqual(6, len(d), 'wrong dictionary length')
     self.assertEqual(6, len(dr), 'wrong dictionary length')
     self.assertEqual(1.0, em[0, 0], 'wrong value in embed matrix')
     self.assertEqual(3, d['fox'], 'wrong value from dictionary')
     self.assertEqual('jumped', dr[4],
                      'wrong value from reverse dictionary')
Beispiel #3
0
 def test_closest_row_indices(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1],
                              [1.0, 0.0], [0, 1.01], [-1.0, 0.0]])
     word_embedding = WordVector(embed_matrix, dictionary)
     dist_list = word_embedding.closest_row_indices(np.array([[2.0, 2.0]]),
                                                    3, 'euclidean')
     self.assertTrue(
         np.sum(np.abs(np.array([1, 2, 0]) - dist_list)) < 0.1,
         'incorrest closest indices')
     dist_list = word_embedding.closest_row_indices(np.array([[2.0, 2.0]]),
                                                    3, 'cosine')
     self.assertTrue(
         np.sum(np.abs(np.array([1, 0, 2]) - dist_list)) < 0.1,
         'incorrest closest indices')
     dist_list = word_embedding.closest_row_indices(np.array([[1.0, 1.0]]),
                                                    6, 'euclidean')
     self.assertTrue(
         np.sum(np.abs(np.array([0, 3, 4, 1, 2, 5]) - dist_list)) < 0.1,
         'incorrest closest indices')
Beispiel #4
0
 def test_get_vector_by_num(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1],
                              [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     self.assertTrue(
         np.sum(
             np.abs(
                 np.array([1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1]) -
                 word_embedding.get_vector_by_num(3))) < 0.1,
         'incorrest closest indices')
     self.assertTrue(
         np.sum(
             np.abs(
                 np.array([1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]) -
                 word_embedding.get_vector_by_num(5))) < 0.1,
         'incorrest closest indices')
     self.assertTrue(
         np.sum(
             np.abs(
                 np.array([1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) -
                 word_embedding.get_vector_by_num(0))) < 0.1,
         'incorrest closest indices')
def load():
	files = ['../data/adventures_of_sherlock_holmes.txt',
        	'../data/hound_of_the_baskervilles.txt',
        	'../data/sign_of_the_four.txt']
	word_array, dictionary, num_lines, num_words = docload.build_word_array(
    	files, vocab_size=50000, gutenberg=True)

	print('Document loaded and processed: {} lines, {} words.'
      	.format(num_lines, num_words))

	print('Building training set ...')
	x, y = WindowModel.build_training_set(word_array)

	# shuffle and split 10% validation data
	x_shuf, y_shuf = sklearn.utils.shuffle(x, y, random_state=0)
	split = round(x_shuf.shape[0]*0.9)
	x_val, y_val = (x_shuf[split:, :], y_shuf[split:, :])
	x_train, y_train = (x[:split, :], y[:split, :])

	print('Training set built.')
	graph_params = {'batch_size': 32,
	                'vocab_size': np.max(x)+1,
	                'embed_size': 64,
	                'hid_size': 64,
	                'neg_samples': 64,
	                'learn_rate': 0.01,
	                'momentum': 0.9,
	                'embed_noise': 0.1,
	                'hid_noise': 0.3,
	                'optimizer': 'Momentum'}
	model = WindowModel(graph_params)
	print('Model built. Vocab size = {}. Document length = {} words.'
	      .format(np.max(x)+1, len(word_array)))

	print('Training ...')
	results = model.train(x_train, y_train, x_val, y_val, epochs=120, verbose=False)

	word_vector_embed = WordVector(results['embed_weights'], dictionary)
	word_vector_nce = WordVector(results['nce_weights'], dictionary)
Beispiel #6
0
 def test_num_words(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1],
                              [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     self.assertEqual(6, word_embedding.num_words(),
                      'incorrect number of words')
Beispiel #7
0
 def n_closest(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 1.01], [2.0, 2.0], [2.0, 2.1],
                              [1.0, 0.0], [0, 1.01], [-1.0, 0.0]])
     word_embedding = WordVector(embed_matrix, dictionary)
     nc_list = word_embedding.n_closest('quick', 3, metric='euclidean')
     self.assertEqual(['quick', 'brown', 'the'], nc_list,
                      'wrong n-closest words returned')
     nc_list = word_embedding.n_closest('quick', 2, metric='cosine')
     self.assertEqual(['the', 'fox'], nc_list,
                      'wrong n-closest words returned')
Beispiel #8
0
 def test_most_common(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     mc_list = word_embedding.most_common(3)
     self.assertEqual(['the', 'quick', 'brown'], mc_list,
                      'wrong most common words returned')
     mc_list = word_embedding.most_common(1)
     self.assertEqual(['the'], mc_list, 'wrong most common words returned')
Beispiel #9
0
 def test_project_2D_2(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.1, 0.1, 0.9, 0.9, 0.9, 0.1],
                              [1.0, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     proj, words = word_embedding.project_2d(0, 6)
     self.assertEqual((6, 2), proj.shape,
                      'incorrect projection array size returned')
     self.assertEqual('the', words[0], 'incorrect word at index 0')
     self.assertEqual('fox', words[3], 'incorrect word at index 3')
Beispiel #10
0
 def test_words_in_range(self):
     from wordvector import WordVector
     dictionary = {
         'the': 0,
         'quick': 1,
         'brown': 2,
         'fox': 3,
         'jumped': 4,
         'over': 5
     }
     embed_matrix = np.array([[1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [-1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                              [1.0, 0.1, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.6, 0.1, 1.1, 1.1, 1.1, 0.1],
                              [1.0, 0.7, 0.1, 1.1, 1.1, 1.1, 0.1]])
     word_embedding = WordVector(embed_matrix, dictionary)
     range_list = word_embedding.words_in_range(3, 6)
     self.assertEqual(['fox', 'jumped', 'over'], range_list,
                      'wrong most common words returned')
     range_list = word_embedding.words_in_range(0, 2)
     self.assertEqual(['the', 'quick'], range_list,
                      'wrong most common words returned')