def test_cy_equal_np_ft_random(self): ft = FastText(size=20, min_count=1) ft.build_vocab(SENTENCES) m1 = Average(ft) m1.prep.prepare_vectors(sv=m1.sv, total_sentences=len(self.sentences), update=False) m1._pre_train_calls() from fse.models.average_inner import MAX_NGRAMS_IN_BATCH m1.batch_ngrams = MAX_NGRAMS_IN_BATCH mem1 = m1._get_thread_working_mem() o1 = train_average_np(m1, self.sentences[:2], m1.sv.vectors, mem1) m2 = Average(ft) m2.prep.prepare_vectors(sv=m2.sv, total_sentences=len(self.sentences), update=False) m2._pre_train_calls() mem2 = m2._get_thread_working_mem() from fse.models.average_inner import train_average_cy o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2) self.assertEqual(o1, o2) self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))
def test_cy_equal_np_w2v_random(self): w2v = Word2Vec(min_count=1, size=DIM) # Random initialization w2v.build_vocab(SENTENCES) m1 = Average(w2v) m1.prep.prepare_vectors( sv=m1.sv, total_sentences=len(self.sentences), update=False ) m1._pre_train_calls() mem1 = m1._get_thread_working_mem() o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1) m2 = Average(w2v) m2.prep.prepare_vectors( sv=m2.sv, total_sentences=len(self.sentences), update=False ) m2._pre_train_calls() mem2 = m2._get_thread_working_mem() from fse.models.average_inner import train_average_cy o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2) self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))
def test_similar_by_sentence(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) o = m.sv.similar_by_sentence(sentence=["the", "product", "is", "good"], model=m) self.assertEqual(4, o[0][0])
def test_similar_by_sentence_wrong_model(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) with self.assertRaises(RuntimeError): m.sv.similar_by_sentence(sentence=["the", "product", "is", "good"], model=W2V)
def test_similar_by_word(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) o = m.sv.similar_by_word(word="the", wv=m.wv) self.assertEqual(96, o[0][0]) o = m.sv.similar_by_word(word="the", wv=m.wv, indexable=sentences) self.assertEqual(96, o[0][1])
def test_most_similar_wrong_indexable(self): def indexable(self): pass sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) with self.assertRaises(RuntimeError): m.sv.most_similar(positive=0, indexable=indexable)
def setUp(self): self.sentences = [["They", "admit"], ["So", "Apple", "bought", "buds"], ["go", "12345"], ["pull", "12345678910111213"]] self.sentences = [(s, i) for i, s in enumerate(self.sentences)] self.model = Average(W2V) self.model.prep.prepare_vectors(sv=self.model.sv, total_sentences=len(self.sentences), update=False) self.model._pre_train_calls()
def test_most_similar_vecs(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) m.sv.init_sims() v = m.sv[[0, 1]] o = m.sv.most_similar(positive=v) self.assertEqual(1, o[0][0]) self.assertEqual(0, o[1][0])
def test_most_similar_vec(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) m.sv.init_sims() v = m.sv.get_vector(0, use_norm=True) o = m.sv.most_similar(positive=v) # Includes 0 obviously self.assertEqual(45, o[1][0]) self.assertEqual(35, o[2][0])
def test_train_single_from_disk(self): p = Path("fse/test/test_data/test_vecs") p_res = Path("fse/test/test_data/test_vecs.vectors") p_target = Path("fse/test/test_data/test_vecs_wv.vectors") se1 = Average(W2V) se2 = Average(W2V, sv_mapfile_path=str(p.absolute()), wv_mapfile_path=str(p.absolute())) se1.train([(s, i) for i, s in enumerate(SENTENCES)]) se2.train([(s, i) for i, s in enumerate(SENTENCES)]) self.assertTrue(p_target.exists()) self.assertTrue((se1.wv.vectors == se2.wv.vectors).all()) self.assertFalse(se2.wv.vectors.flags.writeable) self.assertTrue((se1.sv.vectors == se2.sv.vectors).all()) p_res.unlink() p_target.unlink()
def test_cy_equal_np_w2v(self): m1 = Average(W2V) m1.prep.prepare_vectors(sv=m1.sv, total_sentences=len(self.sentences), update=False) m1._pre_train_calls() mem1 = m1._get_thread_working_mem() o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1) m2 = Average(W2V) m2.prep.prepare_vectors(sv=m2.sv, total_sentences=len(self.sentences), update=False) m2._pre_train_calls() mem2 = m2._get_thread_working_mem() o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2) self.assertEqual(o1, o2) self.assertTrue((m1.sv.vectors == m2.sv.vectors).all())
def test_most_similar(self): sent_ind = IndexedList(SENTENCES) sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) o = m.sv.most_similar(positive=0) self.assertEqual(45, o[0][0]) self.assertEqual(35, o[1][0]) o = m.sv.most_similar(positive=0, indexable=sentences) self.assertEqual("Looks good and fits snug", o[0][0]) o = m.sv.most_similar(positive=0, indexable=sent_ind) self.assertEqual("Looks good and fits snug".split(), o[0][0][0])
def test_most_similar_restrict_size_tuple(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) o = m.sv.most_similar(positive=20, topn=20, restrict_size=(5, 25)) self.assertEqual(19, len(o)) self.assertEqual(22, o[0][0]) o = m.sv.most_similar(positive=1, topn=20, restrict_size=(5, 25)) self.assertEqual(20, len(o)) self.assertEqual(9, o[0][0]) o = m.sv.most_similar(positive=1, topn=20, restrict_size=(5, 25), indexable=sentences) self.assertEqual(20, len(o)) self.assertEqual(9, o[0][1])
def test_average_train_np_ft(self): ft = FastText(min_count=1, size=DIM) ft.build_vocab(SENTENCES) m = Average(ft) m.prep.prepare_vectors( sv=m.sv, total_sentences=len(self.sentences), update=False ) m._pre_train_calls() m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32) m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32) mem = m._get_thread_working_mem() output = train_average_np(m, self.sentences, m.sv.vectors, mem) self.assertEqual((4, 10), output) self.assertTrue((1.0 == m.sv[0]).all()) self.assertTrue((1.5 == m.sv[2]).all()) self.assertTrue((2 == m.sv[3]).all())
def test_similar_by_vector(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) o = m.sv.similar_by_vector(m.wv["the"]) self.assertEqual(96, o[0][0])
def test_most_similar_restrict_size(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) o = m.sv.most_similar(positive=20, topn=20, restrict_size=5) self.assertEqual(5, len(o))
def test_check_parameter_sanity(self): se = Average(W2V) se.word_weights = np.full(20, 2., dtype=np.float32) with self.assertRaises(ValueError): se._check_parameter_sanity()