def test_cy_equal_np_ft_random(self):
        ft = FastText(size=20, min_count=1)
        ft.build_vocab(SENTENCES)

        m1 = Average(ft)
        m1.prep.prepare_vectors(sv=m1.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m1._pre_train_calls()

        from fse.models.average_inner import MAX_NGRAMS_IN_BATCH
        m1.batch_ngrams = MAX_NGRAMS_IN_BATCH
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences[:2], m1.sv.vectors, mem1)

        m2 = Average(ft)
        m2.prep.prepare_vectors(sv=m2.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy
        o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2)

        self.assertEqual(o1, o2)
        self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))
Example #2
0
    def test_cy_equal_np_w2v_random(self):
        w2v = Word2Vec(min_count=1, size=DIM)
        # Random initialization
        w2v.build_vocab(SENTENCES)

        m1 = Average(w2v)
        m1.prep.prepare_vectors(
            sv=m1.sv, total_sentences=len(self.sentences), update=False
        )
        m1._pre_train_calls()
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1)

        m2 = Average(w2v)
        m2.prep.prepare_vectors(
            sv=m2.sv, total_sentences=len(self.sentences), update=False
        )
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy

        o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2)

        self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))
 def test_similar_by_sentence(self):
     sentences = IndexedLineDocument(CORPUS)
     m = Average(W2V)
     m.train(sentences)
     o = m.sv.similar_by_sentence(sentence=["the", "product", "is", "good"],
                                  model=m)
     self.assertEqual(4, o[0][0])
Example #4
0
 def test_similar_by_sentence_wrong_model(self):
     sentences = IndexedLineDocument(CORPUS)
     m = Average(W2V)
     m.train(sentences)
     with self.assertRaises(RuntimeError):
         m.sv.similar_by_sentence(sentence=["the", "product", "is", "good"],
                                  model=W2V)
 def test_similar_by_word(self):
     sentences = IndexedLineDocument(CORPUS)
     m = Average(W2V)
     m.train(sentences)
     o = m.sv.similar_by_word(word="the", wv=m.wv)
     self.assertEqual(96, o[0][0])
     o = m.sv.similar_by_word(word="the", wv=m.wv, indexable=sentences)
     self.assertEqual(96, o[0][1])
 def test_most_similar_wrong_indexable(self):
     def indexable(self):
         pass
     sentences = IndexedLineDocument(CORPUS)
     m = Average(W2V)
     m.train(sentences)
     with self.assertRaises(RuntimeError):
         m.sv.most_similar(positive=0, indexable=indexable)
 def setUp(self):
     self.sentences = [["They", "admit"], ["So", "Apple", "bought", "buds"],
                       ["go", "12345"], ["pull", "12345678910111213"]]
     self.sentences = [(s, i) for i, s in enumerate(self.sentences)]
     self.model = Average(W2V)
     self.model.prep.prepare_vectors(sv=self.model.sv,
                                     total_sentences=len(self.sentences),
                                     update=False)
     self.model._pre_train_calls()
 def test_most_similar_vecs(self):
     sentences = IndexedLineDocument(CORPUS)
     m = Average(W2V)
     m.train(sentences)
     m.sv.init_sims()
     v = m.sv[[0, 1]]
     o = m.sv.most_similar(positive=v)
     self.assertEqual(1, o[0][0])
     self.assertEqual(0, o[1][0])
 def test_most_similar_vec(self):
     sentences = IndexedLineDocument(CORPUS)
     m = Average(W2V)
     m.train(sentences)
     m.sv.init_sims()
     v = m.sv.get_vector(0, use_norm=True)
     o = m.sv.most_similar(positive=v)
     # Includes 0 obviously
     self.assertEqual(45, o[1][0])
     self.assertEqual(35, o[2][0])
    def test_train_single_from_disk(self):
        p = Path("fse/test/test_data/test_vecs")
        p_res = Path("fse/test/test_data/test_vecs.vectors")
        p_target = Path("fse/test/test_data/test_vecs_wv.vectors")

        se1 = Average(W2V)
        se2 = Average(W2V,
                      sv_mapfile_path=str(p.absolute()),
                      wv_mapfile_path=str(p.absolute()))
        se1.train([(s, i) for i, s in enumerate(SENTENCES)])
        se2.train([(s, i) for i, s in enumerate(SENTENCES)])

        self.assertTrue(p_target.exists())
        self.assertTrue((se1.wv.vectors == se2.wv.vectors).all())
        self.assertFalse(se2.wv.vectors.flags.writeable)

        self.assertTrue((se1.sv.vectors == se2.sv.vectors).all())
        p_res.unlink()
        p_target.unlink()
    def test_cy_equal_np_w2v(self):
        m1 = Average(W2V)
        m1.prep.prepare_vectors(sv=m1.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m1._pre_train_calls()
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1)

        m2 = Average(W2V)
        m2.prep.prepare_vectors(sv=m2.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()
        o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2)

        self.assertEqual(o1, o2)
        self.assertTrue((m1.sv.vectors == m2.sv.vectors).all())
    def test_most_similar(self):
        sent_ind = IndexedList(SENTENCES)
        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        o = m.sv.most_similar(positive=0)
        self.assertEqual(45, o[0][0])
        self.assertEqual(35, o[1][0])
        o = m.sv.most_similar(positive=0, indexable=sentences)
        self.assertEqual("Looks good and fits snug", o[0][0])

        o = m.sv.most_similar(positive=0, indexable=sent_ind)
        self.assertEqual("Looks good and fits snug".split(), o[0][0][0])
    def test_most_similar_restrict_size_tuple(self):
        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        o = m.sv.most_similar(positive=20, topn=20, restrict_size=(5, 25))
        self.assertEqual(19, len(o))
        self.assertEqual(22, o[0][0])

        o = m.sv.most_similar(positive=1, topn=20, restrict_size=(5, 25))
        self.assertEqual(20, len(o))
        self.assertEqual(9, o[0][0])

        o = m.sv.most_similar(positive=1, topn=20, restrict_size=(5, 25), indexable=sentences)
        self.assertEqual(20, len(o))
        self.assertEqual(9, o[0][1])
Example #14
0
 def test_average_train_np_ft(self):
     ft = FastText(min_count=1, size=DIM)
     ft.build_vocab(SENTENCES)
     m = Average(ft)
     m.prep.prepare_vectors(
         sv=m.sv, total_sentences=len(self.sentences), update=False
     )
     m._pre_train_calls()
     m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32)
     m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32)
     mem = m._get_thread_working_mem()
     output = train_average_np(m, self.sentences, m.sv.vectors, mem)
     self.assertEqual((4, 10), output)
     self.assertTrue((1.0 == m.sv[0]).all())
     self.assertTrue((1.5 == m.sv[2]).all())
     self.assertTrue((2 == m.sv[3]).all())
 def test_similar_by_vector(self):
     sentences = IndexedLineDocument(CORPUS)
     m = Average(W2V)
     m.train(sentences)
     o = m.sv.similar_by_vector(m.wv["the"])
     self.assertEqual(96, o[0][0])
 def test_most_similar_restrict_size(self):
     sentences = IndexedLineDocument(CORPUS)
     m = Average(W2V)
     m.train(sentences)
     o = m.sv.most_similar(positive=20, topn=20, restrict_size=5)
     self.assertEqual(5, len(o))
 def test_check_parameter_sanity(self):
     se = Average(W2V)
     se.word_weights = np.full(20, 2., dtype=np.float32)
     with self.assertRaises(ValueError):
         se._check_parameter_sanity()