def test_cy_equal_np_ft_random(self):
        ft = FastText(size=20, min_count=1)
        ft.build_vocab(SENTENCES)

        m1 = Average(ft)
        m1.prep.prepare_vectors(sv=m1.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m1._pre_train_calls()

        from fse.models.average_inner import MAX_NGRAMS_IN_BATCH
        m1.batch_ngrams = MAX_NGRAMS_IN_BATCH
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences[:2], m1.sv.vectors, mem1)

        m2 = Average(ft)
        m2.prep.prepare_vectors(sv=m2.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy
        o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2)

        self.assertEqual(o1, o2)
        self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))
Exemple #2
0
    def test_cy_equal_np_w2v_random(self):
        w2v = Word2Vec(min_count=1, size=DIM)
        # Random initialization
        w2v.build_vocab(SENTENCES)

        m1 = Average(w2v)
        m1.prep.prepare_vectors(
            sv=m1.sv, total_sentences=len(self.sentences), update=False
        )
        m1._pre_train_calls()
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1)

        m2 = Average(w2v)
        m2.prep.prepare_vectors(
            sv=m2.sv, total_sentences=len(self.sentences), update=False
        )
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy

        o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2)

        self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))
    def test_cy_equal_np_w2v(self):
        m1 = Average(W2V)
        m1.prep.prepare_vectors(sv=m1.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m1._pre_train_calls()
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1)

        m2 = Average(W2V)
        m2.prep.prepare_vectors(sv=m2.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()
        o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2)

        self.assertEqual(o1, o2)
        self.assertTrue((m1.sv.vectors == m2.sv.vectors).all())
Exemple #4
0
 def test_average_train_np_ft(self):
     ft = FastText(min_count=1, size=DIM)
     ft.build_vocab(SENTENCES)
     m = Average(ft)
     m.prep.prepare_vectors(
         sv=m.sv, total_sentences=len(self.sentences), update=False
     )
     m._pre_train_calls()
     m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors, dtype=np.float32)
     m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams, 2, dtype=np.float32)
     mem = m._get_thread_working_mem()
     output = train_average_np(m, self.sentences, m.sv.vectors, mem)
     self.assertEqual((4, 10), output)
     self.assertTrue((1.0 == m.sv[0]).all())
     self.assertTrue((1.5 == m.sv[2]).all())
     self.assertTrue((2 == m.sv[3]).all())
class TestAverageFunctions(unittest.TestCase):
    def setUp(self):
        self.sentences = [["They", "admit"], ["So", "Apple", "bought", "buds"],
                          ["go", "12345"], ["pull", "12345678910111213"]]
        self.sentences = [(s, i) for i, s in enumerate(self.sentences)]
        self.model = Average(W2V)
        self.model.prep.prepare_vectors(sv=self.model.sv,
                                        total_sentences=len(self.sentences),
                                        update=False)
        self.model._pre_train_calls()

    def test_cython(self):
        from fse.models.average_inner import FAST_VERSION, MAX_WORDS_IN_BATCH, MAX_NGRAMS_IN_BATCH
        self.assertTrue(FAST_VERSION)
        self.assertEqual(10000, MAX_WORDS_IN_BATCH)
        self.assertEqual(40, MAX_NGRAMS_IN_BATCH)

    def test_average_train_np_w2v(self):
        self.model.sv.vectors = np.zeros_like(self.model.sv.vectors,
                                              dtype=np.float32)
        mem = self.model._get_thread_working_mem()
        output = train_average_np(self.model, self.sentences,
                                  self.model.sv.vectors, mem)
        self.assertEqual((4, 7), output)
        self.assertTrue((183 == self.model.sv[0]).all())
        self.assertTrue((164.5 == self.model.sv[1]).all())
        self.assertTrue(
            (self.model.wv.vocab["go"].index == self.model.sv[2]).all())

    def test_average_train_cy_w2v(self):
        self.model.sv.vectors = np.zeros_like(self.model.sv.vectors,
                                              dtype=np.float32)
        mem = self.model._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy
        output = train_average_cy(self.model, self.sentences,
                                  self.model.sv.vectors, mem)
        self.assertEqual((4, 7), output)
        self.assertTrue((183 == self.model.sv[0]).all())
        self.assertTrue((164.5 == self.model.sv[1]).all())
        self.assertTrue(
            (self.model.wv.vocab["go"].index == self.model.sv[2]).all())

    def test_average_train_np_ft(self):
        ft = FastText(min_count=1, size=DIM)
        ft.build_vocab(SENTENCES)
        m = Average(ft)
        m.prep.prepare_vectors(sv=m.sv,
                               total_sentences=len(self.sentences),
                               update=False)
        m._pre_train_calls()
        m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors,
                                                         dtype=np.float32)
        m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams,
                                           2,
                                           dtype=np.float32)
        mem = m._get_thread_working_mem()
        output = train_average_np(m, self.sentences, m.sv.vectors, mem)
        self.assertEqual((4, 10), output)
        self.assertTrue((1. == m.sv[0]).all())
        self.assertTrue((1.5 == m.sv[2]).all())
        self.assertTrue((2 == m.sv[3]).all())
        # "go" -> [1,1...]
        # oov: "12345" -> (14 hashes * 2) / 14 =  2
        # (2 + 1) / 2 = 1.5

    def test_average_train_cy_ft(self):
        ft = FastText(min_count=1, size=DIM)
        ft.build_vocab(SENTENCES)
        m = Average(ft)
        m.prep.prepare_vectors(sv=m.sv,
                               total_sentences=len(self.sentences),
                               update=False)
        m._pre_train_calls()
        m.wv.vectors = m.wv.vectors_vocab = np.ones_like(m.wv.vectors,
                                                         dtype=np.float32)
        m.wv.vectors_ngrams = np.full_like(m.wv.vectors_ngrams,
                                           2,
                                           dtype=np.float32)
        mem = m._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy
        output = train_average_cy(m, self.sentences, m.sv.vectors, mem)
        self.assertEqual((4, 10), output)
        self.assertTrue((1. + EPS == m.sv[0]).all())
        self.assertTrue(np.allclose(1.5, m.sv[2]))
        self.assertTrue(np.allclose(2, m.sv[3]))

    def test_cy_equal_np_w2v(self):
        m1 = Average(W2V)
        m1.prep.prepare_vectors(sv=m1.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m1._pre_train_calls()
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1)

        m2 = Average(W2V)
        m2.prep.prepare_vectors(sv=m2.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy
        o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2)

        self.assertEqual(o1, o2)
        self.assertTrue((m1.sv.vectors == m2.sv.vectors).all())

    def test_cy_equal_np_w2v_random(self):
        w2v = Word2Vec(min_count=1, size=DIM)
        # Random initialization
        w2v.build_vocab(SENTENCES)

        m1 = Average(w2v)
        m1.prep.prepare_vectors(sv=m1.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m1._pre_train_calls()
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1)

        m2 = Average(w2v)
        m2.prep.prepare_vectors(sv=m2.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy
        o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2)

        self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))

    def test_cy_equal_np_ft_random(self):
        ft = FastText(size=20, min_count=1)
        ft.build_vocab(SENTENCES)

        m1 = Average(ft)
        m1.prep.prepare_vectors(sv=m1.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m1._pre_train_calls()

        from fse.models.average_inner import MAX_NGRAMS_IN_BATCH
        m1.batch_ngrams = MAX_NGRAMS_IN_BATCH
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences[:2], m1.sv.vectors, mem1)

        m2 = Average(ft)
        m2.prep.prepare_vectors(sv=m2.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy
        o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2)

        self.assertEqual(o1, o2)
        self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))

    def test_do_train_job(self):
        self.model.prep.prepare_vectors(sv=self.model.sv,
                                        total_sentences=len(SENTENCES),
                                        update=True)
        mem = self.model._get_thread_working_mem()
        self.assertEqual(
            (100, 1450),
            self.model._do_train_job([(s, i) for i, s in enumerate(SENTENCES)],
                                     target=self.model.sv.vectors,
                                     memory=mem))
        self.assertEqual((104, DIM), self.model.sv.vectors.shape)

    def test_train(self):
        self.assertEqual(
            (100, 1450),
            self.model.train([(s, i) for i, s in enumerate(SENTENCES)]))

    def test_train_single_from_disk(self):
        p = Path("fse/test/test_data/test_vecs")
        p_res = Path("fse/test/test_data/test_vecs.vectors")
        p_target = Path("fse/test/test_data/test_vecs_wv.vectors")

        se1 = Average(W2V)
        se2 = Average(W2V,
                      sv_mapfile_path=str(p.absolute()),
                      wv_mapfile_path=str(p.absolute()))
        se1.train([(s, i) for i, s in enumerate(SENTENCES)])
        se2.train([(s, i) for i, s in enumerate(SENTENCES)])

        self.assertTrue(p_target.exists())
        self.assertTrue((se1.wv.vectors == se2.wv.vectors).all())
        self.assertFalse(se2.wv.vectors.flags.writeable)

        self.assertTrue((se1.sv.vectors == se2.sv.vectors).all())
        p_res.unlink()
        p_target.unlink()

    def test_train_multi_from_disk(self):
        p = Path("fse/test/test_data/test_vecs")
        p_res = Path("fse/test/test_data/test_vecs.vectors")
        p_target = Path("fse/test/test_data/test_vecs_wv.vectors")

        se1 = Average(W2V, workers=2)
        se2 = Average(W2V,
                      workers=2,
                      sv_mapfile_path=str(p.absolute()),
                      wv_mapfile_path=str(p.absolute()))
        se1.train([(s, i) for i, s in enumerate(SENTENCES)])
        se2.train([(s, i) for i, s in enumerate(SENTENCES)])

        self.assertTrue(p_target.exists())
        self.assertTrue((se1.wv.vectors == se2.wv.vectors).all())
        self.assertFalse(se2.wv.vectors.flags.writeable)

        self.assertTrue((se1.sv.vectors == se2.sv.vectors).all())
        p_res.unlink()
        p_target.unlink()

    def test_check_parameter_sanity(self):
        se = Average(W2V)
        se.word_weights = np.full(20, 2., dtype=np.float32)
        with self.assertRaises(ValueError):
            se._check_parameter_sanity()