コード例 #1
0
    def testPersistence(self):
        """Test storing/loading the entire model."""
        model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs)
        model.train(self.word_pairs)
        model.save(temp_save_file())

        loaded_model = translation_matrix.TranslationMatrix.load(temp_save_file())
        self.assertTrue(np.allclose(model.translation_matrix, loaded_model.translation_matrix))
コード例 #2
0
    def testPersistence(self):
        """Test storing/loading the entire model."""
        tmpf = get_tmpfile('transmat-en-it.pkl')

        model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs)
        model.train(self.word_pairs)
        model.save(tmpf)

        loaded_model = translation_matrix.TranslationMatrix.load(tmpf)
        self.assertTrue(np.allclose(model.translation_matrix, loaded_model.translation_matrix))
コード例 #3
0
    def test_translate_gc(self):
        # Test globally corrected neighbour retrieval method
        model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs)
        model.train(self.word_pairs)

        test_source_word, test_target_word = zip(*self.test_word_pairs)
        translated_words = model.translate(test_source_word, topn=5, gc=1, sample_num=3, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec)

        for idx, item in enumerate(self.test_word_pairs):
            self.assertTrue(item[1] in translated_words[item[0]])
コード例 #4
0
    def test_translate_nn(self):
        # Test the nearest neighbor retrieval method
        model = translation_matrix.TranslationMatrix(self.source_word_vec,
                                                     self.target_word_vec,
                                                     self.word_pairs)
        model.train(self.word_pairs)

        test_source_word, test_target_word = zip(*self.test_word_pairs)
        translated_words = model.translate(
            test_source_word,
            topn=5,
            source_lang_vec=self.source_word_vec,
            target_lang_vec=self.target_word_vec,
        )

        for idx, item in enumerate(self.test_word_pairs):
            self.assertTrue(item[1] in translated_words[item[0]])
コード例 #5
0
 def test_translation_matrix(self):
     model = translation_matrix.TranslationMatrix(self.source_word_vec,
                                                  self.target_word_vec,
                                                  self.word_pairs)
     model.train(self.word_pairs)
     self.assertEqual(model.translation_matrix.shape, (300, 300))
コード例 #6
0
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models import translation_matrix
from gensim.models import BackMappingTranslationMatrix
from pprint import pprint

w2v_bin_path_old    = '/home/dpappas/COVID/COVID/pubmed2018_w2v_30D.bin'
w2v_bin_path_new    = '/home/dpappas/COVID/covid_19_w2v_embeds_30.model'
wv_old              = KeyedVectors.load_word2vec_format(w2v_bin_path_old, binary=True)
wv_new              = Word2Vec.load(w2v_bin_path_new)

common_tokens       = set(wv_old.vocab.keys()).intersection(set(wv_new.wv.vocab.keys()))
common_tokens       = [(tok, tok) for tok in common_tokens]

transmat = translation_matrix.TranslationMatrix(wv_new.wv, wv_old, common_tokens)
transmat.train(common_tokens)

# transmat.apply_transmat(transmat.source_space)

pprint(transmat.translate('covid-19', topn=25))

from scipy import spatial
result = 1 - spatial.distance.cosine(wv_old['fredriksberg'], wv_old['non-neurologist'])

# def project_words_vectors(transmat, source_words):
#     source_space = translation_matrix.Space.build(transmat.source_lang_vec, source_words)
#     source_space.normalize()
#     mapped_source_space = transmat.apply_transmat(source_space)
#     return mapped_source_space
#