def test_w2v(self): testdir = os.path.dirname(os.path.realpath(__file__)) modelfile = testdir + '/../potara/data/enwiki9stempos.model' try: model = gensim.models.word2vec.Word2Vec.load(modelfile) esim = 0.88749 except: # mock a similarity model class FakeModel(): vocab = [] sim = {} def __init__(self): pass def similarity(self, w1, w2): if w1 + '_' + w2 in self.sim: return self.sim[w1 + '_' + w2] else: return self.sim[w2 + '_' + w1] model = FakeModel() model.vocab = ['right/JJ', 'wrong/JJ'] model.sim = {'right/JJ_wrong/JJ': 0.5} esim = 0.9166 s1 = "This/T beautiful/JJ sentence/NN is/V not/N right/JJ ./PUNCT" s2 = "This/T beautiful/JJ sentence/NN is/V wrong/JJ ./PUNCT" psim = sm.w2v(s1, s2, model) self.assertAlmostEqual(esim, psim, places=3) # order doesn't matter psim2 = sm.w2v(s2, s1, model) self.assertEqual(psim, psim2)
def test_w2v_singleword(self): testdir = os.path.dirname(os.path.realpath(__file__)) modelfile = testdir + '/../potara/data/enwiki9stempos.model' try: model = gensim.models.word2vec.Word2Vec.load(modelfile) except: return s1 = "right/JJ" s2 = "wrong/JJ" # a single different word means 0 sim esim = 0 psim = sm.w2v(s1, s2, model) self.assertEqual(esim, psim) s3 = "right/JJ" esim2 = 1 psim2 = sm.w2v(s1, s3, model) self.assertEqual(esim2, psim2)
def test_w2v_notinvocab(self): testdir = os.path.dirname(os.path.realpath(__file__)) modelfile = testdir + '/../potara/data/enwiki9stempos.model' try: model = gensim.models.word2vec.Word2Vec.load(modelfile) except: return s1 = "This/T beauful/JJ sentence/NN is/V not/N right/JJ ./PUNCT" s2 = "This/T beautiful/JJ sentence/NN is/V wrong/JJ ./PUNCT" esim = 0.8 psim = sm.w2v(s1, s2, model) self.assertAlmostEqual(esim, psim, places=1)
def test_w2v_notinvocab(self): testdir = os.path.dirname(os.path.realpath(__file__)) modelfile = testdir + '/../potara/data/enwiki9stempos.model' try: model = gensim.models.word2vec.Word2Vec.load(modelfile) except: return s1 = "This/T beauful/JJ sentence/NN is/V not/N right/JJ ./PUNCT" s2 = "This/T beautiful/JJ sentence/NN is/V wrong/JJ ./PUNCT" esim = 0.7208 psim = sm.w2v(s1, s2, model) self.assertAlmostEqual(esim, psim, places=3)
def test_w2v_untag(self): testdir = os.path.dirname(os.path.realpath(__file__)) modelfile = testdir + '/../potara/data/enwiki9stempos.model' try: model = gensim.models.word2vec.Word2Vec.load(modelfile) except: return s1 = "This sentence is not right ." s2 = "This sentence is wrong ." # without tags we consider the intersection over min length esim = 4.0/5 psim = sm.w2v(s1, s2, model) self.assertEqual(esim, psim)