コード例 #1
0
ファイル: w2v_subs.py プロジェクト: guiem/metaphor
class W2VSubs(Metaphor):

    def __init__(self, emb_info={}):
        if not emb_info:
            dim = 50
            emb_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.{}d.txt'.format(dim))
            emb_info = {'glove.6B.{}d'.format(dim): {'path': emb_path, 'dim': dim}}
        self.e = Embeddings('Embeddings', emb=emb_info)

    def _create(self, words_list, num_neighbors=5, fast_desired=False):
        words_vec = self.e.get_vectors(words_list)
        closest_n = self.e.closest_n(words_vec, num_neighbors, fast_desired=fast_desired)
        return closest_n

    def _reconstruct_core(self, text, closest_n):
        for w, closest in closest_n.items():
            substitute = closest[np.random.randint(0, len(closest))]
            text = re.sub(r"\b{}\b".format(w), substitute[0], text)
        return text

    def metaphorize(self, text=None, **kwargs):
        num_neighbors = kwargs.pop('num_neighbors', 5)
        fast_desired = kwargs.pop('fast_desired', False)
        correct = kwargs.pop('correct', False)
        words_tagged = self._deconstruct(text, PoS={'NOUN', 'ADJ', 'ADV'})
        words_list = [w for w, tag in words_tagged]
        closest_n = self._create(words_list, num_neighbors=num_neighbors, fast_desired=fast_desired)
        metaphor = self._reconstruct(correct, text, closest_n)
        return metaphor
コード例 #2
0
 def test_embeddings_singleton(self):
     file_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.50d.txt')
     e1 = Embeddings('Embeddings',
                     emb={'glove.6B.50d': {
                         'path': file_path,
                         'dim': 50
                     }})
     e2 = Embeddings('Embeddings')
     self.assertEqual(e1, e2)
コード例 #3
0
 def test_closest_n(self):
     file_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.50d.txt')
     e = Embeddings('Embeddings',
                    {'glove.6B.50d': {
                        'path': file_path,
                        'dim': 50
                    }})
     words = e.get_vectors(['sun'])
     closest_n = e.closest_n(words, 5)
     self.assertEqual(closest_n['sun'][0][0], 'sky')
     self.assertAlmostEqual(closest_n['sun'][0][1], 0.6626, 3)
     self.assertEqual(closest_n['sun'][2][0], 'bright')
     self.assertAlmostEqual(closest_n['sun'][2][1], 0.6353, 3)
コード例 #4
0
    def test_word_combination(self):
        # Do the neighbours change according to a word context?
        # Result: Not enough
        d = 50
        file_path = os.path.join(BASE_DIR,
                                 'data/glove.6B/glove.6B.{}d.txt'.format(d))
        e = Embeddings('Embeddings',
                       {'glove.6B.50d': {
                           'path': file_path,
                           'dim': d
                       }})
        words_neg = ["man", "bad", "dirty"]
        words_pos = ["man", "good", "clean"]

        for x in range(2, 11):
            w1 = e.combine_words(words_neg, x=x)
            w2 = e.combine_words(words_pos, x=x)
            res1 = e.closest_n(w1, 5)
            res2 = e.closest_n(w2, 5)
コード例 #5
0
 def test_closest_n_modes(self):
     file_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.50d.txt')
     e = Embeddings(
         'Embeddings', {
             'glove.6B.50d': {
                 'path': file_path,
                 'dim': 50,
                 'similarities_dim': 2000
             }
         })
     if not e.similarities.get('glove.6B.50d'):
         e.add_embeddings(emb={'glove.6B.50d': {'similarities_dim': 2000}})
     words = e.get_vectors(['sun', 'beautiful', 'ugly', 'mother'])
     ping = time.time()
     closest_n = e.closest_n(words, 5)
     pong = time.time()
     closest_n = e.closest_n(words, 5, fast_desired=True)
     pongo = time.time()
     self.assertLess(pongo - pong, pong - ping)
コード例 #6
0
 def test_neighbours_strategy(self):
     d = 50
     file_path = os.path.join(BASE_DIR,
                              'data/glove.6B/glove.6B.{}d.txt'.format(d))
     e = Embeddings(
         'Embeddings', {
             'glove.6B.{}d'.format(d): {
                 'path': file_path,
                 'dim': d,
                 'sim_index': True
             }
         })
     if not e.sim_index.get('glove.6B.50d'):
         e.add_embeddings(emb={'glove.6B.50d': {'sim_index': True}})
     E = e.get_E()
     words = [
         "war", "child", "mom", "ball", "astral", "eleven", "me", "tennis",
         "playful", "red"
     ]
     words_vec = e.get_vectors(words)
     ping = time.time()
     res1 = e.closest_n(words_vec, 10, fast_desired=True)
     pong = time.time()
     res2 = e.closest_n(words_vec, 10, fast_desired=False)
     peng = time.time()
     self.assertLess(pong - ping, peng - pong)
     self.assertEquals(res1.keys(), res2.keys())
     for k, values in res1.items():
         in_count = 0
         words1, _ = zip(*values)
         words2, _ = zip(*res2[k])
         for w in words1:
             if w in words2:
                 in_count += 1
         self.assertGreaterEqual(in_count / len(words1),
                                 0.9)  # we request 90% similarity
コード例 #7
0
ファイル: w2v_subs.py プロジェクト: guiem/metaphor
 def __init__(self, emb_info={}):
     if not emb_info:
         dim = 50
         emb_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.{}d.txt'.format(dim))
         emb_info = {'glove.6B.{}d'.format(dim): {'path': emb_path, 'dim': dim}}
     self.e = Embeddings('Embeddings', emb=emb_info)
コード例 #8
0
 def test_embeddings_addition(self):
     file_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.50d.txt')
     e = Embeddings('Embeddings')
     e.add_embeddings({'glove.6B.50d': {'path': file_path, 'dim': 50}})
     self.assertNotEqual({}, e.embeddings)
     self.assertAlmostEqual(-0.388916, e.get_E().loc['house'][41], 3)