class W2VSubs(Metaphor): def __init__(self, emb_info={}): if not emb_info: dim = 50 emb_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.{}d.txt'.format(dim)) emb_info = {'glove.6B.{}d'.format(dim): {'path': emb_path, 'dim': dim}} self.e = Embeddings('Embeddings', emb=emb_info) def _create(self, words_list, num_neighbors=5, fast_desired=False): words_vec = self.e.get_vectors(words_list) closest_n = self.e.closest_n(words_vec, num_neighbors, fast_desired=fast_desired) return closest_n def _reconstruct_core(self, text, closest_n): for w, closest in closest_n.items(): substitute = closest[np.random.randint(0, len(closest))] text = re.sub(r"\b{}\b".format(w), substitute[0], text) return text def metaphorize(self, text=None, **kwargs): num_neighbors = kwargs.pop('num_neighbors', 5) fast_desired = kwargs.pop('fast_desired', False) correct = kwargs.pop('correct', False) words_tagged = self._deconstruct(text, PoS={'NOUN', 'ADJ', 'ADV'}) words_list = [w for w, tag in words_tagged] closest_n = self._create(words_list, num_neighbors=num_neighbors, fast_desired=fast_desired) metaphor = self._reconstruct(correct, text, closest_n) return metaphor
def test_embeddings_singleton(self): file_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.50d.txt') e1 = Embeddings('Embeddings', emb={'glove.6B.50d': { 'path': file_path, 'dim': 50 }}) e2 = Embeddings('Embeddings') self.assertEqual(e1, e2)
def test_closest_n(self): file_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.50d.txt') e = Embeddings('Embeddings', {'glove.6B.50d': { 'path': file_path, 'dim': 50 }}) words = e.get_vectors(['sun']) closest_n = e.closest_n(words, 5) self.assertEqual(closest_n['sun'][0][0], 'sky') self.assertAlmostEqual(closest_n['sun'][0][1], 0.6626, 3) self.assertEqual(closest_n['sun'][2][0], 'bright') self.assertAlmostEqual(closest_n['sun'][2][1], 0.6353, 3)
def test_word_combination(self): # Do the neighbours change according to a word context? # Result: Not enough d = 50 file_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.{}d.txt'.format(d)) e = Embeddings('Embeddings', {'glove.6B.50d': { 'path': file_path, 'dim': d }}) words_neg = ["man", "bad", "dirty"] words_pos = ["man", "good", "clean"] for x in range(2, 11): w1 = e.combine_words(words_neg, x=x) w2 = e.combine_words(words_pos, x=x) res1 = e.closest_n(w1, 5) res2 = e.closest_n(w2, 5)
def test_closest_n_modes(self): file_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.50d.txt') e = Embeddings( 'Embeddings', { 'glove.6B.50d': { 'path': file_path, 'dim': 50, 'similarities_dim': 2000 } }) if not e.similarities.get('glove.6B.50d'): e.add_embeddings(emb={'glove.6B.50d': {'similarities_dim': 2000}}) words = e.get_vectors(['sun', 'beautiful', 'ugly', 'mother']) ping = time.time() closest_n = e.closest_n(words, 5) pong = time.time() closest_n = e.closest_n(words, 5, fast_desired=True) pongo = time.time() self.assertLess(pongo - pong, pong - ping)
def test_neighbours_strategy(self): d = 50 file_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.{}d.txt'.format(d)) e = Embeddings( 'Embeddings', { 'glove.6B.{}d'.format(d): { 'path': file_path, 'dim': d, 'sim_index': True } }) if not e.sim_index.get('glove.6B.50d'): e.add_embeddings(emb={'glove.6B.50d': {'sim_index': True}}) E = e.get_E() words = [ "war", "child", "mom", "ball", "astral", "eleven", "me", "tennis", "playful", "red" ] words_vec = e.get_vectors(words) ping = time.time() res1 = e.closest_n(words_vec, 10, fast_desired=True) pong = time.time() res2 = e.closest_n(words_vec, 10, fast_desired=False) peng = time.time() self.assertLess(pong - ping, peng - pong) self.assertEquals(res1.keys(), res2.keys()) for k, values in res1.items(): in_count = 0 words1, _ = zip(*values) words2, _ = zip(*res2[k]) for w in words1: if w in words2: in_count += 1 self.assertGreaterEqual(in_count / len(words1), 0.9) # we request 90% similarity
def __init__(self, emb_info={}): if not emb_info: dim = 50 emb_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.{}d.txt'.format(dim)) emb_info = {'glove.6B.{}d'.format(dim): {'path': emb_path, 'dim': dim}} self.e = Embeddings('Embeddings', emb=emb_info)
def test_embeddings_addition(self): file_path = os.path.join(BASE_DIR, 'data/glove.6B/glove.6B.50d.txt') e = Embeddings('Embeddings') e.add_embeddings({'glove.6B.50d': {'path': file_path, 'dim': 50}}) self.assertNotEqual({}, e.embeddings) self.assertAlmostEqual(-0.388916, e.get_E().loc['house'][41], 3)