def glove_via_magnitude(topn=500, min_similarity=None, filename='glove.6B.100d.magnitude', lang='en_US'): from pymagnitude import Magnitude v = Magnitude(os.path.join(TOPDIR, filename)) training_set = list() units = set() for unit_list in classifier.ambiguous_units(): for unit in unit_list[1]: units.add(unit) for unit in units: print('Processing {}...'.format(unit.name)) name = unit.name surfaces = set(unit.name) if isinstance(unit, classes.Unit): surfaces.update(unit.surfaces) surfaces.update(unit.symbols) for surface in surfaces: neighbours = v.most_similar( v.query(surface), topn=topn, min_similarity=min_similarity) training_set.append({ 'unit': name, 'text': ' '.join(neighbour[0] for neighbour in neighbours) }) print('Done') with language.topdir(lang).joinpath('train/similars.json').open( 'w', encoding='utf-8') as file: json.dump(training_set, file, sort_keys=True, indent=4)
def extract_wordvec_generalization(word, path_to_word_vectors, neighbor_number): ''' Extracts the nearest neighbor from vector space ''' vectors = Magnitude(path_to_word_vectors) generalized_attribute = vectors.most_similar( word, topn=neighbor_number)[neighbor_number - 1][0] return generalized_attribute
def get_nearest_words(): """ provides words closely related to the keywords Parameters: keywords -- an array of words closely related to the concept Returns: closest_words -- these are displayed on the right panel of the concept screen Testing: http://localhost:3001/api/get_nearest_words?keywords=lunch,slice,pie,pasta """ keywords = request.args.get('keywords', '') from pymagnitude import Magnitude #vectors = Magnitude('http://magnitude.plasticity.ai/word2vec/heavy/GoogleNews-vectors-negative300.magnitude', stream=True) # full url for streaming from 10GB model #vectors = Magnitude('http://magnitude.plasticity.ai/glove/light/glove.6B.50d.magnitude', stream=True) vectors = Magnitude('./pretrained_features/glove.6B.50d.magnitude') # there is likely overlap if the concepts words are closely related closest_words = set() for k in keywords.split(','): results = vectors.most_similar(k, topn=10) # Most similar by key #vectors.most_similar(vectors.query(k), topn = 100) # Most similar by vector for r in results: # just add the word, not the word's probability closest_words.add(r[0]) closest_words = closest_words - set(list(keywords.split(','))) return json.dumps(list(closest_words))
class MagnitudeTest(unittest.TestCase): MAGNITUDE_PATH = "" MAGNITUDE_SUBWORD_PATH = "" MAGNITUDE_APPROX_PATH = "" def setUp(self): self.vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=True) self.vectors_cs = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=False, eager=False) self.vectors_sw = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, eager=False) self.vectors_approx = Magnitude(MagnitudeTest.MAGNITUDE_APPROX_PATH, case_insensitive=True, eager=False) self.tmp_vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=False) self.concat_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=False) self.concat_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=False) self.concat = Magnitude(self.concat_1, self.concat_2) self.vectors_feat = FeaturizerMagnitude(100, case_insensitive=True) self.v = { 'padding': self.tmp_vectors._padding_vector(), 'I': self.tmp_vectors.query("I"), 'saw': self.tmp_vectors.query("saw"), 'a': self.tmp_vectors.query("a"), 'cat': self.tmp_vectors.query("cat"), 'He': self.tmp_vectors.query("He"), 'went': self.tmp_vectors.query("went"), 'to': self.tmp_vectors.query("to"), 'the': self.tmp_vectors.query("the"), 'mall': self.tmp_vectors.query("mall"), 'blah123': self.tmp_vectors.query("blah123") } def tearDown(self): self.vectors.close() self.vectors_cs.close() self.vectors_sw.close() self.tmp_vectors.close() self.concat_1.close() self.concat_2.close() del self.concat self.vectors_feat.close() gc.collect() def test_length(self): self.assertEqual(len(self.vectors), 3000000) def test_dim(self): self.assertEqual(self.vectors.dim, 300) def test_index(self): self.assertTrue(isinstance(self.vectors[0][0], unicode)) self.assertTrue(isinstance(self.vectors[0][1], np.ndarray)) self.assertTrue(isinstance(self.vectors.index(0)[0], unicode)) self.assertTrue(isinstance(self.vectors.index(0)[1], np.ndarray)) self.assertTrue( isinstance(self.vectors.index(0, return_vector=False), unicode)) def test_slice(self): sliced = self.vectors[0:5] self.assertEqual(len(sliced), 5) self.assertEqual(sliced[0][0], self.vectors[0][0]) self.assertTrue(isclose(sliced[0][1], self.vectors[0][1]).all()) def test_case_insensitive(self): some_keys_are_not_lower = False for i, (k, _) in enumerate(self.vectors): if i > 1000: break some_keys_are_not_lower = (some_keys_are_not_lower or k.lower() != k) self.assertTrue(some_keys_are_not_lower) self.assertTrue("QuEEn" in self.vectors) self.assertTrue("QUEEN" in self.vectors) self.assertTrue("queen" in self.vectors) self.assertTrue( isclose(self.vectors.query("Queen"), self.vectors.query("QuEEn")).all()) self.assertEqual( self.vectors.most_similar("I", return_similarities=False)[0], 'myself') self.assertEqual( self.vectors.most_similar("i", return_similarities=False)[0], 'ive') self.assertTrue(self.vectors.similarity("a", "A") > .9) def test_case_sensitive(self): some_keys_are_not_lower = False for i, (k, _) in enumerate(self.vectors_cs): if i > 1000: break some_keys_are_not_lower = (some_keys_are_not_lower or k.lower() != k) self.assertTrue(some_keys_are_not_lower) self.assertTrue("QuEEn" not in self.vectors_cs) self.assertTrue("QUEEN" in self.vectors_cs) self.assertTrue("queen" in self.vectors_cs) self.assertTrue(not isclose(self.vectors_cs.query("Queen"), self.vectors_cs.query("QuEEn")).all()) self.assertEqual( self.vectors_cs.most_similar("I", return_similarities=False)[0], 'myself') self.assertEqual( self.vectors_cs.most_similar("i", return_similarities=False)[0], 'ive') self.assertTrue(self.vectors_cs.similarity("a", "A") > .9) def test_iter_case_insensitive(self): for _ in range(2): for i, (k, v) in enumerate(self.vectors): if i > 1000: break k2, v2 = self.vectors[i] self.assertEqual(k, k2) self.assertTrue(isclose(v[0], v2[0])) def test_iter_case_sensitive(self): for _ in range(2): for i, (k, v) in enumerate(self.vectors_cs): if i > 1000: break k2, v2 = self.vectors_cs[i] self.assertEqual(k, k2) self.assertTrue(isclose(v[0], v2[0])) def test_index_case_insensitive(self): for _ in range(2): viter = iter(self.vectors) for i in range(len(self.vectors)): if i > 1000: break k, v = next(viter) k2, v2 = self.vectors[i] self.assertEqual(k, k2) self.assertTrue(isclose(v[0], v2[0])) def test_index_case_sensitive(self): for _ in range(2): viter = iter(self.vectors_cs) for i in range(len(self.vectors_cs)): if i > 1000: break k, v = next(viter) k2, v2 = self.vectors_cs[i] self.assertEqual(k, k2) self.assertTrue(isclose(v[0], v2[0])) def test_bounds(self): length = len(self.vectors) self.assertTrue(isinstance(self.vectors[length - 1][0], unicode)) self.assertTrue(isinstance(self.vectors[length - 1][1], np.ndarray)) @unittest.expectedFailure def test_out_of_bounds(self): length = len(self.vectors) self.assertTrue(isinstance(self.vectors[length][0], unicode)) self.assertTrue(isinstance(self.vectors[length][1], np.ndarray)) def test_contains(self): self.assertTrue("cat" in self.vectors) def test_contains_false(self): self.assertTrue("blah123" not in self.vectors) def test_special_characters(self): self.assertTrue("Wilkes-Barre/Scranton" in self.vectors) self.assertTrue("out-of-vocabulary" not in self.vectors) self.assertTrue('quotation"s' not in self.vectors) self.assertTrue("quotation's" not in self.vectors) self.assertTrue("colon;s" not in self.vectors) self.assertTrue("sh**" not in self.vectors) self.assertTrue("'s" not in self.vectors_cs) self.assertTrue('"s' not in self.vectors) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query("Wilkes-Barre/Scranton").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query("out-of-vocabulary").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query('quotation"s').shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query("quotation's").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query("colon;s").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query("sh**").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors_cs.query("'s").shape) self.assertEqual( self.vectors.query("cat").shape, self.vectors.query('"s').shape) def test_oov_dim(self): self.assertEqual( self.vectors.query("*<<<<").shape, self.vectors.query("cat").shape) def test_oov_subword_dim(self): self.assertEqual( self.vectors_sw.query("*<<<<").shape, self.vectors_sw.query("cat").shape) def test_oov_dim_placeholders(self): self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH, placeholders=5, case_insensitive=True, eager=False) self.assertEqual( self.vectors_placeholders.query("*<<<<").shape, self.vectors_placeholders.query("cat").shape) self.assertTrue( isclose( self.vectors.query("*<<<<")[0], self.vectors_placeholders.query("*<<<<")[0])) self.vectors_placeholders.close() def test_oov_subword_dim_placeholders(self): self.vectors_placeholders = Magnitude( MagnitudeTest.MAGNITUDE_SUBWORD_PATH, placeholders=5, case_insensitive=True, eager=False) self.assertEqual( self.vectors_placeholders.query("*<<<<").shape, self.vectors_placeholders.query("cat").shape) self.assertTrue( isclose( self.vectors.query("*<<<<")[0], self.vectors_placeholders.query("*<<<<")[0])) self.vectors_placeholders.close() def test_oov_unit_norm(self): self.assertTrue( isclose(np.linalg.norm(self.vectors.query("*<<<<<")), 1.0)) def test_oov_subword_unit_norm(self): self.assertTrue( isclose(np.linalg.norm(self.vectors_sw.query("*<<<<<")), 1.0)) def test_ngram_oov_closeness(self): self.assertTrue(self.vectors.similarity("uberx", "uberxl") > .7) self.assertTrue(self.vectors.similarity("uberx", "veryrandom") < .7) self.assertTrue( self.vectors.similarity("veryrandom", "veryrandom") > .7) def test_ngram_oov_subword_closeness(self): self.assertTrue(self.vectors_sw.similarity("uberx", "uberxl") > .7) self.assertTrue(self.vectors_sw.similarity("uberx", "uber") > .7) self.assertTrue(self.vectors_sw.similarity("uberxl", "uber") > .7) self.assertTrue( self.vectors_sw.similarity("discriminatoryy", "discriminatory") > .7) self.assertTrue( self.vectors_sw.similarity("discriminatoryy", "discriminnatory") > .8) self.assertTrue(self.vectors_sw.similarity("uberx", "veryrandom") < .7) self.assertTrue( self.vectors_sw.similarity("veryrandom", "veryrandom") > .7) self.assertTrue(self.vectors_sw.similarity("hiiiiiiiii", "hi") > .7) self.assertTrue(self.vectors_sw.similarity("heeeeeeeey", "hey") > .7) self.assertTrue(self.vectors_sw.similarity("heyyyyyyyyyy", "hey") > .7) self.assertTrue(self.vectors_sw.similarity("faaaaaate", "fate") > .65) def test_oov_values(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.assertTrue( isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<")[0], -0.0372075283555)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<")[0], -0.0201727917272)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<")[0], -0.0475993225776)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<<")[0], 0.0129938352266)) self.assertTrue( isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<<<")[0], -0.0372075283555)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<<<<")[0], -0.0201727917272)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<<<<<")[0], -0.0475993225776)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<<<<<<")[0], 0.0129938352266)) self.vectors_oov_1.close() self.vectors_oov_2.close() def test_oov_subword_values(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.assertTrue( isclose( self.vectors_oov_1.query("discriminatoryy")[0], -0.0573252095591)) self.assertTrue( isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397)) self.assertTrue( isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271)) self.assertTrue( isclose(self.vectors_oov_1.query("uberx")[0], 0.0952671681336)) self.assertTrue( isclose(self.vectors_oov_1.query("misssipi")[0], 0.0577835297955)) self.assertTrue( isclose( self.vectors_oov_2.query("discriminatoryy")[0], -0.0573252095591)) self.assertTrue( isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397)) self.assertTrue( isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271)) self.assertTrue( isclose(self.vectors_oov_2.query("uberx")[0], 0.0952671681336)) self.assertTrue( isclose(self.vectors_oov_2.query("misssipi")[0], 0.0577835297955)) self.vectors_oov_1.close() self.vectors_oov_2.close() def test_oov_stability(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=False, eager=False) for i in range(5): self.assertTrue( isclose(self.vectors_oov_1.query("*<"), self.vectors_oov_2.query("*<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<"), self.vectors_oov_2.query("*<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<"), self.vectors_oov_2.query("*<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<"), self.vectors_oov_2.query("*<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<"), self.vectors_oov_2.query("*<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<"), self.vectors_oov_2.query("*<<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<<"), self.vectors_oov_2.query("*<<<<<<<")).all()) self.vectors_oov_1.close() self.vectors_oov_2.close() def test_ngram_oov_stability(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=True, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=True, eager=False) for i in range(5): self.assertTrue( isclose(self.vectors_oov_1.query("*<"), self.vectors_oov_2.query("*<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<"), self.vectors_oov_2.query("*<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<"), self.vectors_oov_2.query("*<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<"), self.vectors_oov_2.query("*<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<"), self.vectors_oov_2.query("*<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<"), self.vectors_oov_2.query("*<<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<<"), self.vectors_oov_2.query("*<<<<<<<")).all()) self.vectors_oov_1.close() self.vectors_oov_2.close() def test_ngram_oov_subword_stability(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, eager=False) for i in range(5): self.assertTrue( isclose(self.vectors_oov_1.query("*<"), self.vectors_oov_2.query("*<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<"), self.vectors_oov_2.query("*<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<"), self.vectors_oov_2.query("*<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<"), self.vectors_oov_2.query("*<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<"), self.vectors_oov_2.query("*<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<"), self.vectors_oov_2.query("*<<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<<"), self.vectors_oov_2.query("*<<<<<<<")).all()) self.vectors_oov_1.close() self.vectors_oov_2.close() def test_placeholders(self): self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, placeholders=5, eager=False) self.assertEqual(self.vectors_placeholders.query("cat").shape, (305, )) self.assertEqual( self.vectors_placeholders.query("cat")[0], self.vectors.query("cat")[0]) self.vectors_placeholders.close() def test_numpy(self): self.assertTrue(isinstance(self.vectors.query("cat"), np.ndarray)) def test_list(self): self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, use_numpy=False, eager=False) self.assertTrue(isinstance(self.vectors_list.query("cat"), list)) self.vectors_list.close() def test_repeated_single(self): q = "cat" result = self.vectors.query(q) result_2 = self.vectors.query(q) self.assertTrue(isclose(result, result_2).all()) def test_repeated_multiple(self): q = ["I", "saw", "a", "cat"] result = self.vectors.query(q) result_2 = self.vectors.query(q) self.assertTrue(isclose(result, result_2).all()) q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q) result_2 = self.vectors.query(q) self.assertTrue(isclose(result, result_2).all()) def test_multiple(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q) self.assertEqual(result.shape, (2, 5, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['I']).all()) self.assertTrue(isclose(result[0][1], self.v['saw']).all()) self.assertTrue(isclose(result[0][2], self.v['a']).all()) self.assertTrue(isclose(result[0][3], self.v['cat']).all()) self.assertTrue(isclose(result[0][4], self.v['padding']).all()) self.assertTrue(isclose(result[1][0], self.v['He']).all()) self.assertTrue(isclose(result[1][1], self.v['went']).all()) self.assertTrue(isclose(result[1][2], self.v['to']).all()) self.assertTrue(isclose(result[1][3], self.v['the']).all()) self.assertTrue(isclose(result[1][4], self.v['mall']).all()) return result def test_pad_to_length_right_truncate_none(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q, pad_to_length=6) self.assertEqual(result.shape, (2, 6, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['I']).all()) self.assertTrue(isclose(result[0][1], self.v['saw']).all()) self.assertTrue(isclose(result[0][2], self.v['a']).all()) self.assertTrue(isclose(result[0][3], self.v['cat']).all()) self.assertTrue(isclose(result[0][4], self.v['padding']).all()) self.assertTrue(isclose(result[0][5], self.v['padding']).all()) self.assertTrue(isclose(result[1][0], self.v['He']).all()) self.assertTrue(isclose(result[1][1], self.v['went']).all()) self.assertTrue(isclose(result[1][2], self.v['to']).all()) self.assertTrue(isclose(result[1][3], self.v['the']).all()) self.assertTrue(isclose(result[1][4], self.v['mall']).all()) self.assertTrue(isclose(result[1][5], self.v['padding']).all()) return result def test_pad_to_length_truncate_none(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q, pad_to_length=6) self.assertEqual(result.shape, (2, 6, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['I']).all()) self.assertTrue(isclose(result[0][1], self.v['saw']).all()) self.assertTrue(isclose(result[0][2], self.v['a']).all()) self.assertTrue(isclose(result[0][3], self.v['cat']).all()) self.assertTrue(isclose(result[0][4], self.v['padding']).all()) self.assertTrue(isclose(result[0][5], self.v['padding']).all()) self.assertTrue(isclose(result[1][0], self.v['He']).all()) self.assertTrue(isclose(result[1][1], self.v['went']).all()) self.assertTrue(isclose(result[1][2], self.v['to']).all()) self.assertTrue(isclose(result[1][3], self.v['the']).all()) self.assertTrue(isclose(result[1][4], self.v['mall']).all()) self.assertTrue(isclose(result[1][5], self.v['padding']).all()) return result def test_pad_to_length_left_truncate_none(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q, pad_to_length=6, pad_left=True) self.assertEqual(result.shape, (2, 6, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['padding']).all()) self.assertTrue(isclose(result[0][1], self.v['padding']).all()) self.assertTrue(isclose(result[0][2], self.v['I']).all()) self.assertTrue(isclose(result[0][3], self.v['saw']).all()) self.assertTrue(isclose(result[0][4], self.v['a']).all()) self.assertTrue(isclose(result[0][5], self.v['cat']).all()) self.assertTrue(isclose(result[1][0], self.v['padding']).all()) self.assertTrue(isclose(result[1][1], self.v['He']).all()) self.assertTrue(isclose(result[1][2], self.v['went']).all()) self.assertTrue(isclose(result[1][3], self.v['to']).all()) self.assertTrue(isclose(result[1][4], self.v['the']).all()) self.assertTrue(isclose(result[1][5], self.v['mall']).all()) return result def test_pad_to_length_truncate_right(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q, pad_to_length=3) self.assertEqual(result.shape, (2, 3, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['I']).all()) self.assertTrue(isclose(result[0][1], self.v['saw']).all()) self.assertTrue(isclose(result[0][2], self.v['a']).all()) self.assertTrue(isclose(result[1][0], self.v['He']).all()) self.assertTrue(isclose(result[1][1], self.v['went']).all()) self.assertTrue(isclose(result[1][2], self.v['to']).all()) return result def test_pad_to_length_truncate_left(self): q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] result = self.vectors.query(q, pad_to_length=3, truncate_left=True) self.assertEqual(result.shape, (2, 3, self.vectors.dim)) self.assertTrue(isclose(result[0][0], self.v['saw']).all()) self.assertTrue(isclose(result[0][1], self.v['a']).all()) self.assertTrue(isclose(result[0][2], self.v['cat']).all()) self.assertTrue(isclose(result[1][0], self.v['to']).all()) self.assertTrue(isclose(result[1][1], self.v['the']).all()) self.assertTrue(isclose(result[1][2], self.v['mall']).all()) return result def test_list_multiple(self): self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, use_numpy=False, eager=False) q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] self.assertTrue(isinstance(self.vectors_list.query(q[0]), list)) self.assertTrue( isclose(self.vectors.query(q[0]), asarray(self.vectors_list.query(q[0]))).all()) self.assertTrue(isinstance(self.vectors_list.query(q), list)) self.assertTrue( isclose(self.vectors.query(q), asarray(self.vectors_list.query(q))).all()) self.vectors_list.close() def test_concat(self): q = "cat" result = self.concat.query(q) self.assertEqual(result.shape, (self.vectors.dim * 2, )) self.assertTrue(isclose(result[0:300], self.v['cat']).all()) self.assertTrue(isclose(result[300:600], self.v['cat']).all()) def test_concat_multiple(self): q = ["I", "saw"] result = self.concat.query(q) self.assertEqual(result.shape, ( 2, self.vectors.dim * 2, )) self.assertTrue(isclose(result[0][0:300], self.v['I']).all()) self.assertTrue(isclose(result[0][300:600], self.v['I']).all()) self.assertTrue(isclose(result[1][0:300], self.v['saw']).all()) self.assertTrue(isclose(result[1][300:600], self.v['saw']).all()) def test_concat_multiple_2(self): q = [["I", "saw"], ["He", "went"]] result = self.concat.query(q) self.assertEqual(result.shape, ( 2, 2, self.vectors.dim * 2, )) self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all()) self.assertTrue(isclose(result[0][0][300:600], self.v['I']).all()) self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all()) self.assertTrue(isclose(result[0][1][300:600], self.v['saw']).all()) self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all()) self.assertTrue(isclose(result[1][0][300:600], self.v['He']).all()) self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all()) self.assertTrue(isclose(result[1][1][300:600], self.v['went']).all()) def test_concat_specific(self): q = ("cat", "mall") result = self.concat.query(q) self.assertEqual(result.shape, (self.vectors.dim * 2, )) self.assertTrue(isclose(result[0:300], self.v['cat']).all()) self.assertTrue(isclose(result[300:600], self.v['mall']).all()) def test_concat_multiple_specific(self): q = [("I", "He"), ("saw", "went")] result = self.concat.query(q) self.assertEqual(result.shape, ( 2, self.vectors.dim * 2, )) self.assertTrue(isclose(result[0][0:300], self.v['I']).all()) self.assertTrue(isclose(result[0][300:600], self.v['He']).all()) self.assertTrue(isclose(result[1][0:300], self.v['saw']).all()) self.assertTrue(isclose(result[1][300:600], self.v['went']).all()) def test_concat_multiple_2_specific(self): q = [[("I", "He"), ("saw", "went")], [("He", "I"), ("went", "saw")]] result = self.concat.query(q) self.assertEqual(result.shape, ( 2, 2, self.vectors.dim * 2, )) self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all()) self.assertTrue(isclose(result[0][0][300:600], self.v['He']).all()) self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all()) self.assertTrue(isclose(result[0][1][300:600], self.v['went']).all()) self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all()) self.assertTrue(isclose(result[1][0][300:600], self.v['I']).all()) self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all()) self.assertTrue(isclose(result[1][1][300:600], self.v['saw']).all()) def test_distance(self): self.assertTrue( isclose(self.vectors.distance("cat", "dog"), 0.69145405)) def test_distance_multiple(self): self.assertTrue( isclose(self.vectors.distance("cat", ["cats", "dog"]), [0.61654216, 0.69145405]).all()) def test_similarity(self): self.assertTrue( isclose(self.vectors.similarity("cat", "dog"), 0.7609457089782209)) def test_similarity_multiple(self): self.assertTrue( isclose(self.vectors.similarity("cat", ["cats", "dog"]), [0.8099378824686305, 0.7609457089782209]).all()) def test_most_similar_to_given(self): self.assertEqual( self.vectors.most_similar_to_given( "cat", ["dog", "television", "laptop"]), "dog") self.assertEqual( self.vectors.most_similar_to_given( "cat", ["television", "dog", "laptop"]), "dog") self.assertEqual( self.vectors.most_similar_to_given( "cat", ["television", "laptop", "dog"]), "dog") def test_doesnt_match(self): self.assertEqual( self.vectors.doesnt_match( ["breakfast", "cereal", "lunch", "dinner"]), "cereal") self.assertEqual( self.vectors.doesnt_match( ["breakfast", "lunch", "cereal", "dinner"]), "cereal") self.assertEqual( self.vectors.doesnt_match( ["breakfast", "lunch", "dinner", "cereal"]), "cereal") def test_most_similar_case_insensitive(self): keys = [s[0] for s in self.vectors.most_similar("queen", topn=5)] similarities = [ s[1] for s in self.vectors.most_similar("queen", topn=5) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.7399442791938782, 0.7070531845092773, 0.6510956287384033, 0.6383601427078247, 0.6357027292251587 ]), atol=.02).all()) self.assertEqual(keys, [ u'queens', u'princess', u'king', u'monarch', u'very_pampered_McElhatton' ]) def test_most_similar(self): keys = [s[0] for s in self.vectors_cs.most_similar("queen")] similarities = [s[1] for s in self.vectors_cs.most_similar("queen")] self.assertTrue( isclose(asarray(similarities), asarray([ 0.7399442791938782, 0.7070531845092773, 0.6510956287384033, 0.6383601427078247, 0.6357027292251587, 0.6163408160209656, 0.6060680150985718, 0.5923796892166138, 0.5908075571060181, 0.5637184381484985 ]), atol=.02).all()) self.assertEqual(keys, [ u'queens', u'princess', u'king', u'monarch', u'very_pampered_McElhatton', u'Queen', u'NYC_anglophiles_aflutter', u'Queen_Consort', u'princesses', u'royal', ]) def test_most_similar_no_similarities(self): keys = self.vectors_cs.most_similar("queen", return_similarities=False) self.assertEqual(keys, [ u'queens', u'princess', u'king', u'monarch', u'very_pampered_McElhatton', u'Queen', u'NYC_anglophiles_aflutter', u'Queen_Consort', u'princesses', u'royal', ]) def test_most_similar_top_5(self): keys = [s[0] for s in self.vectors_cs.most_similar("queen", topn=5)] similarities = [ s[1] for s in self.vectors_cs.most_similar("queen", topn=5) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.7399442791938782, 0.7070531845092773, 0.6510956287384033, 0.6383601427078247, 0.6357027292251587 ]), atol=.02).all()) self.assertEqual(keys, [ u'queens', u'princess', u'king', u'monarch', u'very_pampered_McElhatton' ]) def test_most_similar_min_similarity(self): keys = [ s[0] for s in self.vectors_cs.most_similar("queen", min_similarity=.63) ] similarities = [ s[1] for s in self.vectors_cs.most_similar("queen", min_similarity=.63) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.7399442791938782, 0.7070531845092773, 0.6510956287384033, 0.6383601427078247, 0.6357027292251587 ]), atol=.02).all()) self.assertEqual(keys, [ u'queens', u'princess', u'king', u'monarch', u'very_pampered_McElhatton' ]) def test_most_similar_analogy(self): keys = [ s[0] for s in self.vectors_cs.most_similar(positive=["king", "woman"], negative=["man"]) ] similarities = [ s[1] for s in self.vectors_cs.most_similar(positive=["king", "woman"], negative=["man"]) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.7118192315101624, 0.6189674139022827, 0.5902431011199951, 0.549946129322052, 0.5377321243286133, 0.5236844420433044, 0.5235944986343384, 0.518113374710083, 0.5098593831062317, 0.5087411403656006 ]), atol=.02).all()) self.assertEqual(keys, [ u'queen', u'monarch', u'princess', u'crown_prince', u'prince', u'kings', u'Queen_Consort', u'queens', u'sultan', u'monarchy' ]) def test_most_similar_cosmul_analogy(self): keys = [ s[0] for s in self.vectors_cs.most_similar_cosmul( positive=["king", "woman"], negative=["man"]) ] similarities = [ s[1] for s in self.vectors_cs.most_similar_cosmul( positive=["king", "woman"], negative=["man"]) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.9314123392105103, 0.858533501625061, 0.8476565480232239, 0.8150269985198975, 0.809981644153595, 0.8089977502822876, 0.8027306795120239, 0.801961362361908, 0.8009798526763916, 0.7958389520645142 ]), atol=.02).all()) self.assertEqual(keys, [ u'queen', u'monarch', u'princess', u'Queen_Consort', u'queens', u'crown_prince', u'royal_palace', u'monarchy', u'prince', u'empress' ]) def test_most_similar_cosmul_min_similarity_analogy(self): keys = [ s[0] for s in self.vectors_cs.most_similar_cosmul( positive=["king", "woman"], negative=["man"], min_similarity=.81) ] similarities = [ s[1] for s in self.vectors_cs.most_similar_cosmul( positive=["king", "woman"], negative=["man"], min_similarity=.81) ] self.assertTrue( isclose(asarray(similarities), asarray([ 0.9314123392105103, 0.858533501625061, 0.8476565480232239, 0.8150269985198975 ]), atol=.02).all()) self.assertEqual(keys, [u'queen', u'monarch', u'princess', u'Queen_Consort']) def test_closer_than(self): self.assertEqual(self.vectors.closer_than("cat", "dog"), ["cats"]) def test_most_similar_approx(self): keys = [ s[0] for s in self.vectors_approx.most_similar_approx("queen", topn=15) ] similarities = [ s[1] for s in self.vectors_approx.most_similar_approx("queen", topn=15) ] self.assertEqual(len(keys), 15) self.assertTrue(similarities[0] > .7 and similarities[-1] > .5) @unittest.expectedFailure def test_most_similar_approx_failure(self): self.vectors.most_similar_approx("queen", topn=15) def test_most_similar_approx_low_effort(self): keys = [ s[0] for s in self.vectors_approx.most_similar_approx( "queen", topn=15, effort=.1) ] self.assertEqual(len(keys), 15) self.assertEqual(keys[0], "princess") def test_most_similar_analogy_approx(self): keys = [ s[0] for s in self.vectors_approx.most_similar_approx( positive=["king", "woman"], negative=["man"], topn=15) ] self.assertEqual(keys[0], "queen") def test_feat_length(self): self.vectors_feat_2 = FeaturizerMagnitude(1000, case_insensitive=True) self.assertEqual(self.vectors_feat.dim, 4) self.assertEqual(self.vectors_feat_2.dim, 5) self.vectors_feat_2.close() def test_feat_stability(self): self.vectors_feat_2 = FeaturizerMagnitude(100, case_insensitive=True) self.assertTrue( isclose(self.vectors_feat.query("VBG"), self.vectors_feat_2.query("VBG")).all()) self.assertTrue( isclose(self.vectors_feat.query("PRP"), self.vectors_feat_2.query("PRP")).all()) self.vectors_feat_2.close() def test_feat_values(self): self.assertTrue( isclose(self.vectors_feat.query("VBG")[0], 0.490634876828)) self.assertTrue( isclose(self.vectors_feat.query("PRP")[0], 0.463890807802)) self.assertTrue(isclose( self.vectors_feat.query(5)[0], -0.750681075834)) self.assertTrue( isclose(self.vectors_feat.query(5)[-1], 1.46936807866e-38))
class EmbeddingEngine: """ An interface to query pre-trained word vectors. """ ABBR_LIST = [ "C41H11O11", "PV", "OPV", "PV12", "CsOS", "CsKPSV", "CsPS", "CsHIOS", "OPV", "CsPSV", "CsOPV", "CsIOS", "BCsIS", "CsPrS", "CEsH", "KP307", "AsOV", "CEsS", "COsV", "CNoO", "BEsF", "I2P3", "KP115", "BCsIS", "C9705IS", "ISC0501", "B349S", "CISe", "CISSe", "CsIPS", "CEsP", "BCsF", "CsFOS", "BCY10", "C12P", "EsHP", "CsHP", "C2K8", "CsOP", "EsHS", "CsHS", "C3P", "C50I", "CEs", "CSm", "BF", "EsN", "BN50S", "AsCP", "CPo", "LiPb17", "CsS", "EsIS", "AsCU", "CCsHS", "CsHPU", "AsOS", "AsCI", "EsF", "FV448", "CNS", "CP5", "AsFP", "EsOP", "NS", "NS2", "EsI", "BH", "PPmV", "PSe", "AsN", "OPV5", "NSiW", "CsHHS" ] def __init__(self, embeddings_source=EMBEDDINGS, out_embeddings_source=OUT_EMBEDDINGS, formulas_source=FORMULAS, phraser_source=PHRASER): """ :param embeddings_source: can be instance of a Magnitude object or url or path to a serialized Magnitude object :param out_embeddings_source: can be instance of a Magnitude object or url or path to a serialized Magnitude object :param formulas_source: can be url or path to a JSON-serialized dict of formulae, if not supplied a default file is loaded """ # hidden layer embeddings (W) self.embeddings = Magnitude(embeddings_source, eager=False) # output layer embeddings (O) self.out_embeddings = Magnitude(out_embeddings_source) # load pre-trained formulas from embeddings with open(formulas_source, 'r') as f: self.formulas_with_abbreviations = load(f) self.dp = DataPreparation(local=False) self.es = ElasticConnection() self.formulas = { k: v for k, v in self.formulas_with_abbreviations.items() if k not in self.ABBR_LIST } self.formula_counts = { root_formula: sum(formulas.values()) for root_formula, formulas in self.formulas.items() } self.most_common_forms = { formula_group_name: (formula_group_name if formula_group_name in self.dp.ELEMENTS else max(formulae.items(), key=operator.itemgetter(1))[0]) for formula_group_name, formulae in self.formulas_with_abbreviations.items() } self.phraser = Phraser.load(phraser_source) def make_phrases(self, sentence, reps=2): """ generates phrases from a sentence of words :param sentence: a list of tokens :param reps: how many times to combine the words :return: """ while reps > 0: sentence = self.phraser[sentence] reps -= 1 return sentence def prepare_wordphrase(self, wp, im=False): """ Process a string into words and phrases according to existing embeddings :param wp: the string to process :param im: if True, will ignore missing words, otherwise will generate random vectors :return: a list of processed words and phrases """ processed_wp = self.make_phrases( self.dp.process_sentence(self.dp.text2sent(wp))[0]) if im: processed_wp = [ pwp for pwp in processed_wp if pwp in self.embeddings ] return processed_wp def get_embedding(self, wordphrases, ignore_missing=False, normalized=True): """ Gets the embedding for the given word :param wordphrases: a string or a list of strings to request embedding for :param ignore_missing: if true, will ignore missing words, otherwise will query them using pymagnitude defult out of dictionary handling :param normalized: if False, returns non-normalized embeddings (True by default) :return: an embedding matrix with each row corresponding to a single processed word or phrase taken from wordphrases, as well as the lists of processed wordphrases """ def get_single_embedding(wp, im=ignore_missing, norm=normalized): """ Returns a single embedding vector for the given string :param wp: a string to get a single embedding for :param im: boolen to ignore missing words or return some random vectors if False :param norm: if False, returns the non-normalized embedding (True by default) :return: a single embedding vector for the string (could be a composite embedding) """ processed_wordphrase = self.prepare_wordphrase(wp, im) if len(processed_wordphrase) > 0: emb = np.mean(self.embeddings.query(processed_wordphrase, normalized=norm), axis=0) if norm: emb = emb / np.linalg.norm(emb) emb = emb.tolist() else: emb = [0] * self.embeddings.dim return emb, processed_wordphrase if not isinstance(wordphrases, list): wordphrases = [wordphrases] processed_wps = [] embeddings = [] try: for wordphrase in wordphrases: embedding, processed_wp = get_single_embedding( wordphrase, im=ignore_missing) processed_wps.append(processed_wp) embeddings.append(embedding) except Exception as ex: warnings.warn(ex) return embeddings, processed_wps def close_words(self, positive, negative=None, top_k=8, exclude_self=True, ignore_missing=True): """ Returns a list of close words :param positive: can be either a string or a list of strings :param negative: same as word, but will be treated with a minus sign :param top_k: number of close words to return :param exclude_self: boolean, if the supplied word should be excluded or not :param ignore_missing: ignore words that are missing from the vocabulary :return: (words, scores, processed_positive, processed_negative) """ if negative is None: negative = [] else: if not isinstance(negative, list): negative = [negative] processed_negative = [] for n in negative: processed_negative += self.prepare_wordphrase(n, im=ignore_missing) if not isinstance(positive, list): positive = [positive] processed_positive = [] for p in positive: processed_positive += self.prepare_wordphrase(p, im=ignore_missing) most_similar = self.embeddings.most_similar( processed_positive, negative=processed_negative, topn=top_k) if not exclude_self: most_similar = [(processed_positive, 1.0) ] + most_similar[:top_k - 1] words, scores = map(list, zip(*most_similar)) return words, [float(s) for s in scores], processed_positive, processed_negative def find_similar_materials(self, sentence, n_sentence=None, min_count=3, use_output_emb=True, ignore_missing=True): """ Finds materials that match the best with the context of the sentence :param sentence: a list of words :param n_sentence: a list of words for a negative context :param min_count: the minimum number of occurrences for the formula to be included :param use_output_emb: if True, use output layer embedding (O) instead of inner layer embedding (W) :return: """ positive_embeddings, processed_sentence = \ self.get_embedding(sentence, ignore_missing=ignore_missing) n_sentence = n_sentence or [] negative_embeddings, processed_n_sentence = \ self.get_embedding(n_sentence, ignore_missing=ignore_missing) emb = self.out_embeddings if use_output_emb else self.embeddings sum_embedding = np.sum(np.asarray(positive_embeddings), axis=0) - \ np.sum(np.asarray(negative_embeddings), axis=0) sum_embedding = sum_embedding / np.linalg.norm(sum_embedding) # formulas common enough to be above cut-off and that exist in embedding formulas = [ f for f, count in self.formula_counts.items() if (count > min_count) and (f in self.embeddings) ] similarity_scores = np.dot(emb.query(formulas, normalized=True), sum_embedding) similarities = { f: float(similarity_scores[i]) for i, f in enumerate(formulas) } return sorted(similarities.items(), key=lambda x: x[1], reverse=True), processed_sentence, processed_n_sentence def most_common_form(self, formulas): """ Return the most common form of the formula given a list with tuples [("normalized formula": score), ...] :param formulas: the dictionary :return: a list of common forms with counts, [("common form", score, counts in text), ...] """ common_form_score_count = [] for formula in formulas: if formula[0] in self.dp.ELEMENTS: most_common_form = formula[0] else: most_common_form = max(self.formulas[formula[0]].items(), key=operator.itemgetter(1))[0] common_form_score_count.append( (most_common_form, formula[1], sum(self.formulas[formula[0]].values()))) return common_form_score_count def filter_by_elements(self, formulas, plus_elems=None, minus_elems=None, max=50): """ Filter formulas according to the following rule: It has to have one of the plus_elements (if None all work), but it cannot have any of the minus_elems. If there is an overlap, the element is ignored :param formulas: a list of (formula, score) tuples :param plus_elems: the formula has to have at least one of these :param minus_elems: but cannot have any of these :param max: maximum number to return :return: """ plus_elems = plus_elems or [] minus_elems = minus_elems or [] plus_elems, minus_elems = set(plus_elems) - set(minus_elems), set( minus_elems) - set(plus_elems) def has_plus(comp, pe): if pe is None or len(pe) == 0: return True for elem in comp: if elem in pe: return True return False def has_minus(comp, me): if me is None or len(me) == 0: return False for elem in comp: if elem in me: return True return False matched = 0 matched_formula = [] for form in formulas: composition = self.dp.parser.parse_formula(form[0]) if has_plus(composition, plus_elems) and not has_minus( composition, minus_elems): matched_formula.append(form) matched += 1 if matched >= max: return matched_formula return matched_formula def mentioned_with(self, material, words): """ Returns True if the supplied material was mentioned with any of the words in any of the abstracts. This is a very strict text search and is aimed at high recall. This method is used for discovery so having higher recall might hinder some discoveries but will avoid too many false positives. E.g. for material=CuTe and words=["thermoelectric"], "CuTe2 is thermoelectric" will return True since "CuTe" will be matched with "CuTe2" in text search. The word search is exact, so if the keyword was "thermo" it would not match "thermoelectric". :param material: A material formula (does not have to be normalized) :param words: List of processed words and phrases (words separated by "_") to search the text for co-occurrences :return: True if the material is mentioned with any of the words, False otherwise """ norm_material = self.dp.get_norm_formula( material) if self.dp.is_simple_formula(material) else material # different ways the material is written variations = self.formulas[ norm_material] if norm_material in self.formulas else [ norm_material ] variations = "(" + " OR ".join(variations) + ")" targets = "(" + " OR ".join(words) + ")" query = "{} AND {}".format(targets, variations) if self.es.count_matches(query) > 0: return True else: return False
from pymagnitude import Magnitude vectors = Magnitude('GoogleNews-vectors-negative300.magnitude') cat_vector = vectors.query('cat') print(cat_vector) print(vectors.similarity("cat", "dog")) print(vectors.most_similar("cat", topn=100)) def similarity(word1, word2): return vectors.similarity(word1, word2)
from pymagnitude import Magnitude #On production server, nginx will serve /static. Be careful with trailing / for folder specification -- don't use. app = Flask(__name__, static_folder="../static", static_url_path="/static") if app.debug: app.logger.info("Debug detected, enabling CORS for * origins") from flask_cors import CORS cors = CORS(app, resources={r"/api/*": {"origins": "*"}}) abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) # don't chdir or flask will falter # os.chdir(dname) print("Loading word vectors") vectorUtil = Magnitude(dname + "/../data/glove.6B.300d.magnitude") vectorUtil.most_similar("memory", topn=10) #to trigger initialization print("Word vectors loaded") @app.route("/") def index(): return send_from_directory("../public/", "index.html") @app.route("/api/words") def getSimilarWords(): query = request.args.get("query") print("Received query: " + query) try: paramNov = float(request.args.get("nov"))