Beispiel #1
0
def glove_via_magnitude(topn=500,
                        min_similarity=None,
                        filename='glove.6B.100d.magnitude',
                        lang='en_US'):

    from pymagnitude import Magnitude

    v = Magnitude(os.path.join(TOPDIR, filename))
    training_set = list()
    units = set()
    for unit_list in classifier.ambiguous_units():
        for unit in unit_list[1]:
            units.add(unit)
    for unit in units:
        print('Processing {}...'.format(unit.name))

        name = unit.name
        surfaces = set(unit.name)
        if isinstance(unit, classes.Unit):
            surfaces.update(unit.surfaces)
            surfaces.update(unit.symbols)
        for surface in surfaces:
            neighbours = v.most_similar(
                v.query(surface), topn=topn, min_similarity=min_similarity)
            training_set.append({
                'unit':
                name,
                'text':
                ' '.join(neighbour[0] for neighbour in neighbours)
            })
    print('Done')

    with language.topdir(lang).joinpath('train/similars.json').open(
            'w', encoding='utf-8') as file:
        json.dump(training_set, file, sort_keys=True, indent=4)
Beispiel #2
0
def extract_wordvec_generalization(word, path_to_word_vectors,
                                   neighbor_number):
    ''' Extracts the nearest neighbor from vector space '''
    vectors = Magnitude(path_to_word_vectors)
    generalized_attribute = vectors.most_similar(
        word, topn=neighbor_number)[neighbor_number - 1][0]
    return generalized_attribute
Beispiel #3
0
def get_nearest_words():
    """
    provides words closely related to the keywords

    Parameters:
      keywords -- an array of words closely related to the concept
    Returns:
      closest_words -- these are displayed on the right panel of the concept screen
    Testing:
      http://localhost:3001/api/get_nearest_words?keywords=lunch,slice,pie,pasta
  """
    keywords = request.args.get('keywords', '')

    from pymagnitude import Magnitude
    #vectors = Magnitude('http://magnitude.plasticity.ai/word2vec/heavy/GoogleNews-vectors-negative300.magnitude', stream=True) # full url for streaming from 10GB model
    #vectors = Magnitude('http://magnitude.plasticity.ai/glove/light/glove.6B.50d.magnitude', stream=True)
    vectors = Magnitude('./pretrained_features/glove.6B.50d.magnitude')

    # there is likely overlap if the concepts words are closely related
    closest_words = set()
    for k in keywords.split(','):
        results = vectors.most_similar(k, topn=10)  # Most similar by key
        #vectors.most_similar(vectors.query(k), topn = 100) # Most similar by vector
        for r in results:
            # just add the word, not the word's probability
            closest_words.add(r[0])
    closest_words = closest_words - set(list(keywords.split(',')))
    return json.dumps(list(closest_words))
Beispiel #4
0
class MagnitudeTest(unittest.TestCase):
    MAGNITUDE_PATH = ""
    MAGNITUDE_SUBWORD_PATH = ""
    MAGNITUDE_APPROX_PATH = ""

    def setUp(self):
        self.vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                 case_insensitive=True,
                                 eager=True)
        self.vectors_cs = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                    case_insensitive=False,
                                    eager=False)
        self.vectors_sw = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                    case_insensitive=True,
                                    eager=False)
        self.vectors_approx = Magnitude(MagnitudeTest.MAGNITUDE_APPROX_PATH,
                                        case_insensitive=True,
                                        eager=False)
        self.tmp_vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                     case_insensitive=True,
                                     eager=False)
        self.concat_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                  case_insensitive=True,
                                  eager=False)
        self.concat_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                  case_insensitive=True,
                                  eager=False)
        self.concat = Magnitude(self.concat_1, self.concat_2)
        self.vectors_feat = FeaturizerMagnitude(100, case_insensitive=True)
        self.v = {
            'padding': self.tmp_vectors._padding_vector(),
            'I': self.tmp_vectors.query("I"),
            'saw': self.tmp_vectors.query("saw"),
            'a': self.tmp_vectors.query("a"),
            'cat': self.tmp_vectors.query("cat"),
            'He': self.tmp_vectors.query("He"),
            'went': self.tmp_vectors.query("went"),
            'to': self.tmp_vectors.query("to"),
            'the': self.tmp_vectors.query("the"),
            'mall': self.tmp_vectors.query("mall"),
            'blah123': self.tmp_vectors.query("blah123")
        }

    def tearDown(self):
        self.vectors.close()
        self.vectors_cs.close()
        self.vectors_sw.close()
        self.tmp_vectors.close()
        self.concat_1.close()
        self.concat_2.close()
        del self.concat
        self.vectors_feat.close()
        gc.collect()

    def test_length(self):
        self.assertEqual(len(self.vectors), 3000000)

    def test_dim(self):
        self.assertEqual(self.vectors.dim, 300)

    def test_index(self):
        self.assertTrue(isinstance(self.vectors[0][0], unicode))
        self.assertTrue(isinstance(self.vectors[0][1], np.ndarray))
        self.assertTrue(isinstance(self.vectors.index(0)[0], unicode))
        self.assertTrue(isinstance(self.vectors.index(0)[1], np.ndarray))
        self.assertTrue(
            isinstance(self.vectors.index(0, return_vector=False), unicode))

    def test_slice(self):
        sliced = self.vectors[0:5]
        self.assertEqual(len(sliced), 5)
        self.assertEqual(sliced[0][0], self.vectors[0][0])
        self.assertTrue(isclose(sliced[0][1], self.vectors[0][1]).all())

    def test_case_insensitive(self):
        some_keys_are_not_lower = False
        for i, (k, _) in enumerate(self.vectors):
            if i > 1000:
                break
            some_keys_are_not_lower = (some_keys_are_not_lower
                                       or k.lower() != k)
        self.assertTrue(some_keys_are_not_lower)
        self.assertTrue("QuEEn" in self.vectors)
        self.assertTrue("QUEEN" in self.vectors)
        self.assertTrue("queen" in self.vectors)
        self.assertTrue(
            isclose(self.vectors.query("Queen"),
                    self.vectors.query("QuEEn")).all())
        self.assertEqual(
            self.vectors.most_similar("I", return_similarities=False)[0],
            'myself')
        self.assertEqual(
            self.vectors.most_similar("i", return_similarities=False)[0],
            'ive')
        self.assertTrue(self.vectors.similarity("a", "A") > .9)

    def test_case_sensitive(self):
        some_keys_are_not_lower = False
        for i, (k, _) in enumerate(self.vectors_cs):
            if i > 1000:
                break
            some_keys_are_not_lower = (some_keys_are_not_lower
                                       or k.lower() != k)
        self.assertTrue(some_keys_are_not_lower)
        self.assertTrue("QuEEn" not in self.vectors_cs)
        self.assertTrue("QUEEN" in self.vectors_cs)
        self.assertTrue("queen" in self.vectors_cs)
        self.assertTrue(not isclose(self.vectors_cs.query("Queen"),
                                    self.vectors_cs.query("QuEEn")).all())
        self.assertEqual(
            self.vectors_cs.most_similar("I", return_similarities=False)[0],
            'myself')
        self.assertEqual(
            self.vectors_cs.most_similar("i", return_similarities=False)[0],
            'ive')
        self.assertTrue(self.vectors_cs.similarity("a", "A") > .9)

    def test_iter_case_insensitive(self):
        for _ in range(2):
            for i, (k, v) in enumerate(self.vectors):
                if i > 1000:
                    break
                k2, v2 = self.vectors[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_iter_case_sensitive(self):
        for _ in range(2):
            for i, (k, v) in enumerate(self.vectors_cs):
                if i > 1000:
                    break
                k2, v2 = self.vectors_cs[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_index_case_insensitive(self):
        for _ in range(2):
            viter = iter(self.vectors)
            for i in range(len(self.vectors)):
                if i > 1000:
                    break
                k, v = next(viter)
                k2, v2 = self.vectors[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_index_case_sensitive(self):
        for _ in range(2):
            viter = iter(self.vectors_cs)
            for i in range(len(self.vectors_cs)):
                if i > 1000:
                    break
                k, v = next(viter)
                k2, v2 = self.vectors_cs[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_bounds(self):
        length = len(self.vectors)
        self.assertTrue(isinstance(self.vectors[length - 1][0], unicode))
        self.assertTrue(isinstance(self.vectors[length - 1][1], np.ndarray))

    @unittest.expectedFailure
    def test_out_of_bounds(self):
        length = len(self.vectors)
        self.assertTrue(isinstance(self.vectors[length][0], unicode))
        self.assertTrue(isinstance(self.vectors[length][1], np.ndarray))

    def test_contains(self):
        self.assertTrue("cat" in self.vectors)

    def test_contains_false(self):
        self.assertTrue("blah123" not in self.vectors)

    def test_special_characters(self):
        self.assertTrue("Wilkes-Barre/Scranton" in self.vectors)
        self.assertTrue("out-of-vocabulary" not in self.vectors)
        self.assertTrue('quotation"s' not in self.vectors)
        self.assertTrue("quotation's" not in self.vectors)
        self.assertTrue("colon;s" not in self.vectors)
        self.assertTrue("sh**" not in self.vectors)
        self.assertTrue("'s" not in self.vectors_cs)
        self.assertTrue('"s' not in self.vectors)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("Wilkes-Barre/Scranton").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("out-of-vocabulary").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query('quotation"s').shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("quotation's").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("colon;s").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("sh**").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors_cs.query("'s").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query('"s').shape)

    def test_oov_dim(self):
        self.assertEqual(
            self.vectors.query("*<<<<").shape,
            self.vectors.query("cat").shape)

    def test_oov_subword_dim(self):
        self.assertEqual(
            self.vectors_sw.query("*<<<<").shape,
            self.vectors_sw.query("cat").shape)

    def test_oov_dim_placeholders(self):
        self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                              placeholders=5,
                                              case_insensitive=True,
                                              eager=False)
        self.assertEqual(
            self.vectors_placeholders.query("*<<<<").shape,
            self.vectors_placeholders.query("cat").shape)
        self.assertTrue(
            isclose(
                self.vectors.query("*<<<<")[0],
                self.vectors_placeholders.query("*<<<<")[0]))
        self.vectors_placeholders.close()

    def test_oov_subword_dim_placeholders(self):
        self.vectors_placeholders = Magnitude(
            MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
            placeholders=5,
            case_insensitive=True,
            eager=False)
        self.assertEqual(
            self.vectors_placeholders.query("*<<<<").shape,
            self.vectors_placeholders.query("cat").shape)
        self.assertTrue(
            isclose(
                self.vectors.query("*<<<<")[0],
                self.vectors_placeholders.query("*<<<<")[0]))
        self.vectors_placeholders.close()

    def test_oov_unit_norm(self):
        self.assertTrue(
            isclose(np.linalg.norm(self.vectors.query("*<<<<<")), 1.0))

    def test_oov_subword_unit_norm(self):
        self.assertTrue(
            isclose(np.linalg.norm(self.vectors_sw.query("*<<<<<")), 1.0))

    def test_ngram_oov_closeness(self):
        self.assertTrue(self.vectors.similarity("uberx", "uberxl") > .7)
        self.assertTrue(self.vectors.similarity("uberx", "veryrandom") < .7)
        self.assertTrue(
            self.vectors.similarity("veryrandom", "veryrandom") > .7)

    def test_ngram_oov_subword_closeness(self):
        self.assertTrue(self.vectors_sw.similarity("uberx", "uberxl") > .7)
        self.assertTrue(self.vectors_sw.similarity("uberx", "uber") > .7)
        self.assertTrue(self.vectors_sw.similarity("uberxl", "uber") > .7)
        self.assertTrue(
            self.vectors_sw.similarity("discriminatoryy", "discriminatory") >
            .7)
        self.assertTrue(
            self.vectors_sw.similarity("discriminatoryy", "discriminnatory") >
            .8)
        self.assertTrue(self.vectors_sw.similarity("uberx", "veryrandom") < .7)
        self.assertTrue(
            self.vectors_sw.similarity("veryrandom", "veryrandom") > .7)
        self.assertTrue(self.vectors_sw.similarity("hiiiiiiiii", "hi") > .7)
        self.assertTrue(self.vectors_sw.similarity("heeeeeeeey", "hey") > .7)
        self.assertTrue(self.vectors_sw.similarity("heyyyyyyyyyy", "hey") > .7)
        self.assertTrue(self.vectors_sw.similarity("faaaaaate", "fate") > .65)

    def test_oov_values(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<")[0], -0.0372075283555))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<")[0], -0.0201727917272))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<<")[0], -0.0475993225776))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<<<")[0], 0.0129938352266))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<")[0], -0.0372075283555))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<")[0], -0.0201727917272))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<<")[0], -0.0475993225776))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<<<")[0], 0.0129938352266))

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_oov_subword_values(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        self.assertTrue(
            isclose(
                self.vectors_oov_1.query("discriminatoryy")[0],
                -0.0573252095591))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("uberx")[0], 0.0952671681336))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("misssipi")[0], 0.0577835297955))
        self.assertTrue(
            isclose(
                self.vectors_oov_2.query("discriminatoryy")[0],
                -0.0573252095591))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("uberx")[0], 0.0952671681336))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("misssipi")[0], 0.0577835297955))

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_oov_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_ngram_oov_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=True,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=True,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_ngram_oov_subword_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_placeholders(self):
        self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                              case_insensitive=True,
                                              placeholders=5,
                                              eager=False)
        self.assertEqual(self.vectors_placeholders.query("cat").shape, (305, ))
        self.assertEqual(
            self.vectors_placeholders.query("cat")[0],
            self.vectors.query("cat")[0])
        self.vectors_placeholders.close()

    def test_numpy(self):
        self.assertTrue(isinstance(self.vectors.query("cat"), np.ndarray))

    def test_list(self):
        self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                      case_insensitive=True,
                                      use_numpy=False,
                                      eager=False)
        self.assertTrue(isinstance(self.vectors_list.query("cat"), list))
        self.vectors_list.close()

    def test_repeated_single(self):
        q = "cat"
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())

    def test_repeated_multiple(self):
        q = ["I", "saw", "a", "cat"]
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())

    def test_multiple(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q)
        self.assertEqual(result.shape, (2, 5, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        return result

    def test_pad_to_length_right_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[0][5], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        self.assertTrue(isclose(result[1][5], self.v['padding']).all())
        return result

    def test_pad_to_length_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[0][5], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        self.assertTrue(isclose(result[1][5], self.v['padding']).all())
        return result

    def test_pad_to_length_left_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6, pad_left=True)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['padding']).all())
        self.assertTrue(isclose(result[0][1], self.v['padding']).all())
        self.assertTrue(isclose(result[0][2], self.v['I']).all())
        self.assertTrue(isclose(result[0][3], self.v['saw']).all())
        self.assertTrue(isclose(result[0][4], self.v['a']).all())
        self.assertTrue(isclose(result[0][5], self.v['cat']).all())
        self.assertTrue(isclose(result[1][0], self.v['padding']).all())
        self.assertTrue(isclose(result[1][1], self.v['He']).all())
        self.assertTrue(isclose(result[1][2], self.v['went']).all())
        self.assertTrue(isclose(result[1][3], self.v['to']).all())
        self.assertTrue(isclose(result[1][4], self.v['the']).all())
        self.assertTrue(isclose(result[1][5], self.v['mall']).all())
        return result

    def test_pad_to_length_truncate_right(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=3)
        self.assertEqual(result.shape, (2, 3, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        return result

    def test_pad_to_length_truncate_left(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=3, truncate_left=True)
        self.assertEqual(result.shape, (2, 3, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1], self.v['a']).all())
        self.assertTrue(isclose(result[0][2], self.v['cat']).all())
        self.assertTrue(isclose(result[1][0], self.v['to']).all())
        self.assertTrue(isclose(result[1][1], self.v['the']).all())
        self.assertTrue(isclose(result[1][2], self.v['mall']).all())
        return result

    def test_list_multiple(self):
        self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                      case_insensitive=True,
                                      use_numpy=False,
                                      eager=False)
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        self.assertTrue(isinstance(self.vectors_list.query(q[0]), list))
        self.assertTrue(
            isclose(self.vectors.query(q[0]),
                    asarray(self.vectors_list.query(q[0]))).all())
        self.assertTrue(isinstance(self.vectors_list.query(q), list))
        self.assertTrue(
            isclose(self.vectors.query(q),
                    asarray(self.vectors_list.query(q))).all())
        self.vectors_list.close()

    def test_concat(self):
        q = "cat"
        result = self.concat.query(q)
        self.assertEqual(result.shape, (self.vectors.dim * 2, ))
        self.assertTrue(isclose(result[0:300], self.v['cat']).all())
        self.assertTrue(isclose(result[300:600], self.v['cat']).all())

    def test_concat_multiple(self):
        q = ["I", "saw"]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[1][300:600], self.v['saw']).all())

    def test_concat_multiple_2(self):
        q = [["I", "saw"], ["He", "went"]]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1][300:600], self.v['saw']).all())
        self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all())
        self.assertTrue(isclose(result[1][0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all())
        self.assertTrue(isclose(result[1][1][300:600], self.v['went']).all())

    def test_concat_specific(self):
        q = ("cat", "mall")
        result = self.concat.query(q)
        self.assertEqual(result.shape, (self.vectors.dim * 2, ))
        self.assertTrue(isclose(result[0:300], self.v['cat']).all())
        self.assertTrue(isclose(result[300:600], self.v['mall']).all())

    def test_concat_multiple_specific(self):
        q = [("I", "He"), ("saw", "went")]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[1][300:600], self.v['went']).all())

    def test_concat_multiple_2_specific(self):
        q = [[("I", "He"), ("saw", "went")], [("He", "I"), ("went", "saw")]]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1][300:600], self.v['went']).all())
        self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all())
        self.assertTrue(isclose(result[1][0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all())
        self.assertTrue(isclose(result[1][1][300:600], self.v['saw']).all())

    def test_distance(self):
        self.assertTrue(
            isclose(self.vectors.distance("cat", "dog"), 0.69145405))

    def test_distance_multiple(self):
        self.assertTrue(
            isclose(self.vectors.distance("cat", ["cats", "dog"]),
                    [0.61654216, 0.69145405]).all())

    def test_similarity(self):
        self.assertTrue(
            isclose(self.vectors.similarity("cat", "dog"), 0.7609457089782209))

    def test_similarity_multiple(self):
        self.assertTrue(
            isclose(self.vectors.similarity("cat", ["cats", "dog"]),
                    [0.8099378824686305, 0.7609457089782209]).all())

    def test_most_similar_to_given(self):
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["dog", "television", "laptop"]), "dog")
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["television", "dog", "laptop"]), "dog")
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["television", "laptop", "dog"]), "dog")

    def test_doesnt_match(self):
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "cereal", "lunch", "dinner"]), "cereal")
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "lunch", "cereal", "dinner"]), "cereal")
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "lunch", "dinner", "cereal"]), "cereal")

    def test_most_similar_case_insensitive(self):
        keys = [s[0] for s in self.vectors.most_similar("queen", topn=5)]
        similarities = [
            s[1] for s in self.vectors.most_similar("queen", topn=5)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar(self):
        keys = [s[0] for s in self.vectors_cs.most_similar("queen")]
        similarities = [s[1] for s in self.vectors_cs.most_similar("queen")]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587, 0.6163408160209656,
                        0.6060680150985718, 0.5923796892166138,
                        0.5908075571060181, 0.5637184381484985
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens',
            u'princess',
            u'king',
            u'monarch',
            u'very_pampered_McElhatton',
            u'Queen',
            u'NYC_anglophiles_aflutter',
            u'Queen_Consort',
            u'princesses',
            u'royal',
        ])

    def test_most_similar_no_similarities(self):
        keys = self.vectors_cs.most_similar("queen", return_similarities=False)
        self.assertEqual(keys, [
            u'queens',
            u'princess',
            u'king',
            u'monarch',
            u'very_pampered_McElhatton',
            u'Queen',
            u'NYC_anglophiles_aflutter',
            u'Queen_Consort',
            u'princesses',
            u'royal',
        ])

    def test_most_similar_top_5(self):
        keys = [s[0] for s in self.vectors_cs.most_similar("queen", topn=5)]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar("queen", topn=5)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar_min_similarity(self):
        keys = [
            s[0]
            for s in self.vectors_cs.most_similar("queen", min_similarity=.63)
        ]
        similarities = [
            s[1]
            for s in self.vectors_cs.most_similar("queen", min_similarity=.63)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar_analogy(self):
        keys = [
            s[0]
            for s in self.vectors_cs.most_similar(positive=["king", "woman"],
                                                  negative=["man"])
        ]
        similarities = [
            s[1]
            for s in self.vectors_cs.most_similar(positive=["king", "woman"],
                                                  negative=["man"])
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7118192315101624, 0.6189674139022827,
                        0.5902431011199951, 0.549946129322052,
                        0.5377321243286133, 0.5236844420433044,
                        0.5235944986343384, 0.518113374710083,
                        0.5098593831062317, 0.5087411403656006
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queen', u'monarch', u'princess', u'crown_prince', u'prince',
            u'kings', u'Queen_Consort', u'queens', u'sultan', u'monarchy'
        ])

    def test_most_similar_cosmul_analogy(self):
        keys = [
            s[0] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"], negative=["man"])
        ]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"], negative=["man"])
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.9314123392105103, 0.858533501625061,
                        0.8476565480232239, 0.8150269985198975,
                        0.809981644153595, 0.8089977502822876,
                        0.8027306795120239, 0.801961362361908,
                        0.8009798526763916, 0.7958389520645142
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queen', u'monarch', u'princess', u'Queen_Consort', u'queens',
            u'crown_prince', u'royal_palace', u'monarchy', u'prince',
            u'empress'
        ])

    def test_most_similar_cosmul_min_similarity_analogy(self):
        keys = [
            s[0] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"],
                negative=["man"],
                min_similarity=.81)
        ]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"],
                negative=["man"],
                min_similarity=.81)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.9314123392105103, 0.858533501625061,
                        0.8476565480232239, 0.8150269985198975
                    ]),
                    atol=.02).all())
        self.assertEqual(keys,
                         [u'queen', u'monarch', u'princess', u'Queen_Consort'])

    def test_closer_than(self):
        self.assertEqual(self.vectors.closer_than("cat", "dog"), ["cats"])

    def test_most_similar_approx(self):
        keys = [
            s[0]
            for s in self.vectors_approx.most_similar_approx("queen", topn=15)
        ]
        similarities = [
            s[1]
            for s in self.vectors_approx.most_similar_approx("queen", topn=15)
        ]
        self.assertEqual(len(keys), 15)
        self.assertTrue(similarities[0] > .7 and similarities[-1] > .5)

    @unittest.expectedFailure
    def test_most_similar_approx_failure(self):
        self.vectors.most_similar_approx("queen", topn=15)

    def test_most_similar_approx_low_effort(self):
        keys = [
            s[0] for s in self.vectors_approx.most_similar_approx(
                "queen", topn=15, effort=.1)
        ]
        self.assertEqual(len(keys), 15)
        self.assertEqual(keys[0], "princess")

    def test_most_similar_analogy_approx(self):
        keys = [
            s[0] for s in self.vectors_approx.most_similar_approx(
                positive=["king", "woman"], negative=["man"], topn=15)
        ]
        self.assertEqual(keys[0], "queen")

    def test_feat_length(self):
        self.vectors_feat_2 = FeaturizerMagnitude(1000, case_insensitive=True)
        self.assertEqual(self.vectors_feat.dim, 4)
        self.assertEqual(self.vectors_feat_2.dim, 5)
        self.vectors_feat_2.close()

    def test_feat_stability(self):
        self.vectors_feat_2 = FeaturizerMagnitude(100, case_insensitive=True)
        self.assertTrue(
            isclose(self.vectors_feat.query("VBG"),
                    self.vectors_feat_2.query("VBG")).all())
        self.assertTrue(
            isclose(self.vectors_feat.query("PRP"),
                    self.vectors_feat_2.query("PRP")).all())
        self.vectors_feat_2.close()

    def test_feat_values(self):
        self.assertTrue(
            isclose(self.vectors_feat.query("VBG")[0], 0.490634876828))
        self.assertTrue(
            isclose(self.vectors_feat.query("PRP")[0], 0.463890807802))
        self.assertTrue(isclose(
            self.vectors_feat.query(5)[0], -0.750681075834))
        self.assertTrue(
            isclose(self.vectors_feat.query(5)[-1], 1.46936807866e-38))
Beispiel #5
0
class EmbeddingEngine:
    """
    An interface to query pre-trained word vectors.
    """

    ABBR_LIST = [
        "C41H11O11", "PV", "OPV", "PV12", "CsOS", "CsKPSV", "CsPS", "CsHIOS",
        "OPV", "CsPSV", "CsOPV", "CsIOS", "BCsIS", "CsPrS", "CEsH", "KP307",
        "AsOV", "CEsS", "COsV", "CNoO", "BEsF", "I2P3", "KP115", "BCsIS",
        "C9705IS", "ISC0501", "B349S", "CISe", "CISSe", "CsIPS", "CEsP",
        "BCsF", "CsFOS", "BCY10", "C12P", "EsHP", "CsHP", "C2K8", "CsOP",
        "EsHS", "CsHS", "C3P", "C50I", "CEs", "CSm", "BF", "EsN", "BN50S",
        "AsCP", "CPo", "LiPb17", "CsS", "EsIS", "AsCU", "CCsHS", "CsHPU",
        "AsOS", "AsCI", "EsF", "FV448", "CNS", "CP5", "AsFP", "EsOP", "NS",
        "NS2", "EsI", "BH", "PPmV", "PSe", "AsN", "OPV5", "NSiW", "CsHHS"
    ]

    def __init__(self,
                 embeddings_source=EMBEDDINGS,
                 out_embeddings_source=OUT_EMBEDDINGS,
                 formulas_source=FORMULAS,
                 phraser_source=PHRASER):
        """

        :param embeddings_source: can be instance of a Magnitude object
        or url or path to a serialized Magnitude object
        :param out_embeddings_source: can be instance of a Magnitude object
        or url or path to a serialized Magnitude object
        :param formulas_source: can be url or path to a JSON-serialized dict
        of formulae, if not supplied a default file is loaded
        """

        # hidden layer embeddings (W)
        self.embeddings = Magnitude(embeddings_source, eager=False)

        # output layer embeddings (O)
        self.out_embeddings = Magnitude(out_embeddings_source)

        # load pre-trained formulas from embeddings
        with open(formulas_source, 'r') as f:
            self.formulas_with_abbreviations = load(f)

        self.dp = DataPreparation(local=False)

        self.es = ElasticConnection()

        self.formulas = {
            k: v
            for k, v in self.formulas_with_abbreviations.items()
            if k not in self.ABBR_LIST
        }

        self.formula_counts = {
            root_formula: sum(formulas.values())
            for root_formula, formulas in self.formulas.items()
        }

        self.most_common_forms = {
            formula_group_name:
            (formula_group_name if formula_group_name in self.dp.ELEMENTS else
             max(formulae.items(), key=operator.itemgetter(1))[0])
            for formula_group_name, formulae in
            self.formulas_with_abbreviations.items()
        }

        self.phraser = Phraser.load(phraser_source)

    def make_phrases(self, sentence, reps=2):
        """
        generates phrases from a sentence of words
        :param sentence: a list of tokens
        :param reps: how many times to combine the words
        :return:
        """
        while reps > 0:
            sentence = self.phraser[sentence]
            reps -= 1
        return sentence

    def prepare_wordphrase(self, wp, im=False):
        """
        Process a string into words and phrases according to existing embeddings
        :param wp: the string to process
        :param im: if True, will ignore missing words, otherwise will generate random vectors
        :return: a list of processed words and phrases
        """
        processed_wp = self.make_phrases(
            self.dp.process_sentence(self.dp.text2sent(wp))[0])
        if im:
            processed_wp = [
                pwp for pwp in processed_wp if pwp in self.embeddings
            ]
        return processed_wp

    def get_embedding(self,
                      wordphrases,
                      ignore_missing=False,
                      normalized=True):
        """
        Gets the embedding for the given word
        :param wordphrases: a string or a list of strings to request embedding for
        :param ignore_missing: if true, will ignore missing words, otherwise will query them
        using pymagnitude defult out of dictionary handling
        :param normalized: if False, returns non-normalized embeddings (True by default)
        :return: an embedding matrix with each row corresponding to a single processed word or phrase
        taken from wordphrases, as well as the lists of processed wordphrases
        """
        def get_single_embedding(wp, im=ignore_missing, norm=normalized):
            """
            Returns a single embedding vector for the given string
            :param wp: a string to get a single embedding for
            :param im: boolen to ignore missing words or return some random vectors if False
            :param norm: if False, returns the non-normalized embedding (True by default)
            :return: a single embedding vector for the string (could be a composite embedding)
            """
            processed_wordphrase = self.prepare_wordphrase(wp, im)

            if len(processed_wordphrase) > 0:
                emb = np.mean(self.embeddings.query(processed_wordphrase,
                                                    normalized=norm),
                              axis=0)
                if norm:
                    emb = emb / np.linalg.norm(emb)
                emb = emb.tolist()
            else:
                emb = [0] * self.embeddings.dim
            return emb, processed_wordphrase

        if not isinstance(wordphrases, list):
            wordphrases = [wordphrases]

        processed_wps = []
        embeddings = []

        try:
            for wordphrase in wordphrases:
                embedding, processed_wp = get_single_embedding(
                    wordphrase, im=ignore_missing)
                processed_wps.append(processed_wp)
                embeddings.append(embedding)
        except Exception as ex:
            warnings.warn(ex)

        return embeddings, processed_wps

    def close_words(self,
                    positive,
                    negative=None,
                    top_k=8,
                    exclude_self=True,
                    ignore_missing=True):
        """
        Returns a list of close words
        :param positive: can be either a string or a list of strings
        :param negative: same as word, but will be treated with a minus sign
        :param top_k: number of close words to return
        :param exclude_self: boolean, if the supplied word should be excluded or not
        :param ignore_missing: ignore words that are missing from the vocabulary
        :return: (words, scores, processed_positive, processed_negative)
        """

        if negative is None:
            negative = []
        else:
            if not isinstance(negative, list):
                negative = [negative]
        processed_negative = []
        for n in negative:
            processed_negative += self.prepare_wordphrase(n, im=ignore_missing)

        if not isinstance(positive, list):
            positive = [positive]
        processed_positive = []
        for p in positive:
            processed_positive += self.prepare_wordphrase(p, im=ignore_missing)

        most_similar = self.embeddings.most_similar(
            processed_positive, negative=processed_negative, topn=top_k)

        if not exclude_self:
            most_similar = [(processed_positive, 1.0)
                            ] + most_similar[:top_k - 1]
        words, scores = map(list, zip(*most_similar))
        return words, [float(s)
                       for s in scores], processed_positive, processed_negative

    def find_similar_materials(self,
                               sentence,
                               n_sentence=None,
                               min_count=3,
                               use_output_emb=True,
                               ignore_missing=True):
        """
        Finds materials that match the best with the context of the sentence
        :param sentence: a list of words
        :param n_sentence: a list of words for a negative context
        :param min_count: the minimum number of occurrences for the formula
        to be included
        :param use_output_emb: if True, use output layer embedding (O) instead of
        inner layer embedding (W)
        :return:
        """
        positive_embeddings, processed_sentence = \
            self.get_embedding(sentence, ignore_missing=ignore_missing)

        n_sentence = n_sentence or []
        negative_embeddings, processed_n_sentence = \
            self.get_embedding(n_sentence, ignore_missing=ignore_missing)

        emb = self.out_embeddings if use_output_emb else self.embeddings

        sum_embedding = np.sum(np.asarray(positive_embeddings), axis=0) - \
                        np.sum(np.asarray(negative_embeddings), axis=0)
        sum_embedding = sum_embedding / np.linalg.norm(sum_embedding)

        # formulas common enough to be above cut-off and that exist in embedding
        formulas = [
            f for f, count in self.formula_counts.items()
            if (count > min_count) and (f in self.embeddings)
        ]

        similarity_scores = np.dot(emb.query(formulas, normalized=True),
                                   sum_embedding)
        similarities = {
            f: float(similarity_scores[i])
            for i, f in enumerate(formulas)
        }

        return sorted(similarities.items(), key=lambda x: x[1],
                      reverse=True), processed_sentence, processed_n_sentence

    def most_common_form(self, formulas):
        """
        Return the most common form of the formula given a list with tuples
        [("normalized formula": score), ...]
        :param formulas: the dictionary
        :return: a list of common forms with counts, [("common form", score, counts in text), ...]
        """
        common_form_score_count = []
        for formula in formulas:
            if formula[0] in self.dp.ELEMENTS:
                most_common_form = formula[0]
            else:
                most_common_form = max(self.formulas[formula[0]].items(),
                                       key=operator.itemgetter(1))[0]
            common_form_score_count.append(
                (most_common_form, formula[1],
                 sum(self.formulas[formula[0]].values())))
        return common_form_score_count

    def filter_by_elements(self,
                           formulas,
                           plus_elems=None,
                           minus_elems=None,
                           max=50):
        """
        Filter formulas according to the following rule: It has to have one of the plus_elements (if None all work),
        but it cannot have any of the minus_elems. If there is an overlap, the element is ignored
        :param formulas: a list of (formula, score) tuples
        :param plus_elems: the formula has to have at least one of these
        :param minus_elems: but cannot have any of these
        :param max: maximum number to return
        :return:
        """
        plus_elems = plus_elems or []
        minus_elems = minus_elems or []
        plus_elems, minus_elems = set(plus_elems) - set(minus_elems), set(
            minus_elems) - set(plus_elems)

        def has_plus(comp, pe):
            if pe is None or len(pe) == 0:
                return True
            for elem in comp:
                if elem in pe:
                    return True
            return False

        def has_minus(comp, me):
            if me is None or len(me) == 0:
                return False
            for elem in comp:
                if elem in me:
                    return True
            return False

        matched = 0
        matched_formula = []
        for form in formulas:
            composition = self.dp.parser.parse_formula(form[0])
            if has_plus(composition, plus_elems) and not has_minus(
                    composition, minus_elems):
                matched_formula.append(form)
                matched += 1
            if matched >= max:
                return matched_formula
        return matched_formula

    def mentioned_with(self, material, words):
        """
        Returns True if the supplied material was mentioned with any of the words in any of the abstracts. This is a
        very strict text search and is aimed at high recall. This method is used for discovery so having higher recall
        might hinder some discoveries but will avoid too many false positives. E.g. for material=CuTe and
        words=["thermoelectric"], "CuTe2 is thermoelectric" will return True since "CuTe" will be matched with "CuTe2"
        in text search. The word search is exact, so if the keyword was "thermo" it would not match "thermoelectric".
        :param material: A material formula (does not have to be normalized)
        :param words: List of processed words and phrases (words separated by "_") to search the text for co-occurrences
        :return: True if the material is mentioned with any of the words, False otherwise
        """
        norm_material = self.dp.get_norm_formula(
            material) if self.dp.is_simple_formula(material) else material

        # different ways the material is written
        variations = self.formulas[
            norm_material] if norm_material in self.formulas else [
                norm_material
            ]
        variations = "(" + " OR ".join(variations) + ")"
        targets = "(" + " OR ".join(words) + ")"
        query = "{} AND {}".format(targets, variations)
        if self.es.count_matches(query) > 0:
            return True
        else:
            return False
Beispiel #6
0
from pymagnitude import Magnitude
vectors = Magnitude('GoogleNews-vectors-negative300.magnitude')

cat_vector = vectors.query('cat')
print(cat_vector)

print(vectors.similarity("cat", "dog"))
print(vectors.most_similar("cat", topn=100))

def similarity(word1, word2):
    return vectors.similarity(word1, word2)
Beispiel #7
0
from pymagnitude import Magnitude

#On production server, nginx will serve /static. Be careful with trailing / for folder specification -- don't use.
app = Flask(__name__, static_folder="../static", static_url_path="/static")
if app.debug:
    app.logger.info("Debug detected, enabling CORS for * origins")
    from flask_cors import CORS
    cors = CORS(app, resources={r"/api/*": {"origins": "*"}})

abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
# don't chdir or flask will falter
# os.chdir(dname)
print("Loading word vectors")
vectorUtil = Magnitude(dname + "/../data/glove.6B.300d.magnitude")
vectorUtil.most_similar("memory", topn=10)  #to trigger initialization
print("Word vectors loaded")


@app.route("/")
def index():
    return send_from_directory("../public/", "index.html")


@app.route("/api/words")
def getSimilarWords():
    query = request.args.get("query")
    print("Received query: " + query)

    try:
        paramNov = float(request.args.get("nov"))