Ejemplo n.º 1
0
def predict(chain, embedding=False, interpolation=False):
    if embedding or interpolation:
        vectors = Magnitude('GoogleNews-vectors-negative300.magnitude')

    scores = dict()
    for verb in verbs:
        score = 0
        for event in chain:
            if embedding: score += vectors.similarity(event[0], verb)
            elif interpolation:
                score += (ALPHA * vectors.similarity(event[0], verb) +
                          (1 - ALPHA) * pmi(event, (verb, None, None)))
            else:
                score += pmi(event, (verb, None, None))
        scores[verb] = score

    cleaned_scores = dict()
    chain_verbs = set()
    for event in chain:
        chain_verbs.add(event)

    for candidate in scores:
        if candidate not in chain_verbs:
            cleaned_scores[candidate] = scores[candidate]

    ranked_scores = sorted(list(cleaned_scores.items()),
                           key=lambda x: x[1],
                           reverse=True)
    return ranked_scores
Ejemplo n.º 2
0
class MagnitudeTest(unittest.TestCase):
    MAGNITUDE_PATH = ""
    MAGNITUDE_SUBWORD_PATH = ""
    MAGNITUDE_APPROX_PATH = ""

    def setUp(self):
        self.vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                 case_insensitive=True,
                                 eager=True)
        self.vectors_cs = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                    case_insensitive=False,
                                    eager=False)
        self.vectors_sw = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                    case_insensitive=True,
                                    eager=False)
        self.vectors_approx = Magnitude(MagnitudeTest.MAGNITUDE_APPROX_PATH,
                                        case_insensitive=True,
                                        eager=False)
        self.tmp_vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                     case_insensitive=True,
                                     eager=False)
        self.concat_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                  case_insensitive=True,
                                  eager=False)
        self.concat_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                  case_insensitive=True,
                                  eager=False)
        self.concat = Magnitude(self.concat_1, self.concat_2)
        self.vectors_feat = FeaturizerMagnitude(100, case_insensitive=True)
        self.v = {
            'padding': self.tmp_vectors._padding_vector(),
            'I': self.tmp_vectors.query("I"),
            'saw': self.tmp_vectors.query("saw"),
            'a': self.tmp_vectors.query("a"),
            'cat': self.tmp_vectors.query("cat"),
            'He': self.tmp_vectors.query("He"),
            'went': self.tmp_vectors.query("went"),
            'to': self.tmp_vectors.query("to"),
            'the': self.tmp_vectors.query("the"),
            'mall': self.tmp_vectors.query("mall"),
            'blah123': self.tmp_vectors.query("blah123")
        }

    def tearDown(self):
        self.vectors.close()
        self.vectors_cs.close()
        self.vectors_sw.close()
        self.tmp_vectors.close()
        self.concat_1.close()
        self.concat_2.close()
        del self.concat
        self.vectors_feat.close()
        gc.collect()

    def test_length(self):
        self.assertEqual(len(self.vectors), 3000000)

    def test_dim(self):
        self.assertEqual(self.vectors.dim, 300)

    def test_index(self):
        self.assertTrue(isinstance(self.vectors[0][0], unicode))
        self.assertTrue(isinstance(self.vectors[0][1], np.ndarray))
        self.assertTrue(isinstance(self.vectors.index(0)[0], unicode))
        self.assertTrue(isinstance(self.vectors.index(0)[1], np.ndarray))
        self.assertTrue(
            isinstance(self.vectors.index(0, return_vector=False), unicode))

    def test_slice(self):
        sliced = self.vectors[0:5]
        self.assertEqual(len(sliced), 5)
        self.assertEqual(sliced[0][0], self.vectors[0][0])
        self.assertTrue(isclose(sliced[0][1], self.vectors[0][1]).all())

    def test_case_insensitive(self):
        some_keys_are_not_lower = False
        for i, (k, _) in enumerate(self.vectors):
            if i > 1000:
                break
            some_keys_are_not_lower = (some_keys_are_not_lower
                                       or k.lower() != k)
        self.assertTrue(some_keys_are_not_lower)
        self.assertTrue("QuEEn" in self.vectors)
        self.assertTrue("QUEEN" in self.vectors)
        self.assertTrue("queen" in self.vectors)
        self.assertTrue(
            isclose(self.vectors.query("Queen"),
                    self.vectors.query("QuEEn")).all())
        self.assertEqual(
            self.vectors.most_similar("I", return_similarities=False)[0],
            'myself')
        self.assertEqual(
            self.vectors.most_similar("i", return_similarities=False)[0],
            'ive')
        self.assertTrue(self.vectors.similarity("a", "A") > .9)

    def test_case_sensitive(self):
        some_keys_are_not_lower = False
        for i, (k, _) in enumerate(self.vectors_cs):
            if i > 1000:
                break
            some_keys_are_not_lower = (some_keys_are_not_lower
                                       or k.lower() != k)
        self.assertTrue(some_keys_are_not_lower)
        self.assertTrue("QuEEn" not in self.vectors_cs)
        self.assertTrue("QUEEN" in self.vectors_cs)
        self.assertTrue("queen" in self.vectors_cs)
        self.assertTrue(not isclose(self.vectors_cs.query("Queen"),
                                    self.vectors_cs.query("QuEEn")).all())
        self.assertEqual(
            self.vectors_cs.most_similar("I", return_similarities=False)[0],
            'myself')
        self.assertEqual(
            self.vectors_cs.most_similar("i", return_similarities=False)[0],
            'ive')
        self.assertTrue(self.vectors_cs.similarity("a", "A") > .9)

    def test_iter_case_insensitive(self):
        for _ in range(2):
            for i, (k, v) in enumerate(self.vectors):
                if i > 1000:
                    break
                k2, v2 = self.vectors[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_iter_case_sensitive(self):
        for _ in range(2):
            for i, (k, v) in enumerate(self.vectors_cs):
                if i > 1000:
                    break
                k2, v2 = self.vectors_cs[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_index_case_insensitive(self):
        for _ in range(2):
            viter = iter(self.vectors)
            for i in range(len(self.vectors)):
                if i > 1000:
                    break
                k, v = next(viter)
                k2, v2 = self.vectors[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_index_case_sensitive(self):
        for _ in range(2):
            viter = iter(self.vectors_cs)
            for i in range(len(self.vectors_cs)):
                if i > 1000:
                    break
                k, v = next(viter)
                k2, v2 = self.vectors_cs[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_bounds(self):
        length = len(self.vectors)
        self.assertTrue(isinstance(self.vectors[length - 1][0], unicode))
        self.assertTrue(isinstance(self.vectors[length - 1][1], np.ndarray))

    @unittest.expectedFailure
    def test_out_of_bounds(self):
        length = len(self.vectors)
        self.assertTrue(isinstance(self.vectors[length][0], unicode))
        self.assertTrue(isinstance(self.vectors[length][1], np.ndarray))

    def test_contains(self):
        self.assertTrue("cat" in self.vectors)

    def test_contains_false(self):
        self.assertTrue("blah123" not in self.vectors)

    def test_special_characters(self):
        self.assertTrue("Wilkes-Barre/Scranton" in self.vectors)
        self.assertTrue("out-of-vocabulary" not in self.vectors)
        self.assertTrue('quotation"s' not in self.vectors)
        self.assertTrue("quotation's" not in self.vectors)
        self.assertTrue("colon;s" not in self.vectors)
        self.assertTrue("sh**" not in self.vectors)
        self.assertTrue("'s" not in self.vectors_cs)
        self.assertTrue('"s' not in self.vectors)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("Wilkes-Barre/Scranton").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("out-of-vocabulary").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query('quotation"s').shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("quotation's").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("colon;s").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("sh**").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors_cs.query("'s").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query('"s').shape)

    def test_oov_dim(self):
        self.assertEqual(
            self.vectors.query("*<<<<").shape,
            self.vectors.query("cat").shape)

    def test_oov_subword_dim(self):
        self.assertEqual(
            self.vectors_sw.query("*<<<<").shape,
            self.vectors_sw.query("cat").shape)

    def test_oov_dim_placeholders(self):
        self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                              placeholders=5,
                                              case_insensitive=True,
                                              eager=False)
        self.assertEqual(
            self.vectors_placeholders.query("*<<<<").shape,
            self.vectors_placeholders.query("cat").shape)
        self.assertTrue(
            isclose(
                self.vectors.query("*<<<<")[0],
                self.vectors_placeholders.query("*<<<<")[0]))
        self.vectors_placeholders.close()

    def test_oov_subword_dim_placeholders(self):
        self.vectors_placeholders = Magnitude(
            MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
            placeholders=5,
            case_insensitive=True,
            eager=False)
        self.assertEqual(
            self.vectors_placeholders.query("*<<<<").shape,
            self.vectors_placeholders.query("cat").shape)
        self.assertTrue(
            isclose(
                self.vectors.query("*<<<<")[0],
                self.vectors_placeholders.query("*<<<<")[0]))
        self.vectors_placeholders.close()

    def test_oov_unit_norm(self):
        self.assertTrue(
            isclose(np.linalg.norm(self.vectors.query("*<<<<<")), 1.0))

    def test_oov_subword_unit_norm(self):
        self.assertTrue(
            isclose(np.linalg.norm(self.vectors_sw.query("*<<<<<")), 1.0))

    def test_ngram_oov_closeness(self):
        self.assertTrue(self.vectors.similarity("uberx", "uberxl") > .7)
        self.assertTrue(self.vectors.similarity("uberx", "veryrandom") < .7)
        self.assertTrue(
            self.vectors.similarity("veryrandom", "veryrandom") > .7)

    def test_ngram_oov_subword_closeness(self):
        self.assertTrue(self.vectors_sw.similarity("uberx", "uberxl") > .7)
        self.assertTrue(self.vectors_sw.similarity("uberx", "uber") > .7)
        self.assertTrue(self.vectors_sw.similarity("uberxl", "uber") > .7)
        self.assertTrue(
            self.vectors_sw.similarity("discriminatoryy", "discriminatory") >
            .7)
        self.assertTrue(
            self.vectors_sw.similarity("discriminatoryy", "discriminnatory") >
            .8)
        self.assertTrue(self.vectors_sw.similarity("uberx", "veryrandom") < .7)
        self.assertTrue(
            self.vectors_sw.similarity("veryrandom", "veryrandom") > .7)
        self.assertTrue(self.vectors_sw.similarity("hiiiiiiiii", "hi") > .7)
        self.assertTrue(self.vectors_sw.similarity("heeeeeeeey", "hey") > .7)
        self.assertTrue(self.vectors_sw.similarity("heyyyyyyyyyy", "hey") > .7)
        self.assertTrue(self.vectors_sw.similarity("faaaaaate", "fate") > .65)

    def test_oov_values(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<")[0], -0.0372075283555))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<")[0], -0.0201727917272))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<<")[0], -0.0475993225776))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<<<")[0], 0.0129938352266))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<")[0], -0.0372075283555))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<")[0], -0.0201727917272))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<<")[0], -0.0475993225776))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<<<")[0], 0.0129938352266))

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_oov_subword_values(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        self.assertTrue(
            isclose(
                self.vectors_oov_1.query("discriminatoryy")[0],
                -0.0573252095591))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("uberx")[0], 0.0952671681336))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("misssipi")[0], 0.0577835297955))
        self.assertTrue(
            isclose(
                self.vectors_oov_2.query("discriminatoryy")[0],
                -0.0573252095591))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("uberx")[0], 0.0952671681336))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("misssipi")[0], 0.0577835297955))

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_oov_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_ngram_oov_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=True,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=True,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_ngram_oov_subword_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_placeholders(self):
        self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                              case_insensitive=True,
                                              placeholders=5,
                                              eager=False)
        self.assertEqual(self.vectors_placeholders.query("cat").shape, (305, ))
        self.assertEqual(
            self.vectors_placeholders.query("cat")[0],
            self.vectors.query("cat")[0])
        self.vectors_placeholders.close()

    def test_numpy(self):
        self.assertTrue(isinstance(self.vectors.query("cat"), np.ndarray))

    def test_list(self):
        self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                      case_insensitive=True,
                                      use_numpy=False,
                                      eager=False)
        self.assertTrue(isinstance(self.vectors_list.query("cat"), list))
        self.vectors_list.close()

    def test_repeated_single(self):
        q = "cat"
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())

    def test_repeated_multiple(self):
        q = ["I", "saw", "a", "cat"]
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())

    def test_multiple(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q)
        self.assertEqual(result.shape, (2, 5, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        return result

    def test_pad_to_length_right_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[0][5], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        self.assertTrue(isclose(result[1][5], self.v['padding']).all())
        return result

    def test_pad_to_length_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[0][5], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        self.assertTrue(isclose(result[1][5], self.v['padding']).all())
        return result

    def test_pad_to_length_left_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6, pad_left=True)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['padding']).all())
        self.assertTrue(isclose(result[0][1], self.v['padding']).all())
        self.assertTrue(isclose(result[0][2], self.v['I']).all())
        self.assertTrue(isclose(result[0][3], self.v['saw']).all())
        self.assertTrue(isclose(result[0][4], self.v['a']).all())
        self.assertTrue(isclose(result[0][5], self.v['cat']).all())
        self.assertTrue(isclose(result[1][0], self.v['padding']).all())
        self.assertTrue(isclose(result[1][1], self.v['He']).all())
        self.assertTrue(isclose(result[1][2], self.v['went']).all())
        self.assertTrue(isclose(result[1][3], self.v['to']).all())
        self.assertTrue(isclose(result[1][4], self.v['the']).all())
        self.assertTrue(isclose(result[1][5], self.v['mall']).all())
        return result

    def test_pad_to_length_truncate_right(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=3)
        self.assertEqual(result.shape, (2, 3, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        return result

    def test_pad_to_length_truncate_left(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=3, truncate_left=True)
        self.assertEqual(result.shape, (2, 3, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1], self.v['a']).all())
        self.assertTrue(isclose(result[0][2], self.v['cat']).all())
        self.assertTrue(isclose(result[1][0], self.v['to']).all())
        self.assertTrue(isclose(result[1][1], self.v['the']).all())
        self.assertTrue(isclose(result[1][2], self.v['mall']).all())
        return result

    def test_list_multiple(self):
        self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                      case_insensitive=True,
                                      use_numpy=False,
                                      eager=False)
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        self.assertTrue(isinstance(self.vectors_list.query(q[0]), list))
        self.assertTrue(
            isclose(self.vectors.query(q[0]),
                    asarray(self.vectors_list.query(q[0]))).all())
        self.assertTrue(isinstance(self.vectors_list.query(q), list))
        self.assertTrue(
            isclose(self.vectors.query(q),
                    asarray(self.vectors_list.query(q))).all())
        self.vectors_list.close()

    def test_concat(self):
        q = "cat"
        result = self.concat.query(q)
        self.assertEqual(result.shape, (self.vectors.dim * 2, ))
        self.assertTrue(isclose(result[0:300], self.v['cat']).all())
        self.assertTrue(isclose(result[300:600], self.v['cat']).all())

    def test_concat_multiple(self):
        q = ["I", "saw"]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[1][300:600], self.v['saw']).all())

    def test_concat_multiple_2(self):
        q = [["I", "saw"], ["He", "went"]]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1][300:600], self.v['saw']).all())
        self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all())
        self.assertTrue(isclose(result[1][0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all())
        self.assertTrue(isclose(result[1][1][300:600], self.v['went']).all())

    def test_concat_specific(self):
        q = ("cat", "mall")
        result = self.concat.query(q)
        self.assertEqual(result.shape, (self.vectors.dim * 2, ))
        self.assertTrue(isclose(result[0:300], self.v['cat']).all())
        self.assertTrue(isclose(result[300:600], self.v['mall']).all())

    def test_concat_multiple_specific(self):
        q = [("I", "He"), ("saw", "went")]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[1][300:600], self.v['went']).all())

    def test_concat_multiple_2_specific(self):
        q = [[("I", "He"), ("saw", "went")], [("He", "I"), ("went", "saw")]]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1][300:600], self.v['went']).all())
        self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all())
        self.assertTrue(isclose(result[1][0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all())
        self.assertTrue(isclose(result[1][1][300:600], self.v['saw']).all())

    def test_distance(self):
        self.assertTrue(
            isclose(self.vectors.distance("cat", "dog"), 0.69145405))

    def test_distance_multiple(self):
        self.assertTrue(
            isclose(self.vectors.distance("cat", ["cats", "dog"]),
                    [0.61654216, 0.69145405]).all())

    def test_similarity(self):
        self.assertTrue(
            isclose(self.vectors.similarity("cat", "dog"), 0.7609457089782209))

    def test_similarity_multiple(self):
        self.assertTrue(
            isclose(self.vectors.similarity("cat", ["cats", "dog"]),
                    [0.8099378824686305, 0.7609457089782209]).all())

    def test_most_similar_to_given(self):
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["dog", "television", "laptop"]), "dog")
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["television", "dog", "laptop"]), "dog")
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["television", "laptop", "dog"]), "dog")

    def test_doesnt_match(self):
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "cereal", "lunch", "dinner"]), "cereal")
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "lunch", "cereal", "dinner"]), "cereal")
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "lunch", "dinner", "cereal"]), "cereal")

    def test_most_similar_case_insensitive(self):
        keys = [s[0] for s in self.vectors.most_similar("queen", topn=5)]
        similarities = [
            s[1] for s in self.vectors.most_similar("queen", topn=5)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar(self):
        keys = [s[0] for s in self.vectors_cs.most_similar("queen")]
        similarities = [s[1] for s in self.vectors_cs.most_similar("queen")]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587, 0.6163408160209656,
                        0.6060680150985718, 0.5923796892166138,
                        0.5908075571060181, 0.5637184381484985
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens',
            u'princess',
            u'king',
            u'monarch',
            u'very_pampered_McElhatton',
            u'Queen',
            u'NYC_anglophiles_aflutter',
            u'Queen_Consort',
            u'princesses',
            u'royal',
        ])

    def test_most_similar_no_similarities(self):
        keys = self.vectors_cs.most_similar("queen", return_similarities=False)
        self.assertEqual(keys, [
            u'queens',
            u'princess',
            u'king',
            u'monarch',
            u'very_pampered_McElhatton',
            u'Queen',
            u'NYC_anglophiles_aflutter',
            u'Queen_Consort',
            u'princesses',
            u'royal',
        ])

    def test_most_similar_top_5(self):
        keys = [s[0] for s in self.vectors_cs.most_similar("queen", topn=5)]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar("queen", topn=5)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar_min_similarity(self):
        keys = [
            s[0]
            for s in self.vectors_cs.most_similar("queen", min_similarity=.63)
        ]
        similarities = [
            s[1]
            for s in self.vectors_cs.most_similar("queen", min_similarity=.63)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar_analogy(self):
        keys = [
            s[0]
            for s in self.vectors_cs.most_similar(positive=["king", "woman"],
                                                  negative=["man"])
        ]
        similarities = [
            s[1]
            for s in self.vectors_cs.most_similar(positive=["king", "woman"],
                                                  negative=["man"])
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7118192315101624, 0.6189674139022827,
                        0.5902431011199951, 0.549946129322052,
                        0.5377321243286133, 0.5236844420433044,
                        0.5235944986343384, 0.518113374710083,
                        0.5098593831062317, 0.5087411403656006
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queen', u'monarch', u'princess', u'crown_prince', u'prince',
            u'kings', u'Queen_Consort', u'queens', u'sultan', u'monarchy'
        ])

    def test_most_similar_cosmul_analogy(self):
        keys = [
            s[0] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"], negative=["man"])
        ]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"], negative=["man"])
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.9314123392105103, 0.858533501625061,
                        0.8476565480232239, 0.8150269985198975,
                        0.809981644153595, 0.8089977502822876,
                        0.8027306795120239, 0.801961362361908,
                        0.8009798526763916, 0.7958389520645142
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queen', u'monarch', u'princess', u'Queen_Consort', u'queens',
            u'crown_prince', u'royal_palace', u'monarchy', u'prince',
            u'empress'
        ])

    def test_most_similar_cosmul_min_similarity_analogy(self):
        keys = [
            s[0] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"],
                negative=["man"],
                min_similarity=.81)
        ]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"],
                negative=["man"],
                min_similarity=.81)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.9314123392105103, 0.858533501625061,
                        0.8476565480232239, 0.8150269985198975
                    ]),
                    atol=.02).all())
        self.assertEqual(keys,
                         [u'queen', u'monarch', u'princess', u'Queen_Consort'])

    def test_closer_than(self):
        self.assertEqual(self.vectors.closer_than("cat", "dog"), ["cats"])

    def test_most_similar_approx(self):
        keys = [
            s[0]
            for s in self.vectors_approx.most_similar_approx("queen", topn=15)
        ]
        similarities = [
            s[1]
            for s in self.vectors_approx.most_similar_approx("queen", topn=15)
        ]
        self.assertEqual(len(keys), 15)
        self.assertTrue(similarities[0] > .7 and similarities[-1] > .5)

    @unittest.expectedFailure
    def test_most_similar_approx_failure(self):
        self.vectors.most_similar_approx("queen", topn=15)

    def test_most_similar_approx_low_effort(self):
        keys = [
            s[0] for s in self.vectors_approx.most_similar_approx(
                "queen", topn=15, effort=.1)
        ]
        self.assertEqual(len(keys), 15)
        self.assertEqual(keys[0], "princess")

    def test_most_similar_analogy_approx(self):
        keys = [
            s[0] for s in self.vectors_approx.most_similar_approx(
                positive=["king", "woman"], negative=["man"], topn=15)
        ]
        self.assertEqual(keys[0], "queen")

    def test_feat_length(self):
        self.vectors_feat_2 = FeaturizerMagnitude(1000, case_insensitive=True)
        self.assertEqual(self.vectors_feat.dim, 4)
        self.assertEqual(self.vectors_feat_2.dim, 5)
        self.vectors_feat_2.close()

    def test_feat_stability(self):
        self.vectors_feat_2 = FeaturizerMagnitude(100, case_insensitive=True)
        self.assertTrue(
            isclose(self.vectors_feat.query("VBG"),
                    self.vectors_feat_2.query("VBG")).all())
        self.assertTrue(
            isclose(self.vectors_feat.query("PRP"),
                    self.vectors_feat_2.query("PRP")).all())
        self.vectors_feat_2.close()

    def test_feat_values(self):
        self.assertTrue(
            isclose(self.vectors_feat.query("VBG")[0], 0.490634876828))
        self.assertTrue(
            isclose(self.vectors_feat.query("PRP")[0], 0.463890807802))
        self.assertTrue(isclose(
            self.vectors_feat.query(5)[0], -0.750681075834))
        self.assertTrue(
            isclose(self.vectors_feat.query(5)[-1], 1.46936807866e-38))
Ejemplo n.º 3
0
from pymagnitude import Magnitude, MagnitudeUtils

# ダウンロード
# デフォルトのダウンロード先: `~/.magnitude/`
# vectors = Magnitude(MagnitudeUtils.download_model("chive-1.1-mc90-aunit", remote_path="https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/"))

# リモートでのロード
# 下記例は300MBのベクトル、検証環境で1分弱
vectors = Magnitude(
    "https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15.magnitude"
)

# リモートでのストリーム
# ローカルにファイルをダウンロードせず、ベクトルをすばやく取得
# vectors = Magnitude("https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude", stream=True)

print(vectors.similarity("痛み", "病気"))
print(vectors.similarity("痛み", "痛い"))
print(vectors.similarity("妊娠中", "妊娠"))
print(vectors.similarity("ぶつけた", "ぶつける"))
Ejemplo n.º 4
0
from pymagnitude import Magnitude
vectors = Magnitude('GoogleNews-vectors-negative300.magnitude')

cat_vector = vectors.query('cat')
print(cat_vector)

print(vectors.similarity("cat", "dog"))
print(vectors.most_similar("cat", topn=100))

def similarity(word1, word2):
    return vectors.similarity(word1, word2)