Ejemplo n.º 1
0
 def setUp(self):
     self.vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                              case_insensitive=True, eager=True)
     self.vectors_cs = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                 case_insensitive=False, eager=False)
     self.vectors_sw = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                 case_insensitive=True, eager=False)
     self.vectors_approx = Magnitude(MagnitudeTest.MAGNITUDE_APPROX_PATH,
                                     case_insensitive=True, eager=False)
     self.tmp_vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                  case_insensitive=True, eager=False)
     self.concat_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                               case_insensitive=True, eager=False)
     self.concat_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                               case_insensitive=True, eager=False)
     self.concat = Magnitude(self.concat_1, self.concat_2)
     self.vectors_feat = FeaturizerMagnitude(100, case_insensitive=True)
     self.v = {
         'padding': self.tmp_vectors._padding_vector(),
         'I': self.tmp_vectors.query("I"),
         'saw': self.tmp_vectors.query("saw"),
         'a': self.tmp_vectors.query("a"),
         'cat': self.tmp_vectors.query("cat"),
         'He': self.tmp_vectors.query("He"),
         'went': self.tmp_vectors.query("went"),
         'to': self.tmp_vectors.query("to"),
         'the': self.tmp_vectors.query("the"),
         'mall': self.tmp_vectors.query("mall"),
         'blah123': self.tmp_vectors.query("blah123")
     }
Ejemplo n.º 2
0
 def test_feat_stability(self):
     self.vectors_feat_2 = FeaturizerMagnitude(100, case_insensitive=True)
     self.assertTrue(isclose(self.vectors_feat.query("VBG"),
                             self.vectors_feat_2.query("VBG")).all())
     self.assertTrue(isclose(self.vectors_feat.query("PRP"),
                             self.vectors_feat_2.query("PRP")).all())
     self.vectors_feat_2.close()
Ejemplo n.º 3
0
 def test_feat_length(self):
     self.vectors_feat_2 = FeaturizerMagnitude(1000, case_insensitive=True)
     self.assertEqual(self.vectors_feat.dim, 4)
     self.assertEqual(self.vectors_feat_2.dim, 5)
     self.vectors_feat_2.close()
Ejemplo n.º 4
0
class MagnitudeTest(unittest.TestCase):
    MAGNITUDE_PATH = ""
    MAGNITUDE_SUBWORD_PATH = ""
    MAGNITUDE_APPROX_PATH = ""

    def setUp(self):
        self.vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                 case_insensitive=True,
                                 eager=True)
        self.vectors_cs = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                    case_insensitive=False,
                                    eager=False)
        self.vectors_sw = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                    case_insensitive=True,
                                    eager=False)
        self.vectors_approx = Magnitude(MagnitudeTest.MAGNITUDE_APPROX_PATH,
                                        case_insensitive=True,
                                        eager=False)
        self.tmp_vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                     case_insensitive=True,
                                     eager=False)
        self.concat_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                  case_insensitive=True,
                                  eager=False)
        self.concat_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                  case_insensitive=True,
                                  eager=False)
        self.concat = Magnitude(self.concat_1, self.concat_2)
        self.vectors_feat = FeaturizerMagnitude(100, case_insensitive=True)
        self.v = {
            'padding': self.tmp_vectors._padding_vector(),
            'I': self.tmp_vectors.query("I"),
            'saw': self.tmp_vectors.query("saw"),
            'a': self.tmp_vectors.query("a"),
            'cat': self.tmp_vectors.query("cat"),
            'He': self.tmp_vectors.query("He"),
            'went': self.tmp_vectors.query("went"),
            'to': self.tmp_vectors.query("to"),
            'the': self.tmp_vectors.query("the"),
            'mall': self.tmp_vectors.query("mall"),
            'blah123': self.tmp_vectors.query("blah123")
        }

    def tearDown(self):
        self.vectors.close()
        self.vectors_cs.close()
        self.vectors_sw.close()
        self.tmp_vectors.close()
        self.concat_1.close()
        self.concat_2.close()
        del self.concat
        self.vectors_feat.close()
        gc.collect()

    def test_length(self):
        self.assertEqual(len(self.vectors), 3000000)

    def test_dim(self):
        self.assertEqual(self.vectors.dim, 300)

    def test_index(self):
        self.assertTrue(isinstance(self.vectors[0][0], unicode))
        self.assertTrue(isinstance(self.vectors[0][1], np.ndarray))
        self.assertTrue(isinstance(self.vectors.index(0)[0], unicode))
        self.assertTrue(isinstance(self.vectors.index(0)[1], np.ndarray))
        self.assertTrue(
            isinstance(self.vectors.index(0, return_vector=False), unicode))

    def test_slice(self):
        sliced = self.vectors[0:5]
        self.assertEqual(len(sliced), 5)
        self.assertEqual(sliced[0][0], self.vectors[0][0])
        self.assertTrue(isclose(sliced[0][1], self.vectors[0][1]).all())

    def test_case_insensitive(self):
        some_keys_are_not_lower = False
        for i, (k, _) in enumerate(self.vectors):
            if i > 1000:
                break
            some_keys_are_not_lower = (some_keys_are_not_lower
                                       or k.lower() != k)
        self.assertTrue(some_keys_are_not_lower)
        self.assertTrue("QuEEn" in self.vectors)
        self.assertTrue("QUEEN" in self.vectors)
        self.assertTrue("queen" in self.vectors)
        self.assertTrue(
            isclose(self.vectors.query("Queen"),
                    self.vectors.query("QuEEn")).all())
        self.assertEqual(
            self.vectors.most_similar("I", return_similarities=False)[0],
            'myself')
        self.assertEqual(
            self.vectors.most_similar("i", return_similarities=False)[0],
            'ive')
        self.assertTrue(self.vectors.similarity("a", "A") > .9)

    def test_case_sensitive(self):
        some_keys_are_not_lower = False
        for i, (k, _) in enumerate(self.vectors_cs):
            if i > 1000:
                break
            some_keys_are_not_lower = (some_keys_are_not_lower
                                       or k.lower() != k)
        self.assertTrue(some_keys_are_not_lower)
        self.assertTrue("QuEEn" not in self.vectors_cs)
        self.assertTrue("QUEEN" in self.vectors_cs)
        self.assertTrue("queen" in self.vectors_cs)
        self.assertTrue(not isclose(self.vectors_cs.query("Queen"),
                                    self.vectors_cs.query("QuEEn")).all())
        self.assertEqual(
            self.vectors_cs.most_similar("I", return_similarities=False)[0],
            'myself')
        self.assertEqual(
            self.vectors_cs.most_similar("i", return_similarities=False)[0],
            'ive')
        self.assertTrue(self.vectors_cs.similarity("a", "A") > .9)

    def test_iter_case_insensitive(self):
        for _ in range(2):
            for i, (k, v) in enumerate(self.vectors):
                if i > 1000:
                    break
                k2, v2 = self.vectors[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_iter_case_sensitive(self):
        for _ in range(2):
            for i, (k, v) in enumerate(self.vectors_cs):
                if i > 1000:
                    break
                k2, v2 = self.vectors_cs[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_index_case_insensitive(self):
        for _ in range(2):
            viter = iter(self.vectors)
            for i in range(len(self.vectors)):
                if i > 1000:
                    break
                k, v = next(viter)
                k2, v2 = self.vectors[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_index_case_sensitive(self):
        for _ in range(2):
            viter = iter(self.vectors_cs)
            for i in range(len(self.vectors_cs)):
                if i > 1000:
                    break
                k, v = next(viter)
                k2, v2 = self.vectors_cs[i]
                self.assertEqual(k, k2)
                self.assertTrue(isclose(v[0], v2[0]))

    def test_bounds(self):
        length = len(self.vectors)
        self.assertTrue(isinstance(self.vectors[length - 1][0], unicode))
        self.assertTrue(isinstance(self.vectors[length - 1][1], np.ndarray))

    @unittest.expectedFailure
    def test_out_of_bounds(self):
        length = len(self.vectors)
        self.assertTrue(isinstance(self.vectors[length][0], unicode))
        self.assertTrue(isinstance(self.vectors[length][1], np.ndarray))

    def test_contains(self):
        self.assertTrue("cat" in self.vectors)

    def test_contains_false(self):
        self.assertTrue("blah123" not in self.vectors)

    def test_special_characters(self):
        self.assertTrue("Wilkes-Barre/Scranton" in self.vectors)
        self.assertTrue("out-of-vocabulary" not in self.vectors)
        self.assertTrue('quotation"s' not in self.vectors)
        self.assertTrue("quotation's" not in self.vectors)
        self.assertTrue("colon;s" not in self.vectors)
        self.assertTrue("sh**" not in self.vectors)
        self.assertTrue("'s" not in self.vectors_cs)
        self.assertTrue('"s' not in self.vectors)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("Wilkes-Barre/Scranton").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("out-of-vocabulary").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query('quotation"s').shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("quotation's").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("colon;s").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query("sh**").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors_cs.query("'s").shape)
        self.assertEqual(
            self.vectors.query("cat").shape,
            self.vectors.query('"s').shape)

    def test_oov_dim(self):
        self.assertEqual(
            self.vectors.query("*<<<<").shape,
            self.vectors.query("cat").shape)

    def test_oov_subword_dim(self):
        self.assertEqual(
            self.vectors_sw.query("*<<<<").shape,
            self.vectors_sw.query("cat").shape)

    def test_oov_dim_placeholders(self):
        self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                              placeholders=5,
                                              case_insensitive=True,
                                              eager=False)
        self.assertEqual(
            self.vectors_placeholders.query("*<<<<").shape,
            self.vectors_placeholders.query("cat").shape)
        self.assertTrue(
            isclose(
                self.vectors.query("*<<<<")[0],
                self.vectors_placeholders.query("*<<<<")[0]))
        self.vectors_placeholders.close()

    def test_oov_subword_dim_placeholders(self):
        self.vectors_placeholders = Magnitude(
            MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
            placeholders=5,
            case_insensitive=True,
            eager=False)
        self.assertEqual(
            self.vectors_placeholders.query("*<<<<").shape,
            self.vectors_placeholders.query("cat").shape)
        self.assertTrue(
            isclose(
                self.vectors.query("*<<<<")[0],
                self.vectors_placeholders.query("*<<<<")[0]))
        self.vectors_placeholders.close()

    def test_oov_unit_norm(self):
        self.assertTrue(
            isclose(np.linalg.norm(self.vectors.query("*<<<<<")), 1.0))

    def test_oov_subword_unit_norm(self):
        self.assertTrue(
            isclose(np.linalg.norm(self.vectors_sw.query("*<<<<<")), 1.0))

    def test_ngram_oov_closeness(self):
        self.assertTrue(self.vectors.similarity("uberx", "uberxl") > .7)
        self.assertTrue(self.vectors.similarity("uberx", "veryrandom") < .7)
        self.assertTrue(
            self.vectors.similarity("veryrandom", "veryrandom") > .7)

    def test_ngram_oov_subword_closeness(self):
        self.assertTrue(self.vectors_sw.similarity("uberx", "uberxl") > .7)
        self.assertTrue(self.vectors_sw.similarity("uberx", "uber") > .7)
        self.assertTrue(self.vectors_sw.similarity("uberxl", "uber") > .7)
        self.assertTrue(
            self.vectors_sw.similarity("discriminatoryy", "discriminatory") >
            .7)
        self.assertTrue(
            self.vectors_sw.similarity("discriminatoryy", "discriminnatory") >
            .8)
        self.assertTrue(self.vectors_sw.similarity("uberx", "veryrandom") < .7)
        self.assertTrue(
            self.vectors_sw.similarity("veryrandom", "veryrandom") > .7)
        self.assertTrue(self.vectors_sw.similarity("hiiiiiiiii", "hi") > .7)
        self.assertTrue(self.vectors_sw.similarity("heeeeeeeey", "hey") > .7)
        self.assertTrue(self.vectors_sw.similarity("heyyyyyyyyyy", "hey") > .7)
        self.assertTrue(self.vectors_sw.similarity("faaaaaate", "fate") > .65)

    def test_oov_values(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<")[0], -0.0372075283555))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<")[0], -0.0201727917272))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<<")[0], -0.0475993225776))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<<<<<<")[0], 0.0129938352266))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<")[0], -0.0372075283555))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<")[0], -0.0201727917272))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<<")[0], -0.0475993225776))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<<<<<<")[0], 0.0129938352266))

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_oov_subword_values(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        self.assertTrue(
            isclose(
                self.vectors_oov_1.query("discriminatoryy")[0],
                -0.0573252095591))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("uberx")[0], 0.0952671681336))
        self.assertTrue(
            isclose(self.vectors_oov_1.query("misssipi")[0], 0.0577835297955))
        self.assertTrue(
            isclose(
                self.vectors_oov_2.query("discriminatoryy")[0],
                -0.0573252095591))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("uberx")[0], 0.0952671681336))
        self.assertTrue(
            isclose(self.vectors_oov_2.query("misssipi")[0], 0.0577835297955))

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_oov_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=False,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_ngram_oov_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=True,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                       case_insensitive=True,
                                       ngram_oov=True,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_ngram_oov_subword_stability(self):
        self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       eager=False)
        self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH,
                                       case_insensitive=True,
                                       eager=False)

        for i in range(5):
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<"),
                        self.vectors_oov_2.query("*<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<"),
                        self.vectors_oov_2.query("*<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<"),
                        self.vectors_oov_2.query("*<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<"),
                        self.vectors_oov_2.query("*<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<"),
                        self.vectors_oov_2.query("*<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<")).all())
            self.assertTrue(
                isclose(self.vectors_oov_1.query("*<<<<<<<"),
                        self.vectors_oov_2.query("*<<<<<<<")).all())

        self.vectors_oov_1.close()
        self.vectors_oov_2.close()

    def test_placeholders(self):
        self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                              case_insensitive=True,
                                              placeholders=5,
                                              eager=False)
        self.assertEqual(self.vectors_placeholders.query("cat").shape, (305, ))
        self.assertEqual(
            self.vectors_placeholders.query("cat")[0],
            self.vectors.query("cat")[0])
        self.vectors_placeholders.close()

    def test_numpy(self):
        self.assertTrue(isinstance(self.vectors.query("cat"), np.ndarray))

    def test_list(self):
        self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                      case_insensitive=True,
                                      use_numpy=False,
                                      eager=False)
        self.assertTrue(isinstance(self.vectors_list.query("cat"), list))
        self.vectors_list.close()

    def test_repeated_single(self):
        q = "cat"
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())

    def test_repeated_multiple(self):
        q = ["I", "saw", "a", "cat"]
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q)
        result_2 = self.vectors.query(q)
        self.assertTrue(isclose(result, result_2).all())

    def test_multiple(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q)
        self.assertEqual(result.shape, (2, 5, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        return result

    def test_pad_to_length_right_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[0][5], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        self.assertTrue(isclose(result[1][5], self.v['padding']).all())
        return result

    def test_pad_to_length_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[0][3], self.v['cat']).all())
        self.assertTrue(isclose(result[0][4], self.v['padding']).all())
        self.assertTrue(isclose(result[0][5], self.v['padding']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        self.assertTrue(isclose(result[1][3], self.v['the']).all())
        self.assertTrue(isclose(result[1][4], self.v['mall']).all())
        self.assertTrue(isclose(result[1][5], self.v['padding']).all())
        return result

    def test_pad_to_length_left_truncate_none(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=6, pad_left=True)
        self.assertEqual(result.shape, (2, 6, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['padding']).all())
        self.assertTrue(isclose(result[0][1], self.v['padding']).all())
        self.assertTrue(isclose(result[0][2], self.v['I']).all())
        self.assertTrue(isclose(result[0][3], self.v['saw']).all())
        self.assertTrue(isclose(result[0][4], self.v['a']).all())
        self.assertTrue(isclose(result[0][5], self.v['cat']).all())
        self.assertTrue(isclose(result[1][0], self.v['padding']).all())
        self.assertTrue(isclose(result[1][1], self.v['He']).all())
        self.assertTrue(isclose(result[1][2], self.v['went']).all())
        self.assertTrue(isclose(result[1][3], self.v['to']).all())
        self.assertTrue(isclose(result[1][4], self.v['the']).all())
        self.assertTrue(isclose(result[1][5], self.v['mall']).all())
        return result

    def test_pad_to_length_truncate_right(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=3)
        self.assertEqual(result.shape, (2, 3, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['I']).all())
        self.assertTrue(isclose(result[0][1], self.v['saw']).all())
        self.assertTrue(isclose(result[0][2], self.v['a']).all())
        self.assertTrue(isclose(result[1][0], self.v['He']).all())
        self.assertTrue(isclose(result[1][1], self.v['went']).all())
        self.assertTrue(isclose(result[1][2], self.v['to']).all())
        return result

    def test_pad_to_length_truncate_left(self):
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        result = self.vectors.query(q, pad_to_length=3, truncate_left=True)
        self.assertEqual(result.shape, (2, 3, self.vectors.dim))
        self.assertTrue(isclose(result[0][0], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1], self.v['a']).all())
        self.assertTrue(isclose(result[0][2], self.v['cat']).all())
        self.assertTrue(isclose(result[1][0], self.v['to']).all())
        self.assertTrue(isclose(result[1][1], self.v['the']).all())
        self.assertTrue(isclose(result[1][2], self.v['mall']).all())
        return result

    def test_list_multiple(self):
        self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH,
                                      case_insensitive=True,
                                      use_numpy=False,
                                      eager=False)
        q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]]
        self.assertTrue(isinstance(self.vectors_list.query(q[0]), list))
        self.assertTrue(
            isclose(self.vectors.query(q[0]),
                    asarray(self.vectors_list.query(q[0]))).all())
        self.assertTrue(isinstance(self.vectors_list.query(q), list))
        self.assertTrue(
            isclose(self.vectors.query(q),
                    asarray(self.vectors_list.query(q))).all())
        self.vectors_list.close()

    def test_concat(self):
        q = "cat"
        result = self.concat.query(q)
        self.assertEqual(result.shape, (self.vectors.dim * 2, ))
        self.assertTrue(isclose(result[0:300], self.v['cat']).all())
        self.assertTrue(isclose(result[300:600], self.v['cat']).all())

    def test_concat_multiple(self):
        q = ["I", "saw"]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[1][300:600], self.v['saw']).all())

    def test_concat_multiple_2(self):
        q = [["I", "saw"], ["He", "went"]]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1][300:600], self.v['saw']).all())
        self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all())
        self.assertTrue(isclose(result[1][0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all())
        self.assertTrue(isclose(result[1][1][300:600], self.v['went']).all())

    def test_concat_specific(self):
        q = ("cat", "mall")
        result = self.concat.query(q)
        self.assertEqual(result.shape, (self.vectors.dim * 2, ))
        self.assertTrue(isclose(result[0:300], self.v['cat']).all())
        self.assertTrue(isclose(result[300:600], self.v['mall']).all())

    def test_concat_multiple_specific(self):
        q = [("I", "He"), ("saw", "went")]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[1][300:600], self.v['went']).all())

    def test_concat_multiple_2_specific(self):
        q = [[("I", "He"), ("saw", "went")], [("He", "I"), ("went", "saw")]]
        result = self.concat.query(q)
        self.assertEqual(result.shape, (
            2,
            2,
            self.vectors.dim * 2,
        ))
        self.assertTrue(isclose(result[0][0][0:300], self.v['I']).all())
        self.assertTrue(isclose(result[0][0][300:600], self.v['He']).all())
        self.assertTrue(isclose(result[0][1][0:300], self.v['saw']).all())
        self.assertTrue(isclose(result[0][1][300:600], self.v['went']).all())
        self.assertTrue(isclose(result[1][0][0:300], self.v['He']).all())
        self.assertTrue(isclose(result[1][0][300:600], self.v['I']).all())
        self.assertTrue(isclose(result[1][1][0:300], self.v['went']).all())
        self.assertTrue(isclose(result[1][1][300:600], self.v['saw']).all())

    def test_distance(self):
        self.assertTrue(
            isclose(self.vectors.distance("cat", "dog"), 0.69145405))

    def test_distance_multiple(self):
        self.assertTrue(
            isclose(self.vectors.distance("cat", ["cats", "dog"]),
                    [0.61654216, 0.69145405]).all())

    def test_similarity(self):
        self.assertTrue(
            isclose(self.vectors.similarity("cat", "dog"), 0.7609457089782209))

    def test_similarity_multiple(self):
        self.assertTrue(
            isclose(self.vectors.similarity("cat", ["cats", "dog"]),
                    [0.8099378824686305, 0.7609457089782209]).all())

    def test_most_similar_to_given(self):
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["dog", "television", "laptop"]), "dog")
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["television", "dog", "laptop"]), "dog")
        self.assertEqual(
            self.vectors.most_similar_to_given(
                "cat", ["television", "laptop", "dog"]), "dog")

    def test_doesnt_match(self):
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "cereal", "lunch", "dinner"]), "cereal")
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "lunch", "cereal", "dinner"]), "cereal")
        self.assertEqual(
            self.vectors.doesnt_match(
                ["breakfast", "lunch", "dinner", "cereal"]), "cereal")

    def test_most_similar_case_insensitive(self):
        keys = [s[0] for s in self.vectors.most_similar("queen", topn=5)]
        similarities = [
            s[1] for s in self.vectors.most_similar("queen", topn=5)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar(self):
        keys = [s[0] for s in self.vectors_cs.most_similar("queen")]
        similarities = [s[1] for s in self.vectors_cs.most_similar("queen")]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587, 0.6163408160209656,
                        0.6060680150985718, 0.5923796892166138,
                        0.5908075571060181, 0.5637184381484985
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens',
            u'princess',
            u'king',
            u'monarch',
            u'very_pampered_McElhatton',
            u'Queen',
            u'NYC_anglophiles_aflutter',
            u'Queen_Consort',
            u'princesses',
            u'royal',
        ])

    def test_most_similar_no_similarities(self):
        keys = self.vectors_cs.most_similar("queen", return_similarities=False)
        self.assertEqual(keys, [
            u'queens',
            u'princess',
            u'king',
            u'monarch',
            u'very_pampered_McElhatton',
            u'Queen',
            u'NYC_anglophiles_aflutter',
            u'Queen_Consort',
            u'princesses',
            u'royal',
        ])

    def test_most_similar_top_5(self):
        keys = [s[0] for s in self.vectors_cs.most_similar("queen", topn=5)]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar("queen", topn=5)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar_min_similarity(self):
        keys = [
            s[0]
            for s in self.vectors_cs.most_similar("queen", min_similarity=.63)
        ]
        similarities = [
            s[1]
            for s in self.vectors_cs.most_similar("queen", min_similarity=.63)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7399442791938782, 0.7070531845092773,
                        0.6510956287384033, 0.6383601427078247,
                        0.6357027292251587
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queens', u'princess', u'king', u'monarch',
            u'very_pampered_McElhatton'
        ])

    def test_most_similar_analogy(self):
        keys = [
            s[0]
            for s in self.vectors_cs.most_similar(positive=["king", "woman"],
                                                  negative=["man"])
        ]
        similarities = [
            s[1]
            for s in self.vectors_cs.most_similar(positive=["king", "woman"],
                                                  negative=["man"])
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.7118192315101624, 0.6189674139022827,
                        0.5902431011199951, 0.549946129322052,
                        0.5377321243286133, 0.5236844420433044,
                        0.5235944986343384, 0.518113374710083,
                        0.5098593831062317, 0.5087411403656006
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queen', u'monarch', u'princess', u'crown_prince', u'prince',
            u'kings', u'Queen_Consort', u'queens', u'sultan', u'monarchy'
        ])

    def test_most_similar_cosmul_analogy(self):
        keys = [
            s[0] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"], negative=["man"])
        ]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"], negative=["man"])
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.9314123392105103, 0.858533501625061,
                        0.8476565480232239, 0.8150269985198975,
                        0.809981644153595, 0.8089977502822876,
                        0.8027306795120239, 0.801961362361908,
                        0.8009798526763916, 0.7958389520645142
                    ]),
                    atol=.02).all())
        self.assertEqual(keys, [
            u'queen', u'monarch', u'princess', u'Queen_Consort', u'queens',
            u'crown_prince', u'royal_palace', u'monarchy', u'prince',
            u'empress'
        ])

    def test_most_similar_cosmul_min_similarity_analogy(self):
        keys = [
            s[0] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"],
                negative=["man"],
                min_similarity=.81)
        ]
        similarities = [
            s[1] for s in self.vectors_cs.most_similar_cosmul(
                positive=["king", "woman"],
                negative=["man"],
                min_similarity=.81)
        ]
        self.assertTrue(
            isclose(asarray(similarities),
                    asarray([
                        0.9314123392105103, 0.858533501625061,
                        0.8476565480232239, 0.8150269985198975
                    ]),
                    atol=.02).all())
        self.assertEqual(keys,
                         [u'queen', u'monarch', u'princess', u'Queen_Consort'])

    def test_closer_than(self):
        self.assertEqual(self.vectors.closer_than("cat", "dog"), ["cats"])

    def test_most_similar_approx(self):
        keys = [
            s[0]
            for s in self.vectors_approx.most_similar_approx("queen", topn=15)
        ]
        similarities = [
            s[1]
            for s in self.vectors_approx.most_similar_approx("queen", topn=15)
        ]
        self.assertEqual(len(keys), 15)
        self.assertTrue(similarities[0] > .7 and similarities[-1] > .5)

    @unittest.expectedFailure
    def test_most_similar_approx_failure(self):
        self.vectors.most_similar_approx("queen", topn=15)

    def test_most_similar_approx_low_effort(self):
        keys = [
            s[0] for s in self.vectors_approx.most_similar_approx(
                "queen", topn=15, effort=.1)
        ]
        self.assertEqual(len(keys), 15)
        self.assertEqual(keys[0], "princess")

    def test_most_similar_analogy_approx(self):
        keys = [
            s[0] for s in self.vectors_approx.most_similar_approx(
                positive=["king", "woman"], negative=["man"], topn=15)
        ]
        self.assertEqual(keys[0], "queen")

    def test_feat_length(self):
        self.vectors_feat_2 = FeaturizerMagnitude(1000, case_insensitive=True)
        self.assertEqual(self.vectors_feat.dim, 4)
        self.assertEqual(self.vectors_feat_2.dim, 5)
        self.vectors_feat_2.close()

    def test_feat_stability(self):
        self.vectors_feat_2 = FeaturizerMagnitude(100, case_insensitive=True)
        self.assertTrue(
            isclose(self.vectors_feat.query("VBG"),
                    self.vectors_feat_2.query("VBG")).all())
        self.assertTrue(
            isclose(self.vectors_feat.query("PRP"),
                    self.vectors_feat_2.query("PRP")).all())
        self.vectors_feat_2.close()

    def test_feat_values(self):
        self.assertTrue(
            isclose(self.vectors_feat.query("VBG")[0], 0.490634876828))
        self.assertTrue(
            isclose(self.vectors_feat.query("PRP")[0], 0.463890807802))
        self.assertTrue(isclose(
            self.vectors_feat.query(5)[0], -0.750681075834))
        self.assertTrue(
            isclose(self.vectors_feat.query(5)[-1], 1.46936807866e-38))
Ejemplo n.º 5
0
def convert(input_file_path,
            output_file_path=None,
            precision=DEFAULT_PRECISION,
            subword=False,
            subword_start=DEFAULT_NGRAM_BEG,
            subword_end=DEFAULT_NGRAM_END,
            approx=False,
            approx_trees=None,
            vocab_path=None):

    files_to_remove = []
    subword = int(subword)
    approx = int(approx)

    # If no output_file_path specified, create it in a tempdir
    if output_file_path is None:
        output_file_path = os.path.join(
            tempfile.mkdtemp(),
            fast_md5_file(input_file_path) + '.magnitude')
        if os.path.isfile(output_file_path):
            try:
                conn = sqlite3.connect(output_file_path)
                db = conn.cursor()
                db.execute(
                    "SELECT value FROM magnitude_format WHERE key='size'") \
                    .fetchall()[0][0]
                conn.close()
                # File already exists and is functioning
                return output_file_path
            except BaseException:
                pass

    # Check args
    meta_1_path = None
    meta_2_path = None
    input_is_text = input_file_path.endswith('.txt') or \
        input_file_path.endswith('.vec')
    input_is_binary = input_file_path.endswith('.bin')
    input_is_hdf5 = input_file_path.endswith('.hdf5')
    input_is_hdf5_weights = input_file_path.endswith('_weights.hdf5')
    if not input_is_text and not input_is_binary and not input_is_hdf5:
        exit("The input file path must be `.txt`, `.bin`, `.vec`, or `.hdf5`")
    if not output_file_path.endswith('.magnitude'):
        exit("The output file path file path must be `.magnitude`")
    if vocab_path and not vocab_path.endswith(".magnitude"):
        exit("The vocab file path file path must be `.magnitude`")

    # Detect ELMo and ELMo options file
    input_is_elmo = False
    elmo_options_path = None
    if input_is_hdf5:
        elmo_options_path = input_file_path[0:-13] + \
            '_options.json' if input_is_hdf5_weights else input_file_path[0:-5] + '.json'  # noqa
        if not os.path.isfile(elmo_options_path):
            exit("Expected `" + elmo_options_path +
                 "` to exist. ELMo models require a JSON options file.")
        input_is_elmo = True
        meta_1_path = input_file_path
        meta_2_path = elmo_options_path

    # Detect GloVe format and convert to word2vec if detected
    detected_glove = False
    if input_is_text:
        with io.open(input_file_path,
                     mode="r",
                     encoding="utf-8",
                     errors="ignore") as ifp:
            line1 = None
            line2 = None
            while line1 is None or line2 is None:
                line = ifp.readline().strip()
                if len(line) > 0:
                    if line1 is None:
                        line1 = line
                    elif line2 is None:
                        line2 = line
            line1 = line1.replace('\t', ' ')
            line2 = line2.replace('\t', ' ')
            line1 = line1.split()
            line2 = line2.split()
            if len(line1) == len(line2):  # No header line present
                detected_glove = True
    if detected_glove:
        eprint("Detected GloVe format! Converting to word2vec format first..."
               "(this may take some time)")
        temp_file_path = os.path.join(
            tempfile.mkdtemp(),
            os.path.basename(input_file_path) + '.txt')
        try:
            import gensim
        except ImportError:
            raise ImportError("You need gensim >= 3.3.0 installed with pip \
                (`pip install gensim`) to convert GloVe files.")
        gensim.scripts.glove2word2vec.glove2word2vec(input_file_path,
                                                     temp_file_path)
        input_file_path = temp_file_path
        files_to_remove.append(temp_file_path)

    # Open and load vector file
    eprint("Loading vectors... (this may take some time)")
    number_of_keys = None
    dimensions = None
    if input_is_binary:
        try:
            from gensim.models import KeyedVectors
        except ImportError:
            raise ImportError("You need gensim >= 3.3.0 installed with pip \
                (`pip install gensim`) to convert binary files.")
        keyed_vectors = KeyedVectors.load_word2vec_format(
            input_file_path, binary=input_is_binary)
        number_of_keys = len(keyed_vectors.vectors)
        dimensions = len(keyed_vectors.vectors[0])
    elif input_is_text:
        # Read it manually instead of with gensim so we can stream large models
        class KeyedVectors:
            pass

        def keyed_vectors_generator():
            number_of_keys, dimensions = (None, None)
            f = io.open(input_file_path,
                        mode="r",
                        encoding="utf-8",
                        errors="ignore")
            first_line = True
            for line in f:
                line_split = line.strip().replace('\t', ' ').split()
                if len(line_split) == 0:
                    continue
                if first_line:
                    first_line = False
                    number_of_keys = int(line_split[0])
                    dimensions = int(line_split[1])
                    yield (number_of_keys, dimensions)
                else:
                    empty_key = len(line_split) == dimensions
                    vec_floats = line_split if empty_key else line_split[1:]
                    key = "" if empty_key else line_split[0]
                    if len(vec_floats) > dimensions:
                        key = " ".join([key] + vec_floats[0:len(vec_floats) -
                                                          dimensions])
                        vec_floats = vec_floats[len(vec_floats) - dimensions:]
                    vector = np.asarray([float(elem) for elem in vec_floats])
                    yield (key, vector)

        keyed_vectors = KeyedVectors()
        kv_gen = keyed_vectors_generator()
        number_of_keys, dimensions = next(kv_gen)
        kv_gen_1, kv_gen_2 = tee(kv_gen)
        keyed_vectors.vectors = imap(lambda kv: kv[1], kv_gen_1)
        keyed_vectors.index2word = imap(lambda kv: kv[0], kv_gen_2)
    elif input_is_elmo:
        vocab_magnitude = None
        if vocab_path:
            vocab_magnitude = Magnitude(vocab_path,
                                        eager=False,
                                        lazy_loading=1)
        else:
            vocab_magnitude = FeaturizerMagnitude(100)

        class KeyedVectors:
            pass

        elmo = None
        keyed_vectors = KeyedVectors()
        number_of_keys = len(vocab_magnitude)
        dimensions = np.concatenate(elmo.embed_batch([["test"]])[0],
                                    axis=1).flatten().shape[0]
        kv_gen_1, kv_gen_2 = tee(vocab_magnitude)
        keyed_vectors.vectors = chain.from_iterable(
            imap(
                lambda b: imap(lambda e: np.concatenate(e, axis=1).flatten(),
                               elmo.embed_batch(list(imap(lambda k: [k], b)))),
                ibatch(imap(lambda kv: kv[0], kv_gen_1), 1000)))
        keyed_vectors.index2word = imap(lambda kv: kv[0], kv_gen_2)
    else:

        class KeyedVectors:
            pass

        keyed_vectors = KeyedVectors()
        number_of_keys = 0
        dimensions = 0
        keyed_vectors.vectors = []
        keyed_vectors.index2word = []

    eprint("Found %d key(s)" % number_of_keys)
    eprint("Each vector has %d dimension(s)" % dimensions)

    # Delete files if they exist
    try_deleting(output_file_path)
    try_deleting(output_file_path + "-shm")
    try_deleting(output_file_path + "-wal")

    # Temporarily re-direct the output to a tmp file
    output_file_path_tmp = output_file_path + '.tmp'
    output_file_path_orig = output_file_path
    output_file_path = output_file_path_tmp

    # Delete files if they exist
    try_deleting(output_file_path)
    try_deleting(output_file_path + "-shm")
    try_deleting(output_file_path + "-wal")

    # Connect to magnitude datastore
    conn = sqlite3.connect(output_file_path)
    db = conn.cursor()

    # Make the database fast
    conn.isolation_level = None
    db.execute("PRAGMA synchronous = OFF;")
    db.execute("PRAGMA default_synchronous = OFF;")
    db.execute("PRAGMA journal_mode = WAL;")
    db.execute("PRAGMA count_changes = OFF;")

    # Create table structure
    eprint("Creating magnitude format...")
    db.execute("DROP TABLE IF EXISTS `magnitude`;")
    db.execute("""
        CREATE TABLE `magnitude` (
            key TEXT COLLATE NOCASE,
            """ + ",\n".join([("dim_%d INTEGER" % i)
                              for i in range(dimensions)]) +
               ",\nmagnitude REAL" + """
        );
    """)
    db.execute("""
        CREATE TABLE `magnitude_format` (
            key TEXT COLLATE NOCASE,
            value INTEGER
        );
    """)
    if subword:
        db.execute("""
            CREATE VIRTUAL TABLE `magnitude_subword`
            USING fts3(
                char_ngrams,
                num_ngrams
            );
        """)
    if approx:
        db.execute("""
            CREATE TABLE `magnitude_approx` (
                trees INTEGER,
                index_file BLOB
            );
        """)

    metas = [('meta_1', meta_1_path), ('meta_2', meta_2_path)]
    for meta_name, meta_path in metas:
        if meta_path:
            db.execute("""
                CREATE TABLE `magnitude_""" + meta_name + """` (
                    meta_file BLOB
                );
            """)

    # Create annoy index
    approx_index = None
    if approx:
        approx_index = AnnoyIndex(dimensions)

    # Write vectors
    eprint("Writing vectors... (this may take some time)")
    insert_query = """
        INSERT INTO `magnitude`(
            key,
            """ + \
        ",\n".join([("dim_%d" % i) for i in range(dimensions)]) + \
        ",\nmagnitude" \
        + """)
        VALUES (
            """ + \
        (",\n".join(["?"] * (dimensions + 2))) \
        + """
        );
    """
    insert_subword_query = """
        INSERT INTO `magnitude_subword`(
            char_ngrams,
            num_ngrams
        )
        VALUES (
            ?, ?
        );
    """
    counters = [Counter() for i in range(dimensions)]
    key_vectors_iterable = izip(keyed_vectors.index2word,
                                keyed_vectors.vectors)
    progress = -1
    db.execute("BEGIN;")
    for i, (key, vector) in enumerate(key_vectors_iterable):
        current_progress = int((float(i) / float(number_of_keys)) * 100)
        if current_progress > progress:
            progress = current_progress
            eprint("%d%% completed" % progress)
        if i % 100000 == 0:
            db.execute("COMMIT;")
            db.execute("BEGIN;")
        magnitude = np.linalg.norm(vector)
        vector = vector / magnitude
        epsilon = np.random.choice(
            [-1.0 / (10**precision), 1.0 / (10**precision)], dimensions)
        vector = epsilon if np.isnan(vector).any() else vector
        for d, v in enumerate(vector):
            counters[d][int(v * 100)] += 1
        db.execute(insert_query, (key, ) +
                   tuple(int(round(v * (10**precision)))
                         for v in vector) + (float(magnitude), ))  # noqa
        if subword:
            ngrams = set(
                (n.lower()
                 for n in char_ngrams(BOW + key +
                                      EOW, subword_start, subword_end)))
            num_ngrams = len(ngrams) * 4
            ngrams = set(
                (n for n in ngrams
                 if not any([c in SQLITE_TOKEN_SPLITTERS for c in n])))
            db.execute(insert_subword_query, (" ".join(ngrams), num_ngrams))
        if approx:
            approx_index.add_item(i, vector)
    eprint("Committing written vectors... (this may take some time)")
    db.execute("COMMIT;")

    # Figure out which dimensions have the most entropy
    entropies = [(d, entropy(counter)) for d, counter in enumerate(counters)]
    entropies.sort(key=lambda e: e[1], reverse=True)
    for e in entropies:
        eprint("Entropy of dimension %d is %f" % (e[0], e[1]))
    highest_entropy_dimensions = [e[0] for e in entropies]

    # Writing metadata
    insert_format_query = """
        INSERT INTO `magnitude_format`(
            key,
            value
        )
        VALUES (
            ?, ?
        );
    """

    db.execute(insert_format_query, ('version', CONVERTER_VERSION))
    db.execute(insert_format_query, ('elmo', input_is_elmo))
    db.execute(insert_format_query, ('size', number_of_keys))
    db.execute(insert_format_query, ('dim', dimensions))
    db.execute(insert_format_query, ('precision', precision))
    if subword:
        db.execute(insert_format_query, ('subword', subword))
        db.execute(insert_format_query, ('subword_start', subword_start))
        db.execute(insert_format_query, ('subword_end', subword_end))
    if approx:
        if approx_trees is None:
            approx_trees = max(50, int((number_of_keys / 3000000.0) * 50.0))
        db.execute(insert_format_query, ('approx', approx))
        db.execute(insert_format_query, ('approx_trees', approx_trees))
    for d in highest_entropy_dimensions:
        db.execute(insert_format_query, ('entropy', d))

    # Create indicies
    eprint("Creating search index... (this may take some time)")
    db.execute("CREATE INDEX `magnitude_key_idx` ON `magnitude` (key);")
    for i in highest_entropy_dimensions[0:1]:
        eprint("Creating spatial search index for dimension %d "
               "(it has high entropy)... (this may take some time)" % i)
        db.execute("""
            CREATE INDEX `magnitude_dim_%d_idx` ON `magnitude` (dim_%d);
        """ % (i, i))

    # Write approximate index to the database
    if approx:
        eprint("Creating approximate nearest neighbors index... \
(this may take some time)")
        approx_index.build(approx_trees)
        approx_index_file_path = os.path.join(
            tempfile.mkdtemp(),
            fast_md5_file(input_file_path) + '.ann')
        eprint("Dumping approximate nearest neighbors index... \
(this may take some time)")
        approx_index.save(approx_index_file_path)
        eprint("Compressing approximate nearest neighbors index... \
(this may take some time)")
        chunk_size = 104857600
        full_size = os.path.getsize(approx_index_file_path)
        insert_approx_query = """
            INSERT INTO magnitude_approx(trees, index_file) VALUES (?, ?);
        """
        with open(approx_index_file_path, 'rb') as ifh, \
                lz4.frame.LZ4FrameCompressor() as compressor:
            for i, chunk in enumerate(iter(partial(ifh.read, chunk_size),
                                           b'')):
                if i == 0:
                    chunk = compressor.begin() + compressor.compress(chunk)
                else:
                    chunk = compressor.compress(chunk)
                eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%")
                if len(chunk) > 0:
                    db.execute(insert_approx_query,
                               (approx_trees, sqlite3.Binary(chunk)))
            chunk = compressor.flush()
            if len(chunk) > 0:
                db.execute(insert_approx_query,
                           (approx_trees, sqlite3.Binary(chunk)))
        files_to_remove.append(approx_index_file_path)

    for meta_name, meta_path in metas:
        if not meta_path:
            continue
        eprint("Compressing meta file... \
(this may take some time)")
        chunk_size = 104857600
        full_size = os.path.getsize(meta_path)
        insert_meta_query = """
            INSERT INTO magnitude_""" + meta_name + """(meta_file)
            VALUES (?);
        """
        with open(meta_path, 'rb') as ifh, \
                lz4.frame.LZ4FrameCompressor() as compressor:
            for i, chunk in enumerate(iter(partial(ifh.read, chunk_size),
                                           b'')):
                if i == 0:
                    chunk = compressor.begin() + compressor.compress(chunk)
                else:
                    chunk = compressor.compress(chunk)
                eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%")
                if len(chunk) > 0:
                    db.execute(insert_meta_query, (sqlite3.Binary(chunk), ))
            chunk = compressor.flush()
            if len(chunk) > 0:
                db.execute(insert_meta_query, (sqlite3.Binary(chunk), ))

    # Clean up
    if len(files_to_remove) > 0:
        eprint("Cleaning up temporary files...")
        for file_to_remove in files_to_remove:
            try_deleting(file_to_remove)

    # Calculate max duplicate keys
    eprint("Finding duplicate keys... (this may take some time)")
    duplicate_keys_query = db.execute("""
        SELECT MAX(key_count)
        FROM (
            SELECT COUNT(key)
            AS key_count
            FROM magnitude
            GROUP BY key
        );
    """).fetchall()
    max_duplicate_keys = (duplicate_keys_query[0][0]
                          if duplicate_keys_query[0][0] is not None else 1
                          )  # noqa
    eprint("Found %d as the maximum number of duplicate key(s)" %
           max_duplicate_keys)
    db.execute(insert_format_query, ('max_duplicate_keys', max_duplicate_keys))

    # VACUUM
    eprint("Vacuuming to save space... (this may take some time)")
    db.execute("VACUUM;")

    # Restore safe database settings
    db.execute("PRAGMA synchronous = FULL;")
    db.execute("PRAGMA default_synchronous = FULL;")
    db.execute("PRAGMA journal_mode = DELETE;")
    db.execute("PRAGMA count_changes = ON;")

    # Clean up connection
    conn.commit()
    conn.close()
    files_to_remove.append(output_file_path + "-shm")
    files_to_remove.append(output_file_path + "-wal")

    # Clean up
    if len(files_to_remove) > 0:
        eprint("Cleaning up temporary files...")
        for file_to_remove in files_to_remove:
            try_deleting(file_to_remove)

    # Rename file the temporary output to the real output
    os.rename(output_file_path, output_file_path_orig)
    output_file_path = output_file_path_orig

    # Print success
    eprint("Successfully converted '%s' to '%s'!" %
           (input_file_path, output_file_path))

    return output_file_path