Ejemplo n.º 1
0
    def testVocab(self):
        """Test word2vec vocabulary building."""
        corpus = LeeCorpus()
        total_words = sum(len(sentence) for sentence in corpus)

        # try vocab building explicitly, using all words
        model = word2vec.Word2Vec(min_count=1, hs=1, negative=0)
        model.build_vocab(corpus)
        self.assertTrue(len(model.vocab) == 6981)
        # with min_count=1, we're not throwing away anything, so make sure the word counts add up to be the entire corpus
        self.assertEqual(sum(v.count for v in model.vocab.values()),
                         total_words)
        # make sure the binary codes are correct
        numpy.allclose(model.vocab['the'].code, [1, 1, 0, 0])

        # test building vocab with default params
        model = word2vec.Word2Vec(hs=1, negative=0)
        model.build_vocab(corpus)
        self.assertTrue(len(model.vocab) == 1750)
        numpy.allclose(model.vocab['the'].code, [1, 1, 1, 0])

        # no input => "RuntimeError: you must first build vocabulary before training the model"
        self.assertRaises(RuntimeError, word2vec.Word2Vec, [])

        # input not empty, but rather completely filtered out
        self.assertRaises(RuntimeError,
                          word2vec.Word2Vec,
                          corpus,
                          min_count=total_words + 1)
Ejemplo n.º 2
0
    def testTrainingCbow(self):
        """Test CBOW word2vec training."""
        # to test training, make the corpus larger by repeating its sentences over and over
        # build vocabulary, don't train yet
        model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=1, negative=0)
        model.build_vocab(sentences)
        self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
        self.assertTrue(model.syn1.shape == (len(model.vocab), 2))

        model.train(sentences)
        sims = model.most_similar('graph', topn=10)
        # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar

        # test querying for "most similar" by vector
        graph_vector = model.syn0norm[model.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        sims2 = [(w, sim) for w, sim in sims2
                 if w != 'graph']  # ignore 'graph' itself
        self.assertEqual(sims, sims2)

        # build vocab and train in one step; must be the same as above
        model2 = word2vec.Word2Vec(sentences,
                                   size=2,
                                   min_count=1,
                                   sg=0,
                                   hs=1,
                                   negative=0)
        self.models_equal(model, model2)
Ejemplo n.º 3
0
 def testRuleWithMinCount(self):
     """Test that returning RULE_DEFAULT from trim_rule triggers min_count."""
     model = word2vec.Word2Vec(sentences + [["occurs_only_once"]],
                               min_count=2,
                               trim_rule=_rule)
     self.assertTrue("human" not in model.vocab)
     self.assertTrue("occurs_only_once" not in model.vocab)
     self.assertTrue("interface" in model.vocab)
Ejemplo n.º 4
0
 def test_sg_neg(self):
     """Test skipgram w/ negative sampling"""
     model = word2vec.Word2Vec(sg=1,
                               window=4,
                               hs=0,
                               negative=15,
                               min_count=5,
                               iter=10,
                               workers=2)
     self.model_sanity(model)
Ejemplo n.º 5
0
 def test_sg_hs(self):
     """Test skipgram w/ hierarchical softmax"""
     model = word2vec.Word2Vec(sg=1,
                               window=4,
                               hs=1,
                               negative=0,
                               min_count=5,
                               iter=10,
                               workers=2)
     self.model_sanity(model)
Ejemplo n.º 6
0
    def testLargeMmap(self):
        """Test storing/loading the entire model."""
        model = word2vec.Word2Vec(sentences, min_count=1)

        # test storing the internal arrays into separate files
        model.save(testfile(), sep_limit=0)
        self.models_equal(model, word2vec.Word2Vec.load(testfile()))

        # make sure mmaping the arrays back works, too
        self.models_equal(model, word2vec.Word2Vec.load(testfile(), mmap='r'))
Ejemplo n.º 7
0
    def testParallel(self):
        """Test word2vec parallel training."""
        if word2vec.FAST_VERSION < 0:  # don't test the plain NumPy version for parallelism (too slow)
            return

        corpus = utils.RepeatCorpus(LeeCorpus(), 10000)

        for workers in [2, 4]:
            model = word2vec.Word2Vec(corpus, workers=workers)
            sims = model.most_similar('israeli')
Ejemplo n.º 8
0
 def testPersistenceWord2VecFormatWithVocab(self):
     """Test storing/loading the entire model and vocabulary in word2vec format."""
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.init_sims()
     testvocab = os.path.join(tempfile.gettempdir(),
                              'par2vec_word2vec.vocab')
     model.save_word2vec_format(testfile(), testvocab, binary=True)
     binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format(
         testfile(), testvocab, binary=True)
     self.assertEqual(model.vocab['human'].count,
                      binary_model_with_vocab.vocab['human'].count)
Ejemplo n.º 9
0
    def testScoring(self):
        """Test word2vec scoring."""
        model = word2vec.Word2Vec(sentences,
                                  size=2,
                                  min_count=1,
                                  hs=1,
                                  negative=0)

        # just score and make sure they exist
        scores = model.score(sentences, len(sentences))
        self.assertEqual(len(scores), len(sentences))
Ejemplo n.º 10
0
    def testSimilarities(self):
        """Test similarity and n_similarity methods."""
        # The model is trained using CBOW
        model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2)
        model.build_vocab(sentences)
        model.train(sentences)

        self.assertTrue(
            model.n_similarity(['graph', 'trees'], ['trees', 'graph']))
        self.assertTrue(
            model.n_similarity(['graph'], ['trees']) == model.similarity(
                'graph', 'trees'))
Ejemplo n.º 11
0
 def test_cbow_neg_multi_n02(self):
     """Test CBOW w/ hierarchical softmax multilayer"""
     model = word2vec.Word2Vec(sg=0,
                               size=[75, 100],
                               cbow_mean=1,
                               alpha=0.05,
                               window=8,
                               hs=0,
                               negative=15,
                               min_count=5,
                               iter=20,
                               workers=1,
                               batch_words=1000)
Ejemplo n.º 12
0
 def test_cbow_hs(self):
     """Test CBOW w/ hierarchical softmax"""
     model = word2vec.Word2Vec(sg=0,
                               cbow_mean=1,
                               alpha=0.05,
                               window=8,
                               hs=1,
                               negative=0,
                               min_count=5,
                               iter=10,
                               workers=2,
                               batch_words=1000)
     self.model_sanity(model)
Ejemplo n.º 13
0
 def test_cbow_neg(self):
     """Test CBOW w/ negative sampling"""
     model = word2vec.Word2Vec(sg=0,
                               cbow_mean=1,
                               alpha=0.05,
                               window=5,
                               hs=0,
                               negative=15,
                               min_count=5,
                               iter=10,
                               workers=2,
                               sample=0)
     self.model_sanity(model)
Ejemplo n.º 14
0
 def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self):
     """Test storing/loading the entire model and vocabulary in word2vec format chained with
      saving and loading via `save` and `load` methods`."""
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.init_sims()
     testvocab = os.path.join(tempfile.gettempdir(),
                              'par2vec_word2vec.vocab')
     model.save_word2vec_format(testfile(), testvocab, binary=True)
     binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format(
         testfile(), testvocab, binary=True)
     binary_model_with_vocab.save(testfile())
     binary_model_with_vocab = word2vec.Word2Vec.load(testfile())
     self.assertEqual(model.vocab['human'].count,
                      binary_model_with_vocab.vocab['human'].count)
Ejemplo n.º 15
0
 def testPersistenceWord2VecFormat(self):
     """Test storing/loading the entire model in word2vec format."""
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.init_sims()
     model.save_word2vec_format(testfile(), binary=True)
     binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(),
                                                           binary=True)
     binary_model.init_sims(replace=False)
     self.assertTrue(numpy.allclose(model['human'], binary_model['human']))
     norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(),
                                                              binary=True)
     norm_only_model.init_sims(replace=True)
     self.assertFalse(
         numpy.allclose(model['human'], norm_only_model['human']))
     self.assertTrue(
         numpy.allclose(model.syn0norm[model.vocab['human'].index],
                        norm_only_model['human']))
Ejemplo n.º 16
0
    def testPersistenceWord2VecFormatNonBinary(self):
        """Test storing/loading the entire model in word2vec non-binary format."""
        model = word2vec.Word2Vec(sentences, min_count=1)
        model.init_sims()
        model.save_word2vec_format(testfile(), binary=False)
        text_model = word2vec.Word2Vec.load_word2vec_format(testfile(),
                                                            binary=False)
        text_model.init_sims(False)
        self.assertTrue(
            numpy.allclose(model['human'], text_model['human'], atol=1e-6))
        norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(),
                                                                 binary=False)
        norm_only_model.init_sims(True)
        self.assertFalse(
            numpy.allclose(model['human'], norm_only_model['human'],
                           atol=1e-6))

        self.assertTrue(
            numpy.allclose(model.syn0norm[model.vocab['human'].index],
                           norm_only_model['human'],
                           atol=1e-4))
Ejemplo n.º 17
0
    def testLocking(self):
        """Test word2vec training doesn't change locked vectors."""
        corpus = LeeCorpus()
        # build vocabulary, don't train yet
        for sg in range(2):  # test both cbow and sg
            model = word2vec.Word2Vec(size=4,
                                      hs=1,
                                      negative=5,
                                      min_count=1,
                                      sg=sg,
                                      window=5)
            model.build_vocab(corpus)

            # remember two vectors
            locked0 = numpy.copy(model.syn0[0])
            unlocked1 = numpy.copy(model.syn0[1])
            # lock the vector in slot 0 against change
            model.syn0_lockf[0] = 0.0

            model.train(corpus)
            self.assertFalse((unlocked1 == model.syn0[1]
                              ).all())  # unlocked vector should vary
            self.assertTrue((locked0 == model.syn0[0]
                             ).all())  # locked vector should not vary
Ejemplo n.º 18
0
 def testLambdaRule(self):
     """Test that lambda trim_rule works."""
     rule = lambda word, count, min_count: utils.RULE_DISCARD if word == "human" else utils.RULE_DEFAULT
     model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=rule)
     self.assertTrue("human" not in model.vocab)
Ejemplo n.º 19
0
 def testRule(self):
     """Test applying vocab trim_rule to build_vocab instead of constructor."""
     model = word2vec.Word2Vec(min_count=1)
     model.build_vocab(sentences, trim_rule=_rule)
     self.assertTrue("human" not in model.vocab)
Ejemplo n.º 20
0
 def testPersistenceWithConstructorRule(self):
     """Test storing/loading the entire model with a vocab trimming rule passed in the constructor."""
     model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=_rule)
     model.save(testfile())
     self.models_equal(model, word2vec.Word2Vec.load(testfile()))
Ejemplo n.º 21
0
 def testPersistence(self):
     """Test storing/loading the entire model."""
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.save(testfile())
     self.models_equal(model, word2vec.Word2Vec.load(testfile()))
Ejemplo n.º 22
0
 def testRNG(self):
     """Test word2vec results identical with identical RNG seed."""
     model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
     model2 = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
     self.models_equal(model, model2)