Ejemplos de Word2Vec en Python, ejemplos de par2vec.models.word2vec.Word2Vec en Python

Ejemplo n.º 1

0

Mostrar archivo

    def testVocab(self):
        """Test word2vec vocabulary building."""
        corpus = LeeCorpus()
        total_words = sum(len(sentence) for sentence in corpus)

        # try vocab building explicitly, using all words
        model = word2vec.Word2Vec(min_count=1, hs=1, negative=0)
        model.build_vocab(corpus)
        self.assertTrue(len(model.vocab) == 6981)
        # with min_count=1, we're not throwing away anything, so make sure the word counts add up to be the entire corpus
        self.assertEqual(sum(v.count for v in model.vocab.values()),
                         total_words)
        # make sure the binary codes are correct
        numpy.allclose(model.vocab['the'].code, [1, 1, 0, 0])

        # test building vocab with default params
        model = word2vec.Word2Vec(hs=1, negative=0)
        model.build_vocab(corpus)
        self.assertTrue(len(model.vocab) == 1750)
        numpy.allclose(model.vocab['the'].code, [1, 1, 1, 0])

        # no input => "RuntimeError: you must first build vocabulary before training the model"
        self.assertRaises(RuntimeError, word2vec.Word2Vec, [])

        # input not empty, but rather completely filtered out
        self.assertRaises(RuntimeError,
                          word2vec.Word2Vec,
                          corpus,
                          min_count=total_words + 1)

Ejemplo n.º 2

0

Mostrar archivo

    def testTrainingCbow(self):
        """Test CBOW word2vec training."""
        # to test training, make the corpus larger by repeating its sentences over and over
        # build vocabulary, don't train yet
        model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=1, negative=0)
        model.build_vocab(sentences)
        self.assertTrue(model.syn0.shape == (len(model.vocab), 2))
        self.assertTrue(model.syn1.shape == (len(model.vocab), 2))

        model.train(sentences)
        sims = model.most_similar('graph', topn=10)
        # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar

        # test querying for "most similar" by vector
        graph_vector = model.syn0norm[model.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        sims2 = [(w, sim) for w, sim in sims2
                 if w != 'graph']  # ignore 'graph' itself
        self.assertEqual(sims, sims2)

        # build vocab and train in one step; must be the same as above
        model2 = word2vec.Word2Vec(sentences,
                                   size=2,
                                   min_count=1,
                                   sg=0,
                                   hs=1,
                                   negative=0)
        self.models_equal(model, model2)

Ejemplo n.º 3

0

Mostrar archivo

 def testRuleWithMinCount(self):
     """Test that returning RULE_DEFAULT from trim_rule triggers min_count."""
     model = word2vec.Word2Vec(sentences + [["occurs_only_once"]],
                               min_count=2,
                               trim_rule=_rule)
     self.assertTrue("human" not in model.vocab)
     self.assertTrue("occurs_only_once" not in model.vocab)
     self.assertTrue("interface" in model.vocab)

Ejemplo n.º 4

0

Mostrar archivo

 def test_sg_neg(self):
     """Test skipgram w/ negative sampling"""
     model = word2vec.Word2Vec(sg=1,
                               window=4,
                               hs=0,
                               negative=15,
                               min_count=5,
                               iter=10,
                               workers=2)
     self.model_sanity(model)

Ejemplo n.º 5

0

Mostrar archivo

 def test_sg_hs(self):
     """Test skipgram w/ hierarchical softmax"""
     model = word2vec.Word2Vec(sg=1,
                               window=4,
                               hs=1,
                               negative=0,
                               min_count=5,
                               iter=10,
                               workers=2)
     self.model_sanity(model)

Ejemplo n.º 6

0

Mostrar archivo

    def testLargeMmap(self):
        """Test storing/loading the entire model."""
        model = word2vec.Word2Vec(sentences, min_count=1)

        # test storing the internal arrays into separate files
        model.save(testfile(), sep_limit=0)
        self.models_equal(model, word2vec.Word2Vec.load(testfile()))

        # make sure mmaping the arrays back works, too
        self.models_equal(model, word2vec.Word2Vec.load(testfile(), mmap='r'))

Ejemplo n.º 7

0

Mostrar archivo

    def testParallel(self):
        """Test word2vec parallel training."""
        if word2vec.FAST_VERSION < 0:  # don't test the plain NumPy version for parallelism (too slow)
            return

        corpus = utils.RepeatCorpus(LeeCorpus(), 10000)

        for workers in [2, 4]:
            model = word2vec.Word2Vec(corpus, workers=workers)
            sims = model.most_similar('israeli')

Ejemplo n.º 8

0

Mostrar archivo

 def testPersistenceWord2VecFormatWithVocab(self):
     """Test storing/loading the entire model and vocabulary in word2vec format."""
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.init_sims()
     testvocab = os.path.join(tempfile.gettempdir(),
                              'par2vec_word2vec.vocab')
     model.save_word2vec_format(testfile(), testvocab, binary=True)
     binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format(
         testfile(), testvocab, binary=True)
     self.assertEqual(model.vocab['human'].count,
                      binary_model_with_vocab.vocab['human'].count)

Ejemplo n.º 9

0

Mostrar archivo

    def testScoring(self):
        """Test word2vec scoring."""
        model = word2vec.Word2Vec(sentences,
                                  size=2,
                                  min_count=1,
                                  hs=1,
                                  negative=0)

        # just score and make sure they exist
        scores = model.score(sentences, len(sentences))
        self.assertEqual(len(scores), len(sentences))

Ejemplo n.º 10

0

Mostrar archivo

    def testSimilarities(self):
        """Test similarity and n_similarity methods."""
        # The model is trained using CBOW
        model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2)
        model.build_vocab(sentences)
        model.train(sentences)

        self.assertTrue(
            model.n_similarity(['graph', 'trees'], ['trees', 'graph']))
        self.assertTrue(
            model.n_similarity(['graph'], ['trees']) == model.similarity(
                'graph', 'trees'))

Ejemplo n.º 11

0

Mostrar archivo

 def test_cbow_neg_multi_n02(self):
     """Test CBOW w/ hierarchical softmax multilayer"""
     model = word2vec.Word2Vec(sg=0,
                               size=[75, 100],
                               cbow_mean=1,
                               alpha=0.05,
                               window=8,
                               hs=0,
                               negative=15,
                               min_count=5,
                               iter=20,
                               workers=1,
                               batch_words=1000)

Ejemplo n.º 12

0

Mostrar archivo

 def test_cbow_hs(self):
     """Test CBOW w/ hierarchical softmax"""
     model = word2vec.Word2Vec(sg=0,
                               cbow_mean=1,
                               alpha=0.05,
                               window=8,
                               hs=1,
                               negative=0,
                               min_count=5,
                               iter=10,
                               workers=2,
                               batch_words=1000)
     self.model_sanity(model)

Ejemplo n.º 13

0

Mostrar archivo

 def test_cbow_neg(self):
     """Test CBOW w/ negative sampling"""
     model = word2vec.Word2Vec(sg=0,
                               cbow_mean=1,
                               alpha=0.05,
                               window=5,
                               hs=0,
                               negative=15,
                               min_count=5,
                               iter=10,
                               workers=2,
                               sample=0)
     self.model_sanity(model)

Ejemplo n.º 14

0

Mostrar archivo

 def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self):
     """Test storing/loading the entire model and vocabulary in word2vec format chained with
      saving and loading via `save` and `load` methods`."""
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.init_sims()
     testvocab = os.path.join(tempfile.gettempdir(),
                              'par2vec_word2vec.vocab')
     model.save_word2vec_format(testfile(), testvocab, binary=True)
     binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format(
         testfile(), testvocab, binary=True)
     binary_model_with_vocab.save(testfile())
     binary_model_with_vocab = word2vec.Word2Vec.load(testfile())
     self.assertEqual(model.vocab['human'].count,
                      binary_model_with_vocab.vocab['human'].count)

Ejemplo n.º 15

0

Mostrar archivo

 def testPersistenceWord2VecFormat(self):
     """Test storing/loading the entire model in word2vec format."""
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.init_sims()
     model.save_word2vec_format(testfile(), binary=True)
     binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(),
                                                           binary=True)
     binary_model.init_sims(replace=False)
     self.assertTrue(numpy.allclose(model['human'], binary_model['human']))
     norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(),
                                                              binary=True)
     norm_only_model.init_sims(replace=True)
     self.assertFalse(
         numpy.allclose(model['human'], norm_only_model['human']))
     self.assertTrue(
         numpy.allclose(model.syn0norm[model.vocab['human'].index],
                        norm_only_model['human']))

Ejemplo n.º 16

0

Mostrar archivo

    def testPersistenceWord2VecFormatNonBinary(self):
        """Test storing/loading the entire model in word2vec non-binary format."""
        model = word2vec.Word2Vec(sentences, min_count=1)
        model.init_sims()
        model.save_word2vec_format(testfile(), binary=False)
        text_model = word2vec.Word2Vec.load_word2vec_format(testfile(),
                                                            binary=False)
        text_model.init_sims(False)
        self.assertTrue(
            numpy.allclose(model['human'], text_model['human'], atol=1e-6))
        norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(),
                                                                 binary=False)
        norm_only_model.init_sims(True)
        self.assertFalse(
            numpy.allclose(model['human'], norm_only_model['human'],
                           atol=1e-6))

        self.assertTrue(
            numpy.allclose(model.syn0norm[model.vocab['human'].index],
                           norm_only_model['human'],
                           atol=1e-4))

Ejemplo n.º 17

0

Mostrar archivo

    def testLocking(self):
        """Test word2vec training doesn't change locked vectors."""
        corpus = LeeCorpus()
        # build vocabulary, don't train yet
        for sg in range(2):  # test both cbow and sg
            model = word2vec.Word2Vec(size=4,
                                      hs=1,
                                      negative=5,
                                      min_count=1,
                                      sg=sg,
                                      window=5)
            model.build_vocab(corpus)

            # remember two vectors
            locked0 = numpy.copy(model.syn0[0])
            unlocked1 = numpy.copy(model.syn0[1])
            # lock the vector in slot 0 against change
            model.syn0_lockf[0] = 0.0

            model.train(corpus)
            self.assertFalse((unlocked1 == model.syn0[1]
                              ).all())  # unlocked vector should vary
            self.assertTrue((locked0 == model.syn0[0]
                             ).all())  # locked vector should not vary

Ejemplo n.º 18

0

Mostrar archivo

 def testLambdaRule(self):
     """Test that lambda trim_rule works."""
     rule = lambda word, count, min_count: utils.RULE_DISCARD if word == "human" else utils.RULE_DEFAULT
     model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=rule)
     self.assertTrue("human" not in model.vocab)

Ejemplo n.º 19

0

Mostrar archivo

 def testRule(self):
     """Test applying vocab trim_rule to build_vocab instead of constructor."""
     model = word2vec.Word2Vec(min_count=1)
     model.build_vocab(sentences, trim_rule=_rule)
     self.assertTrue("human" not in model.vocab)

Ejemplo n.º 20

0

Mostrar archivo

 def testPersistenceWithConstructorRule(self):
     """Test storing/loading the entire model with a vocab trimming rule passed in the constructor."""
     model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=_rule)
     model.save(testfile())
     self.models_equal(model, word2vec.Word2Vec.load(testfile()))

Ejemplo n.º 21

0

Mostrar archivo

 def testPersistence(self):
     """Test storing/loading the entire model."""
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.save(testfile())
     self.models_equal(model, word2vec.Word2Vec.load(testfile()))

Ejemplo n.º 22

0

Mostrar archivo

 def testRNG(self):
     """Test word2vec results identical with identical RNG seed."""
     model = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
     model2 = word2vec.Word2Vec(sentences, min_count=2, seed=42, workers=1)
     self.models_equal(model, model2)