Esempio n. 1
0
    def update_vectors(self, sv: SentenceVectors, total_sentences: int):
        """Given existing sentence vectors, append new ones"""
        logger.info(
            f"appending sentence vectors for {total_sentences} sentences")
        sentences_before = len(sv.vectors)
        sentences_after = len(sv.vectors) + total_sentences

        if sv.mapfile_path:
            sv.vectors = np_memmap(
                str(sv.mapfile_path) + ".vectors",
                dtype=REAL,
                mode="r+",
                shape=(sentences_after, sv.vector_size),
            )
            for i in range(sentences_before, sentences_after):
                sv.vectors[i] = full(shape=sv.vector_size,
                                     fill_value=EPS,
                                     dtype=REAL)
        else:
            newvectors = empty((total_sentences, sv.vector_size), dtype=REAL)
            for i in range(total_sentences):
                newvectors[i] = full(shape=sv.vector_size,
                                     fill_value=EPS,
                                     dtype=REAL)
            sv.vectors = vstack([sv.vectors, newvectors])
        sv.vectors_norm = None
    def reset_vectors(self, sv: SentenceVectors, total_sentences: int):
        """Initialize all sentence vectors to zero and overwrite existing files"""
        logger.info(
            f"initializing sentence vectors for {total_sentences} sentences")
        if sv.mapfile_path:
            sv.vectors = np_memmap(str(sv.mapfile_path) + '.vectors',
                                   dtype=REAL,
                                   mode='w+',
                                   shape=(total_sentences, sv.vector_size))
        else:
            sv.vectors = empty((total_sentences, sv.vector_size), dtype=REAL)

        for i in range(total_sentences):
            sv.vectors[i] = zeros(sv.vector_size, dtype=REAL)
        sv.vectors_norm = None
 def test_save_load(self):
     p = Path("fse/test/test_data/test_vectors.vectors")
     self.sv.save(str(p.absolute()))
     self.assertTrue(p.exists())
     sv2 = SentenceVectors.load(str(p.absolute()))
     self.assertTrue((self.sv.vectors == sv2.vectors).all())
     p.unlink()
    def test_save_load_with_memmap(self):
        p = Path("fse/test/test_data/test_vectors")
        p_target = Path("fse/test/test_data/test_vectors.vectors")
        p_not_exists = Path("fse/test/test_data/test_vectors.vectors.npy")

        sv = SentenceVectors(2, mapfile_path=str(p))

        shape = (1000, 1000)
        sv.vectors = np.ones(shape, dtype=np.float32)
        
        memvecs = np.memmap(
            p_target, dtype=np.float32,
            mode='w+', shape=shape)
        memvecs[:] = sv.vectors[:]
        del memvecs

        self.assertTrue(p_target.exists())
        sv.save(str(p.absolute()))
        self.assertTrue(p.exists())
        self.assertFalse(p_not_exists.exists())

        sv = SentenceVectors.load(str(p.absolute()))
        self.assertEqual(shape, sv.vectors.shape)

        for t in [p, p_target]:
            t.unlink()
 def setUp(self):
     self.sv = SentenceVectors(2)
     self.sv.vectors = np.arange(10).reshape(5, 2)
class TestSentenceVectorsFunctions(unittest.TestCase):
    def setUp(self):
        self.sv = SentenceVectors(2)
        self.sv.vectors = np.arange(10).reshape(5, 2)

    def test_getitem(self):
        self.assertTrue(([0, 1] == self.sv[0]).all())
        self.assertTrue(([[0, 1], [4, 5]] == self.sv[[0, 2]]).all())

    def test_isin(self):
        self.assertTrue(0 in self.sv)
        self.assertFalse(5 in self.sv)

    def test_init_sims_wo_replace(self):
        self.sv.init_sims()
        self.assertIsNotNone(self.sv.vectors_norm)
        self.assertFalse((self.sv.vectors == self.sv.vectors_norm).all())

        v1 = self.sv.vectors[0]
        v1 = v1 / np.sqrt(np.sum(v1**2))

        v2 = self.sv.vectors[1]
        v2 = v2 / np.sqrt(np.sum(v2**2))

        self.assertTrue(np.allclose(v1, self.sv.vectors_norm[0]))
        self.assertTrue(np.allclose(v2, self.sv.vectors_norm[1]))
        self.assertTrue(np.allclose(v2, self.sv.get_vector(1, True)))

    def test_get_vector(self):
        self.assertTrue(([0, 1] == self.sv.get_vector(0)).all())
        self.assertTrue(([2, 3] == self.sv.get_vector(1)).all())

    def test_init_sims_w_replace(self):
        self.sv.init_sims(True)
        self.assertTrue(
            np.allclose(self.sv.vectors[0], self.sv.vectors_norm[0]))

    def test_init_sims_w_mapfile(self):
        p = Path("fse/test/test_data/test_vectors")
        self.sv.mapfile_path = str(p.absolute())
        self.sv.init_sims()
        p = Path("fse/test/test_data/test_vectors.vectors_norm")
        self.assertTrue(p.exists())
        p.unlink()

    def test_save_load(self):
        p = Path("fse/test/test_data/test_vectors.vectors")
        self.sv.save(str(p.absolute()))
        self.assertTrue(p.exists())
        sv2 = SentenceVectors.load(str(p.absolute()))
        self.assertTrue((self.sv.vectors == sv2.vectors).all())
        p.unlink()

    def test_save_load_with_memmap(self):
        p = Path("fse/test/test_data/test_vectors")
        p_target = Path("fse/test/test_data/test_vectors.vectors")
        p_not_exists = Path("fse/test/test_data/test_vectors.vectors.npy")

        sv = SentenceVectors(2, mapfile_path=str(p))

        shape = (1000, 1000)
        sv.vectors = np.ones(shape, dtype=np.float32)

        memvecs = np.memmap(p_target, dtype=np.float32, mode='w+', shape=shape)
        memvecs[:] = sv.vectors[:]
        del memvecs

        self.assertTrue(p_target.exists())
        sv.save(str(p.absolute()))
        self.assertTrue(p.exists())
        self.assertFalse(p_not_exists.exists())

        sv = SentenceVectors.load(str(p.absolute()))
        self.assertEqual(shape, sv.vectors.shape)

        for t in [p, p_target]:
            t.unlink()

    def test_len(self):
        self.assertEqual(5, len(self.sv))

    def test_similarity(self):
        v1 = self.sv.vectors[0]
        v1 = v1 / np.sqrt(np.sum(v1**2))

        v2 = self.sv.vectors[1]
        v2 = v2 / np.sqrt(np.sum(v2**2))

        self.assertTrue(np.allclose(v1.dot(v2), self.sv.similarity(0, 1)))
        self.assertTrue(np.allclose(1 - v1.dot(v2), self.sv.distance(0, 1)))

    def test_most_similar(self):
        sent_ind = IndexedList(SENTENCES)
        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        o = m.sv.most_similar(positive=0)
        self.assertEqual(45, o[0][0])
        self.assertEqual(35, o[1][0])
        o = m.sv.most_similar(positive=0, indexable=sentences)
        self.assertEqual("Looks good and fits snug", o[0][0])

        o = m.sv.most_similar(positive=0, indexable=sent_ind)
        self.assertEqual("Looks good and fits snug".split(), o[0][0][0])

    def test_most_similar_vec(self):
        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        m.sv.init_sims()
        v = m.sv.get_vector(0, use_norm=True)
        o = m.sv.most_similar(positive=v)
        # Includes 0 obviously
        self.assertEqual(45, o[1][0])
        self.assertEqual(35, o[2][0])

    def test_most_similar_vecs(self):
        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        m.sv.init_sims()
        v = m.sv[[0, 1]]
        o = m.sv.most_similar(positive=v)
        self.assertEqual(1, o[0][0])
        self.assertEqual(0, o[1][0])

    def test_most_similar_wrong_indexable(self):
        def indexable(self):
            pass

        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        with self.assertRaises(RuntimeError):
            m.sv.most_similar(positive=0, indexable=indexable)

    def test_most_similar_topn(self):
        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        o = m.sv.most_similar(positive=0, topn=20)
        self.assertEqual(20, len(o))

    def test_most_similar_restrict_size(self):
        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        o = m.sv.most_similar(positive=20, topn=20, restrict_size=5)
        self.assertEqual(5, len(o))

    def test_most_similar_restrict_size_tuple(self):
        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        o = m.sv.most_similar(positive=20, topn=20, restrict_size=(5, 25))
        self.assertEqual(19, len(o))
        self.assertEqual(22, o[0][0])

        o = m.sv.most_similar(positive=1, topn=20, restrict_size=(5, 25))
        self.assertEqual(20, len(o))
        self.assertEqual(9, o[0][0])

        o = m.sv.most_similar(positive=1,
                              topn=20,
                              restrict_size=(5, 25),
                              indexable=sentences)
        self.assertEqual(20, len(o))
        self.assertEqual(9, o[0][1])

    def test_similar_by_word(self):
        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        o = m.sv.similar_by_word(word="the", wv=m.wv)
        self.assertEqual(96, o[0][0])
        o = m.sv.similar_by_word(word="the", wv=m.wv, indexable=sentences)
        self.assertEqual(96, o[0][1])

    def test_similar_by_vector(self):
        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        o = m.sv.similar_by_vector(m.wv["the"])
        self.assertEqual(96, o[0][0])

    def test_similar_by_sentence(self):
        sentences = IndexedLineDocument(CORPUS)
        m = Average(W2V)
        m.train(sentences)
        o = m.sv.similar_by_sentence(sentence=["the", "product", "is", "good"],
                                     model=m)
        self.assertEqual(4, o[0][0])

    def test_l2_norm(self):
        out = np.random.normal(size=(200, 50)).astype(np.float32)
        result = _l2_norm(out, False)
        lens = np.sqrt(np.sum((result**2), axis=-1))
        self.assertTrue(np.allclose(1, lens, atol=1e-6))

        out = np.random.normal(size=(200, 50)).astype(np.float32)
        out = _l2_norm(out, True)
        lens = np.sqrt(np.sum((out**2), axis=-1))
        self.assertTrue(np.allclose(1, lens, atol=1e-6))
Esempio n. 7
0
    def __init__(
        self,
        model: BaseKeyedVectors,
        sv_mapfile_path: str = None,
        wv_mapfile_path: str = None,
        workers: int = 1,
        lang_freq: str = None,
        fast_version: int = 0,
        batch_words: int = 10000,
        batch_ngrams: int = 40,
        **kwargs,
    ):
        """ Base class for all Sentence2Vec Models. Provides core functionality, such as
        save, load, sanity checking, frequency induction, data checking, scanning, etc.

        Parameters
        ----------
        model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
            the wv.vocab and wv.vector elements are required.
        sv_mapfile_path : str, optional
            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
        wv_mapfile_path : str, optional
            Optional path to store the word-vectors in for very large datasets. Used for memmap.
            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
        workers : int, optional
            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
            a value of 1 should be more than enough.
        lang_freq : str, optional
            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
            the frequency of a word. As the frequency is required for estimating the word weights, we induce
            frequencies into the wv.vocab.count based on :class:`~wordfreq`
            If no frequency information is available, you can choose the language to estimate the frequency.
            See https://github.com/LuminosoInsight/wordfreq
        fast_version : {-1, 1}, optional
            Whether or not the fast cython implementation of the internal training methods is available. 1 means it is.
        batch_words : int, optional
            Number of words to be processed by a single job.
        batch_ngrams : int, optional
            Number of maxium ngrams for oov words.
        **kwargs : object
            Key word arguments needed to allow children classes to accept more arguments.

        """
        """
        TODO:

        [ ] global:
            [ ] windows support
            [ ] documentation
            [ ] more benchmarks
            [ ] remove wv_mapfile_path?
            [ ] modifiable sv_mapfile_path?

        [ ] models:
            [ ] check feasibility first
            [ ] max-pooling -> easy
            [ ] hierarchical pooling -> easy
            [ ] discrete cosine transform -> somewhat easy, questionable
            [ ] valve -> unclear, not cited enough
            [ ] power-means embedding -> very large dimensionalty
                [ ] z-score transformation is quite nice
            
        [ ] sentencevectors:
            [X] similar_by_sentence model type check
            [ ] approximate NN search for large files
                [ ] compare ANN libraries
                [ ] ease-of-use
                [ ] dependencies
                [ ] compatibility
                [ ] memory-usage
        """

        set_madvise_for_mmap()

        self.workers = int(workers)
        self.batch_words = batch_words
        self.batch_ngrams = batch_ngrams
        self.wv = None

        self.is_ft = False

        self.wv_mapfile_path = (Path(wv_mapfile_path)
                                if wv_mapfile_path is not None else None)
        self.wv_mapfile_shapes = {}

        if fast_version < 0:
            warnings.warn(
                "C extension not loaded, training/inferring will be slow. "
                "Install a C compiler and reinstall fse.")

        self._check_and_include_model(model)

        if self.wv_mapfile_path is not None:
            self._map_all_vectors_to_disk(self.wv_mapfile_path)

        if lang_freq is not None:
            self._check_language_settings(lang_freq)
            self._induce_frequencies()

        self.sv = SentenceVectors(vector_size=self.wv.vector_size,
                                  mapfile_path=sv_mapfile_path)
        self.prep = BaseSentence2VecPreparer()

        self.word_weights = ones(len(self.wv.vocab), REAL)