コード例 #1
0
    def test_save_load_with_memmap(self):
        p = Path("fse/test/test_data/test_vectors")
        p_target = Path("fse/test/test_data/test_vectors.vectors")
        p_not_exists = Path("fse/test/test_data/test_vectors.vectors.npy")

        sv = SentenceVectors(2, mapfile_path=str(p))

        shape = (1000, 1000)
        sv.vectors = np.ones(shape, dtype=np.float32)
        
        memvecs = np.memmap(
            p_target, dtype=np.float32,
            mode='w+', shape=shape)
        memvecs[:] = sv.vectors[:]
        del memvecs

        self.assertTrue(p_target.exists())
        sv.save(str(p.absolute()))
        self.assertTrue(p.exists())
        self.assertFalse(p_not_exists.exists())

        sv = SentenceVectors.load(str(p.absolute()))
        self.assertEqual(shape, sv.vectors.shape)

        for t in [p, p_target]:
            t.unlink()
コード例 #2
0
 def setUp(self):
     self.sv = SentenceVectors(2)
     self.sv.vectors = np.arange(10).reshape(5, 2)
コード例 #3
0
    def __init__(
        self,
        model: BaseKeyedVectors,
        sv_mapfile_path: str = None,
        wv_mapfile_path: str = None,
        workers: int = 1,
        lang_freq: str = None,
        fast_version: int = 0,
        batch_words: int = 10000,
        batch_ngrams: int = 40,
        **kwargs,
    ):
        """ Base class for all Sentence2Vec Models. Provides core functionality, such as
        save, load, sanity checking, frequency induction, data checking, scanning, etc.

        Parameters
        ----------
        model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
            the wv.vocab and wv.vector elements are required.
        sv_mapfile_path : str, optional
            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
        wv_mapfile_path : str, optional
            Optional path to store the word-vectors in for very large datasets. Used for memmap.
            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
        workers : int, optional
            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
            a value of 1 should be more than enough.
        lang_freq : str, optional
            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
            the frequency of a word. As the frequency is required for estimating the word weights, we induce
            frequencies into the wv.vocab.count based on :class:`~wordfreq`
            If no frequency information is available, you can choose the language to estimate the frequency.
            See https://github.com/LuminosoInsight/wordfreq
        fast_version : {-1, 1}, optional
            Whether or not the fast cython implementation of the internal training methods is available. 1 means it is.
        batch_words : int, optional
            Number of words to be processed by a single job.
        batch_ngrams : int, optional
            Number of maxium ngrams for oov words.
        **kwargs : object
            Key word arguments needed to allow children classes to accept more arguments.

        """
        """
        TODO:

        [ ] global:
            [ ] windows support
            [ ] documentation
            [ ] more benchmarks
            [ ] remove wv_mapfile_path?
            [ ] modifiable sv_mapfile_path?

        [ ] models:
            [ ] check feasibility first
            [ ] max-pooling -> easy
            [ ] hierarchical pooling -> easy
            [ ] discrete cosine transform -> somewhat easy, questionable
            [ ] valve -> unclear, not cited enough
            [ ] power-means embedding -> very large dimensionalty
                [ ] z-score transformation is quite nice
            
        [ ] sentencevectors:
            [X] similar_by_sentence model type check
            [ ] approximate NN search for large files
                [ ] compare ANN libraries
                [ ] ease-of-use
                [ ] dependencies
                [ ] compatibility
                [ ] memory-usage
        """

        set_madvise_for_mmap()

        self.workers = int(workers)
        self.batch_words = batch_words
        self.batch_ngrams = batch_ngrams
        self.wv = None

        self.is_ft = False

        self.wv_mapfile_path = (Path(wv_mapfile_path)
                                if wv_mapfile_path is not None else None)
        self.wv_mapfile_shapes = {}

        if fast_version < 0:
            warnings.warn(
                "C extension not loaded, training/inferring will be slow. "
                "Install a C compiler and reinstall fse.")

        self._check_and_include_model(model)

        if self.wv_mapfile_path is not None:
            self._map_all_vectors_to_disk(self.wv_mapfile_path)

        if lang_freq is not None:
            self._check_language_settings(lang_freq)
            self._induce_frequencies()

        self.sv = SentenceVectors(vector_size=self.wv.vector_size,
                                  mapfile_path=sv_mapfile_path)
        self.prep = BaseSentence2VecPreparer()

        self.word_weights = ones(len(self.wv.vocab), REAL)