Exemple #1
0
    def load(cls, *args, **kwargs):
        """ Load a previously saved :class:`~fse.models.base_s2v.BaseSentence2VecModel`.

        Parameters
        ----------
        fname : str
            Path to the saved file.

        Returns
        -------
        :class:`~fse.models.base_s2v.BaseSentence2VecModel`
            Loaded model.

        """
        # This is kind of an ugly hack because I cannot directly modify the save routine of the
        # correpsonding KeyedVectors Files, as a memmap file makes the npy files irrelvant
        model = super(BaseSentence2VecModel, cls).load(*args, **kwargs)

        if model.wv_mapfile_path is not None:
            model._load_all_vectors_from_disk(model.wv_mapfile_path)
        model.wv_mapfile_shapes = None

        set_madvise_for_mmap()

        return model
Exemple #2
0
 def load(cls, fname_or_handle, **kwargs):
     # TODO: Unittests
     sv = super(SentenceVectors, cls).load(fname_or_handle, **kwargs)
     path = sv.mapfile_path
     if path is not None:
         sv._load_all_vectors_from_disk(mapfile_path=path)
     set_madvise_for_mmap()
     return sv
Exemple #3
0
    def __init__(self, vector_size: int, mapfile_path: str = None):

        set_madvise_for_mmap()

        self.vector_size = vector_size  # Size of vectors
        self.vectors = zeros((0, vector_size), REAL)  # Vectors for sentences
        self.vectors_norm = None

        # File for numpy memmap
        self.mapfile_path = Path(
            mapfile_path) if mapfile_path is not None else None
        self.mapfile_shape = None
    def test_madvise(self):
        from pathlib import Path
        from sys import platform
        from fse.models.utils import set_madvise_for_mmap

        if platform in ["linux", "linux2", "darwin", "aix"]:
            p = Path("fse/test/test_data/test_vectors")
            madvise = set_madvise_for_mmap(True)
            shape = (500, 10)
            mat = np.random.normal(size=shape)
            memvecs = np.memmap(p, dtype=np.float32, mode="w+", shape=shape)
            memvecs[:] = mat[:]
            del memvecs

            mat = np.memmap(p, dtype=np.float32, mode="r", shape=shape)

            self.assertEqual(
                madvise(mat.ctypes.data, mat.size * mat.dtype.itemsize, 1), 0)
            p.unlink()
Exemple #5
0
    def __init__(
        self,
        model: BaseKeyedVectors,
        sv_mapfile_path: str = None,
        wv_mapfile_path: str = None,
        workers: int = 1,
        lang_freq: str = None,
        fast_version: int = 0,
        batch_words: int = 10000,
        batch_ngrams: int = 40,
        **kwargs,
    ):
        """ Base class for all Sentence2Vec Models. Provides core functionality, such as
        save, load, sanity checking, frequency induction, data checking, scanning, etc.

        Parameters
        ----------
        model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
            the wv.vocab and wv.vector elements are required.
        sv_mapfile_path : str, optional
            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
        wv_mapfile_path : str, optional
            Optional path to store the word-vectors in for very large datasets. Used for memmap.
            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
        workers : int, optional
            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
            a value of 1 should be more than enough.
        lang_freq : str, optional
            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
            the frequency of a word. As the frequency is required for estimating the word weights, we induce
            frequencies into the wv.vocab.count based on :class:`~wordfreq`
            If no frequency information is available, you can choose the language to estimate the frequency.
            See https://github.com/LuminosoInsight/wordfreq
        fast_version : {-1, 1}, optional
            Whether or not the fast cython implementation of the internal training methods is available. 1 means it is.
        batch_words : int, optional
            Number of words to be processed by a single job.
        batch_ngrams : int, optional
            Number of maxium ngrams for oov words.
        **kwargs : object
            Key word arguments needed to allow children classes to accept more arguments.

        """
        """
        TODO:

        [ ] global:
            [ ] windows support
            [ ] documentation
            [ ] more benchmarks
            [ ] remove wv_mapfile_path?
            [ ] modifiable sv_mapfile_path?

        [ ] models:
            [ ] check feasibility first
            [ ] max-pooling -> easy
            [ ] hierarchical pooling -> easy
            [ ] discrete cosine transform -> somewhat easy, questionable
            [ ] valve -> unclear, not cited enough
            [ ] power-means embedding -> very large dimensionalty
                [ ] z-score transformation is quite nice
            
        [ ] sentencevectors:
            [X] similar_by_sentence model type check
            [ ] approximate NN search for large files
                [ ] compare ANN libraries
                [ ] ease-of-use
                [ ] dependencies
                [ ] compatibility
                [ ] memory-usage
        """

        set_madvise_for_mmap()

        self.workers = int(workers)
        self.batch_words = batch_words
        self.batch_ngrams = batch_ngrams
        self.wv = None

        self.is_ft = False

        self.wv_mapfile_path = (Path(wv_mapfile_path)
                                if wv_mapfile_path is not None else None)
        self.wv_mapfile_shapes = {}

        if fast_version < 0:
            warnings.warn(
                "C extension not loaded, training/inferring will be slow. "
                "Install a C compiler and reinstall fse.")

        self._check_and_include_model(model)

        if self.wv_mapfile_path is not None:
            self._map_all_vectors_to_disk(self.wv_mapfile_path)

        if lang_freq is not None:
            self._check_language_settings(lang_freq)
            self._induce_frequencies()

        self.sv = SentenceVectors(vector_size=self.wv.vector_size,
                                  mapfile_path=sv_mapfile_path)
        self.prep = BaseSentence2VecPreparer()

        self.word_weights = ones(len(self.wv.vocab), REAL)