def load(cls, *args, **kwargs): """ Load a previously saved :class:`~fse.models.base_s2v.BaseSentence2VecModel`. Parameters ---------- fname : str Path to the saved file. Returns ------- :class:`~fse.models.base_s2v.BaseSentence2VecModel` Loaded model. """ # This is kind of an ugly hack because I cannot directly modify the save routine of the # correpsonding KeyedVectors Files, as a memmap file makes the npy files irrelvant model = super(BaseSentence2VecModel, cls).load(*args, **kwargs) if model.wv_mapfile_path is not None: model._load_all_vectors_from_disk(model.wv_mapfile_path) model.wv_mapfile_shapes = None set_madvise_for_mmap() return model
def load(cls, fname_or_handle, **kwargs): # TODO: Unittests sv = super(SentenceVectors, cls).load(fname_or_handle, **kwargs) path = sv.mapfile_path if path is not None: sv._load_all_vectors_from_disk(mapfile_path=path) set_madvise_for_mmap() return sv
def __init__(self, vector_size: int, mapfile_path: str = None): set_madvise_for_mmap() self.vector_size = vector_size # Size of vectors self.vectors = zeros((0, vector_size), REAL) # Vectors for sentences self.vectors_norm = None # File for numpy memmap self.mapfile_path = Path( mapfile_path) if mapfile_path is not None else None self.mapfile_shape = None
def test_madvise(self): from pathlib import Path from sys import platform from fse.models.utils import set_madvise_for_mmap if platform in ["linux", "linux2", "darwin", "aix"]: p = Path("fse/test/test_data/test_vectors") madvise = set_madvise_for_mmap(True) shape = (500, 10) mat = np.random.normal(size=shape) memvecs = np.memmap(p, dtype=np.float32, mode="w+", shape=shape) memvecs[:] = mat[:] del memvecs mat = np.memmap(p, dtype=np.float32, mode="r", shape=shape) self.assertEqual( madvise(mat.ctypes.data, mat.size * mat.dtype.itemsize, 1), 0) p.unlink()
def __init__( self, model: BaseKeyedVectors, sv_mapfile_path: str = None, wv_mapfile_path: str = None, workers: int = 1, lang_freq: str = None, fast_version: int = 0, batch_words: int = 10000, batch_ngrams: int = 40, **kwargs, ): """ Base class for all Sentence2Vec Models. Provides core functionality, such as save, load, sanity checking, frequency induction, data checking, scanning, etc. Parameters ---------- model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings the wv.vocab and wv.vector elements are required. sv_mapfile_path : str, optional Optional path to store the sentence-vectors in for very large datasets. Used for memmap. wv_mapfile_path : str, optional Optional path to store the word-vectors in for very large datasets. Used for memmap. Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram. workers : int, optional Number of working threads, used for multithreading. For most tasks (few words in a sentence) a value of 1 should be more than enough. lang_freq : str, optional Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about the frequency of a word. As the frequency is required for estimating the word weights, we induce frequencies into the wv.vocab.count based on :class:`~wordfreq` If no frequency information is available, you can choose the language to estimate the frequency. See https://github.com/LuminosoInsight/wordfreq fast_version : {-1, 1}, optional Whether or not the fast cython implementation of the internal training methods is available. 1 means it is. batch_words : int, optional Number of words to be processed by a single job. batch_ngrams : int, optional Number of maxium ngrams for oov words. **kwargs : object Key word arguments needed to allow children classes to accept more arguments. """ """ TODO: [ ] global: [ ] windows support [ ] documentation [ ] more benchmarks [ ] remove wv_mapfile_path? [ ] modifiable sv_mapfile_path? [ ] models: [ ] check feasibility first [ ] max-pooling -> easy [ ] hierarchical pooling -> easy [ ] discrete cosine transform -> somewhat easy, questionable [ ] valve -> unclear, not cited enough [ ] power-means embedding -> very large dimensionalty [ ] z-score transformation is quite nice [ ] sentencevectors: [X] similar_by_sentence model type check [ ] approximate NN search for large files [ ] compare ANN libraries [ ] ease-of-use [ ] dependencies [ ] compatibility [ ] memory-usage """ set_madvise_for_mmap() self.workers = int(workers) self.batch_words = batch_words self.batch_ngrams = batch_ngrams self.wv = None self.is_ft = False self.wv_mapfile_path = (Path(wv_mapfile_path) if wv_mapfile_path is not None else None) self.wv_mapfile_shapes = {} if fast_version < 0: warnings.warn( "C extension not loaded, training/inferring will be slow. " "Install a C compiler and reinstall fse.") self._check_and_include_model(model) if self.wv_mapfile_path is not None: self._map_all_vectors_to_disk(self.wv_mapfile_path) if lang_freq is not None: self._check_language_settings(lang_freq) self._induce_frequencies() self.sv = SentenceVectors(vector_size=self.wv.vector_size, mapfile_path=sv_mapfile_path) self.prep = BaseSentence2VecPreparer() self.word_weights = ones(len(self.wv.vocab), REAL)