Beispiel #1
0
def read_vector_models(path_src_vw_model_bin, path_tgt_vw_model_bin):
    if not all([os.path.isfile(fname) for fname in [path_src_vw_model_bin, path_tgt_vw_model_bin]]):
        print('Some of the vector model files given do not exist, perhaps check defaults!')
        sys.exit()

    print('+ preparing src vector model')
    if "ft" in path_src_vw_model_bin:
        vw_src_model = FastTextKeyedVectors.load(path_src_vw_model_bin)
        vw_src_model.add(UNK_token, np.random.normal(0, 0.01, vw_src_model.vector_size))
    else:
        vw_src_model = KeyedVectors.load_word2vec_format(path_src_vw_model_bin, binary=True)
    print('++ src vector model read')
    vw_src_model = extendPretrainedModel(vw_src_model)
    print('++ src vector model extended')

    print('+ preparing tgt vector model')
    if "ft" in path_tgt_vw_model_bin:
        vw_tgt_model = FastTextKeyedVectors.load(path_tgt_vw_model_bin)
        vw_tgt_model.add(UNK_token, np.random.normal(0, 0.01, vw_tgt_model.vector_size))
    else:
        vw_tgt_model = KeyedVectors.load_word2vec_format(path_tgt_vw_model_bin, binary=True)
    print('++ tgt vector model read')
    vw_tgt_model = extendPretrainedModel(vw_tgt_model)
    print('++ tgt vector model extended')

    return vw_src_model, vw_tgt_model
Beispiel #2
0
 def _load_tree(self, tree: dict) -> None:
     self.__dict__.update(tree)
     self.tokens = split_strings(self.tokens)
     self.frequencies = {
         w: self.frequencies["vals"][i]
         for i, w in enumerate(split_strings(self.frequencies["keys"]))}
     self.checker = SymSpell(max_dictionary_edit_distance=self.max_distance)
     self.checker.__dict__.update(tree["checker"])
     deletes = {}
     words = split_strings(self.checker._deletes["strings"])
     lengths = self.checker._deletes["lengths"]
     data = self.checker._deletes["data"]
     offset = 0
     for i, delindex in enumerate(self.checker._deletes["indexes"]):
         length = lengths[i]
         deletes[delindex] = [words[j] for j in data[offset:offset + length]]
         offset += length
     self.checker._deletes = deletes
     self.checker._words = {w: self.checker._words[i] for i, w in enumerate(words)}
     vectors = self.wv["vectors"]
     wv = FastTextKeyedVectors(vectors.shape[1], self.wv["min_n"], self.wv["max_n"],
                               self.wv["bucket"], True)
     wv.vectors = numpy.array(vectors)
     vocab = split_strings(self.wv["vocab"]["strings"])
     wv.vocab = {
         s: Vocab(index=i, count=self.wv["vocab"]["counts"][i])
         for i, s in enumerate(vocab)}
     wv.bucket = self.wv["bucket"]
     wv.index2word = wv.index2entity = vocab
     wv.num_ngram_vectors = self.wv["num_ngram_vectors"]
     wv.vectors_ngrams = numpy.array(self.wv["vectors_ngrams"])
     wv.hash2index = {k: v for v, k in enumerate(self.wv["hash2index"])}
     self.wv = wv
Beispiel #3
0
def embedding_text(target_dataset):
    print("Loading embedding model...")
    model_name = 'FASTTEXT_' + target_dataset + '.model'
    embedding_model = FastTextKeyedVectors.load(
        os.path.join(CONFIG.EMBEDDING_PATH, model_name))
    print("Loading embedding model completed")
    dataset_path = os.path.join(CONFIG.DATASET_PATH, target_dataset)
    for loc_id in tqdm(os.listdir(dataset_path)):
        path_dir = os.path.join(dataset_path, loc_id)
        for post in tqdm(os.listdir(path_dir), leave=False):
            pickle_path = os.path.join(path_dir, post, "text.p")
            with open(os.path.join(path_dir, post, "text.txt"),
                      'r',
                      encoding='utf-8',
                      newline='\n') as f:
                text_data = f.read()
                word_list = text_data.split()
                vector_list = []
                if len(word_list) > CONFIG.MAX_SENTENCE_LEN:
                    # truncate sentence if sentence length is longer than `max_sentence_len`
                    word_list = word_list[:CONFIG.MAX_SENTENCE_LEN]
                    word_list[-1] = '<EOS>'
                else:
                    word_list = word_list + ['<PAD>'] * (
                        CONFIG.MAX_SENTENCE_LEN - len(word_list))
                for word in word_list:
                    vector = embedding_model.get_vector(word)
                    vector_list.append(vector)
                vector_array = np.array(vector_list, dtype=np.float32)
            f.close()
            with open(pickle_path, 'wb') as f:
                cPickle.dump(vector_array, f, protocol=-1)
            f.close()
            del text_data, word_list, vector_array
 def train_model(self, corpus):
     if self.model is None:
         logging.info(f"Start loading model {self.pretrained_model_path}")
         if self.pretrained_model_path.endswith(".bin"):
             self.model = load_facebook_vectors(self.pretrained_model_path)
         else:
             self.model = FastTextKeyedVectors.load(self.pretrained_model_path)
         self.model.init_sims(True)
         logging.info(f"Finished loading model {self.pretrained_model_path}")
     return self.model
Beispiel #5
0
    def test_ft_kv_backward_compat_w_360(self):
        kv = EuclideanKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))
        ft_kv = FastTextKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))

        expected = ['trees', 'survey', 'system', 'graph', 'interface']
        actual = [word for (word, similarity) in kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)

        actual = [word for (word, similarity) in ft_kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)
    def test_ft_kv_backward_compat_w_360(self):
        kv = EuclideanKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))
        ft_kv = FastTextKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))

        expected = ['trees', 'survey', 'system', 'graph', 'interface']
        actual = [word for (word, similarity) in kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)

        actual = [word for (word, similarity) in ft_kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)
def test_fasttext(target_model):
	model_name = 'FASTTEXT_' + target_model + '.model'
	model = FastTextKeyedVectors.load(os.path.join(CONFIG.EMBEDDING_PATH, model_name))
	pad_vector = np.full(300, np.finfo(np.float32).eps)
	# pad_vector = np.random.randn(300)
	# pad_vector = np.ones(300)
	# pad_vector = np.full(300, 100)
	# print(pad_vector)
	print(model.similar_by_word("<EOS>"))
	print(model.similar_by_vector(vector=pad_vector, topn=5))
	model.add("<PAD>", pad_vector)
	model.init_sims(replace=True)
	print(model.similar_by_vector(vector=pad_vector, topn=5))
	print(model.get_vector("<EOS>"))
	print(model.get_vector("<PAD>"))
def data_reshape(df,
                 lookup_table_relations,
                 lookup_table_ent_types,
                 lookup_table_deptags,
                 lookup_table_postags,
                 len_max_seq=30):
    """Take a dataframe of features and reformat to fit into a vectors of (1D features, sequential features, target) as the input of the model"""
    wordvectors = FastTextKeyedVectors.load(
        ".data/pretrained_word_vectors.bin")

    #Create array X and y
    features_flat = []
    labels = []
    seq_flat = []

    for row in df.itertuples():
        labels.append(int(row.relation != ''))
        #'flat' features
        ent1t = lookup_table_ent_types[row.ent1type]
        ent2t = lookup_table_ent_types[row.ent2type]
        vec = np.concatenate((ent1t, ent2t))
        features_flat.append(vec)

        len_seq = 0
        for i in range(0, len(row.shortest_dependency_path_p.split("/"))):
            #'sequence' features
            current_word = row.shortest_dependency_path_w.split("/")[i]
            current_dependency = row.shortest_dependency_path_p.split("/")[i]
            current_pos_tag = row.shortest_dependency_path_t.split("/")[i]
            seq_sdp_w = wordvectors[current_word] if type(
                current_word) == str else np.zeros(100)
            seq_sdp_p = lookup_table_deptags[
                current_dependency.split(':')
                [0]] if "conj" in current_dependency else lookup_table_deptags[
                    current_dependency]
            seq_sdp_t = lookup_table_postags[current_pos_tag]
            vec_seq = np.concatenate((seq_sdp_w, seq_sdp_p, seq_sdp_t))
            seq_flat.append(vec_seq)
            len_seq += 1
        while len_seq < len_max_seq:
            seq_flat.append(np.zeros(len(vec_seq)))
            len_seq += 1
    #add labels and reshpare into 3-dimensional tensors
    labels = np.array(labels)
    features_words = np.array(features_flat)
    seq = np.reshape(seq_flat, (df.shape[0], len_max_seq, len(vec_seq)))
    return (features_words, seq, labels)
def visualize_language():
    model_name = get_model_name()
    path = "data/vector_models/" + model_name

    if "ft" in model_name:
        wv = FastTextKeyedVectors.load(path)
    else:
        wv = KeyedVectors.load_word2vec_format(path, binary=True)

    print("Vocab size:", len(wv.vocab))

    words = [""]
    if "en" in model_name:
        words = en_words

    if "nl" in model_name:
        words = nl_words

    visualize_words(wv, words)
Beispiel #10
0
    def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
                 max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
                 negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6,
                 sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()):
        """

        Parameters
        ----------
        sentences : iterable of list of str, optional
            Can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
            If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
            in some other way.
        min_count : int, optional
            The model ignores all words with total frequency lower than this.
        size : int, optional
            Dimensionality of the word vectors.
        window : int, optional
            The maximum distance between the current and predicted word within a sentence.
        workers : int, optional
            Use these many worker threads to train the model (=faster training with multicore machines).
        alpha : float, optional
            The initial learning rate.
        min_alpha : float, optional
            Learning rate will linearly drop to `min_alpha` as training progresses.
        sg : {1, 0}, optional
            Training algorithm: skip-gram if `sg=1`, otherwise CBOW.
        hs : {1,0}, optional
            If 1, hierarchical softmax will be used for model training.
            If set to 0, and `negative` is non-zero, negative sampling will be used.
        seed : int, optional
            Seed for the random number generator. Initial vectors for each word are seeded with a hash of
            the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
            you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
            from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
            use of the `PYTHONHASHSEED` environment variable to control hash randomization).
        max_vocab_size : int, optional
            Limits the RAM during vocabulary building; if there are more unique
            words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
            Set to `None` for no limit.
        sample : float, optional
            The threshold for configuring which higher-frequency words are randomly downsampled,
            useful range is (0, 1e-5).
        negative : int, optional
            If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
            should be drawn (usually between 5-20).
            If set to 0, no negative sampling is used.
        ns_exponent : float, optional
            The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
            to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
            than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
            More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
            other values may perform better for recommendation applications.
        cbow_mean : {1,0}, optional
            If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
        hashfxn : function, optional
            Hash function to use to randomly initialize weights, for increased training reproducibility.
        iter : int, optional
            Number of iterations (epochs) over the corpus.
        trim_rule : function, optional
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            The rule, if given, is only used to prune vocabulary during
            :meth:`~gensim.models.fasttext.FastText.build_vocab` and is not stored as part of themodel.

            The input parameters are of the following types:
                * `word` (str) - the word we are examining
                * `count` (int) - the word's frequency count in the corpus
                * `min_count` (int) - the minimum count threshold.

        sorted_vocab : {1,0}, optional
            If 1, sort the vocabulary by descending frequency before assigning word indices.
        batch_words : int, optional
            Target size (in words) for batches of examples passed to worker threads (and
            thus cython routines).(Larger batches will be passed if individual
            texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
        min_n : int, optional
            Minimum length of char n-grams to be used for training word representations.
        max_n : int, optional
            Max length of char ngrams to be used for training word representations. Set `max_n` to be
            lesser than `min_n` to avoid char ngrams being used.
        word_ngrams : {1,0}, optional
            If 1, uses enriches word vectors with subword(n-grams) information.
            If 0, this is equivalent to :class:`~gensim.models.word2vec.Word2Vec`.
        bucket : int, optional
            Character ngrams are hashed into a fixed number of buckets, in order to limit the
            memory usage of the model. This option specifies the number of buckets used by the model.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional
            List of callbacks that need to be executed/run at specific stages during training.

        Examples
        --------
        Initialize and train a `FastText` model::

        >>> from gensim.models import FastText
        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>>
        >>> model = FastText(sentences, min_count=1)
        >>> say_vector = model['say']  # get vector for a word
        >>> of_vector = model['of']  # get vector for an out-of-vocab word

        """
        self.load = call_on_class_only
        self.load_fasttext_format = call_on_class_only
        self.callbacks = callbacks
        self.word_ngrams = int(word_ngrams)
        if self.word_ngrams <= 1 and max_n == 0:
            bucket = 0

        self.wv = FastTextKeyedVectors(size, min_n, max_n)
        self.vocabulary = FastTextVocab(
            max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
            sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent)
        self.trainables = FastTextTrainables(
            vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)
        self.wv.bucket = self.bucket

        super(FastText, self).__init__(
            sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks,
            batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed,
            hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, fast_version=FAST_VERSION)
Beispiel #11
0
class FastText(BaseWordEmbeddingsModel):
    """Train, use and evaluate word representations learned using the method
    described in `Enriching Word Vectors with Subword Information <https://arxiv.org/abs/1607.04606>`_, aka FastText.

    The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save` and
    :meth:`~gensim.models.fasttext.FastText.load` methods, or loaded from a format compatible with the original
    Fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format`.

    Some important internal attributes are the following:

    Attributes
    ----------
    wv : :class:`~gensim.models.keyedvectors.FastTextKeyedVectors`
        This object essentially contains the mapping between words and embeddings. These are similar to the embeddings
        computed in the :class:`~gensim.models.word2vec.Word2Vec`, however here we also include vectors for n-grams.
        This allows the model to compute embeddings even for **unseen** words (that do not exist in the vocabulary),
        as the aggregate of the n-grams included in the word. After training the model, this attribute can be used
        directly to query those embeddings in various ways. Check the module level docstring from some examples.
    vocabulary : :class:`~gensim.models.fasttext.FastTextVocab`
        This object represents the vocabulary of the model.
        Besides keeping track of all unique words, this object provides extra functionality, such as
        constructing a huffman tree (frequent words are closer to the root), or discarding extremely rare words.
    trainables : :class:`~gensim.models.fasttext.FastTextTrainables`
        This object represents the inner shallow neural network used to train the embeddings. This is very
        similar to the network of the :class:`~gensim.models.word2vec.Word2Vec` model, but it also trains weights
        for the N-Grams (sequences of more than 1 words). The semantics of the network are almost the same as
        the one used for the :class:`~gensim.models.word2vec.Word2Vec` model.
        You can think of it as a NN with a single projection and hidden layer which we train on the corpus.
        The weights are then used as our embeddings. An important difference however between the two models, is the
        scoring function used to compute the loss. In the case of FastText, this is modified in word to also account
        for the internal structure of words, besides their concurrence counts.

    """
    def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
                 max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
                 negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6,
                 sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()):
        """

        Parameters
        ----------
        sentences : iterable of list of str, optional
            Can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
            If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
            in some other way.
        min_count : int, optional
            The model ignores all words with total frequency lower than this.
        size : int, optional
            Dimensionality of the word vectors.
        window : int, optional
            The maximum distance between the current and predicted word within a sentence.
        workers : int, optional
            Use these many worker threads to train the model (=faster training with multicore machines).
        alpha : float, optional
            The initial learning rate.
        min_alpha : float, optional
            Learning rate will linearly drop to `min_alpha` as training progresses.
        sg : {1, 0}, optional
            Training algorithm: skip-gram if `sg=1`, otherwise CBOW.
        hs : {1,0}, optional
            If 1, hierarchical softmax will be used for model training.
            If set to 0, and `negative` is non-zero, negative sampling will be used.
        seed : int, optional
            Seed for the random number generator. Initial vectors for each word are seeded with a hash of
            the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
            you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
            from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
            use of the `PYTHONHASHSEED` environment variable to control hash randomization).
        max_vocab_size : int, optional
            Limits the RAM during vocabulary building; if there are more unique
            words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
            Set to `None` for no limit.
        sample : float, optional
            The threshold for configuring which higher-frequency words are randomly downsampled,
            useful range is (0, 1e-5).
        negative : int, optional
            If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
            should be drawn (usually between 5-20).
            If set to 0, no negative sampling is used.
        ns_exponent : float, optional
            The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
            to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
            than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
            More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
            other values may perform better for recommendation applications.
        cbow_mean : {1,0}, optional
            If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
        hashfxn : function, optional
            Hash function to use to randomly initialize weights, for increased training reproducibility.
        iter : int, optional
            Number of iterations (epochs) over the corpus.
        trim_rule : function, optional
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            The rule, if given, is only used to prune vocabulary during
            :meth:`~gensim.models.fasttext.FastText.build_vocab` and is not stored as part of themodel.

            The input parameters are of the following types:
                * `word` (str) - the word we are examining
                * `count` (int) - the word's frequency count in the corpus
                * `min_count` (int) - the minimum count threshold.

        sorted_vocab : {1,0}, optional
            If 1, sort the vocabulary by descending frequency before assigning word indices.
        batch_words : int, optional
            Target size (in words) for batches of examples passed to worker threads (and
            thus cython routines).(Larger batches will be passed if individual
            texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
        min_n : int, optional
            Minimum length of char n-grams to be used for training word representations.
        max_n : int, optional
            Max length of char ngrams to be used for training word representations. Set `max_n` to be
            lesser than `min_n` to avoid char ngrams being used.
        word_ngrams : {1,0}, optional
            If 1, uses enriches word vectors with subword(n-grams) information.
            If 0, this is equivalent to :class:`~gensim.models.word2vec.Word2Vec`.
        bucket : int, optional
            Character ngrams are hashed into a fixed number of buckets, in order to limit the
            memory usage of the model. This option specifies the number of buckets used by the model.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional
            List of callbacks that need to be executed/run at specific stages during training.

        Examples
        --------
        Initialize and train a `FastText` model::

        >>> from gensim.models import FastText
        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>>
        >>> model = FastText(sentences, min_count=1)
        >>> say_vector = model['say']  # get vector for a word
        >>> of_vector = model['of']  # get vector for an out-of-vocab word

        """
        self.load = call_on_class_only
        self.load_fasttext_format = call_on_class_only
        self.callbacks = callbacks
        self.word_ngrams = int(word_ngrams)
        if self.word_ngrams <= 1 and max_n == 0:
            bucket = 0

        self.wv = FastTextKeyedVectors(size, min_n, max_n)
        self.vocabulary = FastTextVocab(
            max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
            sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent)
        self.trainables = FastTextTrainables(
            vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)
        self.wv.bucket = self.bucket

        super(FastText, self).__init__(
            sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks,
            batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed,
            hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, fast_version=FAST_VERSION)

    @property
    @deprecated("Attribute will be removed in 4.0.0, use wv.min_n instead")
    def min_n(self):
        return self.wv.min_n

    @property
    @deprecated("Attribute will be removed in 4.0.0, use wv.max_n instead")
    def max_n(self):
        return self.wv.max_n

    @property
    @deprecated("Attribute will be removed in 4.0.0, use trainables.bucket instead")
    def bucket(self):
        return self.trainables.bucket

    @property
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead")
    def syn0_vocab_lockf(self):
        return self.trainables.vectors_vocab_lockf

    @syn0_vocab_lockf.setter
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead")
    def syn0_vocab_lockf(self, value):
        self.trainables.vectors_vocab_lockf = value

    @syn0_vocab_lockf.deleter
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead")
    def syn0_vocab_lockf(self):
        del self.trainables.vectors_vocab_lockf

    @property
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead")
    def syn0_ngrams_lockf(self):
        return self.trainables.vectors_ngrams_lockf

    @syn0_ngrams_lockf.setter
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead")
    def syn0_ngrams_lockf(self, value):
        self.trainables.vectors_ngrams_lockf = value

    @syn0_ngrams_lockf.deleter
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead")
    def syn0_ngrams_lockf(self):
        del self.trainables.vectors_ngrams_lockf

    @property
    @deprecated("Attribute will be removed in 4.0.0, use self.wv.num_ngram_vectors instead")
    def num_ngram_vectors(self):
        return self.wv.num_ngram_vectors

    def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs):
        """Build vocabulary from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of unicode strings.

        Parameters
        ----------
        sentences : iterable of list of str
            Can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
        update : bool
            If true, the new words in `sentences` will be added to model's vocab.
        progress_per : int
            Indicates how many words to process before showing/updating the progress.
        keep_raw_vocab : bool
            If not true, delete the raw vocabulary after the scaling is done and free up RAM.
        trim_rule : function, optional
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            The rule, if given, is only used to prune vocabulary during
            :meth:`~gensim.models.fasttext.FastText.build_vocab` and is not stored as part of the model.

            The input parameters are of the following types:
                * `word` (str) - the word we are examining
                * `count` (int) - the word's frequency count in the corpus
                * `min_count` (int) - the minimum count threshold.

        **kwargs
            Additional key word parameters passed to
            :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.build_vocab`.

        Examples
        --------
        Train a model and update vocab for online training

        >>> from gensim.models import FastText
        >>> sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>> sentences_2 = [["dude", "say", "wazzup!"]]
        >>>
        >>> model = FastText(min_count=1)
        >>> model.build_vocab(sentences_1)
        >>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter)
        >>>
        >>> model.build_vocab(sentences_2, update=True)
        >>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter)

        """
        if update:
            if not len(self.wv.vocab):
                raise RuntimeError(
                    "You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
                    "First build the vocabulary of your model with a corpus "
                    "before doing an online update.")
            self.vocabulary.old_vocab_len = len(self.wv.vocab)
            self.trainables.old_hash2index_len = len(self.wv.hash2index)

        return super(FastText, self).build_vocab(
            sentences, update=update, progress_per=progress_per,
            keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs)

    def _set_train_params(self, **kwargs):
        pass

    def _clear_post_train(self):
        """Clear the model's internal structures after training has finished to free up RAM."""
        self.wv.vectors_norm = None
        self.wv.vectors_vocab_norm = None
        self.wv.vectors_ngrams_norm = None
        self.wv.buckets_word = None

    def estimate_memory(self, vocab_size=None, report=None):
        vocab_size = vocab_size or len(self.wv.vocab)
        vec_size = self.vector_size * np.dtype(np.float32).itemsize
        l1_size = self.layer1_size * np.dtype(np.float32).itemsize
        report = report or {}
        report['vocab'] = len(self.wv.vocab) * (700 if self.hs else 500)
        report['syn0_vocab'] = len(self.wv.vocab) * vec_size
        num_buckets = self.bucket
        if self.hs:
            report['syn1'] = len(self.wv.vocab) * l1_size
        if self.negative:
            report['syn1neg'] = len(self.wv.vocab) * l1_size
        if self.word_ngrams > 0 and self.wv.vocab:
            buckets = set()
            num_ngrams = 0
            for word in self.wv.vocab:
                ngrams = _compute_ngrams(word, self.min_n, self.max_n)
                num_ngrams += len(ngrams)
                buckets.update(_ft_hash(ng) % self.bucket for ng in ngrams)
            num_buckets = len(buckets)
            report['syn0_ngrams'] = len(buckets) * vec_size
            # A tuple (48 bytes) with num_ngrams_word ints (8 bytes) for each word
            # Only used during training, not stored with the model
            report['buckets_word'] = 48 * len(self.wv.vocab) + 8 * num_ngrams
        elif self.word_ngrams > 0:
            logger.warn(
                'subword information is enabled, but no vocabulary could be found, estimated required memory might be '
                'inaccurate!'
            )
        report['total'] = sum(report.values())
        logger.info(
            "estimated required memory for %i words, %i buckets and %i dimensions: %i bytes",
            len(self.wv.vocab), num_buckets, self.vector_size, report['total']
        )
        return report

    def _do_train_job(self, sentences, alpha, inits):
        """Train a single batch of sentences. Return 2-tuple `(effective word count after
        ignoring unknown words and sentence length trimming, total word count)`.

        Parameters
        ----------
        sentences : iterable of list of str
            Can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
        alpha : float
            The current learning rate.
        inits : tuple of (:class:`numpy.ndarray`, :class:`numpy.ndarray`)
            Each worker's private work memory.

        Returns
        -------
        (int, int)
            Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count)

        """
        work, neu1 = inits
        tally = 0
        if self.sg:
            tally += train_batch_sg(self, sentences, alpha, work, neu1)
        else:
            tally += train_batch_cbow(self, sentences, alpha, work, neu1)

        return tally, self._raw_word_count(sentences)

    def train(self, sentences, total_examples=None, total_words=None,
              epochs=None, start_alpha=None, end_alpha=None,
              word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs):
        """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        For FastText, each sentence must be a list of unicode strings.

        To support linear learning-rate decay from (initial) `alpha` to `min_alpha`, and accurate
        progress-percentage logging, either `total_examples` (count of sentences) or `total_words` (count of
        raw words in sentences) **MUST** be provided. If `sentences` is the same corpus
        that was provided to :meth:`~gensim.models.fasttext.FastText.build_vocab` earlier,
        you can simply use `total_examples=self.corpus_count`.

        To avoid common mistakes around the model's ability to do multiple training passes itself, an
        explicit `epochs` argument **MUST** be provided. In the common and recommended case
        where :meth:`~gensim.models.fasttext.FastText.train` is only called once, you can set `epochs=self.iter`.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
        total_examples : int
            Count of sentences.
        total_words : int
            Count of raw words in sentences.
        epochs : int
            Number of iterations (epochs) over the corpus.
        start_alpha : float, optional
            Initial learning rate. If supplied, replaces the starting `alpha` from the constructor,
            for this one call to :meth:`~gensim.models.fasttext.FastText.train`.
            Use only if making multiple calls to :meth:`~gensim.models.fasttext.FastText.train`, when you want to manage
            the alpha learning-rate yourself (not recommended).
        end_alpha : float, optional
            Final learning rate. Drops linearly from `start_alpha`.
            If supplied, this replaces the final `min_alpha` from the constructor, for this one call to
            :meth:`~gensim.models.fasttext.FastText.train`.
            Use only if making multiple calls to :meth:`~gensim.models.fasttext.FastText.train`, when you want to manage
            the alpha learning-rate yourself (not recommended).
        word_count : int
            Count of words already trained. Set this to 0 for the usual
            case of training on all words in sentences.
        queue_factor : int
            Multiplier for size of queue (number of workers * queue_factor).
        report_delay : float
            Seconds to wait before reporting progress.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.

        Examples
        --------
        >>> from gensim.models import FastText
        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>>
        >>> model = FastText(min_count=1)
        >>> model.build_vocab(sentences)
        >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)

        """
        super(FastText, self).train(
            sentences, total_examples=total_examples, total_words=total_words,
            epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
            queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks)
        self.trainables.get_vocab_word_vecs(self.wv)

    def init_sims(self, replace=False):
        """
        Precompute L2-normalized vectors.

        Parameters
        ----------
        replace : bool
            If True, forget the original vectors and only keep the normalized ones to save RAM.

        """
        # init_sims() resides in KeyedVectors because it deals with input layer mainly, but because the
        # hidden layer is not an attribute of KeyedVectors, it has to be deleted in this class.
        # The normalizing of input layer happens inside of KeyedVectors.
        if replace and hasattr(self.trainables, 'syn1'):
            del self.trainables.syn1
        self.wv.init_sims(replace)

    def clear_sims(self):
        """Remove all L2-normalized word vectors from the model, to free up memory.

        You can recompute them later again using the :meth:`~gensim.models.fasttext.FastText.init_sims` method.

        """
        self._clear_post_train()

    @deprecated("Method will be removed in 4.0.0, use self.wv.__getitem__() instead")
    def __getitem__(self, words):
        """Deprecated. Use self.wv.__getitem__() instead.

        Refer to the documentation for :meth:`gensim.models.keyedvectors.KeyedVectors.__getitem__`

        """
        return self.wv.__getitem__(words)

    @deprecated("Method will be removed in 4.0.0, use self.wv.__contains__() instead")
    def __contains__(self, word):
        """Deprecated. Use self.wv.__contains__() instead.

        Refer to the documentation for :meth:`gensim.models.keyedvectors.KeyedVectors.__contains__`

        """
        return self.wv.__contains__(word)

    @classmethod
    def load_fasttext_format(cls, model_file, encoding='utf8'):
        """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.

        Notes
        ------
        Due to limitations in the FastText API, you cannot continue training with a model loaded this way.

        Parameters
        ----------
        model_file : str
            Path to the FastText output files.
            FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
            Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
            as Gensim requires only `.bin` file to the load entire fastText model.
        encoding : str, optional
            Specifies the file encoding.

        Returns
        -------
        :class: `~gensim.models.fasttext.FastText`
            The loaded model.

        """
        model = cls()
        if not model_file.endswith('.bin'):
            model_file += '.bin'
        model.file_name = model_file
        model.load_binary_data(encoding=encoding)
        return model

    def load_binary_data(self, encoding='utf8'):
        """Load data from a binary file created by Facebook's native FastText.

        Parameters
        ----------
        encoding : str, optional
            Specifies the encoding.

        """

        # TODO use smart_open again when https://github.com/RaRe-Technologies/smart_open/issues/207 will be fixed
        with open(self.file_name, 'rb') as f:
            self._load_model_params(f)
            self._load_dict(f, encoding=encoding)
            self._load_vectors(f)

    def _load_model_params(self, file_handle):
        """Load model parameters from Facebook's native fasttext file.

        Parameters
        ----------
        file_handle : file-like object
            Handle to an open file.

        """
        magic, version = self.struct_unpack(file_handle, '@2i')
        if magic == FASTTEXT_FILEFORMAT_MAGIC:  # newer format
            self.new_format = True
            dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = \
                self.struct_unpack(file_handle, '@12i1d')
        else:  # older format
            self.new_format = False
            dim = magic
            ws = version
            epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d')
        # Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc)
        self.wv.vector_size = dim
        self.vector_size = dim
        self.window = ws
        self.epochs = epoch
        self.vocabulary.min_count = min_count
        self.negative = neg
        self.hs = loss == 1
        self.sg = model == 2
        self.trainables.bucket = bucket
        self.wv.bucket = bucket
        self.wv.min_n = minn
        self.wv.max_n = maxn
        self.vocabulary.sample = t

    def _load_dict(self, file_handle, encoding='utf8'):
        """Load a previously saved dictionary from disk, stored in Facebook's native fasttext format.

        Parameters
        ----------
        file_handle : file-like object
            The opened file handle to the persisted dictionary.
        encoding : str
            Specifies the encoding.

        """
        vocab_size, nwords, nlabels = self.struct_unpack(file_handle, '@3i')
        # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
        if nlabels > 0:
            raise NotImplementedError("Supervised fastText models are not supported")
        logger.info("loading %s words for fastText model from %s", vocab_size, self.file_name)

        self.struct_unpack(file_handle, '@1q')  # number of tokens
        if self.new_format:
            pruneidx_size, = self.struct_unpack(file_handle, '@q')
        for i in range(vocab_size):
            word_bytes = b''
            char_byte = file_handle.read(1)
            # Read vocab word
            while char_byte != b'\x00':
                word_bytes += char_byte
                char_byte = file_handle.read(1)
            word = word_bytes.decode(encoding)
            count, _ = self.struct_unpack(file_handle, '@qb')

            self.wv.vocab[word] = Vocab(index=i, count=count)
            self.wv.index2word.append(word)

        assert len(self.wv.vocab) == nwords, (
            'mismatch between final vocab size ({} words), '
            'and expected number of words ({} words)'.format(len(self.wv.vocab), nwords))
        if len(self.wv.vocab) != vocab_size:
            # expecting to log this warning only for pretrained french vector, wiki.fr
            logger.warning(
                "mismatch between final vocab size (%s words), and expected vocab size (%s words)",
                len(self.wv.vocab), vocab_size
            )

        if self.new_format:
            for j in range(pruneidx_size):
                self.struct_unpack(file_handle, '@2i')

    def _load_vectors(self, file_handle):
        """Load word vectors stored in Facebook's native fasttext format from disk.

        Parameters
        ----------
        file_handle : file-like object
            Open file handle to persisted vectors.

        """
        if self.new_format:
            self.struct_unpack(file_handle, '@?')  # bool quant_input in fasttext.cc
        num_vectors, dim = self.struct_unpack(file_handle, '@2q')
        # Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc)
        assert self.wv.vector_size == dim, (
            'mismatch between vector size in model params ({}) and model vectors ({})'
            .format(self.wv.vector_size, dim)
        )
        float_size = struct.calcsize('@f')
        if float_size == 4:
            dtype = np.dtype(np.float32)
        elif float_size == 8:
            dtype = np.dtype(np.float64)

        self.num_original_vectors = num_vectors
        self.wv.vectors_ngrams = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim)
        self.wv.vectors_ngrams = self.wv.vectors_ngrams.reshape((num_vectors, dim))
        assert self.wv.vectors_ngrams.shape == (
            self.trainables.bucket + len(self.wv.vocab), self.wv.vector_size), \
            'mismatch between actual weight matrix shape {} and expected shape {}'\
            .format(
                self.wv.vectors_ngrams.shape, (self.trainables.bucket + len(self.wv.vocab), self.wv.vector_size)
            )

        self.trainables.init_ngrams_post_load(self.file_name, self.wv)
        self._clear_post_train()

    def struct_unpack(self, file_handle, fmt):
        """Read a single object from an open file.

        Parameters
        ----------
        file_handle : file_like object
            Handle to an open file
        fmt : str
            Byte format in which the structure is saved.

        Returns
        -------
        Tuple of (str)
            Unpacked structure.

        """
        num_bytes = struct.calcsize(fmt)
        return struct.unpack(fmt, file_handle.read(num_bytes))

    def save(self, *args, **kwargs):
        """Save the Fasttext model. This saved model can be loaded again using
        :meth:`~gensim.models.fasttext.FastText.load`, which supports incremental training
        and getting vectors for out-of-vocabulary words.

        Parameters
        ----------
        fname : str
            Store the model to this file.

        See Also
        --------
        :meth:`~gensim.models.fasttext.FastText.load`
            Load :class:`~gensim.models.fasttext.FastText` model.

        """
        kwargs['ignore'] = kwargs.get(
            'ignore', ['vectors_norm', 'vectors_vocab_norm', 'vectors_ngrams_norm', 'buckets_word'])
        super(FastText, self).save(*args, **kwargs)

    @classmethod
    def load(cls, *args, **kwargs):
        """Load a previously saved `FastText` model.

        Parameters
        ----------
        fname : str
            Path to the saved file.

        Returns
        -------
        :class:`~gensim.models.fasttext.FastText`
            Loaded model.

        See Also
        --------
        :meth:`~gensim.models.fasttext.FastText.save`
            Save :class:`~gensim.models.fasttext.FastText` model.

        """
        try:
            model = super(FastText, cls).load(*args, **kwargs)
            if not hasattr(model.trainables, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'):
                model.trainables.vectors_vocab_lockf = ones(len(model.trainables.vectors), dtype=REAL)
            if not hasattr(model.trainables, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'):
                model.trainables.vectors_ngrams_lockf = ones(len(model.trainables.vectors), dtype=REAL)
            return model
        except AttributeError:
            logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.')
            from gensim.models.deprecated.fasttext import load_old_fasttext
            return load_old_fasttext(*args, **kwargs)

    @deprecated("Method will be removed in 4.0.0, use self.wv.accuracy() instead")
    def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True):
        most_similar = most_similar or FastTextKeyedVectors.most_similar
        return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive)
def get_subplot_for_data(lang="en"):
    lang_full, lang_short = language_map(lang)
    fig = plt.figure()

    plot_labels = {
        "w2v": "Word2Vec",
        "ft": "FastText",
        "cbow": "CBOW",
        "sg": "Skip-Gram"
    }

    for i, type in enumerate(["w2v", "ft"]):
        for j, hp in enumerate(["cbow", "sg"]):
            print(type, hp)

            # First word2vec
            model_name = type + "_" + lang + "_d100_" + hp + "_st.bin"
            path = "data/vector_models/" + model_name

            if type == "ft":
                wv = FastTextKeyedVectors.load(path)
            else:
                wv = KeyedVectors.load_word2vec_format(path, binary=True)

            words = all_words[lang]

            total_words = []
            for topic in words:
                total_words.extend(topic)

            pca = PCA(n_components=2)

            X = wv[wv.vocab]
            mean = np.mean(X, axis=0)
            var = np.var(X, axis=0)

            X -= mean
            X /= var
            pca.fit(X)

            # Start subplot
            subplot_num = i * 2 + (j + 1)
            axis = fig.add_subplot(2, 2, subplot_num)

            for topic in words:
                X = wv[topic]
                X -= mean
                X /= var
                result = pca.transform(X)

                axis.scatter(result[:, 0], result[:, 1], s=5.0)
                for k, word in enumerate(topic):
                    axis.annotate(word,
                                  xy=(result[k, 0], result[k, 1]),
                                  size=7)

                plt.setp(axis.get_xticklabels(), visible=False)
                plt.setp(axis.get_yticklabels(), visible=False)

            axis.set_title(lang_full.capitalize() + " - " + plot_labels[type] +
                           " - " + plot_labels[hp],
                           fontdict={"fontsize": 12})
    # plt.savefig("Figures/embedding_" + lang_short + ".png")

    plt.show()
Beispiel #13
0
def get_latent(args):
    device = torch.device(args.gpu)
    print("Loading embedding model...")
    image_embedding_model = models.__dict__[args.arch](pretrained=True)
    image_embedding_dim = image_embedding_model.fc.in_features
    args.image_embedding_dim = image_embedding_dim
    model_name = 'FASTTEXT_' + args.target_dataset + '.model'
    text_embedding_model = FastTextKeyedVectors.load(
        os.path.join(CONFIG.EMBEDDING_PATH, model_name))
    text_embedding_dim = text_embedding_model.vector_size
    args.text_embedding_dim = text_embedding_dim
    print("Building index...")
    indexer = AnnoyIndexer(text_embedding_model, 10)
    print("Loading embedding model completed")
    print("Loading dataset...")
    full_dataset = load_full_data(args,
                                  CONFIG,
                                  text_embedding_model,
                                  total=True)
    print("Loading dataset completed")
    full_loader = DataLoader(full_dataset,
                             batch_size=args.batch_size,
                             shuffle=False)

    # t1 = max_sentence_len + 2 * (args.filter_shape - 1)
    t1 = CONFIG.MAX_SENTENCE_LEN
    t2 = int(math.floor(
        (t1 - args.filter_shape) / 2) + 1)  # "2" means stride size
    t3 = int(math.floor((t2 - args.filter_shape) / 2) + 1)
    args.t3 = t3

    text_encoder = text_model.ConvolutionEncoder(text_embedding_dim, t3,
                                                 args.filter_size,
                                                 args.filter_shape,
                                                 args.latent_size)
    text_decoder = text_model.DeconvolutionDecoder(text_embedding_dim, t3,
                                                   args.filter_size,
                                                   args.filter_shape,
                                                   args.latent_size)
    imgseq_encoder = imgseq_model.RNNEncoder(image_embedding_dim,
                                             args.num_layer,
                                             args.latent_size,
                                             bidirectional=True)
    imgseq_decoder = imgseq_model.RNNDecoder(image_embedding_dim,
                                             args.num_layer,
                                             args.latent_size,
                                             bidirectional=True)
    checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH,
                                         args.checkpoint),
                            map_location=lambda storage, loc: storage)
    multimodal_encoder = multimodal_model.MultimodalEncoder(
        text_encoder, imgseq_encoder, args.latent_size)
    multimodal_encoder.load_state_dict(checkpoint['multimodal_encoder'])
    multimodal_encoder.to(device)
    multimodal_encoder.eval()

    f_csv = open(os.path.join(CONFIG.CSV_PATH, 'latent_features.csv'),
                 'w',
                 encoding='utf-8')
    wr = csv.writer(f_csv)
    for steps, (text_batch, imgseq_batch,
                short_code) in enumerate(full_loader):
        torch.cuda.empty_cache()
        with torch.no_grad():
            text_feature = Variable(text_batch).to(device)
            imgseq_feature = Variable(imgseq_batch).to(device)
        h = multimodal_encoder(text_feature, imgseq_feature)
        row = [short_code] + h.detach().cpu().numpy().tolist()
        wr.writerow(row)
        del text_feature, imgseq_feature
    f_csv.close()
    print("Finish!!!")
Beispiel #14
0
    def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1,
                 docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, min_n=3, max_n=6,
                 callbacks=(), **kwargs):
        """Initialize the model from an iterable of `documents`. Each document is a
        TaggedDocument object that will be used for training.

        Parameters
        ----------
        documents : iterable of iterables
            The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora,
            consider an iterable that streams the documents directly from disk/network.
            If you don't supply `documents`, the model is left uninitialized -- use if
            you plan to initialize it in some other way.

        dm : int {1,0}
            Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
            Otherwise, `distributed bag of words` (PV-DBOW) is employed.

        size : int
            Dimensionality of the feature vectors.
        window : int
            The maximum distance between the current and predicted word within a sentence.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        seed : int
            Seed for the random number generator. Initial vectors for each word are seeded with a hash of
            the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
            you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
            from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
            use of the `PYTHONHASHSEED` environment variable to control hash randomization).
        min_count : int
            Ignores all words with total frequency lower than this.
        max_vocab_size : int
            Limits the RAM during vocabulary building; if there are more unique
            words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
            Set to `None` for no limit.
        sample : float
            The threshold for configuring which higher-frequency words are randomly downsampled,
            useful range is (0, 1e-5).
        workers : int
            Use these many worker threads to train the model (=faster training with multicore machines).
        iter : int
            Number of iterations (epochs) over the corpus.
        hs : int {1,0}
            If 1, hierarchical softmax will be used for model training.
            If set to 0, and `negative` is non-zero, negative sampling will be used.
        negative : int
            If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
            should be drawn (usually between 5-20).
            If set to 0, no negative sampling is used.
        dm_mean : int {1,0}
            If 0 , use the sum of the context word vectors. If 1, use the mean.
            Only applies when `dm` is used in non-concatenative mode.
        dm_concat : int {1,0}
            If 1, use concatenation of context vectors rather than sum/average;
            Note concatenation results in a much-larger model, as the input
            is no longer the size of one (sampled or arithmetically combined) word vector, but the
            size of the tag(s) and all words in the context strung together.
        dm_tag_count : int
            Expected constant number of document tags per document, when using
            dm_concat mode; default is 1.
        dbow_words : int {1,0}
            If set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW
            doc-vector training; If 0, only trains doc-vectors (faster).
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.

        """

        if 'sentences' in kwargs:
            raise DeprecationWarning(
                "Parameter 'sentences' was renamed to 'documents', and will be removed in 4.0.0, "
                "use 'documents' instead."
            )

        if 'iter' in kwargs:
            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
            kwargs['epochs'] = kwargs['iter']

        if 'size' in kwargs:
            warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
            kwargs['vector_size'] = kwargs['size']

        super(Doc2Vec, self).__init__(
            sg=(1 + dm) % 2,
            null_word=dm_concat,
            callbacks=callbacks,
            fast_version=FAST_VERSION,
            **kwargs)

        self.load = call_on_class_only

        if dm_mean is not None:
            self.cbow_mean = dm_mean

        self.dbow_words = int(dbow_words)
        self.dm_concat = int(dm_concat)
        self.dm_tag_count = int(dm_tag_count)

        kwargs['null_word'] = dm_concat
        vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word']
        vocabulary_kwargs = dict((k, kwargs[k]) for k in vocabulary_keys if k in kwargs)
        self.vocabulary = Doc2VecVocab(**vocabulary_kwargs)

        trainables_keys = ['seed', 'hashfxn', 'window']
        trainables_kwargs = dict((k, kwargs[k]) for k in trainables_keys if k in kwargs)
        self.trainables = Doc2VecTrainables(
            dm=dm, dm_concat=dm_concat, dm_tag_count=dm_tag_count,
            vector_size=self.vector_size, **trainables_kwargs)

        from gensim.models.keyedvectors import FastTextKeyedVectors
        self.wv = FastTextKeyedVectors(self.vector_size, min_n, max_n)
        self.docvecs = docvecs or Doc2VecKeyedVectors(self.vector_size, docvecs_mapfile)

        self.comment = comment
        if documents is not None:
            if isinstance(documents, GeneratorType):
                raise TypeError("You can't pass a generator as the documents argument. Try an iterator.")
            self.build_vocab(documents, trim_rule=trim_rule)
            self.train(
                documents, total_examples=self.corpus_count, epochs=self.epochs,
                start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks)
Beispiel #15
0
models, results = {}, {}
word2vec = KeyedVectors.load("C:/Users/Kamil/Downloads/word2vec_300_3_polish.bin")


models[f"CBOW-W2V"] = Average(word2vec, lang_freq="pl")
models[f"SIF-W2V"] = SIF(word2vec, components=10)
models[f"uSIF-W2V"] = uSIF(word2vec, length=11)

from gensim.scripts.glove2word2vec import glove2word2vec  
glove = KeyedVectors.load_word2vec_format("C:/Users/Kamil/Downloads/glove_300_3_polish2.txt")
models[f"CBOW-Glove"] = Average(glove,  lang_freq="pl")
print(f"After memmap {sys.getsizeof(glove.vectors)}")
models[f"SIF-Glove"] = SIF(glove, components=15)
models[f"uSIF-Glove"] = uSIF(glove,length=11)

ft = FastTextKeyedVectors.load("D:/fasttext_300_3_polish.bin")
models[f"CBOW-FT"] = Average(ft, lang_freq="pl")
models[f"SIF-FT"] = SIF(ft, components=10)
models[f"uSIF-FT"] = uSIF(ft, length=11)


s=models[f"uSIF-W2V"]
s.sv[0]

cs, md, ed = [],[],[]
for i, j in zip(range(task_length), range(task_length, 2*task_length)):
    temp1 = s.sv[i].reshape(1, -1)
    temp2 = s.sv[j].reshape(1, -1)
    cs.append((1 - (paired_cosine_distances(temp1, temp2)))[0])
    md.append(-paired_manhattan_distances(temp1, temp2)[0])
    ed.append(-paired_euclidean_distances(temp1, temp2)[0])
Beispiel #16
0
class Doc2Vec(BaseWordEmbeddingsModel):
    """Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf"""

    def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1,
                 docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, min_n=3, max_n=6,
                 callbacks=(), **kwargs):
        """Initialize the model from an iterable of `documents`. Each document is a
        TaggedDocument object that will be used for training.

        Parameters
        ----------
        documents : iterable of iterables
            The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora,
            consider an iterable that streams the documents directly from disk/network.
            If you don't supply `documents`, the model is left uninitialized -- use if
            you plan to initialize it in some other way.

        dm : int {1,0}
            Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
            Otherwise, `distributed bag of words` (PV-DBOW) is employed.

        size : int
            Dimensionality of the feature vectors.
        window : int
            The maximum distance between the current and predicted word within a sentence.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        seed : int
            Seed for the random number generator. Initial vectors for each word are seeded with a hash of
            the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
            you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
            from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
            use of the `PYTHONHASHSEED` environment variable to control hash randomization).
        min_count : int
            Ignores all words with total frequency lower than this.
        max_vocab_size : int
            Limits the RAM during vocabulary building; if there are more unique
            words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
            Set to `None` for no limit.
        sample : float
            The threshold for configuring which higher-frequency words are randomly downsampled,
            useful range is (0, 1e-5).
        workers : int
            Use these many worker threads to train the model (=faster training with multicore machines).
        iter : int
            Number of iterations (epochs) over the corpus.
        hs : int {1,0}
            If 1, hierarchical softmax will be used for model training.
            If set to 0, and `negative` is non-zero, negative sampling will be used.
        negative : int
            If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
            should be drawn (usually between 5-20).
            If set to 0, no negative sampling is used.
        dm_mean : int {1,0}
            If 0 , use the sum of the context word vectors. If 1, use the mean.
            Only applies when `dm` is used in non-concatenative mode.
        dm_concat : int {1,0}
            If 1, use concatenation of context vectors rather than sum/average;
            Note concatenation results in a much-larger model, as the input
            is no longer the size of one (sampled or arithmetically combined) word vector, but the
            size of the tag(s) and all words in the context strung together.
        dm_tag_count : int
            Expected constant number of document tags per document, when using
            dm_concat mode; default is 1.
        dbow_words : int {1,0}
            If set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW
            doc-vector training; If 0, only trains doc-vectors (faster).
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.

        """

        if 'sentences' in kwargs:
            raise DeprecationWarning(
                "Parameter 'sentences' was renamed to 'documents', and will be removed in 4.0.0, "
                "use 'documents' instead."
            )

        if 'iter' in kwargs:
            warnings.warn("The parameter `iter` is deprecated, will be removed in 4.0.0, use `epochs` instead.")
            kwargs['epochs'] = kwargs['iter']

        if 'size' in kwargs:
            warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
            kwargs['vector_size'] = kwargs['size']

        super(Doc2Vec, self).__init__(
            sg=(1 + dm) % 2,
            null_word=dm_concat,
            callbacks=callbacks,
            fast_version=FAST_VERSION,
            **kwargs)

        self.load = call_on_class_only

        if dm_mean is not None:
            self.cbow_mean = dm_mean

        self.dbow_words = int(dbow_words)
        self.dm_concat = int(dm_concat)
        self.dm_tag_count = int(dm_tag_count)

        kwargs['null_word'] = dm_concat
        vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word']
        vocabulary_kwargs = dict((k, kwargs[k]) for k in vocabulary_keys if k in kwargs)
        self.vocabulary = Doc2VecVocab(**vocabulary_kwargs)

        trainables_keys = ['seed', 'hashfxn', 'window']
        trainables_kwargs = dict((k, kwargs[k]) for k in trainables_keys if k in kwargs)
        self.trainables = Doc2VecTrainables(
            dm=dm, dm_concat=dm_concat, dm_tag_count=dm_tag_count,
            vector_size=self.vector_size, **trainables_kwargs)

        from gensim.models.keyedvectors import FastTextKeyedVectors
        self.wv = FastTextKeyedVectors(self.vector_size, min_n, max_n)
        self.docvecs = docvecs or Doc2VecKeyedVectors(self.vector_size, docvecs_mapfile)

        self.comment = comment
        if documents is not None:
            if isinstance(documents, GeneratorType):
                raise TypeError("You can't pass a generator as the documents argument. Try an iterator.")
            self.build_vocab(documents, trim_rule=trim_rule)
            self.train(
                documents, total_examples=self.corpus_count, epochs=self.epochs,
                start_alpha=self.alpha, end_alpha=self.min_alpha, callbacks=callbacks)

    @property
    def dm(self):
        """int {1,0} : `dm=1` indicates 'distributed memory' (PV-DM) else
        `distributed bag of words` (PV-DBOW) is used."""
        return not self.sg  # opposite of SG

    @property
    def dbow(self):
        """int {1,0} : `dbow=1` indicates `distributed bag of words` (PV-DBOW) else
        'distributed memory' (PV-DM) is used."""
        return self.sg  # same as SG

    def _set_train_params(self, **kwargs):
        pass

    def _clear_post_train(self):
        self.clear_sims()

    def clear_sims(self):
        self.wv.vectors_norm = None
        self.wv.vectors_docs_norm = None

    def reset_from(self, other_model):
        """Reuse shareable structures from other_model."""
        self.wv.vocab = other_model.wv.vocab
        self.wv.index2word = other_model.wv.index2word
        self.vocabulary.cum_table = other_model.vocabulary.cum_table
        self.corpus_count = other_model.corpus_count
        self.docvecs.count = other_model.docvecs.count
        self.docvecs.doctags = other_model.docvecs.doctags
        self.docvecs.offset2doctag = other_model.docvecs.offset2doctag
        self.trainables.reset_weights(self.hs, self.negative, self.wv, self.docvecs)

    def _do_train_job(self, job, alpha, inits):
        work, neu1 = inits
        tally = 0
        for doc in job:
            doctag_indexes = self.vocabulary.indexed_doctags(doc.tags, self.docvecs)
            doctag_vectors = self.docvecs.vectors_docs
            doctag_locks = self.trainables.vectors_docs_lockf
            if self.sg:
                tally += train_document_dbow(
                    self, doc.words, doctag_indexes, alpha, work, train_words=self.dbow_words,
                    doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            elif self.dm_concat:
                tally += train_document_dm_concat(
                    self, doc.words, doctag_indexes, alpha, work, neu1,
                    doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            else:
                tally += train_document_dm(
                    self, doc.words, doctag_indexes, alpha, work, neu1,
                    doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
        return tally, self._raw_word_count(job)

    def train(self, documents, total_examples=None, total_words=None,
              epochs=None, start_alpha=None, end_alpha=None,
              word_count=0, queue_factor=2, report_delay=1.0, callbacks=()):
        """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        The `documents` iterable can be simply a list of TaggedDocument elements.

        To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate
        progress-percentage logging, either total_examples (count of sentences) or total_words (count of
        raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to
        :meth:`~gensim.models.word2vec.Word2Vec.build_vocab()`, the count of examples in that corpus
        will be available in the model's :attr:`corpus_count` property).

        To avoid common mistakes around the model's ability to do multiple training passes itself, an
        explicit `epochs` argument **MUST** be provided. In the common and recommended case,
        where :meth:`~gensim.models.word2vec.Word2Vec.train()` is only called once,
        the model's cached `iter` value should be supplied as `epochs` value.

        Parameters
        ----------
        documents : iterable of iterables
            The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora,
            consider an iterable that streams the documents directly from disk/network.
            See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument`
            in :mod:`~gensim.models.doc2vec` module for such examples.
        total_examples : int
            Count of sentences.
        total_words : int
            Count of raw words in documents.
        epochs : int
            Number of iterations (epochs) over the corpus.
        start_alpha : float
            Initial learning rate.
        end_alpha : float
            Final learning rate. Drops linearly from `start_alpha`.
        word_count : int
            Count of words already trained. Set this to 0 for the usual
            case of training on all words in sentences.
        queue_factor : int
            Multiplier for size of queue (number of workers * queue_factor).
        report_delay : float
            Seconds to wait before reporting progress.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.
        """
        super(Doc2Vec, self).train(
            documents, total_examples=total_examples, total_words=total_words,
            epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
            queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks)

    def _raw_word_count(self, job):
        """Return the number of words in a given job."""
        return sum(len(sentence.words) for sentence in job)

    def estimated_lookup_memory(self):
        """Estimated memory for tag lookup; 0 if using pure int tags."""
        return 60 * len(self.docvecs.offset2doctag) + 140 * len(self.docvecs.doctags)

    def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5):
        """
        Infer a vector for given post-bulk training document.

        Parameters
        ----------
        doc_words : :obj: `list` of :obj: `str`
            Document should be a list of (word) tokens.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        steps : int
            Number of times to train the new document.

        Returns
        -------
        :obj: `numpy.ndarray`
            Returns the inferred vector for the new document.

        """
        doctag_vectors, doctag_locks = self.trainables.get_doctag_trainables(doc_words, self.docvecs.vector_size)
        doctag_indexes = [0]
        work = zeros(self.trainables.layer1_size, dtype=REAL)
        if not self.sg:
            neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL)

        for i in range(steps):
            if self.sg:
                train_document_dbow(
                    self, doc_words, doctag_indexes, alpha, work,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            elif self.dm_concat:
                train_document_dm_concat(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            else:
                train_document_dm(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks
                )
            alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha

        return doctag_vectors[0]

    def __getitem__(self, tag):
        if isinstance(tag, string_types + integer_types + (integer,)):
            if tag not in self.wv.vocab:
                return self.docvecs[tag]
            return self.wv[tag]
        return vstack([self[i] for i in tag])

    def __str__(self):
        """Abbreviated name reflecting major configuration paramaters."""
        segments = []
        if self.comment:
            segments.append('"%s"' % self.comment)
        if self.sg:
            if self.dbow_words:
                segments.append('dbow+w')  # also training words
            else:
                segments.append('dbow')  # PV-DBOW (skip-gram-style)

        else:  # PV-DM...
            if self.dm_concat:
                segments.append('dm/c')  # ...with concatenative context layer
            else:
                if self.cbow_mean:
                    segments.append('dm/m')
                else:
                    segments.append('dm/s')
        segments.append('d%d' % self.docvecs.vector_size)  # dimensions
        if self.negative:
            segments.append('n%d' % self.negative)  # negative samples
        if self.hs:
            segments.append('hs')
        if not self.sg or (self.sg and self.dbow_words):
            segments.append('w%d' % self.window)  # window size, when relevant
        if self.vocabulary.min_count > 1:
            segments.append('mc%d' % self.vocabulary.min_count)
        if self.vocabulary.sample > 0:
            segments.append('s%g' % self.vocabulary.sample)
        if self.workers > 1:
            segments.append('t%d' % self.workers)
        return '%s(%s)' % (self.__class__.__name__, ','.join(segments))

    def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True):
        """Discard parameters that are used in training and score. Use if you're sure you're done training a model.

        Parameters
        ----------
        keep_doctags_vectors : bool
            Set `keep_doctags_vectors` to False if you don't want to save doctags vectors,
            in this case you can't to use docvecs's most_similar, similarity etc. methods.
        keep_inference : bool
            Set `keep_inference` to False if you don't want to store parameters that is used for infer_vector method

        """
        if not keep_inference:
            if hasattr(self.trainables, 'syn1'):
                del self.trainables.syn1
            if hasattr(self.trainables, 'syn1neg'):
                del self.trainables.syn1neg
            if hasattr(self.trainables, 'vectors_lockf'):
                del self.trainables.vectors_lockf
        self.model_trimmed_post_training = True
        if self.docvecs and hasattr(self.docvecs, 'vectors_docs') and not keep_doctags_vectors:
            del self.docvecs.vectors_docs
        if self.docvecs and hasattr(self.trainables, 'vectors_docs_lockf'):
            del self.trainables.vectors_docs_lockf

    def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False):
        """Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        Parameters
        ----------
        fname : str
            The file path used to save the vectors in.
        doctag_vec : bool
            Indicates whether to store document vectors.
        word_vec : bool
            Indicates whether to store word vectors.
        prefix : str
            Uniquely identifies doctags from word vocab, and avoids collision
            in case of repeated string in doctag and word vocab.
        fvocab : str
            Optional file path used to save the vocabulary
        binary : bool
            If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.

        """
        total_vec = len(self.wv.vocab) + len(self.docvecs)
        write_first_line = False
        # save word vectors
        if word_vec:
            if not doctag_vec:
                total_vec = len(self.wv.vocab)
            self.wv.save_word2vec_format(fname, fvocab, binary, total_vec)
        # save document vectors
        if doctag_vec:
            if not word_vec:
                total_vec = len(self.docvecs)
                write_first_line = True
            self.docvecs.save_word2vec_format(
                fname, prefix=prefix, fvocab=fvocab, total_vec=total_vec,
                binary=binary, write_first_line=write_first_line)

    def init_sims(self, replace=False):
        """
        Precompute L2-normalized vectors.

        If `replace` is set, forget the original vectors and only keep the normalized
        ones = saves lots of memory!

        Note that you **cannot continue training or inference** after doing a replace.
        The model becomes effectively read-only = you can call `most_similar`, `similarity`
        etc., but not `train` or `infer_vector`.

        """
        return self.docvecs.init_sims(replace=replace)

    @classmethod
    def load(cls, *args, **kwargs):
        try:
            return super(Doc2Vec, cls).load(*args, **kwargs)
        except AttributeError:
            logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.')
            from gensim.models.deprecated.doc2vec import load_old_doc2vec
            return load_old_doc2vec(*args, **kwargs)

    def estimate_memory(self, vocab_size=None, report=None):
        """Estimate required memory for a model using current settings."""
        report = report or {}
        report['doctag_lookup'] = self.estimated_lookup_memory()
        report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize
        return super(Doc2Vec, self).estimate_memory(vocab_size, report=report)

    def build_vocab(self, documents, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs):
        """Build vocabulary from a sequence of sentences (can be a once-only generator stream).
        Each sentence is a iterable of iterables (can simply be a list of unicode strings too).

        Parameters
        ----------
        documents : iterable of iterables
            The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora,
            consider an iterable that streams the documents directly from disk/network.
            See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument`
            in :mod:`~gensim.models.doc2vec` module for such examples.
        keep_raw_vocab : bool
            If not true, delete the raw vocabulary after the scaling is done and free up RAM.
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        progress_per : int
            Indicates how many words to process before showing/updating the progress.
        update : bool
            If true, the new words in `sentences` will be added to model's vocab.
        """
        total_words, corpus_count = self.vocabulary.scan_vocab(
            documents, self.docvecs, progress_per=progress_per, trim_rule=trim_rule)
        self.corpus_count = corpus_count
        report_values = self.vocabulary.prepare_vocab(
            self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,
            **kwargs)

        report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
        self.trainables.prepare_weights(
            self.hs, self.negative, self.wv, self.docvecs, update=update)

    def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
        """
        Build vocabulary from a dictionary of word frequencies.
        Build model vocabulary from a passed dictionary that contains (word,word count).
        Words must be of type unicode strings.

        Parameters
        ----------
        word_freq : dict
            Word,Word_Count dictionary.
        keep_raw_vocab : bool
            If not true, delete the raw vocabulary after the scaling is done and free up RAM.
        corpus_count : int
            Even if no corpus is provided, this argument can set corpus_count explicitly.
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        update : bool
            If true, the new provided words in `word_freq` dict will be added to model's vocab.

        Examples
        --------
        >>> from gensim.models.word2vec import Word2Vec
        >>> model= Word2Vec()
        >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20})
        """
        logger.info("Processing provided word frequencies")
        # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq)
        # to be directly the raw vocab
        raw_vocab = word_freq
        logger.info(
            "collected %i different raw word, with total frequency of %i",
            len(raw_vocab), sum(itervalues(raw_vocab))
        )

        # Since no sentences are provided, this is to control the corpus_count
        self.corpus_count = corpus_count or 0
        self.vocabulary.raw_vocab = raw_vocab

        # trim by min_count & precalculate downsampling
        report_values = self.vocabulary.prepare_vocab(
            self.hs, self.negative, self.wv, keep_raw_vocab=keep_raw_vocab,
            trim_rule=trim_rule, update=update)
        report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
        self.trainables.prepare_weights(
            self.hs, self.negative, self.wv, self.docvecs, update=update)
class FastText(BaseWordEmbeddingsModel):
    """Class for training, using and evaluating word representations learned using method
    described in [1]_ aka Fasttext.

    The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save()` and
    :meth:`~gensim.models.fasttext.FastText.load()` methods, or loaded in a format compatible with the original
    fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format()`.

    """
    def __init__(self,
                 sentences=None,
                 sg=0,
                 hs=0,
                 size=100,
                 alpha=0.025,
                 window=5,
                 min_count=5,
                 max_vocab_size=None,
                 word_ngrams=1,
                 sample=1e-3,
                 seed=1,
                 workers=3,
                 min_alpha=0.0001,
                 negative=5,
                 cbow_mean=1,
                 hashfxn=hash,
                 iter=5,
                 null_word=0,
                 min_n=3,
                 max_n=6,
                 sorted_vocab=1,
                 bucket=2000000,
                 trim_rule=None,
                 batch_words=MAX_WORDS_IN_BATCH,
                 callbacks=()):
        """Initialize the model from an iterable of `sentences`. Each sentence is a
        list of words (unicode strings) that will be used for training.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
            If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
            in some other way.
        sg : int {1, 0}
            Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed.
        size : int
            Dimensionality of the feature vectors.
        window : int
            The maximum distance between the current and predicted word within a sentence.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        seed : int
            Seed for the random number generator. Initial vectors for each word are seeded with a hash of
            the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
            you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
            from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
            use of the `PYTHONHASHSEED` environment variable to control hash randomization).
        min_count : int
            Ignores all words with total frequency lower than this.
        max_vocab_size : int
            Limits the RAM during vocabulary building; if there are more unique
            words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
            Set to `None` for no limit.
        sample : float
            The threshold for configuring which higher-frequency words are randomly downsampled,
            useful range is (0, 1e-5).
        workers : int
            Use these many worker threads to train the model (=faster training with multicore machines).
        hs : int {1,0}
            If 1, hierarchical softmax will be used for model training.
            If set to 0, and `negative` is non-zero, negative sampling will be used.
        negative : int
            If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
            should be drawn (usually between 5-20).
            If set to 0, no negative sampling is used.
        cbow_mean : int {1,0}
            If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
        hashfxn : function
            Hash function to use to randomly initialize weights, for increased training reproducibility.
        iter : int
            Number of iterations (epochs) over the corpus.
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        sorted_vocab : int {1,0}
            If 1, sort the vocabulary by descending frequency before assigning word indexes.
        batch_words : int
            Target size (in words) for batches of examples passed to worker threads (and
            thus cython routines).(Larger batches will be passed if individual
            texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
        min_n : int
            Min length of char ngrams to be used for training word representations.
        max_n : int
            Max length of char ngrams to be used for training word representations. Set `max_n` to be
            lesser than `min_n` to avoid char ngrams being used.
        word_ngrams : int {1,0}
            If 1, uses enriches word vectors with subword(ngrams) information.
            If 0, this is equivalent to word2vec.
        bucket : int
            Character ngrams are hashed into a fixed number of buckets, in order to limit the
            memory usage of the model. This option specifies the number of buckets used by the model.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.

        Examples
        --------
        Initialize and train a `FastText` model

        >>> from gensim.models import FastText
        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>>
        >>> model = FastText(sentences, min_count=1)
        >>> say_vector = model['say']  # get vector for word
        >>> of_vector = model['of']  # get vector for out-of-vocab word


        """
        self.load = call_on_class_only
        self.load_fasttext_format = call_on_class_only
        self.callbacks = callbacks
        self.word_ngrams = int(word_ngrams)
        if self.word_ngrams <= 1 and max_n == 0:
            bucket = 0

        self.wv = FastTextKeyedVectors(size, min_n, max_n)
        self.vocabulary = FastTextVocab(max_vocab_size=max_vocab_size,
                                        min_count=min_count,
                                        sample=sample,
                                        sorted_vocab=bool(sorted_vocab),
                                        null_word=null_word)
        self.trainables = FastTextTrainables(vector_size=size,
                                             seed=seed,
                                             bucket=bucket,
                                             hashfxn=hashfxn)

        super(FastText, self).__init__(sentences=sentences,
                                       workers=workers,
                                       vector_size=size,
                                       epochs=iter,
                                       callbacks=callbacks,
                                       batch_words=batch_words,
                                       trim_rule=trim_rule,
                                       sg=sg,
                                       alpha=alpha,
                                       window=window,
                                       seed=seed,
                                       hs=hs,
                                       negative=negative,
                                       cbow_mean=cbow_mean,
                                       min_alpha=min_alpha,
                                       fast_version=FAST_VERSION)

    @property
    @deprecated("Attribute will be removed in 4.0.0, use wv.min_n instead")
    def min_n(self):
        return self.wv.min_n

    @property
    @deprecated("Attribute will be removed in 4.0.0, use wv.max_n instead")
    def max_n(self):
        return self.wv.max_n

    @property
    @deprecated(
        "Attribute will be removed in 4.0.0, use trainables.bucket instead")
    def bucket(self):
        return self.trainables.bucket

    @property
    @deprecated(
        "Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead"
    )
    def syn0_vocab_lockf(self):
        return self.trainables.vectors_vocab_lockf

    @syn0_vocab_lockf.setter
    @deprecated(
        "Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead"
    )
    def syn0_vocab_lockf(self, value):
        self.trainables.vectors_vocab_lockf = value

    @syn0_vocab_lockf.deleter
    @deprecated(
        "Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead"
    )
    def syn0_vocab_lockf(self):
        del self.trainables.vectors_vocab_lockf

    @property
    @deprecated(
        "Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead"
    )
    def syn0_ngrams_lockf(self):
        return self.trainables.vectors_ngrams_lockf

    @syn0_ngrams_lockf.setter
    @deprecated(
        "Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead"
    )
    def syn0_ngrams_lockf(self, value):
        self.trainables.vectors_ngrams_lockf = value

    @syn0_ngrams_lockf.deleter
    @deprecated(
        "Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead"
    )
    def syn0_ngrams_lockf(self):
        del self.trainables.vectors_ngrams_lockf

    @property
    @deprecated(
        "Attribute will be removed in 4.0.0, use self.wv.num_ngram_vectors instead"
    )
    def num_ngram_vectors(self):
        return self.wv.num_ngram_vectors

    def build_vocab(self,
                    sentences,
                    update=False,
                    progress_per=10000,
                    keep_raw_vocab=False,
                    trim_rule=None,
                    **kwargs):
        """Build vocabulary from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of unicode strings.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
        keep_raw_vocab : bool
            If not true, delete the raw vocabulary after the scaling is done and free up RAM.
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        progress_per : int
            Indicates how many words to process before showing/updating the progress.
        update : bool
            If true, the new words in `sentences` will be added to model's vocab.

        Example
        -------
        Train a model and update vocab for online training

        >>> from gensim.models import FastText
        >>> sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>> sentences_2 = [["dude", "say", "wazzup!"]]
        >>>
        >>> model = FastText(min_count=1)
        >>> model.build_vocab(sentences_1)
        >>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter)
        >>> model.build_vocab(sentences_2, update=True)
        >>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter)

        """
        if update:
            if not len(self.wv.vocab):
                raise RuntimeError(
                    "You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
                    "First build the vocabulary of your model with a corpus "
                    "before doing an online update.")
            self.vocabulary.old_vocab_len = len(self.wv.vocab)
            self.trainables.old_hash2index_len = len(self.wv.hash2index)

        return super(FastText, self).build_vocab(sentences,
                                                 update=update,
                                                 progress_per=progress_per,
                                                 keep_raw_vocab=keep_raw_vocab,
                                                 trim_rule=trim_rule,
                                                 **kwargs)

    def _set_train_params(self, **kwargs):
        pass

    def _clear_post_train(self):
        self.wv.vectors_norm = None
        self.wv.vectors_vocab_norm = None
        self.wv.vectors_ngrams_norm = None

    def _do_train_job(self, sentences, alpha, inits):
        """Train a single batch of sentences. Return 2-tuple `(effective word count after
        ignoring unknown words and sentence length trimming, total word count)`.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
        alpha : float
            The current learning rate.
        inits : (:class:`numpy.ndarray`, :class:`numpy.ndarray`)
            Each worker's private work memory.

        Returns
        -------
        (int, int)
            Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count)

        """
        work, neu1 = inits
        tally = 0
        if self.sg:
            tally += train_batch_sg(self, sentences, alpha, work, neu1)
        else:
            tally += train_batch_cbow(self, sentences, alpha, work, neu1)

        return tally, self._raw_word_count(sentences)

    def train(self,
              sentences,
              total_examples=None,
              total_words=None,
              epochs=None,
              start_alpha=None,
              end_alpha=None,
              word_count=0,
              queue_factor=2,
              report_delay=1.0,
              callbacks=(),
              **kwargs):
        """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        For FastText, each sentence must be a list of unicode strings.

        To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate
        progress-percentage logging, either total_examples (count of sentences) or total_words (count of
        raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to
        :meth:`~gensim.models.fasttext.FastText.build_vocab()`, the count of examples in that corpus
        will be available in the model's :attr:`corpus_count` property).

        To avoid common mistakes around the model's ability to do multiple training passes itself, an
        explicit `epochs` argument **MUST** be provided. In the common and recommended case,
        where :meth:`~gensim.models.fasttext.FastText.train()` is only called once,
        the model's cached `iter` value should be supplied as `epochs` value.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
        total_examples : int
            Count of sentences.
        total_words : int
            Count of raw words in sentences.
        epochs : int
            Number of iterations (epochs) over the corpus.
        start_alpha : float
            Initial learning rate.
        end_alpha : float
            Final learning rate. Drops linearly from `start_alpha`.
        word_count : int
            Count of words already trained. Set this to 0 for the usual
            case of training on all words in sentences.
        queue_factor : int
            Multiplier for size of queue (number of workers * queue_factor).
        report_delay : float
            Seconds to wait before reporting progress.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.

        Examples
        --------
        >>> from gensim.models import FastText
        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>>
        >>> model = FastText(min_count=1)
        >>> model.build_vocab(sentences)
        >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)

        """

        super(FastText, self).train(sentences,
                                    total_examples=total_examples,
                                    total_words=total_words,
                                    epochs=epochs,
                                    start_alpha=start_alpha,
                                    end_alpha=end_alpha,
                                    word_count=word_count,
                                    queue_factor=queue_factor,
                                    report_delay=report_delay,
                                    callbacks=callbacks)
        self.trainables.get_vocab_word_vecs(self.wv)

    def init_sims(self, replace=False):
        """
        init_sims() resides in KeyedVectors because it deals with syn0 mainly, but because syn1 is not an attribute
        of KeyedVectors, it has to be deleted in this class, and the normalizing of syn0 happens inside of KeyedVectors
        """
        if replace and hasattr(self.trainables, 'syn1'):
            del self.trainables.syn1
        return self.wv.init_sims(replace)

    def clear_sims(self):
        """
        Removes all L2-normalized vectors for words from the model.
        You will have to recompute them using init_sims method.
        """

        self._clear_post_train()

    @deprecated(
        "Method will be removed in 4.0.0, use self.wv.__getitem__() instead")
    def __getitem__(self, words):
        """
        Deprecated. Use self.wv.__getitem__() instead.
        Refer to the documentation for `gensim.models.KeyedVectors.__getitem__`
        """
        return self.wv.__getitem__(words)

    @deprecated(
        "Method will be removed in 4.0.0, use self.wv.__contains__() instead")
    def __contains__(self, word):
        """
        Deprecated. Use self.wv.__contains__() instead.
        Refer to the documentation for `gensim.models.KeyedVectors.__contains__`
        """
        return self.wv.__contains__(word)

    @classmethod
    def load_fasttext_format(cls, model_file, encoding='utf8'):
        """
        Load the input-hidden weight matrix from the fast text output files.

        Note that due to limitations in the FastText API, you cannot continue training
        with a model loaded this way, though you can query for word similarity etc.

        Parameters
        ----------
        model_file : str
            Path to the FastText output files.
            FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
            Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
            as gensim requires only `.bin` file to load entire fastText model.
        encoding : str
            Specifies the encoding.

        Returns
        -------
        :obj: `~gensim.models.fasttext.FastText`
            Returns the loaded model as an instance of :class: `~gensim.models.fasttext.FastText`.

        """
        model = cls()
        if not model_file.endswith('.bin'):
            model_file += '.bin'
        model.file_name = model_file
        model.load_binary_data(encoding=encoding)
        return model

    def load_binary_data(self, encoding='utf8'):
        """Loads data from the output binary file created by FastText training"""
        with utils.smart_open(self.file_name, 'rb') as f:
            self._load_model_params(f)
            self._load_dict(f, encoding=encoding)
            self._load_vectors(f)

    def _load_model_params(self, file_handle):
        magic, version = self.struct_unpack(file_handle, '@2i')
        if magic == FASTTEXT_FILEFORMAT_MAGIC:  # newer format
            self.new_format = True
            dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = \
                self.struct_unpack(file_handle, '@12i1d')
        else:  # older format
            self.new_format = False
            dim = magic
            ws = version
            epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(
                file_handle, '@10i1d')
        # Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc)
        self.wv.vector_size = dim
        self.vector_size = dim
        self.window = ws
        self.epochs = epoch
        self.vocabulary.min_count = min_count
        self.negative = neg
        self.hs = loss == 1
        self.sg = model == 2
        self.trainables.bucket = bucket
        self.wv.min_n = minn
        self.wv.max_n = maxn
        self.vocabulary.sample = t

    def _load_dict(self, file_handle, encoding='utf8'):
        vocab_size, nwords, nlabels = self.struct_unpack(file_handle, '@3i')
        # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
        if nlabels > 0:
            raise NotImplementedError(
                "Supervised fastText models are not supported")
        logger.info("loading %s words for fastText model from %s", vocab_size,
                    self.file_name)

        self.struct_unpack(file_handle, '@1q')  # number of tokens
        if self.new_format:
            pruneidx_size, = self.struct_unpack(file_handle, '@q')
        for i in range(vocab_size):
            word_bytes = b''
            char_byte = file_handle.read(1)
            # Read vocab word
            while char_byte != b'\x00':
                word_bytes += char_byte
                char_byte = file_handle.read(1)
            word = word_bytes.decode(encoding)
            count, _ = self.struct_unpack(file_handle, '@qb')

            self.wv.vocab[word] = Vocab(index=i, count=count)
            self.wv.index2word.append(word)

        assert len(self.wv.vocab) == nwords, (
            'mismatch between final vocab size ({} words), '
            'and expected number of words ({} words)'.format(
                len(self.wv.vocab), nwords))
        if len(self.wv.vocab) != vocab_size:
            # expecting to log this warning only for pretrained french vector, wiki.fr
            logger.warning(
                "mismatch between final vocab size (%s words), and expected vocab size (%s words)",
                len(self.wv.vocab), vocab_size)

        if self.new_format:
            for j in range(pruneidx_size):
                self.struct_unpack(file_handle, '@2i')

    def _load_vectors(self, file_handle):
        if self.new_format:
            self.struct_unpack(file_handle,
                               '@?')  # bool quant_input in fasttext.cc
        num_vectors, dim = self.struct_unpack(file_handle, '@2q')
        # Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc)
        assert self.wv.vector_size == dim, (
            'mismatch between vector size in model params ({}) and model vectors ({})'
            .format(self.wv.vector_size, dim))
        float_size = struct.calcsize('@f')
        if float_size == 4:
            dtype = np.dtype(np.float32)
        elif float_size == 8:
            dtype = np.dtype(np.float64)

        self.num_original_vectors = num_vectors
        self.wv.vectors_ngrams = np.fromfile(file_handle,
                                             dtype=dtype,
                                             count=num_vectors * dim)
        self.wv.vectors_ngrams = self.wv.vectors_ngrams.reshape(
            (num_vectors, dim))
        assert self.wv.vectors_ngrams.shape == (
            self.trainables.bucket + len(self.wv.vocab), self.wv.vector_size), \
            'mismatch between actual weight matrix shape {} and expected shape {}'\
            .format(
                self.wv.vectors_ngrams.shape, (self.trainables.bucket + len(self.wv.vocab), self.wv.vector_size)
            )

        self.trainables.init_ngrams_post_load(self.file_name, self.wv)
        self._clear_post_train()

    def struct_unpack(self, file_handle, fmt):
        num_bytes = struct.calcsize(fmt)
        return struct.unpack(fmt, file_handle.read(num_bytes))

    def save(self, *args, **kwargs):
        """Save the model. This saved model can be loaded again using :func:`~gensim.models.fasttext.FastText.load`,
        which supports online training and getting vectors for out-of-vocabulary words.

        Parameters
        ----------
        fname : str
            Path to the file.

        """
        kwargs['ignore'] = kwargs.get(
            'ignore',
            ['vectors_norm', 'vectors_vocab_norm', 'vectors_ngrams_norm'])
        super(FastText, self).save(*args, **kwargs)

    @classmethod
    def load(cls, *args, **kwargs):
        """Loads a previously saved `FastText` model. Also see `save()`.

        Parameters
        ----------
        fname : str
            Path to the saved file.

        Returns
        -------
        :obj: `~gensim.models.fasttext.FastText`
            Returns the loaded model as an instance of :class: `~gensim.models.fasttext.FastText`.
        """
        try:
            model = super(FastText, cls).load(*args, **kwargs)
            if not hasattr(model.trainables,
                           'vectors_vocab_lockf') and hasattr(
                               model.wv, 'vectors_vocab'):
                model.trainables.vectors_vocab_lockf = ones(len(
                    model.trainables.vectors),
                                                            dtype=REAL)
            if not hasattr(model.trainables,
                           'vectors_ngrams_lockf') and hasattr(
                               model.wv, 'vectors_ngrams'):
                model.trainables.vectors_ngrams_lockf = ones(len(
                    model.trainables.vectors),
                                                             dtype=REAL)
            return model
        except AttributeError:
            logger.info(
                'Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.'
            )
            from gensim.models.deprecated.fasttext import load_old_fasttext
            return load_old_fasttext(*args, **kwargs)

    @deprecated(
        "Method will be removed in 4.0.0, use self.wv.accuracy() instead")
    def accuracy(self,
                 questions,
                 restrict_vocab=30000,
                 most_similar=None,
                 case_insensitive=True):
        most_similar = most_similar or FastTextKeyedVectors.most_similar
        return self.wv.accuracy(questions, restrict_vocab, most_similar,
                                case_insensitive)
Beispiel #18
0
class FastText(BaseWordEmbeddingsModel):
    """Class for training, using and evaluating word representations learned using method
    described in [1]_ aka Fasttext.

    The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save()` and
    :meth:`~gensim.models.fasttext.FastText.load()` methods, or loaded in a format compatible with the original
    fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format()`.

    """
    def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
                 max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
                 negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1,
                 bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()):
        """Initialize the model from an iterable of `sentences`. Each sentence is a
        list of words (unicode strings) that will be used for training.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
            If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
            in some other way.
        sg : int {1, 0}
            Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed.
        size : int
            Dimensionality of the feature vectors.
        window : int
            The maximum distance between the current and predicted word within a sentence.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        seed : int
            Seed for the random number generator. Initial vectors for each word are seeded with a hash of
            the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
            you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
            from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
            use of the `PYTHONHASHSEED` environment variable to control hash randomization).
        min_count : int
            Ignores all words with total frequency lower than this.
        max_vocab_size : int
            Limits the RAM during vocabulary building; if there are more unique
            words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
            Set to `None` for no limit.
        sample : float
            The threshold for configuring which higher-frequency words are randomly downsampled,
            useful range is (0, 1e-5).
        workers : int
            Use these many worker threads to train the model (=faster training with multicore machines).
        hs : int {1,0}
            If 1, hierarchical softmax will be used for model training.
            If set to 0, and `negative` is non-zero, negative sampling will be used.
        negative : int
            If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
            should be drawn (usually between 5-20).
            If set to 0, no negative sampling is used.
        cbow_mean : int {1,0}
            If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
        hashfxn : function
            Hash function to use to randomly initialize weights, for increased training reproducibility.
        iter : int
            Number of iterations (epochs) over the corpus.
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        sorted_vocab : int {1,0}
            If 1, sort the vocabulary by descending frequency before assigning word indexes.
        batch_words : int
            Target size (in words) for batches of examples passed to worker threads (and
            thus cython routines).(Larger batches will be passed if individual
            texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
        min_n : int
            Min length of char ngrams to be used for training word representations.
        max_n : int
            Max length of char ngrams to be used for training word representations. Set `max_n` to be
            lesser than `min_n` to avoid char ngrams being used.
        word_ngrams : int {1,0}
            If 1, uses enriches word vectors with subword(ngrams) information.
            If 0, this is equivalent to word2vec.
        bucket : int
            Character ngrams are hashed into a fixed number of buckets, in order to limit the
            memory usage of the model. This option specifies the number of buckets used by the model.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.

        Examples
        --------
        Initialize and train a `FastText` model

        >>> from gensim.models import FastText
        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>>
        >>> model = FastText(sentences, min_count=1)
        >>> say_vector = model['say']  # get vector for word
        >>> of_vector = model['of']  # get vector for out-of-vocab word


        """
        self.load = call_on_class_only
        self.load_fasttext_format = call_on_class_only
        self.callbacks = callbacks
        self.word_ngrams = int(word_ngrams)
        if self.word_ngrams <= 1 and max_n == 0:
            bucket = 0

        self.wv = FastTextKeyedVectors(size, min_n, max_n)
        self.vocabulary = FastTextVocab(
            max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
            sorted_vocab=bool(sorted_vocab), null_word=null_word)
        self.trainables = FastTextTrainables(
            vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)

        super(FastText, self).__init__(
            sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks,
            batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed,
            hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, fast_version=FAST_VERSION)

    @property
    @deprecated("Attribute will be removed in 4.0.0, use wv.min_n instead")
    def min_n(self):
        return self.wv.min_n

    @property
    @deprecated("Attribute will be removed in 4.0.0, use wv.max_n instead")
    def max_n(self):
        return self.wv.max_n

    @property
    @deprecated("Attribute will be removed in 4.0.0, use trainables.bucket instead")
    def bucket(self):
        return self.trainables.bucket

    @property
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead")
    def syn0_vocab_lockf(self):
        return self.trainables.vectors_vocab_lockf

    @syn0_vocab_lockf.setter
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead")
    def syn0_vocab_lockf(self, value):
        self.trainables.vectors_vocab_lockf = value

    @syn0_vocab_lockf.deleter
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead")
    def syn0_vocab_lockf(self):
        del self.trainables.vectors_vocab_lockf

    @property
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead")
    def syn0_ngrams_lockf(self):
        return self.trainables.vectors_ngrams_lockf

    @syn0_ngrams_lockf.setter
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead")
    def syn0_ngrams_lockf(self, value):
        self.trainables.vectors_ngrams_lockf = value

    @syn0_ngrams_lockf.deleter
    @deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead")
    def syn0_ngrams_lockf(self):
        del self.trainables.vectors_ngrams_lockf

    @property
    @deprecated("Attribute will be removed in 4.0.0, use self.wv.num_ngram_vectors instead")
    def num_ngram_vectors(self):
        return self.wv.num_ngram_vectors

    def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs):
        """Build vocabulary from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of unicode strings.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
        keep_raw_vocab : bool
            If not true, delete the raw vocabulary after the scaling is done and free up RAM.
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        progress_per : int
            Indicates how many words to process before showing/updating the progress.
        update : bool
            If true, the new words in `sentences` will be added to model's vocab.

        Example
        -------
        Train a model and update vocab for online training

        >>> from gensim.models import FastText
        >>> sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>> sentences_2 = [["dude", "say", "wazzup!"]]
        >>>
        >>> model = FastText(min_count=1)
        >>> model.build_vocab(sentences_1)
        >>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter)
        >>> model.build_vocab(sentences_2, update=True)
        >>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter)

        """
        if update:
            if not len(self.wv.vocab):
                raise RuntimeError(
                    "You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
                    "First build the vocabulary of your model with a corpus "
                    "before doing an online update.")
            self.vocabulary.old_vocab_len = len(self.wv.vocab)
            self.trainables.old_hash2index_len = len(self.wv.hash2index)

        return super(FastText, self).build_vocab(
            sentences, update=update, progress_per=progress_per,
            keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs)

    def _set_train_params(self, **kwargs):
        pass

    def _clear_post_train(self):
        self.wv.vectors_norm = None
        self.wv.vectors_vocab_norm = None
        self.wv.vectors_ngrams_norm = None

    def _do_train_job(self, sentences, alpha, inits):
        """Train a single batch of sentences. Return 2-tuple `(effective word count after
        ignoring unknown words and sentence length trimming, total word count)`.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
        alpha : float
            The current learning rate.
        inits : (:class:`numpy.ndarray`, :class:`numpy.ndarray`)
            Each worker's private work memory.

        Returns
        -------
        (int, int)
            Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count)

        """
        work, neu1 = inits
        tally = 0
        if self.sg:
            tally += train_batch_sg(self, sentences, alpha, work, neu1)
        else:
            tally += train_batch_cbow(self, sentences, alpha, work, neu1)

        return tally, self._raw_word_count(sentences)

    def train(self, sentences, total_examples=None, total_words=None,
              epochs=None, start_alpha=None, end_alpha=None,
              word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs):
        """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        For FastText, each sentence must be a list of unicode strings.

        To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate
        progress-percentage logging, either total_examples (count of sentences) or total_words (count of
        raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to
        :meth:`~gensim.models.fasttext.FastText.build_vocab()`, the count of examples in that corpus
        will be available in the model's :attr:`corpus_count` property).

        To avoid common mistakes around the model's ability to do multiple training passes itself, an
        explicit `epochs` argument **MUST** be provided. In the common and recommended case,
        where :meth:`~gensim.models.fasttext.FastText.train()` is only called once,
        the model's cached `iter` value should be supplied as `epochs` value.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
        total_examples : int
            Count of sentences.
        total_words : int
            Count of raw words in sentences.
        epochs : int
            Number of iterations (epochs) over the corpus.
        start_alpha : float
            Initial learning rate.
        end_alpha : float
            Final learning rate. Drops linearly from `start_alpha`.
        word_count : int
            Count of words already trained. Set this to 0 for the usual
            case of training on all words in sentences.
        queue_factor : int
            Multiplier for size of queue (number of workers * queue_factor).
        report_delay : float
            Seconds to wait before reporting progress.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.

        Examples
        --------
        >>> from gensim.models import FastText
        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>>
        >>> model = FastText(min_count=1)
        >>> model.build_vocab(sentences)
        >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)

        """

        super(FastText, self).train(
            sentences, total_examples=total_examples, total_words=total_words,
            epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
            queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks)
        self.trainables.get_vocab_word_vecs(self.wv)

    def init_sims(self, replace=False):
        """
        init_sims() resides in KeyedVectors because it deals with syn0 mainly, but because syn1 is not an attribute
        of KeyedVectors, it has to be deleted in this class, and the normalizing of syn0 happens inside of KeyedVectors
        """
        if replace and hasattr(self.trainables, 'syn1'):
            del self.trainables.syn1
        return self.wv.init_sims(replace)

    def clear_sims(self):
        """
        Removes all L2-normalized vectors for words from the model.
        You will have to recompute them using init_sims method.
        """

        self._clear_post_train()

    @deprecated("Method will be removed in 4.0.0, use self.wv.__getitem__() instead")
    def __getitem__(self, words):
        """
        Deprecated. Use self.wv.__getitem__() instead.
        Refer to the documentation for `gensim.models.KeyedVectors.__getitem__`
        """
        return self.wv.__getitem__(words)

    @deprecated("Method will be removed in 4.0.0, use self.wv.__contains__() instead")
    def __contains__(self, word):
        """
        Deprecated. Use self.wv.__contains__() instead.
        Refer to the documentation for `gensim.models.KeyedVectors.__contains__`
        """
        return self.wv.__contains__(word)

    @classmethod
    def load_fasttext_format(cls, model_file, encoding='utf8'):
        """
        Load the input-hidden weight matrix from the fast text output files.

        Note that due to limitations in the FastText API, you cannot continue training
        with a model loaded this way, though you can query for word similarity etc.

        Parameters
        ----------
        model_file : str
            Path to the FastText output files.
            FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
            Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
            as gensim requires only `.bin` file to load entire fastText model.
        encoding : str
            Specifies the encoding.

        Returns
        -------
        :obj: `~gensim.models.fasttext.FastText`
            Returns the loaded model as an instance of :class: `~gensim.models.fasttext.FastText`.

        """
        model = cls()
        if not model_file.endswith('.bin'):
            model_file += '.bin'
        model.file_name = model_file
        model.load_binary_data(encoding=encoding)
        return model

    def load_binary_data(self, encoding='utf8'):
        """Loads data from the output binary file created by FastText training"""
        with utils.smart_open(self.file_name, 'rb') as f:
            self._load_model_params(f)
            self._load_dict(f, encoding=encoding)
            self._load_vectors(f)

    def _load_model_params(self, file_handle):
        magic, version = self.struct_unpack(file_handle, '@2i')
        if magic == FASTTEXT_FILEFORMAT_MAGIC:  # newer format
            self.new_format = True
            dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = \
                self.struct_unpack(file_handle, '@12i1d')
        else:  # older format
            self.new_format = False
            dim = magic
            ws = version
            epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d')
        # Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc)
        self.wv.vector_size = dim
        self.vector_size = dim
        self.window = ws
        self.epochs = epoch
        self.vocabulary.min_count = min_count
        self.negative = neg
        self.hs = loss == 1
        self.sg = model == 2
        self.trainables.bucket = bucket
        self.wv.min_n = minn
        self.wv.max_n = maxn
        self.vocabulary.sample = t

    def _load_dict(self, file_handle, encoding='utf8'):
        vocab_size, nwords, nlabels = self.struct_unpack(file_handle, '@3i')
        # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
        if nlabels > 0:
            raise NotImplementedError("Supervised fastText models are not supported")
        logger.info("loading %s words for fastText model from %s", vocab_size, self.file_name)

        self.struct_unpack(file_handle, '@1q')  # number of tokens
        if self.new_format:
            pruneidx_size, = self.struct_unpack(file_handle, '@q')
        for i in range(vocab_size):
            word_bytes = b''
            char_byte = file_handle.read(1)
            # Read vocab word
            while char_byte != b'\x00':
                word_bytes += char_byte
                char_byte = file_handle.read(1)
            word = word_bytes.decode(encoding)
            count, _ = self.struct_unpack(file_handle, '@qb')

            self.wv.vocab[word] = Vocab(index=i, count=count)
            self.wv.index2word.append(word)

        assert len(self.wv.vocab) == nwords, (
            'mismatch between final vocab size ({} words), '
            'and expected number of words ({} words)'.format(len(self.wv.vocab), nwords))
        if len(self.wv.vocab) != vocab_size:
            # expecting to log this warning only for pretrained french vector, wiki.fr
            logger.warning(
                "mismatch between final vocab size (%s words), and expected vocab size (%s words)",
                len(self.wv.vocab), vocab_size
            )

        if self.new_format:
            for j in range(pruneidx_size):
                self.struct_unpack(file_handle, '@2i')

    def _load_vectors(self, file_handle):
        if self.new_format:
            self.struct_unpack(file_handle, '@?')  # bool quant_input in fasttext.cc
        num_vectors, dim = self.struct_unpack(file_handle, '@2q')
        # Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc)
        assert self.wv.vector_size == dim, (
            'mismatch between vector size in model params ({}) and model vectors ({})'
            .format(self.wv.vector_size, dim)
        )
        float_size = struct.calcsize('@f')
        if float_size == 4:
            dtype = np.dtype(np.float32)
        elif float_size == 8:
            dtype = np.dtype(np.float64)

        self.num_original_vectors = num_vectors
        self.wv.vectors_ngrams = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim)
        self.wv.vectors_ngrams = self.wv.vectors_ngrams.reshape((num_vectors, dim))
        assert self.wv.vectors_ngrams.shape == (
            self.trainables.bucket + len(self.wv.vocab), self.wv.vector_size), \
            'mismatch between actual weight matrix shape {} and expected shape {}'\
            .format(
                self.wv.vectors_ngrams.shape, (self.trainables.bucket + len(self.wv.vocab), self.wv.vector_size)
            )

        self.trainables.init_ngrams_post_load(self.file_name, self.wv)
        self._clear_post_train()

    def struct_unpack(self, file_handle, fmt):
        num_bytes = struct.calcsize(fmt)
        return struct.unpack(fmt, file_handle.read(num_bytes))

    def save(self, *args, **kwargs):
        """Save the model. This saved model can be loaded again using :func:`~gensim.models.fasttext.FastText.load`,
        which supports online training and getting vectors for out-of-vocabulary words.

        Parameters
        ----------
        fname : str
            Path to the file.

        """
        kwargs['ignore'] = kwargs.get('ignore', ['vectors_norm', 'vectors_vocab_norm', 'vectors_ngrams_norm'])
        super(FastText, self).save(*args, **kwargs)

    @classmethod
    def load(cls, *args, **kwargs):
        """Loads a previously saved `FastText` model. Also see `save()`.

        Parameters
        ----------
        fname : str
            Path to the saved file.

        Returns
        -------
        :obj: `~gensim.models.fasttext.FastText`
            Returns the loaded model as an instance of :class: `~gensim.models.fasttext.FastText`.
        """
        try:
            model = super(FastText, cls).load(*args, **kwargs)
            if not hasattr(model.trainables, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'):
                model.trainables.vectors_vocab_lockf = ones(len(model.trainables.vectors), dtype=REAL)
            if not hasattr(model.trainables, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'):
                model.trainables.vectors_ngrams_lockf = ones(len(model.trainables.vectors), dtype=REAL)
            return model
        except AttributeError:
            logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.')
            from gensim.models.deprecated.fasttext import load_old_fasttext
            return load_old_fasttext(*args, **kwargs)

    @deprecated("Method will be removed in 4.0.0, use self.wv.accuracy() instead")
    def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True):
        most_similar = most_similar or FastTextKeyedVectors.most_similar
        return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive)
Beispiel #19
0
    # model_filename = "20190509_yle-wikipedia_word2vec_cbow_fi_lr=0.05,dim=100,ws=5,epoch=5,neg=5,mincount=5"
    # model_filename = "fin-word2vec-lemma"
    # model_filename = "wikipedia2008_fi_lemmatized_size=200,alpha=0.025,window=5,min_count=2,sg=1,negative=15,iter=5"
    model_filename = "Word2Vec_iltalehti-wikipedia_new_size=300,alpha=0.025,window=5,min_count=2,sg=1,negative=5,iter=15"
    

    # remember to switch the right model_file_type
    model_file = os.path.join(config.EMBEDDINGS_DIR, model_filename + model_file_type)

    print("Using the "+model_type+" model:", model_filename)
    print("Loading "+model_type+" model...")
    if model_type == 'Word2Vec':
        try:
            model = KeyedVectors.load_word2vec_format(model_file, binary=True)
        except UnicodeDecodeError:
            model = KeyedVectors.load_word2vec_format(model_file, binary=False)
    else:
        model = FastTextKeyedVectors.load(model_file)
    print(model_type+" model loaded.")

    # evaluate
    result_string = intrusion(model)
    result_string += analogy(model)
    result_string += nearest_neighbours(model)

    result_file = os.path.join("results", model_type + model_filename + "_results.txt")

    with open(result_file, 'w', encoding='utf-8') as f:
        f.write(result_string)

"""
File: app.py
Created Date: Monday, 2nd November 2020 9:18:53 am
Author: Tianyu Gu ([email protected])
"""

from pathlib import Path

from flask import Flask, abort, jsonify, request
from gensim.models.keyedvectors import FastTextKeyedVectors

_data_folder = Path(__file__).parent.parent.joinpath("data")
_fasttext_wv = FastTextKeyedVectors.load(
    str(_data_folder.joinpath("fasttext.wv")))

app = Flask("gensim_fasttext_service")


@app.route("/most-similar", methods=["POST"])
def similar_by_word():
    word: str = request.form.get("word")
    topn_arg = request.form.get("topn", "10")
    if not topn_arg.isdigit():
        abort(400)

    topn: int = int(topn_arg)
    res = [
        candidate
        for candidate, _ in _fasttext_wv.most_similar(word, topn=topn)
    ]
    return jsonify(res)
    def __init__(self,
                 sentences=None,
                 sg=0,
                 hs=0,
                 size=100,
                 alpha=0.025,
                 window=5,
                 min_count=5,
                 max_vocab_size=None,
                 word_ngrams=1,
                 sample=1e-3,
                 seed=1,
                 workers=3,
                 min_alpha=0.0001,
                 negative=5,
                 cbow_mean=1,
                 hashfxn=hash,
                 iter=5,
                 null_word=0,
                 min_n=3,
                 max_n=6,
                 sorted_vocab=1,
                 bucket=2000000,
                 trim_rule=None,
                 batch_words=MAX_WORDS_IN_BATCH,
                 callbacks=()):
        """Initialize the model from an iterable of `sentences`. Each sentence is a
        list of words (unicode strings) that will be used for training.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
            If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
            in some other way.
        sg : int {1, 0}
            Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed.
        size : int
            Dimensionality of the feature vectors.
        window : int
            The maximum distance between the current and predicted word within a sentence.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        seed : int
            Seed for the random number generator. Initial vectors for each word are seeded with a hash of
            the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
            you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
            from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
            use of the `PYTHONHASHSEED` environment variable to control hash randomization).
        min_count : int
            Ignores all words with total frequency lower than this.
        max_vocab_size : int
            Limits the RAM during vocabulary building; if there are more unique
            words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
            Set to `None` for no limit.
        sample : float
            The threshold for configuring which higher-frequency words are randomly downsampled,
            useful range is (0, 1e-5).
        workers : int
            Use these many worker threads to train the model (=faster training with multicore machines).
        hs : int {1,0}
            If 1, hierarchical softmax will be used for model training.
            If set to 0, and `negative` is non-zero, negative sampling will be used.
        negative : int
            If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
            should be drawn (usually between 5-20).
            If set to 0, no negative sampling is used.
        cbow_mean : int {1,0}
            If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
        hashfxn : function
            Hash function to use to randomly initialize weights, for increased training reproducibility.
        iter : int
            Number of iterations (epochs) over the corpus.
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        sorted_vocab : int {1,0}
            If 1, sort the vocabulary by descending frequency before assigning word indexes.
        batch_words : int
            Target size (in words) for batches of examples passed to worker threads (and
            thus cython routines).(Larger batches will be passed if individual
            texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
        min_n : int
            Min length of char ngrams to be used for training word representations.
        max_n : int
            Max length of char ngrams to be used for training word representations. Set `max_n` to be
            lesser than `min_n` to avoid char ngrams being used.
        word_ngrams : int {1,0}
            If 1, uses enriches word vectors with subword(ngrams) information.
            If 0, this is equivalent to word2vec.
        bucket : int
            Character ngrams are hashed into a fixed number of buckets, in order to limit the
            memory usage of the model. This option specifies the number of buckets used by the model.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.

        Examples
        --------
        Initialize and train a `FastText` model

        >>> from gensim.models import FastText
        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>>
        >>> model = FastText(sentences, min_count=1)
        >>> say_vector = model['say']  # get vector for word
        >>> of_vector = model['of']  # get vector for out-of-vocab word


        """
        self.load = call_on_class_only
        self.load_fasttext_format = call_on_class_only
        self.callbacks = callbacks
        self.word_ngrams = int(word_ngrams)
        if self.word_ngrams <= 1 and max_n == 0:
            bucket = 0

        self.wv = FastTextKeyedVectors(size, min_n, max_n)
        self.vocabulary = FastTextVocab(max_vocab_size=max_vocab_size,
                                        min_count=min_count,
                                        sample=sample,
                                        sorted_vocab=bool(sorted_vocab),
                                        null_word=null_word)
        self.trainables = FastTextTrainables(vector_size=size,
                                             seed=seed,
                                             bucket=bucket,
                                             hashfxn=hashfxn)

        super(FastText, self).__init__(sentences=sentences,
                                       workers=workers,
                                       vector_size=size,
                                       epochs=iter,
                                       callbacks=callbacks,
                                       batch_words=batch_words,
                                       trim_rule=trim_rule,
                                       sg=sg,
                                       alpha=alpha,
                                       window=window,
                                       seed=seed,
                                       hs=hs,
                                       negative=negative,
                                       cbow_mean=cbow_mean,
                                       min_alpha=min_alpha,
                                       fast_version=FAST_VERSION)
Beispiel #22
0
# encoder layer weights
vectors = torch.zeros(len(corpus.dictionary), args.emsize)

if not args.evaluate:
    ###############################################################################
    # Load word embeddings and corpus
    ###############################################################################
    w2v_model = None
    if args.emmodel != 'no':
        print("using pretrained word embeddings", args.emmodel)
        try:
            w2v_model = KeyedVectors.load_word2vec_format(args.emmodel, binary=True)
            # w2v_model = Word2Vec.load(args.emmodel)
        except UnicodeDecodeError:
            w2v_model = FastTextKeyedVectors.load(args.emmodel)

        assert w2v_model.vector_size == args.emsize

    # initialise uniformly
    initrange = 0.1
    nn.init.uniform_(vectors)
    print("encoder layer shape", vectors.shape)
    # use pretrained vectors if available
    oov_count = 0
    if w2v_model:
        for i, word in enumerate(corpus.dictionary.idx2word):
            try:
                # print(w2v_model.wv[word].shape, vectors[i].shape)
                vectors[i] = torch.tensor(w2v_model.wv[word])
            except KeyError as err: