def train(self, sentences, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0): self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. Word2Vec.train(self, sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha) self.get_vocab_word_vecs()
def train(self, sentences, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For FastText, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate progress-percentage logging, either total_examples (count of sentences) or total_words (count of raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to :meth:`~gensim.models.fasttext.FastText.build_vocab()`, the count of examples in that corpus will be available in the model's :attr:`corpus_count` property). To avoid common mistakes around the model's ability to do multiple training passes itself, an explicit `epochs` argument **MUST** be provided. In the common and recommended case, where :meth:`~gensim.models.fasttext.FastText.train()` is only called once, the model's cached `iter` value should be supplied as `epochs` value. Parameters ---------- sentences : iterable of iterables The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. total_examples : int Count of sentences. total_words : int Count of raw words in sentences. epochs : int Number of iterations (epochs) over the corpus. start_alpha : float Initial learning rate. end_alpha : float Final learning rate. Drops linearly from `start_alpha`. word_count : int Count of words already trained. Set this to 0 for the usual case of training on all words in sentences. queue_factor : int Multiplier for size of queue (number of workers * queue_factor). report_delay : float Seconds to wait before reporting progress. Examples -------- >>> from gensim.models import FastText >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] >>> >>> model = FastText(min_count=1) >>> model.build_vocab(sentences) >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) """ self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. Word2Vec.train(self, sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha) self.get_vocab_word_vecs()