def build_vocab(self, corpus): """ Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided. """ if self.vocabulary_counts != None: logger.debug("building vocabulary from provided frequency map") vocab = self.vocabulary_counts else: logger.debug("default vocabulary building") super(Skipgram, self).build_vocab(corpus) return # assign a unique index to each word self.vocab, self.index2word = {}, [] for word, count in vocab.iteritems(): v = Vocab() v.count = count if v.count >= self.min_count: v.index = len(self.vocab) self.index2word.append(word) self.vocab[word] = v logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) if self.hs: # add info about each word's Huffman encoding self.create_binary_tree() if self.negative: # build the table for drawing random words (for negative sampling) self.make_table() # precalculate downsampling thresholds self.precalc_sampling() self.reset_weights()
def build_vocab(self, corpus): """ Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided. """ if self.vocabulary_counts != None: print "building vocabulary from provided frequency map" vocab = self.vocabulary_counts else: print "default vocabulary building" super(Skipgram, self).build_vocab(corpus) return # assign a unique index to each word self.vocab, self.index2word = {}, [] for word, count in vocab.iteritems(): v = Vocab() v.count = count if v.count >= self.min_count: v.index = len(self.vocab) self.index2word.append(word) self.vocab[word] = v self.corpus_count = len(vocab) self.raw_vocab = vocab logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) self.scale_vocab() self.finalize_vocab()