def build_vocab(self, corpus): """ Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided. """ if self.vocabulary_counts != None: logger.debug("building vocabulary from provided frequency map") vocab = self.vocabulary_counts else: logger.debug("default vocabulary building") super(Skipgram, self).build_vocab(corpus) return # assign a unique index to each word self.vocab, self.index2word = {}, [] for word, count in vocab.iteritems(): v = Vocab() v.count = count if v.count >= self.min_count: v.index = len(self.vocab) self.index2word.append(word) self.vocab[word] = v logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) if self.hs: # add info about each word's Huffman encoding self.create_binary_tree() if self.negative: # build the table for drawing random words (for negative sampling) self.make_table() # precalculate downsampling thresholds self.precalc_sampling() self.reset_weights()
def build_vocab(self, corpus): """ Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided. """ if self.vocabulary_counts != None: print "building vocabulary from provided frequency map" vocab = self.vocabulary_counts else: print "default vocabulary building" super(Skipgram, self).build_vocab(corpus) return # assign a unique index to each word self.vocab, self.index2word = {}, [] for word, count in vocab.iteritems(): v = Vocab() v.count = count if v.count >= self.min_count: v.index = len(self.vocab) self.index2word.append(word) self.vocab[word] = v self.corpus_count = len(vocab) self.raw_vocab = vocab logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) self.scale_vocab() self.finalize_vocab()
def add_new_labels(sentences, model): """ Add new labels (for new docs) to the doc2vec model's `self.vocab`. from: <https://gist.github.com/zseder/4201551d7f8608f0b82b> """ sentence_no = -1 total_words = 0 vocab = model.vocab #model_sentence_n = len([l for l in vocab if l.startswith("SENT")]) model_sentence_n = max(int(l.split('_')[-1]) for l in vocab if l.startswith("SENT")) n_sentences = 0 for sentence_no, sentence in enumerate(sentences): sentence_length = len(sentence.words) for label in sentence.labels: label_e = label.split("_") label_n = int(label_e[1]) + model_sentence_n label = "{0}_{1}".format(label_e[0], label_n) total_words += 1 if label in vocab: vocab[label].count += sentence_length else: vocab[label] = Vocab(count=sentence_length) vocab[label].index = len(model.vocab) - 1 vocab[label].code = [0] vocab[label].sample_probability = 1. model.index2word.append(label) n_sentences += 1 return n_sentences
def build_vocab(self, sentences): logger.info("collecting all words and their counts") vocab = self._vocab_from_new(sentences) # assign a unique index to each word self.vocab, self.index2word = {}, [] for meta_word in [ self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab ]: v = Vocab(count=1) v.index = len(self.vocab) v.sample_probability = 1.0 self.index2word.append(meta_word) self.vocab[meta_word] = v # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1 # actually, not remove any words # build self.vocab word->Vocab dict, and assign a unique index to each word for subgram, v in iteritems(vocab): if v.count >= self.min_count: v.sample_probability = 1.0 v.index = len(self.vocab) self.index2word.append(subgram) self.vocab[subgram] = v logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) logger.info('reset weights') if self.hybrid_pred: # v is word # get single character word frequency freq_list = [ self.vocab[v].count for v in self.vocab if len(v) == 1 ] freq_list.sort(reverse=True) self.hybrid_threshold = freq_list[len(freq_list) / 25] print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold self.reset_weights()
def create_corpus_from_matlab(word_embedding, index2word): model = Word2VecExtended() model.syn0 = word_embedding.astype(theano.config.floatX).copy() model.index2word = index2word model.index2word[0] = UnknownWord vocab = {} for word in model.index2word: v = Vocab(count=1) v.index = len(vocab) vocab[word] = v model.vocab = vocab model.UnknownWordIndex = model.vocab[UnknownWord].index return model
def build_vocab(self, sentences): logger.info("collecting all words and their counts") vocab = self._vocab_from_new(sentences) # assign a unique index to each word self.vocab, self.index2word = {}, [] for meta_word in [self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab]: v = Vocab(count=1) v.index = len(self.vocab) v.sample_probability = 1.0 self.index2word.append(meta_word) self.vocab[meta_word] = v # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1 # actually, not remove any words # build self.vocab word->Vocab dict, and assign a unique index to each word for subgram, v in iteritems(vocab): if v.count >= self.min_count: v.sample_probability = 1.0 v.index = len(self.vocab) self.index2word.append(subgram) self.vocab[subgram] = v logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) logger.info('reset weights') if self.hybrid_pred: # v is word # get single character word frequency freq_list = [self.vocab[v].count for v in self.vocab if len(v) == 1] freq_list.sort(reverse=True) self.hybrid_threshold = freq_list[len(freq_list) / 25] print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold self.reset_weights()
def add_word_to_vocab(self, word, count=1): v = Vocab(count=count) v.index = len(self.vocab) self.vocab[word] = v self.index2word.append(word) return v