def build_vocab(self, corpus): """ Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided. """ if self.vocabulary_counts != None: logger.debug("building vocabulary from provided frequency map") vocab = self.vocabulary_counts else: logger.debug("default vocabulary building") super(Skipgram, self).build_vocab(corpus) return # assign a unique index to each word self.vocab, self.index2word = {}, [] for word, count in vocab.iteritems(): v = Vocab() v.count = count if v.count >= self.min_count: v.index = len(self.vocab) self.index2word.append(word) self.vocab[word] = v logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) if self.hs: # add info about each word's Huffman encoding self.create_binary_tree() if self.negative: # build the table for drawing random words (for negative sampling) self.make_table() # precalculate downsampling thresholds self.precalc_sampling() self.reset_weights()
def build_vocab(self, corpus): """ Build vocabulary from a sequence of sentences or from a frequency dictionary, if one was provided. """ if self.vocabulary_counts != None: print "building vocabulary from provided frequency map" vocab = self.vocabulary_counts else: print "default vocabulary building" super(Skipgram, self).build_vocab(corpus) return # assign a unique index to each word self.vocab, self.index2word = {}, [] for word, count in vocab.iteritems(): v = Vocab() v.count = count if v.count >= self.min_count: v.index = len(self.vocab) self.index2word.append(word) self.vocab[word] = v self.corpus_count = len(vocab) self.raw_vocab = vocab logger.debug("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) self.scale_vocab() self.finalize_vocab()
def add_new_labels(sentences, model): """ Add new labels (for new docs) to the doc2vec model's `self.vocab`. from: <https://gist.github.com/zseder/4201551d7f8608f0b82b> """ sentence_no = -1 total_words = 0 vocab = model.vocab #model_sentence_n = len([l for l in vocab if l.startswith("SENT")]) model_sentence_n = max(int(l.split('_')[-1]) for l in vocab if l.startswith("SENT")) n_sentences = 0 for sentence_no, sentence in enumerate(sentences): sentence_length = len(sentence.words) for label in sentence.labels: label_e = label.split("_") label_n = int(label_e[1]) + model_sentence_n label = "{0}_{1}".format(label_e[0], label_n) total_words += 1 if label in vocab: vocab[label].count += sentence_length else: vocab[label] = Vocab(count=sentence_length) vocab[label].index = len(model.vocab) - 1 vocab[label].code = [0] vocab[label].sample_probability = 1. model.index2word.append(label) n_sentences += 1 return n_sentences
def _vocab_from(sentences): sentence_no, vocab = -1, {} total_words = 0 for sentence_no, sentence in enumerate(sentences): if sentence_no % 10000 == 0: logger.info( "PROGRESS: at item #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) sentence_length = len(sentence.words) for label in sentence.labels: total_words += 1 if label in vocab: vocab[label].count += sentence_length else: vocab[label] = Vocab(count=sentence_length) for word in sentence.words: total_words += 1 if word in vocab: vocab[word].count += 1 else: vocab[word] = Vocab(count=1) logger.info( "collected %i word types from a corpus of %i words and %i items" % (len(vocab), total_words, sentence_no + 1)) return vocab
def create_corpus_from_matlab(word_embedding, index2word): model = Word2VecExtended() model.syn0 = word_embedding.astype(theano.config.floatX).copy() model.index2word = index2word model.index2word[0] = UnknownWord vocab = {} for word in model.index2word: v = Vocab(count=1) v.index = len(vocab) vocab[word] = v model.vocab = vocab model.UnknownWordIndex = model.vocab[UnknownWord].index return model
def load_word_embeddings(self, word_embeddings, word_to_ix): """Loads the word embeddings. Parameters ---------- word_embeddings : numpy.ndarray Matrix with word-embeddings. word_to_ix : dict of (str, int) Mapping word to index. """ logger.info("Loading the vocabulary") self.vocab = {} self.index2word = [] counts = {} for word in word_to_ix: counts[word] = counts.get(word, 0) + 1 self.vocab_size = len(counts) self.vector_size = word_embeddings.shape[1] self.syn0 = np.zeros((self.vocab_size, self.vector_size)) self.index2word = [None] * self.vocab_size logger.info("Corpus has %i words", len(self.vocab)) for word_id, word in enumerate(counts): self.vocab[word] = Vocab(index=word_id, count=counts[word]) self.syn0[word_id] = word_embeddings[word_to_ix[word]] self.index2word[word_id] = word assert((len(self.vocab), self.vector_size) == self.syn0.shape) logger.info("Loaded matrix of %d size and %d dimensions", self.vocab_size, self.vector_size)
def __init__(self, pathtomapping, pathtovectors, pathtocounts="", initkeys=()): """ SPPMI model equivalent to a gensim word2vec model. :param pathtomapping: :param pathtovectors: :param pathtocounts: :param initkeys: :return: """ super(SPPMIModel, self).__init__() self.word2index = json.load(open(pathtomapping)) self.index2word = {v: k for k, v in self.word2index.items()} self.word_vectors = self._load_sparse(pathtovectors) self.vocab = {} self.fast_table = {k: {} for k in initkeys} if pathtocounts: counts = json.load(open(pathtocounts)) for w, idx in self.word2index.items(): v = Vocab(count=counts[w], index=idx) self.vocab[w] = v
def add_word(self, word, parent_word, emb, cur_index): fake_vocab_size = int(1e7) word_index = len(self.vocab) inner_node_index = word_index - 1 parent_index = self.vocab[parent_word].index # add in the left subtree if word != parent_word: self.vocab[word] = Vocab(index=word_index, count=fake_vocab_size - word_index, sample_int=(2**32)) if emb is not None: self.syn0[cur_index] = emb else: self.syn0[cur_index] = self.syn0[parent_index] # the node in the coarsened graph serves as an inner node now self.index2word.append(word) self.vocab[word].code = array(list(self.vocab[parent_word].code) + [0], dtype=uint8) self.vocab[word].point = array( list(self.vocab[parent_word].point) + [inner_node_index], dtype=uint32) self.inner_node_index_map[parent_word] = inner_node_index else: if emb is not None: self.syn0[parent_index] = emb self.vocab[word].code = array(list(self.vocab[word].code) + [1], dtype=uint8) self.vocab[word].point = array(list(self.vocab[word].point) + [self.inner_node_index_map[word]], dtype=uint32)
def finalize_vocab(self): """Build tables and model weights based on final vocabulary settings.""" if not self.index2word: self.scale_vocab() if self.sorted_vocab: self.sort_vocab() if self.hs: class FakeSelf(LabeledWord2Vec): def __init__(self, vocab): self.vocab = vocab # add info about each word's Huffman encoding self.__class__.create_binary_tree(FakeSelf(self.lvocab)) if self.negative: # build the table for drawing random words (for negative sampling) self.make_cum_table() if self.null_word: # create null pseudo-word for padding when using concatenative L1 (run-of-words) # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter word, v = '\0', Vocab(count=1, sample_int=0) v.index = len(self.vocab) self.index2word.append(word) self.vocab[word] = v # set initial input/projection and hidden weights self.reset_weights()
def extend_vocab(self, sentences, oov_word=False, report_frequency=10000): """ Extend vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ logger.info("collecting all words and their counts") prev_sentence_no = -1 sentence_no, vocab = -1, {} total_words = 0 assign_to_vocab = vocab.__setitem__ # slight performance gain # https://wiki.python.org/moin/PythonSpeed/PerformanceTips get_from_vocab = vocab.__getitem__ for sentence_no, sentence in enumerate(sentences): if prev_sentence_no == sentence_no: break if sentence_no % report_frequency == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) for word in sentence: if word in vocab: get_from_vocab(word).count += 1 else: assign_to_vocab(word, Vocab(count=1)) total_words += len(sentence) prev_sentence_no = sentence_no logger.info( "collected %i word types from a corpus of %i words and %i sentences" % (len(vocab), total_words, sentence_no + 1)) # assign a unique index to each word append = self.index2word.append assign_to_vocab = self.vocab.__setitem__ for word, v in vocab.items(): if word not in self.vocab: if v.count >= self.min_count: v.index = len(self.vocab) append(word) assign_to_vocab(word, v) else: self.vocab[word].count += v.count # add the special out of vocabulary word **UNKNOWN**: if oov_word: self.add_oov_word(count=len(vocab) - len(self.vocab)) logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) # add info about each word's Huffman encoding self.create_binary_tree() self.extend_weights()
def _vocab_from_new(self, sentences): """ build word dict from subgrams, bigrams calculate total_words :arg sentences list of already segmented sentence """ sentence_no, vocab, vocab_pred = -1, {}, {} total_words = 0 # for meta_subgram in [self.START, self.END]: # vocab[meta_subgram]=Vocab(count =1) for sentence_no, sentence in enumerate(sentences): if sentence_no % 200 == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) if sentence: char_seq = [self.START, self.START] + map( full2halfwidth, u"".join(sentence)) + [self.END, self.END] # count \n as a word total_words = total_words + len(char_seq) - 3 subgrams = [char for char in char_seq] + [ self.su_prefix + varient + char for char in char_seq for varient in self.state_varient ] bigrams = [ char_seq[index] + char_seq[index + 1] for index in range(len(char_seq) - 1) ] subgrams.extend([ self.sb_prefix + varient + bigram for bigram in bigrams for varient in self.state_varient ]) subgrams.extend(bigrams) for sub in subgrams: if sub in vocab: vocab[sub].count += 1 else: vocab[sub] = Vocab(count=1) logger.info( "collected %i word types from a corpus of %i words and %i sentences" % (len(vocab), total_words, sentence_no + 1)) self.total_words = total_words return vocab
def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: logger.warning( "duplicate word '%s' in %s, ignoring all but first", word, fname) return if counts is None: # most common scenario: no vocab file given. just make up some bogus counts, in descending order result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab(index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to None (TODO: or raise?) logger.warning( "vocabulary file is incomplete: '%s' is missing", word) result.vocab[word] = Vocab(index=word_id, count=None) result.syn0[word_id] = weights result.index2word.append(word)
def load_gensim_from_binary_file(filename): from gensim.models.word2vec import Vocab, Word2Vec words, M = load_vectors_from_binary_file(filename) N, k = M.shape assert N == len(words) model = Word2Vec(size=k) model.syn0 = M model.index2word = words model.vocab = dict( (w, Vocab(index=idx, count=N - idx)) for (idx, w) in enumerate(words)) return model
def build_vocab(self, sentences): logger.info("collecting all words and their counts") vocab = self._vocab_from_new(sentences) # assign a unique index to each word self.vocab, self.index2word = {}, [] for meta_word in [self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab]: v = Vocab(count=1) v.index = len(self.vocab) v.sample_probability = 1.0 self.index2word.append(meta_word) self.vocab[meta_word] = v # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1 # actually, not remove any words # build self.vocab word->Vocab dict, and assign a unique index to each word for subgram, v in iteritems(vocab): if v.count >= self.min_count: v.sample_probability = 1.0 v.index = len(self.vocab) self.index2word.append(subgram) self.vocab[subgram] = v logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) logger.info('reset weights') if self.hybrid_pred: # v is word # get single character word frequency freq_list = [self.vocab[v].count for v in self.vocab if len(v) == 1] freq_list.sort(reverse=True) self.hybrid_threshold = freq_list[len(freq_list) / 25] print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold self.reset_weights()
def update_vocab(corpus, old_model, model): """Like mode.build_vocab(), inserts words/vectors from old model""" count = model.min_count + 1 model.scan_vocab(corpus) # initial survey for word in old_model.vocab: # insert old if word not in model.vocab: model.raw_vocab[word] += count model.vocab[word] = Vocab(count=count, index=len(model.index2word)) model.index2word.append(word) # trim by min_count & precalculate downsampling model.scale_vocab() model.finalize_vocab() # build tables & arrays for word in old_model.vocab: if word in model.vocab: model.syn0[model.vocab[word].index] = old_model.syn0[ old_model.vocab[word].index]
def finalize_vocab(self, update=False): """Build tables and model weights based on final word vocabulary settings.""" if not self.wv.index2word: self.scale_vocab() if self.sorted_vocab and not update: self.sort_vocab() if self.null_word: # create null pseudo-word for padding when using concatenative L1 (run-of-words) # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter word, v = '\0', Vocab(count=1, sample_int=0) v.index = len(self.wv.vocab) self.wv.index2word.append(word) self.wv.vocab[word] = v # set initial input/projection and hidden weights if not update: self.reset_weights(outputs=False) else: self.update_weights(outputs=False)
def add_vector_to_model(self, category_id, vector, model): # The category should not already be in the space # (rebuild the space in that case) catid = '#' + unicode(category_id) if catid in model.vocab: self.remove_category_from_space(category_id) w_count = len(model.vocab) model.vocab[catid] = Vocab(index=w_count, count=w_count + 1) model.index2word.append(catid) if w_count == 0: model.syn0 = np.empty((1, 300), dtype=np.float32) model.syn0[0] = vector else: try: model.syn0 = np.vstack((model.syn0, vector)) except ValueError as e: print(e) print("Vector length: {}".format(len(vector))) print("Space Length: {}".format(model.vector_size)) return model
def __init__(self, token_database, document_database): # set the token and document databases self.token_database = token_database self.document_database = document_database # create the gensim model self.model = gensim.models.Word2Vec(size=token_database.vector_size, window=3, negative=25, sorted_vocab=0) # add each token from the token database to the gensim model for token in self.token_database: self.model.wv.vocab[token] = Vocab( count=self.token_database.get_freq(token), index=len(self.model.wv.index2word), sample_int=sys.maxint) self.model.wv.index2word.append(token) # prepare the model and copy over the existing token vectors self.model.finalize_vocab() self.model.wv.syn0 = self.token_database.get_vectors()
def _vocab_from(self, sentences): """ Construct the vocabulary. """ self.signals = set([]) sentence_no, vocab = -1, {} total_words = 0 for sentence_no, sentence_signal in enumerate(sentences): sentence, signal = self.extract_sentence_and_signal( sentence_signal) self.signals.add(signal) if sentence_no % 10000 == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) for word in sentence: total_words += 1 if word in vocab: vocab[word].count += 1 else: vocab[word] = Vocab(count=1) logger.info( "collected %i word types from a corpus of %i words and %i sentences" % (len(vocab), total_words, sentence_no + 1)) return vocab
def build_vocab(self, sentences): logger.info("collecting all words and their counts") vocab = self._vocab_from_new(sentences) # assign a unique index to each word self.vocab, self.index2word = {}, [] for meta_word in [ self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab ]: v = Vocab(count=1) v.index = len(self.vocab) v.sample_probability = 1.0 self.index2word.append(meta_word) self.vocab[meta_word] = v # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1 # actually, not remove any words # build self.vocab word->Vocab dict, and assign a unique index to each word for subgram, v in iteritems(vocab): if v.count >= self.min_count: v.sample_probability = 1.0 v.index = len(self.vocab) self.index2word.append(subgram) self.vocab[subgram] = v logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) logger.info('reset weights') if self.hybrid_pred: # v is word # get single character word frequency freq_list = [ self.vocab[v].count for v in self.vocab if len(v) == 1 ] freq_list.sort(reverse=True) self.hybrid_threshold = freq_list[len(freq_list) / 25] print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold self.reset_weights()
def load_word2vec_format(fname, fvocab=None, binary=False, norm_only=True, encoding='utf8'): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. """ counts = None if fvocab is not None: logger.info("loading word counts from %s" % (fvocab)) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = map( int, header.split()) # throws for invalid file format result = Word2Vec(size=vector_size) result.syn0 = zeros((vocab_size, vector_size), dtype=REAL) if binary: binary_len = dtype(REAL).itemsize * vector_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) try: word = utils.to_unicode(b''.join(word), encoding=encoding) except UnicodeDecodeError, e: logger.warning( "Couldn't convert whole word to unicode: trying to convert first %d bytes only ..." % e.start) word = utils.to_unicode(b''.join(word[:e.start]), encoding=encoding) logger.warning("... first %d bytes converted to '%s'" % (e.start, word)) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else:
count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else: for line_no, line in enumerate(fin): parts = utils.to_unicode(line[:-1], encoding=encoding).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % (line_no)) word, weights = parts[0], list(map(REAL, parts[1:])) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = weights logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) result.init_sims(norm_only) return result
def add_word_to_vocab(self, word, count=1): v = Vocab(count=count) v.index = len(self.vocab) self.vocab[word] = v self.index2word.append(word) return v
def build_vocab(self, sentences, oov_word=False, report_frequency=10000): """ Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ print("build vocab") path = (re.sub("/", "_", sentences.fname) + ("(mc=%d)" % (self.min_count)) + ".vocab") if hasattr(sentences, "fname") else None if path != None and file_exists(path): logger.info("loading from saved vocab list at \"%s\"" % (path)) file = gzip.open(path, 'r') saved_vocab = pickle.load(file) file.close() self.index2word = saved_vocab["index2word"] self.vocab = saved_vocab["vocab"] if oov_word: self.add_oov_word(count=10000) self.create_binary_tree() self.reset_weights() else: logger.info("collecting all words and their counts") prev_sentence_no = -1 sentence_no, vocab = -1, {} total_words = 0 assign_to_vocab = vocab.__setitem__ # slight performance gain # https://wiki.python.org/moin/PythonSpeed/PerformanceTips get_from_vocab = vocab.__getitem__ for sentence_no, sentence in enumerate(sentences): if prev_sentence_no == sentence_no: break if sentence_no % report_frequency == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types" % (sentence_no, total_words, len(vocab))) for word in sentence: if word in vocab: get_from_vocab(word).count += 1 else: assign_to_vocab(word, Vocab(count=1)) total_words += len(sentence) prev_sentence_no = sentence_no logger.info( "collected %i word types from a corpus of %i words and %i sentences" % (len(vocab), total_words, sentence_no + 1)) # assign a unique index to each word self.vocab, self.index2word = {}, [] append = self.index2word.append assign_to_vocab = self.vocab.__setitem__ for word, v in vocab.items(): if v.count >= self.min_count: v.index = len(self.vocab) append(word) assign_to_vocab(word, v) # add the special out of vocabulary word **UNKNOWN**: if oov_word: self.add_oov_word(count=len(vocab) - len(self.vocab)) len(vocab) - len(self.vocab) logger.info( "total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) # add info about each word's Huffman encoding self.create_binary_tree() self.reset_weights() if path != None: logger.info("saving vocab list in \"%s\"" % (path)) with gzip.open(path, 'wb') as file: pickle.dump( { "vocab": self.vocab, "index2word": self.index2word }, file, 1)
def scale_vocab(self, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None, update=False): """ Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). Calling with `dry_run=True` will only simulate the provided settings and report the size of the retained vocabulary, effective corpus length, and estimated memory requirements. Results are both printed via logging and returned as a dict. Delete the raw vocabulary after the scaling is done to free up RAM, unless `keep_raw_vocab` is set. """ sample = sample or self.sample logger.info("Loading a fresh vocabulary") # Discard words less-frequent than min_count if not dry_run: self.index2word = [] # make stored settings match these applied settings self.sample = sample self.vocab = {} for word, v in iteritems(self.raw_vocab): if not dry_run: self.vocab[word] = Vocab(count=v, index=len(self.index2word)) self.index2word.append(word) retain_total = self.total_words # Precalculate each vocabulary item's threshold for sampling if not sample: # no words downsampled threshold_count = retain_total elif sample < 1.0: # traditional meaning: set parameter as proportion of total threshold_count = sample * retain_total else: # new shorthand: sample >= 1 means downsample all words with # higher count than sample threshold_count = int(sample * (3 + sqrt(5)) / 2) downsample_total, downsample_unique = 0, 0 for w in self.raw_vocab.iterkeys(): v = self.raw_vocab[w] word_probability = (sqrt(v / threshold_count) + 1) * (threshold_count / v) if word_probability < 1.0: downsample_unique += 1 downsample_total += word_probability * v else: word_probability = 1.0 downsample_total += v if not dry_run: self.vocab[w].sample_int = int(round(word_probability * 2**32)) if not dry_run and not keep_raw_vocab: logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) self.raw_vocab = defaultdict(int) logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) logger.info( "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total) # print extra memory estimates memory = self.estimate_memory(vocab_size=len(self.vocab)) return memory
def load_word2vec_format_filtered(fname, vocab, fvocab=None, binary=False, norm_only=True): """ Like Word2Vec's loader, but allows you to restrict to a limited vocabulary. """ vocab = set(vocab) counts = None if fvocab is not None: counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline()) vocab_size, layer1_size = map( int, header.split()) # throws for invalid file format # We know we only need to store the number of things in the vocab vocab_size = len(vocab) result = Word2Vec(size=layer1_size) result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL) word_num = 0 if binary: binary_len = dtype(REAL).itemsize * layer1_size while word_num < vocab_size: # mixed text and binary: read text first, then binary word = read_word(fin) if word is None: # Reached EOF break # Only store the vectors for words in the given vocabulary if word in vocab: vocab.remove(word) if counts is None: result.vocab[word] = Vocab(index=word_num, count=vocab_size - word_num) elif word in counts: result.vocab[word] = Vocab(index=word_num, count=counts[word]) else: result.vocab[word] = Vocab(index=word_num, count=None) result.index2word.append(word) result.syn0[word_num] = fromstring(fin.read(binary_len), dtype=REAL) word_num += 1 else: # Skip this vector fin.read(binary_len) else: for line_no, line in enumerate(fin): parts = utils.to_unicode(line).split() if len(parts) != layer1_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % (line_no)) word, weights = parts[0], map(REAL, parts[1:]) if word in vocab: vocab.remove(word) if counts is None: result.vocab[word] = Vocab(index=word_num, count=vocab_size - word_num) elif word in counts: result.vocab[word] = Vocab(index=word_num, count=counts[word]) else: result.vocab[word] = Vocab(index=word_num, count=None) result.index2word.append(word) result.syn0[word_num] = weights word_num += 1 if word_num >= vocab_size: # Got all we need: don't carry on reading break # Get rid of the empty vectors at the end if not all words were found if word_num < vocab_size: result.syn0 = result.syn0[:word_num].copy() result.init_sims(norm_only) return result
s = corpus \ .flatMap(lambda s: [(w, 1) for w in s]) \ .reduceByKey(lambda a, b: a+b) \ .filter(lambda x: x[1] >= 5) \ .map(lambda x: (x[1], x[0])) \ .collect() #.sortByKey(False) \ #.collect() vocab = {} for i, (c, w) in enumerate(s): if i >= 1000000: break if (i + 1) % 100000 == 0: print i + 1 vocab[w] = Vocab(count=c) def build_vocab(model, vocab): model.word_count = long(0) model.total_words = long(0) model.vocab, model.index2word = {}, [] for word, v in vocab.iteritems(): if v.count >= model.min_count: v.index = len(model.vocab) model.index2word.append(word) model.vocab[word] = v model.total_words += v.count print "total %i word types after removing those with count<%s" % (len( model.vocab), model.min_count) if model.hs: