def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). """ counts = None if fvocab is not None: #logger.info("loading word counts from %s" % (fvocab)) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) #logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline()) vocab_size, layer1_size = map(int, header.split()) # throws for invalid file format result = Word2Vec(size=layer1_size) result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL) if binary: binary_len = dtype(REAL).itemsize * layer1_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch != b'\n': # ignore newlines in front of words (some binary files have newline, some don't) word.append(ch) word = utils.to_unicode(b''.join(word),encoding='latin-1') if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: #logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else: for line_no, line in enumerate(fin): parts = utils.to_unicode(line).split() if len(parts) != layer1_size + 1: raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) word, weights = parts[0], map(REAL, parts[1:]) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: #logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = weights #logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) result.init_sims(norm_only) return result