def read_embeddings(embeddings_filename, max_rank_emb): """Reads .vector or .bin file, modifies it to include <OOV> and <PADDING> and <SENTROOT>""" if embeddings_filename.endswith(".bin"): binary = True else: binary = False gensim_vectors = KeyedVectors.load_word2vec_format( embeddings_filename, binary=binary, limit=max_rank_emb, unicode_errors='replace') gensim_vectors.vocab["<OOV>"] = Vocab(index=1) gensim_vectors.vocab["<PADDING>"] = Vocab(index=0) gensim_vectors.vocab["<SENTROOT>"] = Vocab(index=2) for word_record in gensim_vectors.vocab.values(): word_record.index += 3 two_random_rows = numpy.random.uniform( low=-0.01, high=0.01, size=(3, gensim_vectors.vectors.shape[1])) # stack the two rows, and the embedding matrix on top of each other gensim_vectors.vectors = numpy.vstack( [two_random_rows, gensim_vectors.vectors]) gensim_vectors.vectors = keras.utils.normalize(gensim_vectors.vectors, axis=0) gensim_vectors.vectors = keras.utils.normalize(gensim_vectors.vectors) return gensim_vectors
def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: return result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) result.syn0[word_id] = weights result.index2word.append(word)
def create_keyedvector_from_matrix(self, embedding_matrix, word2id): """ Imports the necessary attributes for the Embedding object from an embedding matrix and a word2id vocabulary. Can be used for custom pre-trained embeddings. Parameters ---------- embedding_matrix: numpy.ndarray Embedding matrix as a numpy object word2id: dict Word vocabulary (key: word, value: word_index) """ vocab = { word: word2id[word] for word in sorted(word2id, key=word2id.__getitem__, reverse=False) } embedding_matrix = embedding_matrix vector_size = embedding_matrix.shape[1] kv = KeyedVectors(vector_size) kv.vector_size = vector_size kv.vectors = embedding_matrix kv.index2word = list(vocab.keys()) kv.vocab = { word: Vocab(index=word_id, count=0) for word, word_id in vocab.items() } self.embedding = kv
def __init__(self, model_path, model_params_path, dim, dtype="float32", dtype_size=4, header_size=32): self.header_size_bites = header_size * dtype_size self.dtype_size = dtype_size self.dtype = dtype self.dim = dim with open(model_params_path) as fd: params = json.load(fd) self.model = gensim.models.keyedvectors.FastTextKeyedVectors( vector_size=dim, min_n=params['hash_params']['minn'], max_n=params['hash_params']['maxn'], bucket=params['hash_params']['num_buckets'], compatible_hash=params['hash_params']['fb_compatible']) self.model.vectors_vocab = self.load_matrix( f'{model_path}.vectors_vocab.npy') self.model.vectors = self.load_matrix(f'{model_path}.vectors.npy') self.model.vectors_ngrams = self.load_matrix( f'{model_path}.vectors_ngrams.npy') self.model.vocab = dict((word, Vocab(index=idx, count=1)) for word, idx in params['vocab'].items())
def _load_tree(self, tree: dict) -> None: self.__dict__.update(tree) self.tokens = split_strings(self.tokens) self.frequencies = { w: self.frequencies["vals"][i] for i, w in enumerate(split_strings(self.frequencies["keys"]))} self.checker = SymSpell(max_dictionary_edit_distance=self.max_distance) self.checker.__dict__.update(tree["checker"]) deletes = {} words = split_strings(self.checker._deletes["strings"]) lengths = self.checker._deletes["lengths"] data = self.checker._deletes["data"] offset = 0 for i, delindex in enumerate(self.checker._deletes["indexes"]): length = lengths[i] deletes[delindex] = [words[j] for j in data[offset:offset + length]] offset += length self.checker._deletes = deletes self.checker._words = {w: self.checker._words[i] for i, w in enumerate(words)} vectors = self.wv["vectors"] wv = FastTextKeyedVectors(vectors.shape[1], self.wv["min_n"], self.wv["max_n"], self.wv["bucket"], True) wv.vectors = numpy.array(vectors) vocab = split_strings(self.wv["vocab"]["strings"]) wv.vocab = { s: Vocab(index=i, count=self.wv["vocab"]["counts"][i]) for i, s in enumerate(vocab)} wv.bucket = self.wv["bucket"] wv.index2word = wv.index2entity = vocab wv.num_ngram_vectors = self.wv["num_ngram_vectors"] wv.vectors_ngrams = numpy.array(self.wv["vectors_ngrams"]) wv.hash2index = {k: v for v, k in enumerate(self.wv["hash2index"])} self.wv = wv
def load_vocab(wv, vocab_file, use_glove_format, restrict_vocab): # Read vocab. vocab_size = 0 with open(vocab_file, "r") as f: wv.index2freq = [] all_lines = f.readlines()[:restrict_vocab] if restrict_vocab > 0 else f.readlines() for index, line in enumerate(all_lines): if use_glove_format: word, count = line.strip().split(" ") # vocab is indexed from 0; for co-occ we use 1-based indexing index = index else: index, word, count = line.strip().split("\t") index = int(index) - 1 # indexing starts at 1 in the file; for co-occ we use 0-based indexing wv.index2word.append(word) wv.vocab[word] = Vocab(index=index, count=int(count)) wv.index2freq.append(count) vocab_size += 1 wv.index2freq = array(wv.index2freq).astype(uint32) # Unused members from VanillaWordEmbeddingsKeyedVectors. wv.vectors_norm = None print("Loaded vocabulary with {} words".format(vocab_size)) return vocab_size
def convert_graph_embedding_to_gensim( model_path: Union[str, Path], dataset_path: Union[str, Path], embedding_path: Union[str, Path] = None) -> Path: """ Example usage convert_graph_embedding_to_gensim( model_path=( DEFAULT_OUTPUT_PATH / 'reviews_Cell_Phones_and_Accessories-50000-docs' / 'our' / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.model').as_posix(), dataset_path=( DEFAULT_OUTPUT_PATH / 'reviews_Cell_Phones_and_Accessories-50000-docs' / 'our' / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.dataset').as_posix() ) """ if embedding_path is None: embedding_path = Path(model_path).with_suffix('.word2vec_format.bin') model = torch.load(model_path) dataset = torch.load(dataset_path) _save_word2vec_format( embedding_path, vocab={ word.replace(' ', '_'): Vocab(index=index, count=1) for word, index in dataset[0].nodes_mapping[0].items() }, vectors=model.embedding.weight.detach().numpy(), binary=True) return embedding_path
def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname) return if counts is None: # most common scenario: no vocab file given. just make up some bogus counts, in descending order result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab(index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to None (TODO: or raise?) logger.warning("vocabulary file is incomplete: '%s' is missing", word) result.vocab[word] = Vocab(index=word_id, count=None) result.vectors[word_id] = weights result.index2word.append(word)
def generate_vocab(walks): index2word = [] raw_vocab = defaultdict(int) for walk in walks: for word in walk: raw_vocab[word] += 1 vocab = {} for word, v in iteritems(raw_vocab): vocab[word] = Vocab(count=v, index=len(index2word)) index2word.append(word) index2word.sort(key=lambda word: vocab[word].count, reverse=True) for i, word in enumerate(index2word): vocab[word].index = i return vocab, index2word
def add_embeddings(keyed_vectors, *words, init=None): from gensim.models.keyedvectors import Vocab if init is None: init = np.zeros syn0 = keyed_vectors.syn0 for word in words: keyed_vectors.key_to_index[word] = Vocab(count=0, index=syn0.shape[0]) keyed_vectors.syn0 = np.concatenate([syn0, init((1, syn0.shape[1]))]) keyed_vectors.index2word.append(word) return syn0.shape[0]
def save_gensim_model(words, word_reprs, output_file, binary=True): """Save word representations in w2v format. Word order is not preserved""" vocab = dict() for word in words: vocab[word] = Vocab(index=len(vocab)) model = Word2VecKeyedVectors(word_reprs.shape[1]) model.vocab = vocab model.vectors = word_reprs model.save_word2vec_format(fname=output_file, binary=binary)
def generate_vocab(all_walks): index2word = [] raw_vocab = defaultdict(int) for walks in all_walks: # 遍历每一层walks for walk in walks: for word in walk: raw_vocab[word] += 1 # 统计一下word(node)出现的次数 vocab = {} for word, v in iteritems(raw_vocab): # 构建词袋,给word一个编码,按照出现的次数进行排序 vocab[word] = Vocab(count=v, index=len(index2word)) index2word.append(word) index2word.sort(key=lambda word: vocab[word].count, reverse=True) # 按照词袋的数量从大到小进行排序 for i, word in enumerate(index2word): vocab[word].index = i # 词袋的index重新排序 return vocab, index2word
def load(root_model): with open(root_model + '.txt', encoding='utf-8') as f: index_vocab = [t.strip() for t in f] vocab = {w: Vocab(index=i) for (i, w) in enumerate(index_vocab)} vectors = np.load(root_model + '.npy') m = Word2Vec() m.wv.index2word = index_vocab m.wv.vectors = vectors m.wv.vocab = vocab return m
def add_embeddings(keyed_vectors, *words, init=None): import numpy as np from gensim.models.keyedvectors import Vocab if init is None: init = np.zeros vectors = keyed_vectors.vectors for word in words: keyed_vectors.vocab[word] = Vocab(count=0, index=vectors.shape[0]) keyed_vectors.vectors = np.concatenate( [vectors, init((1, vectors.shape[1]))]) keyed_vectors.index2word.append(word) return vectors.shape[0]
def add(self, wwc: Collection[Tuple[str, np.ndarray, int]]) -> Iterator[Vocab]: """`wwc` is a collection of (word, weights, count) tuples""" vocab_size, vector_size = self.vectors.shape self.vectors.resize((vocab_size + len(wwc), vector_size)) for i, (word, weights, count) in enumerate(wwc): word_id = vocab_size + i voc = Vocab(index=word_id, count=count) self.vocab[word] = voc self.vectors[word_id] = weights self.index2word.append(word) yield voc
def generate_vocab(all_walks): index2word = [] raw_vocab = defaultdict(int) for walks in all_walks: for walk in walks: for word in walk: raw_vocab[word] += 1 vocab = {} for word, v in iteritems(raw_vocab): vocab[word] = Vocab(count=v, index=len(index2word)) index2word.append(word) # 按照每个单词出现的频次进行从大到小排序在vocab中 index2word.sort(key=lambda word: vocab[word].count, reverse=True) for i, word in enumerate(index2word): vocab[word].index = i # vocab是一个按照walks中所有节点出现的频率从大到小排序后的单词表 # index2word是节点集合 return vocab, index2word
def _set_keyedvector(self, attrname, keys, dim, vec=None): keyed_vec = KeyedVectors(dim) dummy_max_count = len(keys) + 1 for i, key in enumerate(keys): key = str(key) keyed_vec.vocab[key] = Vocab(index=i, count=dummy_max_count - i) # dummy count keyed_vec.index2word.append(key) if vec is not None: keyed_vec.vectors = vec keyed_vec.init_sims() setattr(self, attrname, keyed_vec)
def load_emb(fp): n_vocab, dim = map(int, fp.readline().split()) emb = KeyedVectors(dim) emb.vectors = np.empty((n_vocab, dim), dtype=np.float32) for i, line in tqdm(enumerate(fp), total=n_vocab, unit='word'): word, vec_str = line.split(' ', 1) emb.vectors[i] = np.fromstring(vec_str, sep=' ') emb.vocab[word] = Vocab(index=i) emb.index2word.append(word) return emb
def _create_pv(self): try: vectors = KeyedVectors() except TypeError: # Newer versions of gensim require a constructor argument. vectors = KeyedVectors(self._wv.shape[1]) phrases = [] for name in self.terminology.iter_names(): # This iterates over unique names. vectors.vocab[name] = Vocab(index=len(vectors.vocab), count=None) vectors.index2word.append(name) phrases.append(self._phrase_vector(name)) vectors.syn0 = vectors.syn0norm = np.array(phrases) return vectors
def generate_vocab(all_walks): index2word = [] raw_vocab = defaultdict( int) # stores the count of a word appearing in the walk for walks in all_walks: for walk in walks: for word in walk: raw_vocab[word] += 1 # compute count, then to sort based on count, at last set index after sort vocab = {} for word, v in iteritems(raw_vocab): vocab[word] = Vocab( count=v, index=len(index2word)) # vocab[word] = (count, index) index2word.append(word) # index2word[index] == word index2word.sort(key=lambda word: vocab[word].count, reverse=True) # decending order for i, word in enumerate(index2word): vocab[word].index = i # word2inedx return vocab, index2word # vocab:
def update(self): wv = self.word_vectors_file.get_word_vectors() voc = self.vocabs_file.get_vocabs()['word'] words_in_vocab = [ k for k, _ in sorted(voc.items(), key=lambda i: i[1][0]) ] word_embs = wv[words_in_vocab[1:]] unk_emb = np.mean(word_embs, 0, keepdims=True) embs = np.concatenate((unk_emb, word_embs), 0) kv = KeyedVectors(embs.shape[1]) kv.syn0 = embs kv.vocab = dict( (k, Vocab(index=v[0], count=v[1])) for k, v in voc.items()) kv.index2word = words_in_vocab kv.save(self.path)
def __create_keyed_vector(matrix, orig_vocab): vocab = dict() index_to_word = [] for word, word_id in sorted(orig_vocab.token2id.items(), key=itemgetter(1)): index_to_word.append(word) vocab[word] = Vocab(index=word_id, count=orig_vocab.word_freq[word_id]) vector_size = matrix.shape[1] keyed_vector = KeyedVectors(vector_size) keyed_vector.vector_size = vector_size keyed_vector.vocab = vocab keyed_vector.index2word = index_to_word keyed_vector.vectors = matrix assert (len(vocab), vector_size) == keyed_vector.vectors.shape return keyed_vector
def _load_dict(self, file_handle, encoding='utf8'): """Load a previously saved dictionary from disk, stored in Facebook's native fasttext format. Parameters ---------- file_handle : file-like object The opened file handle to the persisted dictionary. encoding : str Specifies the encoding. """ vocab_size, nwords, nlabels = self.struct_unpack(file_handle, '@3i') # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) if nlabels > 0: raise NotImplementedError("Supervised fastText models are not supported") logger.info("loading %s words for fastText model from %s", vocab_size, self.file_name) self.struct_unpack(file_handle, '@1q') # number of tokens if self.new_format: pruneidx_size, = self.struct_unpack(file_handle, '@q') for i in range(vocab_size): word_bytes = b'' char_byte = file_handle.read(1) # Read vocab word while char_byte != b'\x00': word_bytes += char_byte char_byte = file_handle.read(1) word = word_bytes.decode(encoding) count, _ = self.struct_unpack(file_handle, '@qb') self.wv.vocab[word] = Vocab(index=i, count=count) self.wv.index2word.append(word) assert len(self.wv.vocab) == nwords, ( 'mismatch between final vocab size ({} words), ' 'and expected number of words ({} words)'.format(len(self.wv.vocab), nwords)) if len(self.wv.vocab) != vocab_size: # expecting to log this warning only for pretrained french vector, wiki.fr logger.warning( "mismatch between final vocab size (%s words), and expected vocab size (%s words)", len(self.wv.vocab), vocab_size ) if self.new_format: for j in range(pruneidx_size): self.struct_unpack(file_handle, '@2i')
def load_dict(self, file_handle, encoding='utf8'): vocab_size, nwords, _ = self.struct_unpack(file_handle, '@3i') # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) logger.info("loading %s words for fastText model from %s", vocab_size, self.file_name) self.struct_unpack(file_handle, '@1q') # number of tokens if self.new_format: pruneidx_size, = self.struct_unpack(file_handle, '@q') for i in range(vocab_size): word_bytes = b'' char_byte = file_handle.read(1) # Read vocab word while char_byte != b'\x00': word_bytes += char_byte char_byte = file_handle.read(1) word = word_bytes.decode(encoding) count, _ = self.struct_unpack(file_handle, '@qb') if i == nwords and i < vocab_size: # To handle the error in pretrained vector wiki.fr (French). # For more info : https://github.com/facebookresearch/fastText/issues/218 assert word == "__label__", ( 'mismatched vocab_size ({}) and nwords ({}), extra word "{}"' .format(vocab_size, nwords, word)) continue # don't add word to vocab self.wv.vocab[word] = Vocab(index=i, count=count) self.wv.index2word.append(word) assert len(self.wv.vocab) == nwords, ( 'mismatch between final vocab size ({} words), ' 'and expected number of words ({} words)'.format( len(self.wv.vocab), nwords)) if len(self.wv.vocab) != vocab_size: # expecting to log this warning only for pretrained french vector, wiki.fr logger.warning( "mismatch between final vocab size (%s words), and expected vocab size (%s words)", len(self.wv.vocab), vocab_size) if self.new_format: for j in range(pruneidx_size): self.struct_unpack(file_handle, '@2i')
def load_a_format(cls, fin, vocab_size, vector_size, datatype=np.float32, discard=None): # type: (Iterable[str], int, int, Any, Optional[Callable[[str], bool]]) -> KeyedVectorsOriginal discard = discard or false result = cls(vector_size) result.vector_size = vector_size result.vectors = np.zeros((vocab_size, vector_size), dtype=datatype) for line in fin: word, vect = line.rstrip().split(" ", 1) if discard(word): continue weights = np.fromstring(vect, sep=" ", dtype=np.float32) # raise ValueError("invalid vector on line %s; is vector_size incorrect or file otherwise damaged?" % (i+1,)) if word in result.vocab: raise DuplicateEntry(word) word_id = len(result.index2word) result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) result.vectors[word_id] = weights result.index2word.append(word) real_size = len(result.index2word) if real_size != vocab_size: if discard is None: raise EOFError( "unexpected end of input; is vocab_size incorrect or file otherwise damaged?" ) else: result.vectors.resize( (real_size, vector_size)) # this should be no-copy return result
def prep_embeddings_fast(ft, emb_func, limit_vocab_size=30000): w2v = Word2Vec() emb = None if type(emb_func) is np.ndarray: size = min(limit_vocab_size, emb_func.shape[0]) emb = emb_func[:size] else: emb = np.zeros((limit_vocab_size, ft.D)) for i in range(limit_vocab_size): emb[i], _ = emb_func(ft.id2word[i]) size = emb.shape[0] w2v.index2word = ft.id2word[:size] w2v.vector_size = ft.D w2v.syn0 = emb dvocab = {} for word_id, word in enumerate(w2v.index2word): dvocab[word] = Vocab(index=word_id, count=ft.nwords - word_id) w2v.vocab = dvocab return w2v
def main(): """Entry point.""" parser = argparse.ArgumentParser("AWD-LSTM Embeddings to Word Vectors") parser.add_argument("--model", required=True) parser.add_argument("--dictionary", required=True) parser.add_argument("--output", required=True) args = parser.parse_args() dictionary = torch.load(args.dictionary) model = torch.load(args.model, map_location='cpu') embeddings = model[0].encoder.weight.data.cpu().numpy() kv = KeyedVectors(embeddings.shape[1]) kv.syn0 = embeddings kv.vocab = { w: Vocab(index=i) for i, w in enumerate(dictionary.dictionary.idx2word) } kv.index2word = dictionary.dictionary.idx2word kv.save(args.output)
def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, trim_rule=None, min_count=None, sample=None, dry_run=False): min_count = min_count or self.min_count sample = sample or self.sample drop_total = drop_unique = 0 if not update: raise Exception('Nonce2Vec can only update a pre-existing ' 'vocabulary') logger.info('Updating model with new vocabulary') new_total = pre_exist_total = 0 # New words and pre-existing words are two separate lists new_words = [] pre_exist_words = [] if self.nonce is not None: # if self.nonce is not None and self.nonce in wv.vocab: if self.nonce in wv.vocab: gold_nonce = '{}_true'.format(self.nonce) nonce_index = wv.vocab[self.nonce].index wv.vocab[gold_nonce] = wv.vocab[self.nonce] wv.index2word[nonce_index] = gold_nonce # del wv.index2word[wv.vocab[self.nonce].index] del wv.vocab[self.nonce] for word, v in iteritems(self.raw_vocab): # Update count of all words already in vocab if word in wv.vocab: pre_exist_words.append(word) pre_exist_total += v if not dry_run: wv.vocab[word].count += v else: # For new words, keep the ones above the min count # AND the nonce (regardless of count) if keep_vocab_item(word, v, min_count, trim_rule=trim_rule) or word == self.nonce: new_words.append(word) new_total += v if not dry_run: wv.vocab[word] = Vocab(count=v, index=len(wv.index2word)) wv.index2word.append(word) else: drop_unique += 1 drop_total += v original_unique_total = len(pre_exist_words) \ + len(new_words) + drop_unique pre_exist_unique_pct = len(pre_exist_words) \ * 100 / max(original_unique_total, 1) new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) logger.info('New added %i unique words (%i%% of original %i) ' 'and increased the count of %i pre-existing words ' '(%i%% of original %i)', len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), pre_exist_unique_pct, original_unique_total) retain_words = new_words + pre_exist_words retain_total = new_total + pre_exist_total # Precalculate each vocabulary item's threshold for sampling if not sample: # no words downsampled threshold_count = retain_total # Only retaining one subsampling notion from original gensim implementation else: threshold_count = sample * retain_total downsample_total, downsample_unique = 0, 0 for w in retain_words: v = wv.vocab[w].count word_probability = (np.sqrt(v / threshold_count) + 1) \ * (threshold_count / v) if word_probability < 1.0: downsample_unique += 1 downsample_total += word_probability * v else: word_probability = 1.0 downsample_total += v if not dry_run: wv.vocab[w].sample_int = int(round(word_probability * 2**32)) if not dry_run and not keep_raw_vocab: logger.info('deleting the raw counts dictionary of %i items', len(self.raw_vocab)) self.raw_vocab = defaultdict(int) logger.info('sample=%g downsamples %i most-common words', sample, downsample_unique) logger.info('downsampling leaves estimated %i word corpus ' '(%.1f%% of prior %i)', downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total) # return from each step: words-affected, resulting-corpus-size, # extra memory estimates report_values = { 'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique, 'downsample_total': int(downsample_total), 'num_retained_words': len(retain_words) } if self.null_word: # create null pseudo-word for padding when using concatenative # L1 (run-of-words) # this word is only ever input – never predicted – so count, # huffman-point, etc doesn't matter self.add_null_word(wv) if self.sorted_vocab and not update: self.sort_vocab(wv) if hs: # add info about each word's Huffman encoding self.create_binary_tree(wv) if negative: # build the table for drawing random words (for negative sampling) self.make_cum_table(wv) return report_values, pre_exist_words