def create_keyedvector_from_matrix(self, embedding_matrix, word2id): """ Imports the necessary attributes for the Embedding object from an embedding matrix and a word2id vocabulary. Can be used for custom pre-trained embeddings. Parameters ---------- embedding_matrix: numpy.ndarray Embedding matrix as a numpy object word2id: dict Word vocabulary (key: word, value: word_index) """ vocab = { word: word2id[word] for word in sorted(word2id, key=word2id.__getitem__, reverse=False) } embedding_matrix = embedding_matrix vector_size = embedding_matrix.shape[1] kv = KeyedVectors(vector_size) kv.vector_size = vector_size kv.vectors = embedding_matrix kv.index2word = list(vocab.keys()) kv.vocab = { word: Vocab(index=word_id, count=0) for word, word_id in vocab.items() } self.embedding = kv
def update(self): wv = self.word_vectors_file.get_word_vectors() voc = self.vocabs_file.get_vocabs()['word'] words_in_vocab = [ k for k, _ in sorted(voc.items(), key=lambda i: i[1][0]) ] word_embs = wv[words_in_vocab[1:]] unk_emb = np.mean(word_embs, 0, keepdims=True) embs = np.concatenate((unk_emb, word_embs), 0) kv = KeyedVectors(embs.shape[1]) kv.syn0 = embs kv.vocab = dict( (k, Vocab(index=v[0], count=v[1])) for k, v in voc.items()) kv.index2word = words_in_vocab kv.save(self.path)