def _load(path, vocab, normalize=False) -> Tuple[Vocab, Union[np.ndarray, None]]: if not vocab: vocab = Vocab() if not path: return vocab, None assert vocab.unk_idx is not None word2vec, dim = load_word2vec(path) for word in word2vec: vocab.get_idx(word) pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32) state = np.random.get_state() np.random.seed(0) bias = np.random.uniform(low=-0.001, high=0.001, size=dim).astype(dtype=np.float32) scale = np.sqrt(3.0 / dim) for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) if vec is None: vec = word2vec.get(word.lower(), None) # if vec is not None: # vec += bias if vec is None: # vec = np.random.uniform(-scale, scale, [dim]) vec = np.zeros([dim], dtype=np.float32) pret_embs[idx] = vec # noinspection PyTypeChecker np.random.set_state(state) return vocab, pret_embs
def index_word2vec_with_vocab(filepath: str, vocab: Vocab, extend_vocab=True, unk=None, lowercase=False, init='uniform', normalize=None) -> torch.Tensor: """ Args: filepath: The path to pretrained embedding. vocab: The vocabulary from training set. extend_vocab: Unlock vocabulary of training set to add those tokens in pretrained embedding file. unk: UNK token. lowercase: Convert words in pretrained embeddings into lowercase. init: Indicate which initialization to use for oov tokens. normalize: ``True`` or a method to normalize the embedding matrix. Returns: An embedding matrix. """ pret_vocab, pret_matrix = load_word2vec_as_vocab_tensor(filepath) if unk and unk in pret_vocab: pret_vocab[vocab.safe_unk_token] = pret_vocab.pop(unk) if extend_vocab: vocab.unlock() for word in pret_vocab: vocab.get_idx(word.lower() if lowercase else word) vocab.lock() ids = [] unk_id_offset = 0 for word, idx in vocab.token_to_idx.items(): word_id = pret_vocab.get(word, None) # Retry lower case if word_id is None: word_id = pret_vocab.get(word.lower(), None) if word_id is None: word_id = len(pret_vocab) + unk_id_offset unk_id_offset += 1 ids.append(word_id) if unk_id_offset: unk_embeds = torch.zeros(unk_id_offset, pret_matrix.size(1)) if init and init != 'zeros': if init == 'uniform': init = embedding_uniform else: raise ValueError(f'Unsupported init {init}') unk_embeds = init(unk_embeds) pret_matrix = torch.cat([pret_matrix, unk_embeds]) ids = torch.LongTensor(ids) embedding = pret_matrix.index_select(0, ids) if normalize == 'norm': embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) elif normalize == 'std': embedding /= torch.std(embedding) return embedding
def __init__(self, filepath: str = None, vocab: Vocab = None, expand_vocab=True, lowercase=True, input_dim=None, output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None, name=None, **kwargs): filepath = get_resource(filepath) word2vec, _output_dim = load_word2vec(filepath) if output_dim: assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}' output_dim = _output_dim # if the `unk` token exists in the pretrained, # then replace it with a self-defined one, usually the one in word vocab if unk and unk in word2vec: word2vec[vocab.safe_unk_token] = word2vec.pop(unk) if vocab is None: vocab = Vocab() vocab.update(word2vec.keys()) if expand_vocab and vocab.mutable: for word in word2vec: vocab.get_idx(word.lower() if lowercase else word) if input_dim: assert input_dim == len(vocab), f'input_dim = {input_dim} does not match {filepath}' input_dim = len(vocab) # init matrix self._embeddings_initializer = embeddings_initializer embeddings_initializer = tf.keras.initializers.get(embeddings_initializer) with tf.device('cpu:0'): pret_embs = embeddings_initializer(shape=[input_dim, output_dim]).numpy() # insert to pret_embs for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) # Retry lower case if vec is None and lowercase: vec = word2vec.get(word.lower(), None) if vec is not None: pret_embs[idx] = vec if normalize: pret_embs /= np.std(pret_embs) if not name: name = os.path.splitext(os.path.basename(filepath))[0] super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer, activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs) self.filepath = filepath self.expand_vocab = expand_vocab self.lowercase = lowercase