Beispiel #1
0
    def _load(path, vocab, normalize=False) -> Tuple[Vocab, Union[np.ndarray, None]]:
        if not vocab:
            vocab = Vocab()
        if not path:
            return vocab, None
        assert vocab.unk_idx is not None

        word2vec, dim = load_word2vec(path)
        for word in word2vec:
            vocab.get_idx(word)

        pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32)
        state = np.random.get_state()
        np.random.seed(0)
        bias = np.random.uniform(low=-0.001, high=0.001, size=dim).astype(dtype=np.float32)
        scale = np.sqrt(3.0 / dim)
        for word, idx in vocab.token_to_idx.items():
            vec = word2vec.get(word, None)
            if vec is None:
                vec = word2vec.get(word.lower(), None)
                # if vec is not None:
                #     vec += bias
            if vec is None:
                # vec = np.random.uniform(-scale, scale, [dim])
                vec = np.zeros([dim], dtype=np.float32)
            pret_embs[idx] = vec
        # noinspection PyTypeChecker
        np.random.set_state(state)
        return vocab, pret_embs
Beispiel #2
0
def index_word2vec_with_vocab(filepath: str,
                              vocab: Vocab,
                              extend_vocab=True,
                              unk=None,
                              lowercase=False,
                              init='uniform',
                              normalize=None) -> torch.Tensor:
    """

    Args:
        filepath: The path to pretrained embedding.
        vocab: The vocabulary from training set.
        extend_vocab: Unlock vocabulary of training set to add those tokens in pretrained embedding file.
        unk: UNK token.
        lowercase: Convert words in pretrained embeddings into lowercase.
        init: Indicate which initialization to use for oov tokens.
        normalize: ``True`` or a method to normalize the embedding matrix.

    Returns:
        An embedding matrix.

    """
    pret_vocab, pret_matrix = load_word2vec_as_vocab_tensor(filepath)
    if unk and unk in pret_vocab:
        pret_vocab[vocab.safe_unk_token] = pret_vocab.pop(unk)
    if extend_vocab:
        vocab.unlock()
        for word in pret_vocab:
            vocab.get_idx(word.lower() if lowercase else word)
    vocab.lock()
    ids = []

    unk_id_offset = 0
    for word, idx in vocab.token_to_idx.items():
        word_id = pret_vocab.get(word, None)
        # Retry lower case
        if word_id is None:
            word_id = pret_vocab.get(word.lower(), None)
        if word_id is None:
            word_id = len(pret_vocab) + unk_id_offset
            unk_id_offset += 1
        ids.append(word_id)
    if unk_id_offset:
        unk_embeds = torch.zeros(unk_id_offset, pret_matrix.size(1))
        if init and init != 'zeros':
            if init == 'uniform':
                init = embedding_uniform
            else:
                raise ValueError(f'Unsupported init {init}')
            unk_embeds = init(unk_embeds)
        pret_matrix = torch.cat([pret_matrix, unk_embeds])
    ids = torch.LongTensor(ids)
    embedding = pret_matrix.index_select(0, ids)
    if normalize == 'norm':
        embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12)
    elif normalize == 'std':
        embedding /= torch.std(embedding)
    return embedding
Beispiel #3
0
 def __init__(self, filepath: str = None, vocab: Vocab = None, expand_vocab=True, lowercase=True,
              input_dim=None, output_dim=None, unk=None, normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None,
              name=None, **kwargs):
     filepath = get_resource(filepath)
     word2vec, _output_dim = load_word2vec(filepath)
     if output_dim:
         assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}'
     output_dim = _output_dim
     # if the `unk` token exists in the pretrained,
     # then replace it with a self-defined one, usually the one in word vocab
     if unk and unk in word2vec:
         word2vec[vocab.safe_unk_token] = word2vec.pop(unk)
     if vocab is None:
         vocab = Vocab()
         vocab.update(word2vec.keys())
     if expand_vocab and vocab.mutable:
         for word in word2vec:
             vocab.get_idx(word.lower() if lowercase else word)
     if input_dim:
         assert input_dim == len(vocab), f'input_dim = {input_dim} does not match {filepath}'
     input_dim = len(vocab)
     # init matrix
     self._embeddings_initializer = embeddings_initializer
     embeddings_initializer = tf.keras.initializers.get(embeddings_initializer)
     with tf.device('cpu:0'):
         pret_embs = embeddings_initializer(shape=[input_dim, output_dim]).numpy()
     # insert to pret_embs
     for word, idx in vocab.token_to_idx.items():
         vec = word2vec.get(word, None)
         # Retry lower case
         if vec is None and lowercase:
             vec = word2vec.get(word.lower(), None)
         if vec is not None:
             pret_embs[idx] = vec
     if normalize:
         pret_embs /= np.std(pret_embs)
     if not name:
         name = os.path.splitext(os.path.basename(filepath))[0]
     super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer,
                      activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs)
     self.filepath = filepath
     self.expand_vocab = expand_vocab
     self.lowercase = lowercase