Exemple #1
0
    def _load(path, vocab, normalize=False) -> Tuple[Vocab, Union[np.ndarray, None]]:
        if not vocab:
            vocab = Vocab()
        if not path:
            return vocab, None
        assert vocab.unk_idx is not None

        word2vec, dim = load_word2vec(path)
        for word in word2vec:
            vocab.get_idx(word)

        pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32)
        state = np.random.get_state()
        np.random.seed(0)
        bias = np.random.uniform(low=-0.001, high=0.001, size=dim).astype(dtype=np.float32)
        scale = np.sqrt(3.0 / dim)
        for word, idx in vocab.token_to_idx.items():
            vec = word2vec.get(word, None)
            if vec is None:
                vec = word2vec.get(word.lower(), None)
                # if vec is not None:
                #     vec += bias
            if vec is None:
                # vec = np.random.uniform(-scale, scale, [dim])
                vec = np.zeros([dim], dtype=np.float32)
            pret_embs[idx] = vec
        # noinspection PyTypeChecker
        np.random.set_state(state)
        return vocab, pret_embs
Exemple #2
0
 def __init__(self, filepath: str = None, vocab: Vocab = None, expand_vocab=True, lowercase=True,
              input_dim=None, output_dim=None, unk=None, normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None,
              name=None, **kwargs):
     filepath = get_resource(filepath)
     word2vec, _output_dim = load_word2vec(filepath)
     if output_dim:
         assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}'
     output_dim = _output_dim
     # if the `unk` token exists in the pretrained,
     # then replace it with a self-defined one, usually the one in word vocab
     if unk and unk in word2vec:
         word2vec[vocab.safe_unk_token] = word2vec.pop(unk)
     if vocab is None:
         vocab = Vocab()
         vocab.update(word2vec.keys())
     if expand_vocab and vocab.mutable:
         for word in word2vec:
             vocab.get_idx(word.lower() if lowercase else word)
     if input_dim:
         assert input_dim == len(vocab), f'input_dim = {input_dim} does not match {filepath}'
     input_dim = len(vocab)
     # init matrix
     self._embeddings_initializer = embeddings_initializer
     embeddings_initializer = tf.keras.initializers.get(embeddings_initializer)
     with tf.device('cpu:0'):
         pret_embs = embeddings_initializer(shape=[input_dim, output_dim]).numpy()
     # insert to pret_embs
     for word, idx in vocab.token_to_idx.items():
         vec = word2vec.get(word, None)
         # Retry lower case
         if vec is None and lowercase:
             vec = word2vec.get(word.lower(), None)
         if vec is not None:
             pret_embs[idx] = vec
     if normalize:
         pret_embs /= np.std(pret_embs)
     if not name:
         name = os.path.splitext(os.path.basename(filepath))[0]
     super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer,
                      activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs)
     self.filepath = filepath
     self.expand_vocab = expand_vocab
     self.lowercase = lowercase