def __init__(self,
              config: SerializableDict = None,
              map_x=True,
              map_y=True,
              lower=False,
              **kwargs) -> None:
     super().__init__(**merge_locals_kwargs(locals(), kwargs))
     self.token_vocab = VocabTF()
     self.pos_vocab = VocabTF(pad_token=None, unk_token=None)
     self.ner_vocab = VocabTF(pad_token=None)
     self.deprel_vocab = VocabTF(pad_token=None, unk_token=None)
     self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
def vocab_from_txt(txt_file_path,
                   bigram_only=False,
                   window_size=4,
                   **kwargs) -> Tuple[VocabTF, VocabTF, VocabTF]:
    char_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF(
        pad_token=None, unk_token=None)
    for X, Y in generate_ngram_bmes(txt_file_path,
                                    bigram_only,
                                    window_size,
                                    gold=True):
        char_vocab.update(X[0])
        for ngram in X[1:]:
            ngram_vocab.update(filter(lambda x: x, ngram))
        tag_vocab.update(Y)
    return char_vocab, ngram_vocab, tag_vocab
    def _load(path,
              vocab,
              normalize=False) -> Tuple[VocabTF, Union[np.ndarray, None]]:
        if not vocab:
            vocab = VocabTF()
        if not path:
            return vocab, None
        assert vocab.unk_idx is not None

        word2vec, dim = load_word2vec(path)
        for word in word2vec:
            vocab.get_idx(word)

        pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32)
        state = np.random.get_state()
        np.random.seed(0)
        bias = np.random.uniform(low=-0.001, high=0.001,
                                 size=dim).astype(dtype=np.float32)
        scale = np.sqrt(3.0 / dim)
        for word, idx in vocab.token_to_idx.items():
            vec = word2vec.get(word, None)
            if vec is None:
                vec = word2vec.get(word.lower(), None)
                # if vec is not None:
                #     vec += bias
            if vec is None:
                # vec = np.random.uniform(-scale, scale, [dim])
                vec = np.zeros([dim], dtype=np.float32)
            pret_embs[idx] = vec
        # noinspection PyTypeChecker
        np.random.set_state(state)
        return vocab, pret_embs
 def __init__(self,
              filepath: str = None,
              vocab: VocabTF = None,
              expand_vocab=True,
              lowercase=False,
              input_dim=None,
              output_dim=None,
              unk=None,
              normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None,
              embeddings_constraint=None,
              mask_zero=True,
              input_length=None,
              name=None,
              **kwargs):
     if vocab is None:
         vocab = VocabTF()
     self.vocab = vocab
     super().__init__(filepath, vocab, expand_vocab, lowercase, input_dim,
                      output_dim, unk, normalize, embeddings_initializer,
                      embeddings_regularizer, activity_regularizer,
                      embeddings_constraint, mask_zero, input_length, name,
                      **kwargs)
 def fit(self, trn_path: str, **kwargs) -> int:
     self.vocab = VocabTF()
     num_samples = 0
     for x, y in self.file_to_inputs(trn_path):
         self.vocab.update(x)
         num_samples += 1
     return num_samples
 def fit(self, trn_path: str, **kwargs) -> int:
     self.word_vocab = VocabTF()
     self.tag_vocab = VocabTF(pad_token=None, unk_token=None)
     num_samples = 0
     for words, tags in self.file_to_inputs(trn_path, True):
         self.word_vocab.update(words)
         self.tag_vocab.update(tags)
         num_samples += 1
     if self.char_vocab:
         self.char_vocab = VocabTF()
         for word in self.word_vocab.token_to_idx.keys():
             if word in (self.word_vocab.pad_token,
                         self.word_vocab.unk_token):
                 continue
             self.char_vocab.update(list(word))
     return num_samples
Exemple #7
0
 def load_vocabs(self, save_dir, filename='vocabs.json'):
     save_dir = get_resource(save_dir)
     vocabs = SerializableDict()
     vocabs.load_json(os.path.join(save_dir, filename))
     for key, value in vocabs.items():
         vocab = VocabTF()
         vocab.copy_from(value)
         setattr(self.transform, key, vocab)
def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \
        -> Tuple[VocabTF, VocabTF, VocabTF]:
    word_vocab = VocabTF()
    char_vocab = VocabTF()
    tag_vocab = VocabTF(unk_token=None)
    with open(tsv_file_path, encoding='utf-8') as tsv_file:
        for line in tsv_file:
            cells = line.strip().split()
            if cells:
                word, tag = cells
                if lower:
                    word_vocab.add(word.lower())
                else:
                    word_vocab.add(word)
                char_vocab.update(list(word))
                tag_vocab.add(tag)
    if lock_word_vocab:
        word_vocab.lock()
    if lock_char_vocab:
        char_vocab.lock()
    if lock_tag_vocab:
        tag_vocab.lock()
    return word_vocab, char_vocab, tag_vocab
 def __init__(self,
              filepath: str = None,
              vocab: VocabTF = None,
              expand_vocab=True,
              lowercase=True,
              input_dim=None,
              output_dim=None,
              unk=None,
              normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None,
              embeddings_constraint=None,
              mask_zero=True,
              input_length=None,
              name=None,
              cpu=True,
              **kwargs):
     filepath = get_resource(filepath)
     word2vec, _output_dim = load_word2vec(filepath)
     if output_dim:
         assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}'
     output_dim = _output_dim
     # if the `unk` token exists in the pretrained,
     # then replace it with a self-defined one, usually the one in word vocab
     if unk and unk in word2vec:
         word2vec[vocab.safe_unk_token] = word2vec.pop(unk)
     if vocab is None:
         vocab = VocabTF()
         vocab.update(word2vec.keys())
     if expand_vocab and vocab.mutable:
         for word in word2vec:
             vocab.get_idx(word.lower() if lowercase else word)
     if input_dim:
         assert input_dim == len(
             vocab), f'input_dim = {input_dim} does not match {filepath}'
     input_dim = len(vocab)
     # init matrix
     self._embeddings_initializer = embeddings_initializer
     embeddings_initializer = tf.keras.initializers.get(
         embeddings_initializer)
     with tf.device('cpu:0') if cpu else DummyContext():
         pret_embs = embeddings_initializer(
             shape=[input_dim, output_dim]).numpy()
     # insert to pret_embs
     for word, idx in vocab.token_to_idx.items():
         vec = word2vec.get(word, None)
         # Retry lower case
         if vec is None and lowercase:
             vec = word2vec.get(word.lower(), None)
         if vec is not None:
             pret_embs[idx] = vec
     if normalize:
         pret_embs /= np.std(pret_embs)
     if not name:
         name = os.path.splitext(os.path.basename(filepath))[0]
     super().__init__(input_dim,
                      output_dim,
                      tf.keras.initializers.Constant(pret_embs),
                      embeddings_regularizer,
                      activity_regularizer,
                      embeddings_constraint,
                      mask_zero,
                      input_length,
                      name=name,
                      **kwargs)
     self.filepath = filepath
     self.expand_vocab = expand_vocab
     self.lowercase = lowercase