def fit(self, trn_path: str, **kwargs) -> int:
     self.vocab = VocabTF()
     num_samples = 0
     for x, y in self.file_to_inputs(trn_path):
         self.vocab.update(x)
         num_samples += 1
     return num_samples
    def _load(path,
              vocab,
              normalize=False) -> Tuple[VocabTF, Union[np.ndarray, None]]:
        if not vocab:
            vocab = VocabTF()
        if not path:
            return vocab, None
        assert vocab.unk_idx is not None

        word2vec, dim = load_word2vec(path)
        for word in word2vec:
            vocab.get_idx(word)

        pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32)
        state = np.random.get_state()
        np.random.seed(0)
        bias = np.random.uniform(low=-0.001, high=0.001,
                                 size=dim).astype(dtype=np.float32)
        scale = np.sqrt(3.0 / dim)
        for word, idx in vocab.token_to_idx.items():
            vec = word2vec.get(word, None)
            if vec is None:
                vec = word2vec.get(word.lower(), None)
                # if vec is not None:
                #     vec += bias
            if vec is None:
                # vec = np.random.uniform(-scale, scale, [dim])
                vec = np.zeros([dim], dtype=np.float32)
            pret_embs[idx] = vec
        # noinspection PyTypeChecker
        np.random.set_state(state)
        return vocab, pret_embs
 def __init__(self,
              path: str = None,
              vocab: VocabTF = None,
              normalize: bool = False,
              load_all=True,
              mask_zero=True,
              trainable=False,
              name=None,
              dtype=None,
              dynamic=False,
              **kwargs):
     super().__init__(trainable, name, dtype, dynamic, **kwargs)
     if load_all and vocab and vocab.locked:
         vocab.unlock()
     self.vocab, self.array_np = self._load(path, vocab, normalize)
     self.vocab.lock()
     self.array_ks = tf.keras.layers.Embedding(
         input_dim=len(self.vocab),
         output_dim=self.dim,
         trainable=trainable,
         embeddings_initializer=tf.keras.initializers.Constant(
             self.array_np),
         mask_zero=mask_zero)
     self.mask_zero = mask_zero
     self.supports_masking = mask_zero
Exemple #4
0
 def load_vocabs(self, save_dir, filename='vocabs.json'):
     save_dir = get_resource(save_dir)
     vocabs = SerializableDict()
     vocabs.load_json(os.path.join(save_dir, filename))
     for key, value in vocabs.items():
         vocab = VocabTF()
         vocab.copy_from(value)
         setattr(self.transform, key, vocab)
 def __init__(self,
              config: SerializableDict = None,
              map_x=True,
              map_y=True,
              lower=False,
              **kwargs) -> None:
     super().__init__(**merge_locals_kwargs(locals(), kwargs))
     self.token_vocab = VocabTF()
     self.pos_vocab = VocabTF(pad_token=None, unk_token=None)
     self.ner_vocab = VocabTF(pad_token=None)
     self.deprel_vocab = VocabTF(pad_token=None, unk_token=None)
     self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
 def __init__(self,
              filepath: str = None,
              vocab: VocabTF = None,
              expand_vocab=True,
              lowercase=False,
              input_dim=None,
              output_dim=None,
              unk=None,
              normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None,
              embeddings_constraint=None,
              mask_zero=True,
              input_length=None,
              name=None,
              **kwargs):
     if vocab is None:
         vocab = VocabTF()
     self.vocab = vocab
     super().__init__(filepath, vocab, expand_vocab, lowercase, input_dim,
                      output_dim, unk, normalize, embeddings_initializer,
                      embeddings_regularizer, activity_regularizer,
                      embeddings_constraint, mask_zero, input_length, name,
                      **kwargs)
def vocab_from_txt(txt_file_path,
                   bigram_only=False,
                   window_size=4,
                   **kwargs) -> Tuple[VocabTF, VocabTF, VocabTF]:
    char_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF(
        pad_token=None, unk_token=None)
    for X, Y in generate_ngram_bmes(txt_file_path,
                                    bigram_only,
                                    window_size,
                                    gold=True):
        char_vocab.update(X[0])
        for ngram in X[1:]:
            ngram_vocab.update(filter(lambda x: x, ngram))
        tag_vocab.update(Y)
    return char_vocab, ngram_vocab, tag_vocab
 def fit(self, trn_path: str, **kwargs) -> int:
     self.word_vocab = VocabTF()
     self.tag_vocab = VocabTF(pad_token=None, unk_token=None)
     num_samples = 0
     for words, tags in self.file_to_inputs(trn_path, True):
         self.word_vocab.update(words)
         self.tag_vocab.update(tags)
         num_samples += 1
     if self.char_vocab:
         self.char_vocab = VocabTF()
         for word in self.word_vocab.token_to_idx.keys():
             if word in (self.word_vocab.pad_token,
                         self.word_vocab.unk_token):
                 continue
             self.char_vocab.update(list(word))
     return num_samples
 def __init__(self,
              filepath: str = None,
              vocab: VocabTF = None,
              expand_vocab=True,
              lowercase=True,
              input_dim=None,
              output_dim=None,
              unk=None,
              normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None,
              embeddings_constraint=None,
              mask_zero=True,
              input_length=None,
              name=None,
              cpu=True,
              **kwargs):
     filepath = get_resource(filepath)
     word2vec, _output_dim = load_word2vec(filepath)
     if output_dim:
         assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}'
     output_dim = _output_dim
     # if the `unk` token exists in the pretrained,
     # then replace it with a self-defined one, usually the one in word vocab
     if unk and unk in word2vec:
         word2vec[vocab.safe_unk_token] = word2vec.pop(unk)
     if vocab is None:
         vocab = VocabTF()
         vocab.update(word2vec.keys())
     if expand_vocab and vocab.mutable:
         for word in word2vec:
             vocab.get_idx(word.lower() if lowercase else word)
     if input_dim:
         assert input_dim == len(
             vocab), f'input_dim = {input_dim} does not match {filepath}'
     input_dim = len(vocab)
     # init matrix
     self._embeddings_initializer = embeddings_initializer
     embeddings_initializer = tf.keras.initializers.get(
         embeddings_initializer)
     with tf.device('cpu:0') if cpu else DummyContext():
         pret_embs = embeddings_initializer(
             shape=[input_dim, output_dim]).numpy()
     # insert to pret_embs
     for word, idx in vocab.token_to_idx.items():
         vec = word2vec.get(word, None)
         # Retry lower case
         if vec is None and lowercase:
             vec = word2vec.get(word.lower(), None)
         if vec is not None:
             pret_embs[idx] = vec
     if normalize:
         pret_embs /= np.std(pret_embs)
     if not name:
         name = os.path.splitext(os.path.basename(filepath))[0]
     super().__init__(input_dim,
                      output_dim,
                      tf.keras.initializers.Constant(pret_embs),
                      embeddings_regularizer,
                      activity_regularizer,
                      embeddings_constraint,
                      mask_zero,
                      input_length,
                      name=name,
                      **kwargs)
     self.filepath = filepath
     self.expand_vocab = expand_vocab
     self.lowercase = lowercase
class TextTransform(Transform):
    def __init__(self,
                 forward=True,
                 seq_len=10,
                 tokenizer='char',
                 config: SerializableDict = None,
                 map_x=True,
                 map_y=True,
                 **kwargs) -> None:
        super().__init__(config,
                         map_x,
                         map_y,
                         seq_len=seq_len,
                         tokenizer=tokenizer,
                         forward=forward,
                         **kwargs)
        self.vocab: VocabTF = None

    def tokenize_func(self):
        if self.config.tokenizer == 'char':
            return list
        elif self.config.tokenizer == 'whitespace':
            return lambda x: x.split()
        else:
            return lambda x: x.split(self.config.tokenizer)

    def fit(self, trn_path: str, **kwargs) -> int:
        self.vocab = VocabTF()
        num_samples = 0
        for x, y in self.file_to_inputs(trn_path):
            self.vocab.update(x)
            num_samples += 1
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        defaults = self.vocab.pad_token, self.vocab.pad_token
        return types, shapes, defaults

    def file_to_inputs(self, filepath: str, gold=True):
        forward = self.config.forward
        seq_len = self.config.seq_len
        buffer = []
        tokenizer = self.tokenize_func()
        with open(filepath,
                  encoding='utf-8') if forward else FileReadBackwards(
                      filepath, encoding="utf-8") as src:
            for line in src:
                tokens = tokenizer(line)
                buffer += tokens
                while len(buffer) > seq_len:
                    yield buffer[:seq_len], buffer[1:1 + seq_len]
                    buffer.pop(0)

    def inputs_to_samples(self, inputs, gold=False):
        forward = self.config.forward
        for t in inputs:
            if gold:
                x, y = t
            else:
                x, y = t, t
            if not forward:
                x = list(reversed(x))
                y = list(reversed(y))
            yield x, y

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.x_to_idx(y)

    def Y_to_outputs(self,
                     Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                     gold=False,
                     inputs=None,
                     **kwargs) -> Iterable:
        pred = tf.argmax(Y, axis=-1)
        for ys, ms in zip(pred, inputs):
            ret = []
            for y in ys:
                ret.append(self.vocab.idx_to_token[int(y)])
            yield ret

    def input_is_single_sample(self, input: Any) -> bool:
        return isinstance(input[0], str)
class TSVTaggingTransform(TsvTaggingFormat, Transform):
    def __init__(self,
                 config: SerializableDict = None,
                 map_x=True,
                 map_y=True,
                 use_char=False,
                 **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.word_vocab: Optional[VocabTF] = None
        self.tag_vocab: Optional[VocabTF] = None
        self.char_vocab: Optional[VocabTF] = None

    def fit(self, trn_path: str, **kwargs) -> int:
        self.word_vocab = VocabTF()
        self.tag_vocab = VocabTF(pad_token=None, unk_token=None)
        num_samples = 0
        for words, tags in self.file_to_inputs(trn_path, True):
            self.word_vocab.update(words)
            self.tag_vocab.update(tags)
            num_samples += 1
        if self.char_vocab:
            self.char_vocab = VocabTF()
            for word in self.word_vocab.token_to_idx.keys():
                if word in (self.word_vocab.pad_token,
                            self.word_vocab.unk_token):
                    continue
                self.char_vocab.update(list(word))
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        values = self.word_vocab.pad_token, self.tag_vocab.first_token
        return types, shapes, values

    def inputs_to_samples(self, inputs, gold=False):
        lower = self.config.get('lower', False)
        if gold:
            if lower:
                for x, y in inputs:
                    yield x.lower(), y
            else:
                yield from inputs
        else:
            for x in inputs:
                yield x.lower() if lower else x, [self.padding_values[-1]
                                                  ] * len(x)

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.word_vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.tag_vocab.lookup(y)

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        for xs in X:
            words = []
            for x in xs:
                words.append(
                    str_tensor_to_str(x) if self.char_vocab else self.
                    word_vocab.idx_to_token[int(x)])
            yield words

    def Y_to_outputs(self,
                     Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                     gold=False,
                     inputs=None,
                     X=None,
                     **kwargs) -> Iterable:
        if not gold:
            Y = tf.argmax(Y, axis=2)
        for ys, xs in zip(Y, inputs):
            tags = []
            for y, x in zip(ys, xs):
                tags.append(self.tag_vocab.idx_to_token[int(y)])
            yield tags

    def input_is_single_sample(
            self, input: Union[List[str], List[List[str]]]) -> bool:
        return isinstance(input[0], str)

    def input_truth_output_to_str(self, input: List[str], truth: List[str],
                                  output: List[str]):
        text = ''
        for word, gold_tag, pred_tag in zip(input, truth, output):
            text += ' '.join([word, gold_tag, pred_tag]) + '\n'

        text += '\n'
        return text
def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \
        -> Tuple[VocabTF, VocabTF, VocabTF]:
    word_vocab = VocabTF()
    char_vocab = VocabTF()
    tag_vocab = VocabTF(unk_token=None)
    with open(tsv_file_path, encoding='utf-8') as tsv_file:
        for line in tsv_file:
            cells = line.strip().split()
            if cells:
                word, tag = cells
                if lower:
                    word_vocab.add(word.lower())
                else:
                    word_vocab.add(word)
                char_vocab.update(list(word))
                tag_vocab.add(tag)
    if lock_word_vocab:
        word_vocab.lock()
    if lock_char_vocab:
        char_vocab.lock()
    if lock_tag_vocab:
        tag_vocab.lock()
    return word_vocab, char_vocab, tag_vocab
class TACREDTransform(Transform):
    def __init__(self,
                 config: SerializableDict = None,
                 map_x=True,
                 map_y=True,
                 lower=False,
                 **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.token_vocab = VocabTF()
        self.pos_vocab = VocabTF(pad_token=None, unk_token=None)
        self.ner_vocab = VocabTF(pad_token=None)
        self.deprel_vocab = VocabTF(pad_token=None, unk_token=None)
        self.rel_vocab = VocabTF(pad_token=None, unk_token=None)

    def fit(self, trn_path: str, **kwargs) -> int:
        count = 0
        for (tokens, pos, ner, head, deprel, subj_positions, obj_positions,
             subj_type, obj_type), relation in self.file_to_samples(trn_path,
                                                                    gold=True):
            count += 1
            self.token_vocab.update(tokens)
            self.pos_vocab.update(pos)
            self.ner_vocab.update(ner)
            self.deprel_vocab.update(deprel)
            self.rel_vocab.add(relation)
        return count

    def file_to_inputs(self, filepath: str, gold=True):
        data = load_json(filepath)
        for d in data:
            tokens = list(d['token'])
            ss, se = d['subj_start'], d['subj_end']
            os, oe = d['obj_start'], d['obj_end']
            pos = d['stanford_pos']
            ner = d['stanford_ner']
            deprel = d['stanford_deprel']
            head = [int(x) for x in d['stanford_head']]
            assert any([x == 0 for x in head])
            relation = d['relation']
            yield (tokens, pos, ner, head, deprel, ss, se, os, oe), relation

    def inputs_to_samples(self, inputs, gold=False):
        for input in inputs:
            if gold:
                (tokens, pos, ner, head, deprel, ss, se, os,
                 oe), relation = input
            else:
                tokens, pos, ner, head, deprel, ss, se, os, oe = input
                relation = self.rel_vocab.safe_pad_token
            l = len(tokens)
            subj_positions = get_positions(ss, se, l)
            obj_positions = get_positions(os, oe, l)
            subj_type = ner[ss]
            obj_type = ner[os]
            # anonymize tokens
            tokens[ss:se + 1] = ['SUBJ-' + subj_type] * (se - ss + 1)
            tokens[os:oe + 1] = ['OBJ-' + obj_type] * (oe - os + 1)
            # min head is 0, but root is not included in tokens, so take 1 off from each head
            head = [h - 1 for h in head]
            yield (tokens, pos, ner, head, deprel, subj_positions,
                   obj_positions, subj_type, obj_type), relation

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        # (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation
        types = (tf.string, tf.string, tf.string, tf.int32, tf.string,
                 tf.int32, tf.int32, tf.string, tf.string), tf.string
        shapes = ([None], [None], [None], [None], [None], [None], [None], [],
                  []), []
        pads = (self.token_vocab.safe_pad_token, self.pos_vocab.safe_pad_token,
                self.ner_vocab.safe_pad_token, 0,
                self.deprel_vocab.safe_pad_token, 0, 0,
                self.ner_vocab.safe_pad_token,
                self.ner_vocab.safe_pad_token), self.rel_vocab.safe_pad_token
        return types, shapes, pads

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type = x
        tokens = self.token_vocab.lookup(tokens)
        pos = self.pos_vocab.lookup(pos)
        ner = self.ner_vocab.lookup(ner)
        deprel = self.deprel_vocab.lookup(deprel)
        subj_type = self.ner_vocab.lookup(subj_type)
        obj_type = self.ner_vocab.lookup(obj_type)
        return tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type

    def y_to_idx(self, y) -> tf.Tensor:
        return self.rel_vocab.lookup(y)