Ejemplo n.º 1
0
class TSVTaggingTransform(TsvTaggingFormat, Transform):
    def __init__(self,
                 config: SerializableDict = None,
                 map_x=True,
                 map_y=True,
                 use_char=False,
                 **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.word_vocab: Optional[Vocab] = None
        self.tag_vocab: Optional[Vocab] = None
        self.char_vocab: Optional[Vocab] = None

    def fit(self, trn_path: str, **kwargs) -> int:
        self.word_vocab = Vocab()
        self.tag_vocab = Vocab(pad_token=None, unk_token=None)
        num_samples = 0
        for words, tags in generator_words_tags(trn_path,
                                                gold=True,
                                                lower=self.config.get(
                                                    'lower', False)):
            self.word_vocab.update(words)
            self.tag_vocab.update(tags)
            num_samples += 1
        if self.char_vocab:
            self.char_vocab = Vocab()
            for word in self.word_vocab.token_to_idx.keys():
                if word in (self.word_vocab.pad_token,
                            self.word_vocab.unk_token):
                    continue
                self.char_vocab.update(list(word))
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        values = self.word_vocab.pad_token, self.tag_vocab.first_token
        return types, shapes, values

    def inputs_to_samples(self, inputs, gold=False):
        lower = self.config.get('lower', False)
        if gold:
            if lower:
                for x, y in inputs:
                    yield x.lower(), y
            else:
                yield from inputs
        else:
            for x in inputs:
                yield x.lower() if lower else x, [self.padding_values[-1]
                                                  ] * len(x)

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.word_vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.tag_vocab.lookup(y)

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        for xs in X:
            words = []
            for x in xs:
                words.append(
                    str_tensor_to_str(x) if self.char_vocab else self.
                    word_vocab.idx_to_token[int(x)])
            yield words

    def Y_to_outputs(self,
                     Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                     gold=False,
                     inputs=None,
                     X=None,
                     **kwargs) -> Iterable:
        masks = Y._keras_mask if hasattr(Y, '_keras_mask') else tf.ones_like(Y)
        if not gold:
            Y = tf.argmax(Y, axis=2)
        for ys, mask in zip(Y, masks):
            tags = []
            for y, m in zip(ys, mask):
                if not m:
                    break
                tags.append(self.tag_vocab.idx_to_token[int(y)])
            yield tags

    def input_is_single_sample(
            self, input: Union[List[str], List[List[str]]]) -> bool:
        return isinstance(input[0], str)

    def input_truth_output_to_str(self, input: List[str], truth: List[str],
                                  output: List[str]):
        text = ''
        for word, gold_tag, pred_tag in zip(input, truth, output):
            text += ' '.join([word, gold_tag, pred_tag]) + '\n'

        text += '\n'
        return text
Ejemplo n.º 2
0
class TextTransform(Transform):
    def __init__(self,
                 forward=True,
                 seq_len=10,
                 tokenizer='char',
                 config: SerializableDict = None,
                 map_x=True,
                 map_y=True,
                 **kwargs) -> None:
        super().__init__(config,
                         map_x,
                         map_y,
                         seq_len=seq_len,
                         tokenizer=tokenizer,
                         forward=forward,
                         **kwargs)
        self.vocab: Vocab = None

    def tokenize_func(self):
        if self.config.tokenizer == 'char':
            return list
        elif self.config.tokenizer == 'whitespace':
            return lambda x: x.split()
        else:
            return lambda x: x.split(self.config.tokenizer)

    def fit(self, trn_path: str, **kwargs) -> int:
        self.vocab = Vocab()
        num_samples = 0
        for x, y in self.file_to_inputs(trn_path):
            self.vocab.update(x)
            num_samples += 1
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        defaults = self.vocab.pad_token, self.vocab.pad_token
        return types, shapes, defaults

    def file_to_inputs(self, filepath: str, gold=True):
        forward = self.config.forward
        seq_len = self.config.seq_len
        buffer = []
        tokenizer = self.tokenize_func()
        with open(filepath,
                  encoding='utf-8') if forward else FileReadBackwards(
                      filepath, encoding="utf-8") as src:
            for line in src:
                tokens = tokenizer(line)
                buffer += tokens
                while len(buffer) > seq_len:
                    yield buffer[:seq_len], buffer[1:1 + seq_len]
                    buffer.pop(0)

    def inputs_to_samples(self, inputs, gold=False):
        forward = self.config.forward
        for t in inputs:
            if gold:
                x, y = t
            else:
                x, y = t, t
            if not forward:
                x = list(reversed(x))
                y = list(reversed(y))
            yield x, y

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.x_to_idx(y)

    def Y_to_outputs(self,
                     Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                     gold=False) -> Iterable:
        mask = Y._keras_mask
        pred = tf.argmax(Y, axis=-1)
        for ys, ms in zip(pred, mask):
            ret = []
            for y, m in zip(ys, ms):
                if not m:
                    break
                ret.append(self.vocab.idx_to_token[int(y)])
            yield ret

    def input_is_single_sample(self, input: Any) -> bool:
        return isinstance(input[0], str)