Ejemplo n.º 1
0
class CoNLL_SDP_Transform(CoNLLTransform):
    def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32,
                 n_tokens_per_batch=5000, min_freq=2, **kwargs) -> None:
        super().__init__(config, map_x, map_y, lower, n_buckets, n_tokens_per_batch, min_freq, **kwargs)
        self.orphan_relation = ROOT

    def lock_vocabs(self):
        super().lock_vocabs()
        # heuristic to find the orphan relation
        for rel in self.rel_vocab.idx_to_token:
            if 'root' in rel.lower():
                self.orphan_relation = rel
                break

    def file_to_inputs(self, filepath: str, gold=True):
        assert gold, 'only support gold file for now'
        for i, sent in enumerate(read_conll(filepath)):
            prev_cells = None
            parsed_sent = []
            heads = []
            rels = []
            for j, cell in enumerate(sent):
                ID = cell[0]
                form = cell[1]
                cpos = cell[3]
                head = cell[6]
                deprel = cell[7]
                if prev_cells and ID != prev_cells[0]:  # found end of token
                    parsed_sent.append([prev_cells[1], prev_cells[2], heads, rels])
                    heads = []
                    rels = []
                heads.append(head)
                rels.append(deprel)
                prev_cells = [ID, form, cpos, head, deprel]
            parsed_sent.append([prev_cells[1], prev_cells[2], heads, rels])
            yield parsed_sent

    def fit(self, trn_path: str, **kwargs) -> int:
        self.form_vocab = Vocab()
        self.form_vocab.add(ROOT)  # make root the 2ed elements while 0th is pad, 1st is unk
        self.cpos_vocab = Vocab(pad_token=None, unk_token=None)
        self.rel_vocab = Vocab(pad_token=None, unk_token=None)
        num_samples = 0
        counter = Counter()
        for sent in self.file_to_samples(trn_path, gold=True):
            num_samples += 1
            for idx, (form, cpos, head, deprel) in enumerate(sent):
                if idx == 0:
                    root = form
                else:
                    counter[form] += 1
                self.cpos_vocab.add(cpos)
                self.rel_vocab.update(deprel)

        for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]:
            self.form_vocab.add(token)
        return num_samples

    def inputs_to_samples(self, inputs, gold=False):
        for sent in inputs:
            sample = []
            if self.config['lower']:
                for i, cell in enumerate(sent):
                    cell = list(sent[i])
                    cell[0] = cell[0].lower()
                    if not gold:
                        cell += [[0], [self.rel_vocab.safe_pad_token]]
                    sample.append(cell)
            # insert root word with arbitrary fields, anyway it will be masked
            form, cpos, head, deprel = sample[0]
            sample.insert(0, [self.bos, self.bos, [0], deprel])
            yield sample

    def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=5000, shuffle=None, repeat=None,
                           drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset:
        def generator():
            # custom bucketing, load corpus into memory
            corpus = list(x for x in (samples() if callable(samples) else samples))
            lengths = [1 + len(i) for i in corpus]
            if len(corpus) < 32:
                n_buckets = 1
            else:
                n_buckets = min(self.config.n_buckets, len(corpus))
            buckets = dict(zip(*kmeans(lengths, n_buckets)))
            sizes, buckets = zip(*[
                (size, bucket) for size, bucket in buckets.items()
            ])
            # the number of chunks in each bucket, which is clipped by
            # range [1, len(bucket)]
            chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in
                      zip(sizes, buckets)]
            range_fn = randperm if shuffle else arange
            for i in tolist(range_fn(len(buckets))):
                split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1
                               for j in range(chunks[i])]
                for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes):
                    indices = [buckets[i][j] for j in tolist(batch_indices)]
                    raw_batch = [[], [], [], []]
                    max_len = len(max([corpus[i] for i in indices], key=len))
                    for idx in indices:
                        arc = np.zeros((max_len, max_len), dtype=np.bool)
                        rel = np.zeros((max_len, max_len), dtype=np.int64)
                        for b in raw_batch[:2]:
                            b.append([])
                        for m, cells in enumerate(corpus[idx]):
                            for b, c, v in zip(raw_batch, cells,
                                               [self.form_vocab, self.cpos_vocab]):
                                b[-1].append(v.get_idx_without_add(c))
                            for n, r in zip(cells[2], cells[3]):
                                arc[m, n] = True
                                rid = self.rel_vocab.get_idx_without_add(r)
                                if rid is None:
                                    logger.warning(f'Relation OOV: {r} not exists in train')
                                    continue
                                rel[m, n] = rid
                        raw_batch[-2].append(arc)
                        raw_batch[-1].append(rel)
                    batch = []
                    for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]):
                        b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post',
                                                                          value=v.safe_pad_token_idx,
                                                                          dtype='int64')
                        batch.append(b)
                    batch += raw_batch[2:]
                    assert len(batch) == 4
                    yield (batch[0], batch[1]), (batch[2], batch[3])

        # for x in generator():
        #     print(len(x[-1][-1]))
        return super().samples_to_dataset(generator, False, False, 0, False, repeat, drop_remainder, prefetch,
                                          cache)

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = (tf.int64, tf.int64), (tf.bool, tf.int64)
        shapes = ([None, None], [None, None]), ([None, None, None], [None, None, None])
        values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), (
            False, self.rel_vocab.safe_pad_token_idx)
        return types, shapes, values

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable:
        arc_preds, rel_preds, mask = Y
        sents = []

        for arc_sent, rel_sent, length in zip(arc_preds, rel_preds,
                                              tf.math.count_nonzero(mask, axis=-1)):
            sent = []
            for arc, rel in zip(tolist(arc_sent[1:, 1:]), tolist(rel_sent[1:, 1:])):
                ar = []
                for idx, (a, r) in enumerate(zip(arc, rel)):
                    if a:
                        ar.append((idx + 1, self.rel_vocab.idx_to_token[r]))
                if not ar:
                    # orphan
                    ar.append((0, self.orphan_relation))
                sent.append(ar)
            sents.append(sent)

        return sents

    def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                             gold=False, inputs=None, conll=True) -> Iterable:
        (words, feats, mask), (arc_preds, rel_preds) = X, Y
        xs = inputs
        ys = self.Y_to_outputs((arc_preds, rel_preds, mask))
        sents = []
        for x, y in zip(xs, ys):
            sent = CoNLLSentence()
            for idx, ((form, cpos), pred) in enumerate(zip(x, y)):
                head = [p[0] for p in pred]
                deprel = [p[1] for p in pred]
                if conll:
                    sent.append(CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel))
                else:
                    sent.append([head, deprel])
            sents.append(sent)
        return sents
Ejemplo n.º 2
0
class TransformerTransform(TsvTaggingFormat, Transform):
    def __init__(self,
                 tokenizer=None,
                 config: SerializableDict = None,
                 map_x=False,
                 map_y=False,
                 **kwargs) -> None:
        super().__init__(config, map_x, map_y, **kwargs)
        self._tokenizer = tokenizer
        self.tag_vocab: Vocab = None
        self.special_token_ids = None

    @property
    def tokenizer(self):
        return self._tokenizer

    @tokenizer.setter
    def tokenizer(self, tokenizer):
        self._tokenizer = tokenizer
        self.special_token_ids = tf.constant(
            [tokenizer.vocab[token] for token in ['[PAD]', '[CLS]', '[SEP]']],
            dtype=tf.int32)

    def fit(self, trn_path: str, **kwargs) -> int:
        self.tag_vocab = Vocab(unk_token=None)
        num_samples = 0
        for words, tags in self.file_to_inputs(trn_path, gold=True):
            num_samples += 1
            self.tag_vocab.update(tags)
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        max_seq_length = self.config.get('max_seq_length', 128)
        types = (tf.int32, tf.int32, tf.int32), tf.int32
        # (input_ids, input_mask, segment_ids), label_ids
        shapes = ([max_seq_length], [max_seq_length], [max_seq_length]), [None]
        values = (0, 0, 0), self.tag_vocab.pad_idx
        return types, shapes, values

    def lock_vocabs(self):
        super().lock_vocabs()

    def inputs_to_samples(self, inputs, gold=False):
        max_seq_length = self.config.get('max_seq_length', 128)
        tokenizer = self._tokenizer
        xlnet = False
        roberta = False
        pad_token = '[PAD]'
        cls_token = '[CLS]'
        sep_token = '[SEP]'
        unk_token = '[UNK]'

        pad_label_idx = self.tag_vocab.pad_idx
        pad_token = tokenizer.convert_tokens_to_ids([pad_token])[0]
        for sample in inputs:
            if gold:
                words, tags = sample
            else:
                words, tags = sample, [self.tag_vocab.pad_token] * len(sample)

            input_ids, input_mask, segment_ids, label_ids = convert_examples_to_features(
                words,
                tags,
                self.tag_vocab.token_to_idx,
                max_seq_length,
                tokenizer,
                cls_token_at_end=xlnet,
                # xlnet has a cls token at the end
                cls_token=cls_token,
                cls_token_segment_id=2 if xlnet else 0,
                sep_token=sep_token,
                sep_token_extra=roberta,
                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                pad_on_left=xlnet,
                # pad on the left for xlnet
                pad_token=pad_token,
                pad_token_segment_id=4 if xlnet else 0,
                pad_token_label_id=pad_label_idx,
                unk_token=unk_token)

            if None in input_ids:
                print(input_ids)
            if None in input_mask:
                print(input_mask)
            if None in segment_ids:
                print(input_mask)
            yield (input_ids, input_mask, segment_ids), label_ids

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        raise NotImplementedError(
            'transformers has its own tagger, not need to convert idx for x')

    def y_to_idx(self, y) -> tf.Tensor:
        raise NotImplementedError(
            'transformers has its own tagger, not need to convert idx for y')

    def input_is_single_sample(
            self, input: Union[List[str], List[List[str]]]) -> bool:
        return isinstance(input[0], str)

    def Y_to_outputs(self,
                     Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                     gold=False,
                     X=None,
                     inputs=None,
                     **kwargs) -> Iterable:
        assert X is not None, 'Need the X to know actual length of Y'
        input_ids, input_mask, segment_ids = X

        mask = tf.reduce_all(tf.not_equal(tf.expand_dims(input_ids, axis=-1),
                                          self.special_token_ids),
                             axis=-1)
        Y = tf.argmax(Y, axis=-1)
        Y = Y[mask]
        tags = [self.tag_vocab.idx_to_token[tid] for tid in Y]
        offset = 0
        for words in inputs:
            yield tags[offset:offset + len(words)]
            offset += len(words)
Ejemplo n.º 3
0
 def __init__(self,
              filepath: str = None,
              vocab: Vocab = None,
              expand_vocab=True,
              lowercase=True,
              input_dim=None,
              output_dim=None,
              unk=None,
              normalize=False,
              embeddings_initializer='VarianceScaling',
              embeddings_regularizer=None,
              activity_regularizer=None,
              embeddings_constraint=None,
              mask_zero=True,
              input_length=None,
              name=None,
              **kwargs):
     filepath = get_resource(filepath)
     word2vec, _output_dim = load_word2vec(filepath)
     if output_dim:
         assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}'
     output_dim = _output_dim
     # if the `unk` token exists in the pretrained,
     # then replace it with a self-defined one, usually the one in word vocab
     if unk and unk in word2vec:
         word2vec[vocab.safe_unk_token] = word2vec.pop(unk)
     if vocab is None:
         vocab = Vocab()
         vocab.update(word2vec.keys())
     if expand_vocab and vocab.mutable:
         for word in word2vec:
             vocab.get_idx(word.lower() if lowercase else word)
     if input_dim:
         assert input_dim == len(
             vocab), f'input_dim = {input_dim} does not match {filepath}'
     input_dim = len(vocab)
     # init matrix
     self._embeddings_initializer = embeddings_initializer
     embeddings_initializer = tf.keras.initializers.get(
         embeddings_initializer)
     with tf.device('cpu:0'):
         pret_embs = embeddings_initializer(
             shape=[input_dim, output_dim]).numpy()
     # insert to pret_embs
     for word, idx in vocab.token_to_idx.items():
         vec = word2vec.get(word, None)
         # Retry lower case
         if vec is None and lowercase:
             vec = word2vec.get(word.lower(), None)
         if vec is not None:
             pret_embs[idx] = vec
     if normalize:
         pret_embs /= np.std(pret_embs)
     if not name:
         name = os.path.splitext(os.path.basename(filepath))[0]
     super().__init__(input_dim,
                      output_dim,
                      tf.keras.initializers.Constant(pret_embs),
                      embeddings_regularizer,
                      activity_regularizer,
                      embeddings_constraint,
                      mask_zero,
                      input_length,
                      name=name,
                      **kwargs)
     self.filepath = filepath
     self.expand_vocab = expand_vocab
     self.lowercase = lowercase
Ejemplo n.º 4
0
class TSVTaggingTransform(TsvTaggingFormat, Transform):
    def __init__(self,
                 config: SerializableDict = None,
                 map_x=True,
                 map_y=True,
                 use_char=False,
                 **kwargs) -> None:
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.word_vocab: Optional[Vocab] = None
        self.tag_vocab: Optional[Vocab] = None
        self.char_vocab: Optional[Vocab] = None

    def fit(self, trn_path: str, **kwargs) -> int:
        self.word_vocab = Vocab()
        self.tag_vocab = Vocab(pad_token=None, unk_token=None)
        num_samples = 0
        for words, tags in self.file_to_inputs(trn_path, True):
            self.word_vocab.update(words)
            self.tag_vocab.update(tags)
            num_samples += 1
        if self.char_vocab:
            self.char_vocab = Vocab()
            for word in self.word_vocab.token_to_idx.keys():
                if word in (self.word_vocab.pad_token,
                            self.word_vocab.unk_token):
                    continue
                self.char_vocab.update(list(word))
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        values = self.word_vocab.pad_token, self.tag_vocab.first_token
        return types, shapes, values

    def inputs_to_samples(self, inputs, gold=False):
        lower = self.config.get('lower', False)
        if gold:
            if lower:
                for x, y in inputs:
                    yield x.lower(), y
            else:
                yield from inputs
        else:
            for x in inputs:
                yield x.lower() if lower else x, [self.padding_values[-1]
                                                  ] * len(x)

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.word_vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.tag_vocab.lookup(y)

    def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable:
        for xs in X:
            words = []
            for x in xs:
                words.append(
                    str_tensor_to_str(x) if self.char_vocab else self.
                    word_vocab.idx_to_token[int(x)])
            yield words

    def Y_to_outputs(self,
                     Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                     gold=False,
                     inputs=None,
                     X=None,
                     **kwargs) -> Iterable:
        if not gold:
            Y = tf.argmax(Y, axis=2)
        for ys, xs in zip(Y, inputs):
            tags = []
            for y, x in zip(ys, xs):
                tags.append(self.tag_vocab.idx_to_token[int(y)])
            yield tags

    def input_is_single_sample(
            self, input: Union[List[str], List[List[str]]]) -> bool:
        return isinstance(input[0], str)

    def input_truth_output_to_str(self, input: List[str], truth: List[str],
                                  output: List[str]):
        text = ''
        for word, gold_tag, pred_tag in zip(input, truth, output):
            text += ' '.join([word, gold_tag, pred_tag]) + '\n'

        text += '\n'
        return text
Ejemplo n.º 5
0
class TextTransform(Transform):

    def __init__(self,
                 forward=True,
                 seq_len=10,
                 tokenizer='char',
                 config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None:
        super().__init__(config, map_x, map_y, seq_len=seq_len, tokenizer=tokenizer, forward=forward, **kwargs)
        self.vocab: Vocab = None

    def tokenize_func(self):
        if self.config.tokenizer == 'char':
            return list
        elif self.config.tokenizer == 'whitespace':
            return lambda x: x.split()
        else:
            return lambda x: x.split(self.config.tokenizer)

    def fit(self, trn_path: str, **kwargs) -> int:
        self.vocab = Vocab()
        num_samples = 0
        for x, y in self.file_to_inputs(trn_path):
            self.vocab.update(x)
            num_samples += 1
        return num_samples

    def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]:
        types = tf.string, tf.string
        shapes = [None], [None]
        defaults = self.vocab.pad_token, self.vocab.pad_token
        return types, shapes, defaults

    def file_to_inputs(self, filepath: str, gold=True):
        forward = self.config.forward
        seq_len = self.config.seq_len
        buffer = []
        tokenizer = self.tokenize_func()
        with open(filepath, encoding='utf-8') if forward else FileReadBackwards(filepath, encoding="utf-8") as src:
            for line in src:
                tokens = tokenizer(line)
                buffer += tokens
                while len(buffer) > seq_len:
                    yield buffer[:seq_len], buffer[1:1 + seq_len]
                    buffer.pop(0)

    def inputs_to_samples(self, inputs, gold=False):
        forward = self.config.forward
        for t in inputs:
            if gold:
                x, y = t
            else:
                x, y = t, t
            if not forward:
                x = list(reversed(x))
                y = list(reversed(y))
            yield x, y

    def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]:
        return self.vocab.lookup(x)

    def y_to_idx(self, y) -> tf.Tensor:
        return self.x_to_idx(y)

    def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, **kwargs) -> Iterable:
        pred = tf.argmax(Y, axis=-1)
        for ys, ms in zip(pred, inputs):
            ret = []
            for y in ys:
                ret.append(self.vocab.idx_to_token[int(y)])
            yield ret

    def input_is_single_sample(self, input: Any) -> bool:
        return isinstance(input[0], str)