class CoNLL_SDP_Transform(CoNLLTransform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, n_tokens_per_batch=5000, min_freq=2, **kwargs) -> None: super().__init__(config, map_x, map_y, lower, n_buckets, n_tokens_per_batch, min_freq, **kwargs) self.orphan_relation = ROOT def lock_vocabs(self): super().lock_vocabs() # heuristic to find the orphan relation for rel in self.rel_vocab.idx_to_token: if 'root' in rel.lower(): self.orphan_relation = rel break def file_to_inputs(self, filepath: str, gold=True): assert gold, 'only support gold file for now' for i, sent in enumerate(read_conll(filepath)): prev_cells = None parsed_sent = [] heads = [] rels = [] for j, cell in enumerate(sent): ID = cell[0] form = cell[1] cpos = cell[3] head = cell[6] deprel = cell[7] if prev_cells and ID != prev_cells[0]: # found end of token parsed_sent.append([prev_cells[1], prev_cells[2], heads, rels]) heads = [] rels = [] heads.append(head) rels.append(deprel) prev_cells = [ID, form, cpos, head, deprel] parsed_sent.append([prev_cells[1], prev_cells[2], heads, rels]) yield parsed_sent def fit(self, trn_path: str, **kwargs) -> int: self.form_vocab = Vocab() self.form_vocab.add(ROOT) # make root the 2ed elements while 0th is pad, 1st is unk self.cpos_vocab = Vocab(pad_token=None, unk_token=None) self.rel_vocab = Vocab(pad_token=None, unk_token=None) num_samples = 0 counter = Counter() for sent in self.file_to_samples(trn_path, gold=True): num_samples += 1 for idx, (form, cpos, head, deprel) in enumerate(sent): if idx == 0: root = form else: counter[form] += 1 self.cpos_vocab.add(cpos) self.rel_vocab.update(deprel) for token in [token for token, freq in counter.items() if freq >= self.config.min_freq]: self.form_vocab.add(token) return num_samples def inputs_to_samples(self, inputs, gold=False): for sent in inputs: sample = [] if self.config['lower']: for i, cell in enumerate(sent): cell = list(sent[i]) cell[0] = cell[0].lower() if not gold: cell += [[0], [self.rel_vocab.safe_pad_token]] sample.append(cell) # insert root word with arbitrary fields, anyway it will be masked form, cpos, head, deprel = sample[0] sample.insert(0, [self.bos, self.bos, [0], deprel]) yield sample def samples_to_dataset(self, samples: Generator, map_x=None, map_y=None, batch_size=5000, shuffle=None, repeat=None, drop_remainder=False, prefetch=1, cache=True) -> tf.data.Dataset: def generator(): # custom bucketing, load corpus into memory corpus = list(x for x in (samples() if callable(samples) else samples)) lengths = [1 + len(i) for i in corpus] if len(corpus) < 32: n_buckets = 1 else: n_buckets = min(self.config.n_buckets, len(corpus)) buckets = dict(zip(*kmeans(lengths, n_buckets))) sizes, buckets = zip(*[ (size, bucket) for size, bucket in buckets.items() ]) # the number of chunks in each bucket, which is clipped by # range [1, len(bucket)] chunks = [min(len(bucket), max(round(size * len(bucket) / batch_size), 1)) for size, bucket in zip(sizes, buckets)] range_fn = randperm if shuffle else arange for i in tolist(range_fn(len(buckets))): split_sizes = [(len(buckets[i]) - j - 1) // chunks[i] + 1 for j in range(chunks[i])] for batch_indices in tf.split(range_fn(len(buckets[i])), split_sizes): indices = [buckets[i][j] for j in tolist(batch_indices)] raw_batch = [[], [], [], []] max_len = len(max([corpus[i] for i in indices], key=len)) for idx in indices: arc = np.zeros((max_len, max_len), dtype=np.bool) rel = np.zeros((max_len, max_len), dtype=np.int64) for b in raw_batch[:2]: b.append([]) for m, cells in enumerate(corpus[idx]): for b, c, v in zip(raw_batch, cells, [self.form_vocab, self.cpos_vocab]): b[-1].append(v.get_idx_without_add(c)) for n, r in zip(cells[2], cells[3]): arc[m, n] = True rid = self.rel_vocab.get_idx_without_add(r) if rid is None: logger.warning(f'Relation OOV: {r} not exists in train') continue rel[m, n] = rid raw_batch[-2].append(arc) raw_batch[-1].append(rel) batch = [] for b, v in zip(raw_batch, [self.form_vocab, self.cpos_vocab]): b = tf.keras.preprocessing.sequence.pad_sequences(b, padding='post', value=v.safe_pad_token_idx, dtype='int64') batch.append(b) batch += raw_batch[2:] assert len(batch) == 4 yield (batch[0], batch[1]), (batch[2], batch[3]) # for x in generator(): # print(len(x[-1][-1])) return super().samples_to_dataset(generator, False, False, 0, False, repeat, drop_remainder, prefetch, cache) def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = (tf.int64, tf.int64), (tf.bool, tf.int64) shapes = ([None, None], [None, None]), ([None, None, None], [None, None, None]) values = (self.form_vocab.safe_pad_token_idx, self.cpos_vocab.safe_pad_token_idx), ( False, self.rel_vocab.safe_pad_token_idx) return types, shapes, values def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None) -> Iterable: arc_preds, rel_preds, mask = Y sents = [] for arc_sent, rel_sent, length in zip(arc_preds, rel_preds, tf.math.count_nonzero(mask, axis=-1)): sent = [] for arc, rel in zip(tolist(arc_sent[1:, 1:]), tolist(rel_sent[1:, 1:])): ar = [] for idx, (a, r) in enumerate(zip(arc, rel)): if a: ar.append((idx + 1, self.rel_vocab.idx_to_token[r])) if not ar: # orphan ar.append((0, self.orphan_relation)) sent.append(ar) sents.append(sent) return sents def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, conll=True) -> Iterable: (words, feats, mask), (arc_preds, rel_preds) = X, Y xs = inputs ys = self.Y_to_outputs((arc_preds, rel_preds, mask)) sents = [] for x, y in zip(xs, ys): sent = CoNLLSentence() for idx, ((form, cpos), pred) in enumerate(zip(x, y)): head = [p[0] for p in pred] deprel = [p[1] for p in pred] if conll: sent.append(CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel)) else: sent.append([head, deprel]) sents.append(sent) return sents
class TransformerTransform(TsvTaggingFormat, Transform): def __init__(self, tokenizer=None, config: SerializableDict = None, map_x=False, map_y=False, **kwargs) -> None: super().__init__(config, map_x, map_y, **kwargs) self._tokenizer = tokenizer self.tag_vocab: Vocab = None self.special_token_ids = None @property def tokenizer(self): return self._tokenizer @tokenizer.setter def tokenizer(self, tokenizer): self._tokenizer = tokenizer self.special_token_ids = tf.constant( [tokenizer.vocab[token] for token in ['[PAD]', '[CLS]', '[SEP]']], dtype=tf.int32) def fit(self, trn_path: str, **kwargs) -> int: self.tag_vocab = Vocab(unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, gold=True): num_samples += 1 self.tag_vocab.update(tags) return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: max_seq_length = self.config.get('max_seq_length', 128) types = (tf.int32, tf.int32, tf.int32), tf.int32 # (input_ids, input_mask, segment_ids), label_ids shapes = ([max_seq_length], [max_seq_length], [max_seq_length]), [None] values = (0, 0, 0), self.tag_vocab.pad_idx return types, shapes, values def lock_vocabs(self): super().lock_vocabs() def inputs_to_samples(self, inputs, gold=False): max_seq_length = self.config.get('max_seq_length', 128) tokenizer = self._tokenizer xlnet = False roberta = False pad_token = '[PAD]' cls_token = '[CLS]' sep_token = '[SEP]' unk_token = '[UNK]' pad_label_idx = self.tag_vocab.pad_idx pad_token = tokenizer.convert_tokens_to_ids([pad_token])[0] for sample in inputs: if gold: words, tags = sample else: words, tags = sample, [self.tag_vocab.pad_token] * len(sample) input_ids, input_mask, segment_ids, label_ids = convert_examples_to_features( words, tags, self.tag_vocab.token_to_idx, max_seq_length, tokenizer, cls_token_at_end=xlnet, # xlnet has a cls token at the end cls_token=cls_token, cls_token_segment_id=2 if xlnet else 0, sep_token=sep_token, sep_token_extra=roberta, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=xlnet, # pad on the left for xlnet pad_token=pad_token, pad_token_segment_id=4 if xlnet else 0, pad_token_label_id=pad_label_idx, unk_token=unk_token) if None in input_ids: print(input_ids) if None in input_mask: print(input_mask) if None in segment_ids: print(input_mask) yield (input_ids, input_mask, segment_ids), label_ids def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: raise NotImplementedError( 'transformers has its own tagger, not need to convert idx for x') def y_to_idx(self, y) -> tf.Tensor: raise NotImplementedError( 'transformers has its own tagger, not need to convert idx for y') def input_is_single_sample( self, input: Union[List[str], List[List[str]]]) -> bool: return isinstance(input[0], str) def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, X=None, inputs=None, **kwargs) -> Iterable: assert X is not None, 'Need the X to know actual length of Y' input_ids, input_mask, segment_ids = X mask = tf.reduce_all(tf.not_equal(tf.expand_dims(input_ids, axis=-1), self.special_token_ids), axis=-1) Y = tf.argmax(Y, axis=-1) Y = Y[mask] tags = [self.tag_vocab.idx_to_token[tid] for tid in Y] offset = 0 for words in inputs: yield tags[offset:offset + len(words)] offset += len(words)
def __init__(self, filepath: str = None, vocab: Vocab = None, expand_vocab=True, lowercase=True, input_dim=None, output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None, name=None, **kwargs): filepath = get_resource(filepath) word2vec, _output_dim = load_word2vec(filepath) if output_dim: assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}' output_dim = _output_dim # if the `unk` token exists in the pretrained, # then replace it with a self-defined one, usually the one in word vocab if unk and unk in word2vec: word2vec[vocab.safe_unk_token] = word2vec.pop(unk) if vocab is None: vocab = Vocab() vocab.update(word2vec.keys()) if expand_vocab and vocab.mutable: for word in word2vec: vocab.get_idx(word.lower() if lowercase else word) if input_dim: assert input_dim == len( vocab), f'input_dim = {input_dim} does not match {filepath}' input_dim = len(vocab) # init matrix self._embeddings_initializer = embeddings_initializer embeddings_initializer = tf.keras.initializers.get( embeddings_initializer) with tf.device('cpu:0'): pret_embs = embeddings_initializer( shape=[input_dim, output_dim]).numpy() # insert to pret_embs for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) # Retry lower case if vec is None and lowercase: vec = word2vec.get(word.lower(), None) if vec is not None: pret_embs[idx] = vec if normalize: pret_embs /= np.std(pret_embs) if not name: name = os.path.splitext(os.path.basename(filepath))[0] super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer, activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs) self.filepath = filepath self.expand_vocab = expand_vocab self.lowercase = lowercase
class TSVTaggingTransform(TsvTaggingFormat, Transform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, use_char=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.word_vocab: Optional[Vocab] = None self.tag_vocab: Optional[Vocab] = None self.char_vocab: Optional[Vocab] = None def fit(self, trn_path: str, **kwargs) -> int: self.word_vocab = Vocab() self.tag_vocab = Vocab(pad_token=None, unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, True): self.word_vocab.update(words) self.tag_vocab.update(tags) num_samples += 1 if self.char_vocab: self.char_vocab = Vocab() for word in self.word_vocab.token_to_idx.keys(): if word in (self.word_vocab.pad_token, self.word_vocab.unk_token): continue self.char_vocab.update(list(word)) return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = tf.string, tf.string shapes = [None], [None] values = self.word_vocab.pad_token, self.tag_vocab.first_token return types, shapes, values def inputs_to_samples(self, inputs, gold=False): lower = self.config.get('lower', False) if gold: if lower: for x, y in inputs: yield x.lower(), y else: yield from inputs else: for x in inputs: yield x.lower() if lower else x, [self.padding_values[-1] ] * len(x) def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: return self.word_vocab.lookup(x) def y_to_idx(self, y) -> tf.Tensor: return self.tag_vocab.lookup(y) def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: for xs in X: words = [] for x in xs: words.append( str_tensor_to_str(x) if self.char_vocab else self. word_vocab.idx_to_token[int(x)]) yield words def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, **kwargs) -> Iterable: if not gold: Y = tf.argmax(Y, axis=2) for ys, xs in zip(Y, inputs): tags = [] for y, x in zip(ys, xs): tags.append(self.tag_vocab.idx_to_token[int(y)]) yield tags def input_is_single_sample( self, input: Union[List[str], List[List[str]]]) -> bool: return isinstance(input[0], str) def input_truth_output_to_str(self, input: List[str], truth: List[str], output: List[str]): text = '' for word, gold_tag, pred_tag in zip(input, truth, output): text += ' '.join([word, gold_tag, pred_tag]) + '\n' text += '\n' return text
class TextTransform(Transform): def __init__(self, forward=True, seq_len=10, tokenizer='char', config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None: super().__init__(config, map_x, map_y, seq_len=seq_len, tokenizer=tokenizer, forward=forward, **kwargs) self.vocab: Vocab = None def tokenize_func(self): if self.config.tokenizer == 'char': return list elif self.config.tokenizer == 'whitespace': return lambda x: x.split() else: return lambda x: x.split(self.config.tokenizer) def fit(self, trn_path: str, **kwargs) -> int: self.vocab = Vocab() num_samples = 0 for x, y in self.file_to_inputs(trn_path): self.vocab.update(x) num_samples += 1 return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = tf.string, tf.string shapes = [None], [None] defaults = self.vocab.pad_token, self.vocab.pad_token return types, shapes, defaults def file_to_inputs(self, filepath: str, gold=True): forward = self.config.forward seq_len = self.config.seq_len buffer = [] tokenizer = self.tokenize_func() with open(filepath, encoding='utf-8') if forward else FileReadBackwards(filepath, encoding="utf-8") as src: for line in src: tokens = tokenizer(line) buffer += tokens while len(buffer) > seq_len: yield buffer[:seq_len], buffer[1:1 + seq_len] buffer.pop(0) def inputs_to_samples(self, inputs, gold=False): forward = self.config.forward for t in inputs: if gold: x, y = t else: x, y = t, t if not forward: x = list(reversed(x)) y = list(reversed(y)) yield x, y def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: return self.vocab.lookup(x) def y_to_idx(self, y) -> tf.Tensor: return self.x_to_idx(y) def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, **kwargs) -> Iterable: pred = tf.argmax(Y, axis=-1) for ys, ms in zip(pred, inputs): ret = [] for y in ys: ret.append(self.vocab.idx_to_token[int(y)]) yield ret def input_is_single_sample(self, input: Any) -> bool: return isinstance(input[0], str)