def fit(self, trn_path: str, **kwargs) -> int: self.vocab = VocabTF() num_samples = 0 for x, y in self.file_to_inputs(trn_path): self.vocab.update(x) num_samples += 1 return num_samples
def _load(path, vocab, normalize=False) -> Tuple[VocabTF, Union[np.ndarray, None]]: if not vocab: vocab = VocabTF() if not path: return vocab, None assert vocab.unk_idx is not None word2vec, dim = load_word2vec(path) for word in word2vec: vocab.get_idx(word) pret_embs = np.zeros(shape=(len(vocab), dim), dtype=np.float32) state = np.random.get_state() np.random.seed(0) bias = np.random.uniform(low=-0.001, high=0.001, size=dim).astype(dtype=np.float32) scale = np.sqrt(3.0 / dim) for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) if vec is None: vec = word2vec.get(word.lower(), None) # if vec is not None: # vec += bias if vec is None: # vec = np.random.uniform(-scale, scale, [dim]) vec = np.zeros([dim], dtype=np.float32) pret_embs[idx] = vec # noinspection PyTypeChecker np.random.set_state(state) return vocab, pret_embs
def __init__(self, path: str = None, vocab: VocabTF = None, normalize: bool = False, load_all=True, mask_zero=True, trainable=False, name=None, dtype=None, dynamic=False, **kwargs): super().__init__(trainable, name, dtype, dynamic, **kwargs) if load_all and vocab and vocab.locked: vocab.unlock() self.vocab, self.array_np = self._load(path, vocab, normalize) self.vocab.lock() self.array_ks = tf.keras.layers.Embedding( input_dim=len(self.vocab), output_dim=self.dim, trainable=trainable, embeddings_initializer=tf.keras.initializers.Constant( self.array_np), mask_zero=mask_zero) self.mask_zero = mask_zero self.supports_masking = mask_zero
def load_vocabs(self, save_dir, filename='vocabs.json'): save_dir = get_resource(save_dir) vocabs = SerializableDict() vocabs.load_json(os.path.join(save_dir, filename)) for key, value in vocabs.items(): vocab = VocabTF() vocab.copy_from(value) setattr(self.transform, key, vocab)
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.token_vocab = VocabTF() self.pos_vocab = VocabTF(pad_token=None, unk_token=None) self.ner_vocab = VocabTF(pad_token=None) self.deprel_vocab = VocabTF(pad_token=None, unk_token=None) self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
def __init__(self, filepath: str = None, vocab: VocabTF = None, expand_vocab=True, lowercase=False, input_dim=None, output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None, name=None, **kwargs): if vocab is None: vocab = VocabTF() self.vocab = vocab super().__init__(filepath, vocab, expand_vocab, lowercase, input_dim, output_dim, unk, normalize, embeddings_initializer, embeddings_regularizer, activity_regularizer, embeddings_constraint, mask_zero, input_length, name, **kwargs)
def vocab_from_txt(txt_file_path, bigram_only=False, window_size=4, **kwargs) -> Tuple[VocabTF, VocabTF, VocabTF]: char_vocab, ngram_vocab, tag_vocab = VocabTF(), VocabTF(), VocabTF( pad_token=None, unk_token=None) for X, Y in generate_ngram_bmes(txt_file_path, bigram_only, window_size, gold=True): char_vocab.update(X[0]) for ngram in X[1:]: ngram_vocab.update(filter(lambda x: x, ngram)) tag_vocab.update(Y) return char_vocab, ngram_vocab, tag_vocab
def fit(self, trn_path: str, **kwargs) -> int: self.word_vocab = VocabTF() self.tag_vocab = VocabTF(pad_token=None, unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, True): self.word_vocab.update(words) self.tag_vocab.update(tags) num_samples += 1 if self.char_vocab: self.char_vocab = VocabTF() for word in self.word_vocab.token_to_idx.keys(): if word in (self.word_vocab.pad_token, self.word_vocab.unk_token): continue self.char_vocab.update(list(word)) return num_samples
def __init__(self, filepath: str = None, vocab: VocabTF = None, expand_vocab=True, lowercase=True, input_dim=None, output_dim=None, unk=None, normalize=False, embeddings_initializer='VarianceScaling', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=True, input_length=None, name=None, cpu=True, **kwargs): filepath = get_resource(filepath) word2vec, _output_dim = load_word2vec(filepath) if output_dim: assert output_dim == _output_dim, f'output_dim = {output_dim} does not match {filepath}' output_dim = _output_dim # if the `unk` token exists in the pretrained, # then replace it with a self-defined one, usually the one in word vocab if unk and unk in word2vec: word2vec[vocab.safe_unk_token] = word2vec.pop(unk) if vocab is None: vocab = VocabTF() vocab.update(word2vec.keys()) if expand_vocab and vocab.mutable: for word in word2vec: vocab.get_idx(word.lower() if lowercase else word) if input_dim: assert input_dim == len( vocab), f'input_dim = {input_dim} does not match {filepath}' input_dim = len(vocab) # init matrix self._embeddings_initializer = embeddings_initializer embeddings_initializer = tf.keras.initializers.get( embeddings_initializer) with tf.device('cpu:0') if cpu else DummyContext(): pret_embs = embeddings_initializer( shape=[input_dim, output_dim]).numpy() # insert to pret_embs for word, idx in vocab.token_to_idx.items(): vec = word2vec.get(word, None) # Retry lower case if vec is None and lowercase: vec = word2vec.get(word.lower(), None) if vec is not None: pret_embs[idx] = vec if normalize: pret_embs /= np.std(pret_embs) if not name: name = os.path.splitext(os.path.basename(filepath))[0] super().__init__(input_dim, output_dim, tf.keras.initializers.Constant(pret_embs), embeddings_regularizer, activity_regularizer, embeddings_constraint, mask_zero, input_length, name=name, **kwargs) self.filepath = filepath self.expand_vocab = expand_vocab self.lowercase = lowercase
class TextTransform(Transform): def __init__(self, forward=True, seq_len=10, tokenizer='char', config: SerializableDict = None, map_x=True, map_y=True, **kwargs) -> None: super().__init__(config, map_x, map_y, seq_len=seq_len, tokenizer=tokenizer, forward=forward, **kwargs) self.vocab: VocabTF = None def tokenize_func(self): if self.config.tokenizer == 'char': return list elif self.config.tokenizer == 'whitespace': return lambda x: x.split() else: return lambda x: x.split(self.config.tokenizer) def fit(self, trn_path: str, **kwargs) -> int: self.vocab = VocabTF() num_samples = 0 for x, y in self.file_to_inputs(trn_path): self.vocab.update(x) num_samples += 1 return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = tf.string, tf.string shapes = [None], [None] defaults = self.vocab.pad_token, self.vocab.pad_token return types, shapes, defaults def file_to_inputs(self, filepath: str, gold=True): forward = self.config.forward seq_len = self.config.seq_len buffer = [] tokenizer = self.tokenize_func() with open(filepath, encoding='utf-8') if forward else FileReadBackwards( filepath, encoding="utf-8") as src: for line in src: tokens = tokenizer(line) buffer += tokens while len(buffer) > seq_len: yield buffer[:seq_len], buffer[1:1 + seq_len] buffer.pop(0) def inputs_to_samples(self, inputs, gold=False): forward = self.config.forward for t in inputs: if gold: x, y = t else: x, y = t, t if not forward: x = list(reversed(x)) y = list(reversed(y)) yield x, y def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: return self.vocab.lookup(x) def y_to_idx(self, y) -> tf.Tensor: return self.x_to_idx(y) def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, **kwargs) -> Iterable: pred = tf.argmax(Y, axis=-1) for ys, ms in zip(pred, inputs): ret = [] for y in ys: ret.append(self.vocab.idx_to_token[int(y)]) yield ret def input_is_single_sample(self, input: Any) -> bool: return isinstance(input[0], str)
class TSVTaggingTransform(TsvTaggingFormat, Transform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, use_char=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.word_vocab: Optional[VocabTF] = None self.tag_vocab: Optional[VocabTF] = None self.char_vocab: Optional[VocabTF] = None def fit(self, trn_path: str, **kwargs) -> int: self.word_vocab = VocabTF() self.tag_vocab = VocabTF(pad_token=None, unk_token=None) num_samples = 0 for words, tags in self.file_to_inputs(trn_path, True): self.word_vocab.update(words) self.tag_vocab.update(tags) num_samples += 1 if self.char_vocab: self.char_vocab = VocabTF() for word in self.word_vocab.token_to_idx.keys(): if word in (self.word_vocab.pad_token, self.word_vocab.unk_token): continue self.char_vocab.update(list(word)) return num_samples def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: types = tf.string, tf.string shapes = [None], [None] values = self.word_vocab.pad_token, self.tag_vocab.first_token return types, shapes, values def inputs_to_samples(self, inputs, gold=False): lower = self.config.get('lower', False) if gold: if lower: for x, y in inputs: yield x.lower(), y else: yield from inputs else: for x in inputs: yield x.lower() if lower else x, [self.padding_values[-1] ] * len(x) def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: return self.word_vocab.lookup(x) def y_to_idx(self, y) -> tf.Tensor: return self.tag_vocab.lookup(y) def X_to_inputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]]) -> Iterable: for xs in X: words = [] for x in xs: words.append( str_tensor_to_str(x) if self.char_vocab else self. word_vocab.idx_to_token[int(x)]) yield words def Y_to_outputs(self, Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, X=None, **kwargs) -> Iterable: if not gold: Y = tf.argmax(Y, axis=2) for ys, xs in zip(Y, inputs): tags = [] for y, x in zip(ys, xs): tags.append(self.tag_vocab.idx_to_token[int(y)]) yield tags def input_is_single_sample( self, input: Union[List[str], List[List[str]]]) -> bool: return isinstance(input[0], str) def input_truth_output_to_str(self, input: List[str], truth: List[str], output: List[str]): text = '' for word, gold_tag, pred_tag in zip(input, truth, output): text += ' '.join([word, gold_tag, pred_tag]) + '\n' text += '\n' return text
def vocab_from_tsv(tsv_file_path, lower=False, lock_word_vocab=False, lock_char_vocab=True, lock_tag_vocab=True) \ -> Tuple[VocabTF, VocabTF, VocabTF]: word_vocab = VocabTF() char_vocab = VocabTF() tag_vocab = VocabTF(unk_token=None) with open(tsv_file_path, encoding='utf-8') as tsv_file: for line in tsv_file: cells = line.strip().split() if cells: word, tag = cells if lower: word_vocab.add(word.lower()) else: word_vocab.add(word) char_vocab.update(list(word)) tag_vocab.add(tag) if lock_word_vocab: word_vocab.lock() if lock_char_vocab: char_vocab.lock() if lock_tag_vocab: tag_vocab.lock() return word_vocab, char_vocab, tag_vocab
class TACREDTransform(Transform): def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.token_vocab = VocabTF() self.pos_vocab = VocabTF(pad_token=None, unk_token=None) self.ner_vocab = VocabTF(pad_token=None) self.deprel_vocab = VocabTF(pad_token=None, unk_token=None) self.rel_vocab = VocabTF(pad_token=None, unk_token=None) def fit(self, trn_path: str, **kwargs) -> int: count = 0 for (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation in self.file_to_samples(trn_path, gold=True): count += 1 self.token_vocab.update(tokens) self.pos_vocab.update(pos) self.ner_vocab.update(ner) self.deprel_vocab.update(deprel) self.rel_vocab.add(relation) return count def file_to_inputs(self, filepath: str, gold=True): data = load_json(filepath) for d in data: tokens = list(d['token']) ss, se = d['subj_start'], d['subj_end'] os, oe = d['obj_start'], d['obj_end'] pos = d['stanford_pos'] ner = d['stanford_ner'] deprel = d['stanford_deprel'] head = [int(x) for x in d['stanford_head']] assert any([x == 0 for x in head]) relation = d['relation'] yield (tokens, pos, ner, head, deprel, ss, se, os, oe), relation def inputs_to_samples(self, inputs, gold=False): for input in inputs: if gold: (tokens, pos, ner, head, deprel, ss, se, os, oe), relation = input else: tokens, pos, ner, head, deprel, ss, se, os, oe = input relation = self.rel_vocab.safe_pad_token l = len(tokens) subj_positions = get_positions(ss, se, l) obj_positions = get_positions(os, oe, l) subj_type = ner[ss] obj_type = ner[os] # anonymize tokens tokens[ss:se + 1] = ['SUBJ-' + subj_type] * (se - ss + 1) tokens[os:oe + 1] = ['OBJ-' + obj_type] * (oe - os + 1) # min head is 0, but root is not included in tokens, so take 1 off from each head head = [h - 1 for h in head] yield (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation def create_types_shapes_values(self) -> Tuple[Tuple, Tuple, Tuple]: # (tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type), relation types = (tf.string, tf.string, tf.string, tf.int32, tf.string, tf.int32, tf.int32, tf.string, tf.string), tf.string shapes = ([None], [None], [None], [None], [None], [None], [None], [], []), [] pads = (self.token_vocab.safe_pad_token, self.pos_vocab.safe_pad_token, self.ner_vocab.safe_pad_token, 0, self.deprel_vocab.safe_pad_token, 0, 0, self.ner_vocab.safe_pad_token, self.ner_vocab.safe_pad_token), self.rel_vocab.safe_pad_token return types, shapes, pads def x_to_idx(self, x) -> Union[tf.Tensor, Tuple]: tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type = x tokens = self.token_vocab.lookup(tokens) pos = self.pos_vocab.lookup(pos) ner = self.ner_vocab.lookup(ner) deprel = self.deprel_vocab.lookup(deprel) subj_type = self.ner_vocab.lookup(subj_type) obj_type = self.ner_vocab.lookup(obj_type) return tokens, pos, ner, head, deprel, subj_positions, obj_positions, subj_type, obj_type def y_to_idx(self, y) -> tf.Tensor: return self.rel_vocab.lookup(y)