class BERTVectorizer: def __init__(self, sess, bert_model_hub_path): self.sess = sess self.bert_model_hub_path = bert_model_hub_path self.create_tokenizer_from_hub_module() def create_tokenizer_from_hub_module(self): # get the vocabulary and lowercasing or uppercase information directly from the BERT tf hub module bert_module = hub.Module(self.bert_model_hub_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = self.sess.run( [ tokenization_info["vocab_file"], tokenization_info["do_lower_case"] ] ) self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) #do_lower_case=True # print(tokenizer.tokenize('hello world!')) --> ['hello', 'world', '!'] def tokenize(self, text:str): ## tokenize every sentence words = text.split() ## # text: add leah kauffman to my uncharted 4 nathan drake playlist ## # words: ['add', 'leah', 'kauffman', 'to', 'my', 'uncharted', '4', 'nathan', 'drake', 'playlist'] tokens = [] ## # tokens: ['add', 'leah', 'ka', '##uf', '##fm', '##an', 'to', 'my', 'un', '##cha', '##rted', '4', 'nathan', 'drake', 'play', '##list'] valid_positions = [] ## # valid_positions:[1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0] for i, word in enumerate(words): token = self.tokenizer.tokenize(word) tokens.extend(token) for i in range(len(token)): if i == 0: valid_positions.append(1) else: valid_positions.append(0) return tokens, valid_positions def transform(self, text_arr): input_ids = [] input_mask = [] segment_ids = [] valid_positions = [] for text in text_arr: ids, mask, seg_ids, valid_pos = self.__vectorize(text) input_ids.append(ids) input_mask.append(mask) segment_ids.append(seg_ids) valid_positions.append(valid_pos) sequence_length = np.array([len(i) for i in input_ids]) ## set the maximum length is 50 input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=50, truncating='post', padding='post') input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, maxlen=50, truncating='post', padding='post') segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, maxlen=50, truncating='post', padding='post') valid_positions = tf.keras.preprocessing.sequence.pad_sequences(valid_positions, maxlen=50, truncating='post', padding='post') # input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, padding='post') # input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, padding='post') # segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, padding='post') # valid_positions = tf.keras.preprocessing.sequence.pad_sequences(valid_positions, padding='post') return input_ids, input_mask, segment_ids, valid_positions, sequence_length def __vectorize(self, text:str): tokens, valid_positions = self.tokenize(text) ## insert the first token "[CLS]" tokens.insert(0, '[CLS]') valid_positions.insert(0, 1) ## insert the last token "[SEP]" tokens.append('[SEP]') valid_positions.append(1) ## ['[CLS]', 'add', 'leah', 'ka', '##uf', '##fm', '##an', 'to', 'my', 'un', '##cha', '##rted', '4', 'nathan', 'drake', 'play', '##list', '[SEP]'] ## [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1] ''' (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. ''' segment_ids = [0] * len(tokens) ## # segment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] input_ids = self.tokenizer.convert_tokens_to_ids(tokens) ## # input_ids: [101, 5587, 14188, 10556, 16093, 16715, 2319, 2000, 2026, 4895, 7507, 17724, 1018, 7150, 7867, 2377, 9863, 102] and the first is always 101 and the last is 102 input_mask = [1] * len(input_ids) ## The mask has 1 for real tokens and 0 for padding tokens. ## # input_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] return input_ids, input_mask, segment_ids, valid_positions
from sequence_transfer.sequence import CharSequence, TokenSequence from sequence_transfer.magic_transfer import MagicTransfer from bert.tokenization import FullTokenizer text = "She lives in Lindström, Minnesota" tokenizer = FullTokenizer('../vocab.txt') tokens = tokenizer.tokenize(text) # 01 - We create sequences s1 = CharSequence.new(text) s2 = TokenSequence.new(tokens) # 02 - We create a magic transfer that will try to match "similar" subsequences between s1 and s2: transfer = MagicTransfer(s1, s2) # 03 - We will use the transfer object to find the tokens that correspond to the word `Lindström` sub1 = s1[13, 22] # `Lindström` in s1 sub2 = transfer.apply(sub1) for token in sub2: print(token.text) # 04 - We can debug the transfer transfer.debug()
class BERTVectorizer: def __init__( self, sess, is_bert, # bert_model_hub_path='https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1' bert_model_hub_path="https://tfhub.dev/google/albert_base/1"): self.sess = sess self.is_bert = is_bert self.bert_model_hub_path = bert_model_hub_path self.create_tokenizer_from_hub_module(is_bert=is_bert) def create_tokenizer_from_hub_module(self, is_bert): """Get the vocab file and casing info from the Hub module.""" bert_module = hub.Module(self.bert_model_hub_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = self.sess.run([ tokenization_info["vocab_file"], tokenization_info["do_lower_case"], ]) if is_bert: from bert.tokenization import FullTokenizer self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: from opennlu.services.tensorflow_JointBERT.vectorizers.albert_tokenization import FullTokenizer self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, spm_model_file=vocab_file) def tokenize(self, text: str): words = text.split() # whitespace tokenizer tokens = [] valid_positions = [] for i, word in enumerate(words): token = self.tokenizer.tokenize(word) tokens.extend(token) for i in range(len(token)): if i == 0: valid_positions.append(1) else: valid_positions.append(0) return tokens, valid_positions def transform(self, text_arr): input_ids = [] input_mask = [] segment_ids = [] valid_positions = [] for text in text_arr: ids, mask, seg_ids, valid_pos = self.__vectorize(text) input_ids.append(ids) input_mask.append(mask) segment_ids.append(seg_ids) valid_positions.append(valid_pos) sequence_lengths = np.array([len(i) for i in input_ids]) input_ids = tf.keras.preprocessing.sequence.pad_sequences( input_ids, padding='post') input_mask = tf.keras.preprocessing.sequence.pad_sequences( input_mask, padding='post') segment_ids = tf.keras.preprocessing.sequence.pad_sequences( segment_ids, padding='post') valid_positions = tf.keras.preprocessing.sequence.pad_sequences( valid_positions, padding='post') return input_ids, input_mask, segment_ids, valid_positions, sequence_lengths def __vectorize(self, text: str): tokens, valid_positions = self.tokenize(text) # insert "[CLS]" tokens.insert(0, '[CLS]') valid_positions.insert(0, 1) # insert "[SEP]" tokens.append('[SEP]') valid_positions.append(1) segment_ids = [0] * len(tokens) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) return input_ids, input_mask, segment_ids, valid_positions
class BERTEmbeddingEvaluator(SentenceEmbeddingEvaluator): def __init__( self, model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt", bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json", vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt", max_seq_length=32, dimension=768, num_labels=2): super().__init__("bert", dimension) config = BertConfig.from_json_file(bertconfig_fname) self.max_seq_length = max_seq_length self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph( config, max_seq_length, 1.0, num_labels, tune=False) saver = tf.train.Saver(tf.global_variables()) self.sess = tf.Session() checkpoint_path = tf.train.latest_checkpoint(model_fname) saver.restore(self.sess, checkpoint_path) def predict(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) probs = self.sess.run(self.probs, model_input) return probs """ sentence를 입력하면 토크나이즈 결과와 token 벡터 시퀀스를 반환한다 - shape :[[# of tokens], [batch size, max seq length, dimension]] """ def get_token_vector_sequence(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) return [ tokens, self.sess.run(self.model.get_sequence_output()[0], model_input)[:len(tokens) + 2] ] """ sentence를 입력하면 토크나이즈 결과와 [CLS] 벡터를 반환한다 - shape :[[# of tokens], [batch size, dimension]] """ def get_sentence_vector(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) return [ tokens, self.sess.run(self.model.pooled_output, model_input)[0] ] """ sentence를 입력하면 토크나이즈 결과와 self-attention score matrix를 반환한다 - shape :[[# of tokens], [batch size, # of tokens, # of tokens]] """ def get_self_attention_score(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) # raw_score : shape=[# of layers, batch_size, num_attention_heads, max_seq_length, max_seq_length] raw_score = self.sess.run(self.model.attn_probs_for_visualization_list, model_input) # 마지막 레이어를 취한 뒤, attention head 기준(axis=0)으로 sum scores = np.sum(raw_score[-1][0], axis=0) # scores matrix에서 토큰 개수만큼 취함 scores = scores[:len(tokens), :len(tokens)] return [tokens, scores] def tokenize(self, sentence): return self.tokenizer.tokenize(convert_to_unicode(sentence)) def make_input(self, tokens): tokens = tokens[:(self.max_seq_length - 2)] token_sequence = ["[CLS]"] + tokens + ["[SEP]"] segment = [0] * len(token_sequence) sequence = self.tokenizer.convert_tokens_to_ids(token_sequence) current_length = len(sequence) padding_length = self.max_seq_length - current_length input_feed = { self.input_ids: np.array([sequence + [0] * padding_length]), self.segment_ids: np.array([segment + [0] * padding_length]), self.input_mask: np.array([[1] * current_length + [0] * padding_length]) } return input_feed def visualize_self_attention_scores(self, sentence, palette="Viridis256"): tokens, scores = self.get_self_attention_score(sentence) visualize_self_attention_scores(tokens, scores, palette)
def build_dataset(conll_file, tfrecod_file, pos2id, dep2id, path2id, truncate=False): max_len = 0 tokenizer = FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) with open(conll_file, 'r') as reader: text = reader.read().strip() sentences = text.split('\n\n') tf_writer = tf.python_io.TFRecordWriter(tfrecod_file) for sent in sentences: subword_list = ["[CLS]"] span_list = [0] mask_list = [0] cue_list = [0] pos_list = [0] dep_list = [0] path_list = [0] lpath_list = [-1] cp_list = [-1] subword_id_list = tokenizer.convert_tokens_to_ids(["[CLS]"]) for token in sent.split('\n'): if len(token) >= 8: token = token.split('\t') token_ = token[0] subword = tokenizer.tokenize(token_) span = [int(token[8]) for _ in range(len(subword))] cue = [int(token[7]) for _ in range(len(subword))] pos = [ int(mapping(pos2id, token[2])) for _ in range(len(subword)) ] dep = [ int(mapping(dep2id, token[3])) for _ in range(len(subword)) ] path = [ int(mapping(path2id, token[4])) for _ in range(len(subword)) ] lpath = [int(token[5]) for _ in range(len(subword))] cp = [int(token[6]) for _ in range(len(subword))] mask = [0 for _ in range(len(subword))] mask[0] = 1 sub_id = tokenizer.convert_tokens_to_ids(subword) subword_list.extend(subword) mask_list.extend(mask) subword_id_list.extend(sub_id) pos_list.extend(pos) dep_list.extend(dep) path_list.extend(path) lpath_list.extend(lpath) cp_list.extend(cp) cue_list.extend(cue) span_list.extend(span) subword_list.append("[SEP]") span_list.append(0) cue_list.append(0) mask_list.append(0) subword_id_list.extend(tokenizer.convert_tokens_to_ids(["[SEP]"])) pos_list.append(0) dep_list.append(0) path_list.append(0) lpath_list.append(-1) cp_list.append(-1) assert len(subword_list) == len(span_list) == len(mask_list) == len( subword_id_list) max_len = max(max_len, len(subword_id_list)) if len(subword_list) > 2: if (not truncate) or (len(subword_id_list) <= 64): # write tfrecord token_id = [ tf.train.Feature(int64_list=tf.train.Int64List(value=[t_])) for t_ in subword_id_list ] mask = [ tf.train.Feature(int64_list=tf.train.Int64List(value=[m_])) for m_ in mask_list ] span = [ tf.train.Feature(int64_list=tf.train.Int64List(value=[s_])) for s_ in span_list ] cue = [ tf.train.Feature(int64_list=tf.train.Int64List(value=[c_])) for c_ in cue_list ] pos_features = [ tf.train.Feature(int64_list=tf.train.Int64List( value=[pos_])) for pos_ in pos_list ] dep_features = [ tf.train.Feature(int64_list=tf.train.Int64List( value=[dep_])) for dep_ in dep_list ] path_features = [ tf.train.Feature(int64_list=tf.train.Int64List( value=[path_])) for path_ in path_list ] lpath_features = [ tf.train.Feature(int64_list=tf.train.Int64List( value=[lpath_])) for lpath_ in lpath_list ] cp_features = [ tf.train.Feature(int64_list=tf.train.Int64List( value=[cp_])) for cp_ in cp_list ] feature_list = { 'token_id': tf.train.FeatureList(feature=token_id), 'span': tf.train.FeatureList(feature=span), 'masks': tf.train.FeatureList(feature=mask), 'cue': tf.train.FeatureList(feature=cue), 'pos': tf.train.FeatureList(feature=pos_features), 'dep': tf.train.FeatureList(feature=dep_features), 'path': tf.train.FeatureList(feature=path_features), 'lpath': tf.train.FeatureList(feature=lpath_features), 'cp': tf.train.FeatureList(feature=cp_features), } context = tf.train.Features( feature={ "length": tf.train.Feature(int64_list=tf.train.Int64List( value=[len(subword_id_list)])), }) feature_lists = tf.train.FeatureLists( feature_list=feature_list) ex = tf.train.SequenceExample(feature_lists=feature_lists, context=context) tf_writer.write(ex.SerializeToString()) tf_writer.close()
from bert.tokenization import FullTokenizer from sacremoses import MosesTokenizer from sequence_transfer.sequence import TokenSequence from sequence_transfer.magic_transfer import MagicTransfer # 01 - Create tokenizer moses_tokenizer = MosesTokenizer('en') bert_tokenizer = FullTokenizer('../vocab.txt') # 02 - Create tokens text = "She lives in Lindström, Minnesota" moses_tokens = moses_tokenizer.tokenize(text) bert_tokens = bert_tokenizer.tokenize(text) # 03 - Create sequences s1 = TokenSequence.new(moses_tokens) s2 = TokenSequence.new(bert_tokens) # 04 - Create transfer transfer = MagicTransfer(s1, s2) # 05 - We will use the transfer object to find the bert tokens that correspond to the third moses token (`Lindström`) sub1 = s1[3] # `Lindström` sub2 = transfer.apply(sub1) for token in sub2: print(token.text) # 04 - We can debug the transfer transfer.debug()