class BertTokenizer: def __init__(self, bert_path, tokenizer_cls=FullTokenizer, maxlen=512): self.maxlen = maxlen # with tf.compat.v1.Session() as sess: # bert = hub.Module(bert_path) # tk_info = bert(signature='tokenization_info', as_dict=True) # tk_info = [tk_info['vocab_file'], tk_info['do_lower_case']] # vocab_file, do_lower_case = sess.run(tk_info) # self.tokenizer = tokenizer_cls(vocab_file, do_lower_case) bert_layer = hub.KerasLayer(bert_path, trainable=True) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() self.tokenizer = FullTokenizer(vocab_file, do_lower_case) def convert_sentences_to_ids(self, sentences): ids = list(map(self.convert_single_sentence_to_ids, sentences)) return np.array(ids) def convert_single_sentence_to_ids(self, sentence): tokens = self.tokenize(sentence) tokens = ['[CLS]'] + tokens + ['[SEP]'] tokens += (self.maxlen - len(tokens)) * ['[PAD]'] return self.tokenizer.convert_tokens_to_ids(tokens) def convert_two_sentence_to_ids(self, sent1, sent2, maxlen=None, return_tokens=False): if not maxlen: maxlen = self.maxlen tokens1 = self.tokenize(sent1) tokens2 = self.tokenize(sent2) if len(tokens1) + len(tokens2) > maxlen - 3: tokens2 = tokens2[:maxlen - 3 - len(tokens1)] tokens = ['[CLS]'] + tokens1 + ['[SEP]'] + tokens2 + ['[SEP]'] tokens += (maxlen - len(tokens)) * ['[PAD]'] ids = self.tokenizer.convert_tokens_to_ids(tokens) if return_tokens: return tokens1, tokens2, ids return ids def convert_sentence_to_features(self, sent1, sent2, maxlen=None): if not maxlen: maxlen = self.maxlen tokens1, tokens2, token_ids = self.convert_two_sentence_to_ids( sent1, sent2, maxlen, return_tokens=True) segment_ids = [0] * (len(tokens1) + 2) + [1] * (len(tokens2) + 1) input_mask = [1] * len(segment_ids) segment_ids += (maxlen - len(segment_ids)) * [0] input_mask += (maxlen - len(input_mask)) * [0] return token_ids, input_mask, segment_ids def tokenize(self, sent): return self.tokenizer.tokenize(sent)
def predict_input_fn_generator(input_file_or_list, config: Params, mode='predict'): # if is string, treat it as path to file if isinstance(input_file_or_list, str): inputs = open(input_file_or_list, 'r', encoding='utf8').readlines() else: inputs = input_file_or_list tokenizer = FullTokenizer(config.vocab_file) data_dict = {} data_dict['input_ids'] = [] data_dict['input_mask'] = [] data_dict['segment_ids'] = [] for doc in inputs: inputs_a = list(doc) tokens, target = tokenize_text_with_seqs(tokenizer, inputs_a, None) tokens_a, tokens_b, target = truncate_seq_pair(tokens, None, target, config.max_seq_len) tokens, segment_ids, target = add_special_tokens_with_seqs( tokens_a, tokens_b, target) input_mask, tokens, segment_ids, target = create_mask_and_padding( tokens, segment_ids, target, config.max_seq_len) input_ids = tokenizer.convert_tokens_to_ids(tokens) data_dict['input_ids'] = input_ids data_dict['input_mask'] = input_mask data_dict['segment_ids'] = segment_ids yield data_dict
def prepare_train_dataset(input_file, output_data_dir, output_filename, sliding_window_size, config, tokenizer=None, vocab_file=None, language="english", max_doc_length: int = None, is_training=True, demo=False, lowercase=False): if vocab_file is None: if not lowercase: vocab_file = os.path.join(REPO_PATH, "data_utils", "uppercase_vocab.txt") else: vocab_file = os.path.join(REPO_PATH, "data_utils", "lowercase_vocab.txt") if tokenizer is None: tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=lowercase) writer = tf.python_io.TFRecordWriter( os.path.join(output_data_dir, "{}.{}.tfrecord".format(output_filename, language))) doc_map = {} documents = read_conll_file(input_file) for doc_idx, document in enumerate(documents): doc_info = parse_document(document, language) tokenized_document = tokenize_document(config, doc_info, tokenizer, max_doc_length=max_doc_length) doc_key = tokenized_document['doc_key'] token_windows, mask_windows, text_len = convert_to_sliding_window( tokenized_document, sliding_window_size) input_id_windows = [ tokenizer.convert_tokens_to_ids(tokens) for tokens in token_windows ] span_start, span_end, mention_span, cluster_ids = flatten_clusters( tokenized_document['clusters']) # {'sub_tokens': sub_tokens, 'sentence_map': sentence_map, 'subtoken_map': subtoken_map, # 'speakers': speakers, 'clusters': clusters, 'doc_key': doc_info['doc_key']} tmp_speaker_ids = tokenized_document["speakers"] tmp_speaker_ids = [[0] * 130] * config["max_training_sentences"] instance = (input_id_windows, mask_windows, text_len, tmp_speaker_ids, tokenized_document["genre"], is_training, span_start, span_end, cluster_ids, tokenized_document['sentence_map']) write_instance_to_example_file(writer, instance, doc_key, config) doc_map[doc_idx] = doc_key if demo and doc_idx > 3: break with open( os.path.join(output_data_dir, "{}.{}.map".format(output_filename, language)), 'w') as fo: json.dump(doc_map, fo, indent=2)
def prepare_training_data(data_dir: str, language: str, vocab_file: str, sliding_window_size: int): tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False) for dataset in ['train', 'dev', 'test']: conll_file_path = os.path.join(data_dir, F"{dataset}.{language}.v4_gold_conll") writer = tf.python_io.TFRecordWriter( os.path.join(data_dir, F"{dataset}.{language}.tfrecord")) doc_map = {} documents = read_conll_file(conll_file_path) for doc_idx, document in enumerate(documents): doc_info = parse_document(document, language) checkout_clusters(doc_info) tokenized_document = tokenize_document(doc_info, tokenizer) doc_map[doc_idx] = tokenized_document['doc_key'] token_windows, mask_windows = convert_to_sliding_window( tokenized_document, sliding_window_size) input_id_windows = [ tokenizer.convert_tokens_to_ids(tokens) for tokens in token_windows ] span_starts, span_ends, cluster_ids = flatten_clusters( tokenized_document['clusters']) instance = (doc_idx, tokenized_document['sentence_map'], tokenized_document['subtoken_map'], input_id_windows, mask_windows, span_starts, span_ends, cluster_ids) write_instance_to_example_file(writer, instance) with open(os.path.join(data_dir, F"{dataset}.{language}.map"), 'w') as fo: json.dump(doc_map, fo, indent=2)
class Inferer: def __init__(self, checkpoint, attr_values_file, vocab_file): self.checkpoint = checkpoint self.attr_values_file = attr_values_file self.vocab_file = vocab_file if not os.path.exists(self.checkpoint): raise Exception("local checkpoint %s not exists" % self.checkpoint) if not os.path.exists(self.attr_values_file): raise Exception("local attr_values_file %s not exists" % self.attr_values_file) if not os.path.exists(self.vocab_file): raise Exception("local vocab_file %s not exists" % self.vocab_file) self.config = InferConfig() self.tokenizer = FullTokenizer(self.vocab_file) with open(self.attr_values_file, 'rb') as fr: attr_values, attr_values_r = pickle.load(fr) self.attr_values_r = attr_values_r self.config.output_dim = len(attr_values_r) self.graph = tf.Graph() with self.graph.as_default(): self.input_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length]) self.token_type_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length]) self.input_mask_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length]) model = Model(self.config) self.inference = model.infer(self.input_ids_p, self.token_type_ids_p, self.input_mask_p) ckpt_state = tf.train.get_checkpoint_state(self.checkpoint) if not (ckpt_state and ckpt_state.model_checkpoint_path): raise Exception('No model to eval yet at: ' + self.checkpoint) self.sess = tf.Session(config = tf.ConfigProto(allow_soft_placement = True)) saver = tf.train.Saver() saver.restore(self.sess, ckpt_state.model_checkpoint_path) def infer(self, sequences): transforms = [self._transform(s) for s in sequences if s != ''] input_ids, token_type_ids, input_mask = list(map(lambda x: list(x), zip(*transforms))) with self.graph.as_default(): result = self.sess.run(self.inference, feed_dict = { self.input_ids_p: input_ids, self.token_type_ids_p: token_type_ids, self.input_mask_p: input_mask }) return [self.attr_values_r[e] for e in result] def _transform(self, sequence): tokens = self.tokenizer.tokenize(sequence) if len(tokens) > self.config.max_seq_length - 2: tokens = tokens[0:self.config.max_seq_length - 2] tokens = ['[CLS]'] + tokens + ['[SEP]'] token_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_ids_1 = token_ids[0:self.config.max_seq_length] + [0] * (self.config.max_seq_length - len(token_ids)) token_type_ids_1 = [0] * self.config.max_seq_length input_mask_1 = [1] * len(token_ids) + [0] * (self.config.max_seq_length - len(token_ids)) return input_ids_1, token_type_ids_1, input_mask_1
def test_compare(self): model_dir = tempfile.TemporaryDirectory().name os.makedirs(model_dir) save_path = MiniBertFactory.create_mini_bert_weights(model_dir) tokenizer = FullTokenizer(vocab_file=os.path.join( model_dir, "vocab.txt"), do_lower_case=True) # prepare input max_seq_len = 16 input_str = "hello, bert!" input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) input_mask = [1] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) input_ids = np.array([input_ids], dtype=np.int32) input_mask = np.array([input_mask], dtype=np.int32) token_type_ids = np.array([token_type_ids], dtype=np.int32) print(" tokens:", input_tokens) print( "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len, input_ids), input_ids.shape, token_type_ids) bert_1_seq_out = CompareBertActivationsTest.predict_on_stock_model( model_dir, input_ids, input_mask, token_type_ids) bert_2_seq_out = CompareBertActivationsTest.predict_on_keras_model( model_dir, input_ids, input_mask, token_type_ids) np.set_printoptions(precision=9, threshold=20, linewidth=200, sign="+", floatmode="fixed") print("stock bert res", bert_1_seq_out.shape) print("keras bert res", bert_2_seq_out.shape) print("stock bert res:\n {}".format(bert_1_seq_out[0, :2, :10]), bert_1_seq_out.dtype) print("keras bert_res:\n {}".format(bert_2_seq_out[0, :2, :10]), bert_2_seq_out.dtype) abs_diff = np.abs(bert_1_seq_out - bert_2_seq_out).flatten() print("abs diff:", np.max(abs_diff), np.argmax(abs_diff)) self.assertTrue(np.allclose(bert_1_seq_out, bert_2_seq_out, atol=1e-6))
def test_finetune(self): model_dir = tempfile.TemporaryDirectory().name os.makedirs(model_dir) save_path = MiniBertFactory.create_mini_bert_weights(model_dir) tokenizer = FullTokenizer(vocab_file=os.path.join( model_dir, "vocab.txt"), do_lower_case=True) # prepare input max_seq_len = 24 input_str_batch = ["hello, bert!", "how are you doing!"] input_ids_batch = [] token_type_ids_batch = [] for input_str in input_str_batch: input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] print("input_tokens len:", len(input_tokens)) input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * len(input_tokens) + [0] * ( max_seq_len - len(input_tokens)) input_ids_batch.append(input_ids) token_type_ids_batch.append(token_type_ids) input_ids = np.array(input_ids_batch, dtype=np.int32) token_type_ids = np.array(token_type_ids_batch, dtype=np.int32) print(" tokens:", input_tokens) print( "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len, input_ids), input_ids.shape, token_type_ids) model = CompareBertActivationsTest.load_keras_model( model_dir, max_seq_len) model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.mean_squared_error) pres = model.predict([input_ids, token_type_ids ]) # just for fetching the shape of the output print("pres:", pres.shape) model.fit(x=(input_ids, token_type_ids), y=np.zeros_like(pres), batch_size=2, epochs=2)
def test_direct_keras_to_stock_compare(self): from tests.ext.modeling import BertModel, BertConfig, get_assignment_map_from_checkpoint bert_config = BertConfig.from_json_file(self.bert_config_file) tokenizer = FullTokenizer( vocab_file=os.path.join(self.bert_ckpt_dir, "vocab.txt")) # prepare input max_seq_len = 6 input_str = "Hello, Bert!" input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) input_mask = [1] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) input_ids = np.array([input_ids], dtype=np.int32) input_mask = np.array([input_mask], dtype=np.int32) token_type_ids = np.array([token_type_ids], dtype=np.int32) print(" tokens:", input_tokens) print( "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len, input_ids), input_ids.shape, token_type_ids) s_res = self.predict_on_stock_model(input_ids, input_mask, token_type_ids) k_res = self.predict_on_keras_model(input_ids, input_mask, token_type_ids) np.set_printoptions(precision=9, threshold=20, linewidth=200, sign="+", floatmode="fixed") print("s_res", s_res.shape) print("k_res", k_res.shape) print("s_res:\n {}".format(s_res[0, :2, :10]), s_res.dtype) print("k_res:\n {}".format(k_res[0, :2, :10]), k_res.dtype) adiff = np.abs(s_res - k_res).flatten() print("diff:", np.max(adiff), np.argmax(adiff)) self.assertTrue(np.allclose(s_res, k_res, atol=1e-6))
def tokenize_single_input(text, tokenizer: btk.FullTokenizer, max_input_length): tokens = ['[CLS]'] tokens += tokenizer.tokenize(text) token_ids = tokenizer.convert_tokens_to_ids(tokens) token_masks = [1] * len(token_ids) segment_ids = [0] * max_input_length if len(token_ids) > max_input_length: raise ValueError( 'The input is %i while the maximum input can be only %i.' % (len(token_ids), max_input_length)) while len(token_ids) != max_input_length: token_ids.append(0) token_masks.append(0) return token_ids, token_masks, segment_ids
def tokenize_data(input_str_batch, max_seq_len, model_dir): tokenizer = FullTokenizer(vocab_file=os.path.join(model_dir, "vocab.txt"), do_lower_case=True) input_ids_batch = [] token_type_ids_batch = [] for input_str in input_str_batch: input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] print("input_tokens len:", len(input_tokens)) input_ids = tokenizer.convert_tokens_to_ids(input_tokens) if len(input_tokens) > max_seq_len: input_ids = input_ids[:max_seq_len] else: input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) # token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * max_seq_len input_ids_batch.append(input_ids) token_type_ids_batch.append(token_type_ids) return input_ids_batch, token_type_ids_batch
def predict_input_fn(input_file_or_list, config: Params, mode='predict'): # if is string, treat it as path to file if isinstance(input_file_or_list, str): inputs = open(input_file_or_list, 'r', encoding='utf8').readlines() else: inputs = input_file_or_list tokenizer = FullTokenizer(config.vocab_file) data_dict = {} data_dict['input_ids'] = [] data_dict['input_mask'] = [] data_dict['segment_ids'] = [] for doc in inputs: inputs_a = list(doc) tokens, target = tokenize_text_with_seqs(tokenizer, inputs_a, None) tokens_a, tokens_b, target = truncate_seq_pair(tokens, None, target, config.max_seq_len) tokens, segment_ids, target = add_special_tokens_with_seqs( tokens_a, tokens_b, target) input_mask, tokens, segment_ids, target = create_mask_and_padding( tokens, segment_ids, target, config.max_seq_len) input_ids = tokenizer.convert_tokens_to_ids(tokens) data_dict['input_ids'].append(input_ids) data_dict['input_mask'].append(input_mask) data_dict['segment_ids'].append(segment_ids) dataset = tf.data.Dataset.from_tensor_slices(data_dict) dataset = dataset.batch(config.batch_size * 2) return dataset
class BERTEmbeddingEvaluator(SentenceEmbeddingEvaluator): def __init__( self, model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt", bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json", vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt", max_seq_length=32, dimension=768, num_labels=2, use_notebook=False): super().__init__("bert", dimension, use_notebook) config = BertConfig.from_json_file(bertconfig_fname) self.max_seq_length = max_seq_length self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph( config, max_seq_length, 1.0, num_labels, tune=False) saver = tf.train.Saver(tf.global_variables()) self.sess = tf.Session() checkpoint_path = tf.train.latest_checkpoint(model_fname) saver.restore(self.sess, checkpoint_path) def predict(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) probs = self.sess.run(self.probs, model_input) return probs """ sentence를 입력하면 토크나이즈 결과와 token 벡터 시퀀스를 반환한다 - shape :[[# of tokens], [batch size, max seq length, dimension]] """ def get_token_vector_sequence(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) return [ tokens, self.sess.run(self.model.get_sequence_output()[0], model_input)[:len(tokens) + 2] ] """ sentence를 입력하면 토크나이즈 결과와 [CLS] 벡터를 반환한다 - shape :[[# of tokens], [batch size, dimension]] """ def get_sentence_vector(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) return [ tokens, self.sess.run(self.model.pooled_output, model_input)[0] ] """ sentence를 입력하면 토크나이즈 결과와 self-attention score matrix를 반환한다 - shape :[[# of tokens], [batch size, # of tokens, # of tokens]] """ def get_self_attention_score(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) # raw_score : shape=[# of layers, batch_size, num_attention_heads, max_seq_length, max_seq_length] raw_score = self.sess.run(self.model.attn_probs_for_visualization_list, model_input) # 마지막 레이어를 취한 뒤, attention head 기준(axis=0)으로 sum scores = np.sum(raw_score[-1][0], axis=0) # scores matrix에서 토큰 개수만큼 취함 scores = scores[:len(tokens), :len(tokens)] return [tokens, scores] def tokenize(self, sentence): return self.tokenizer.tokenize(convert_to_unicode(sentence)) def make_input(self, tokens): tokens = tokens[:(self.max_seq_length - 2)] token_sequence = ["[CLS]"] + tokens + ["[SEP]"] segment = [0] * len(token_sequence) sequence = self.tokenizer.convert_tokens_to_ids(token_sequence) current_length = len(sequence) padding_length = self.max_seq_length - current_length input_feed = { self.input_ids: np.array([sequence + [0] * padding_length]), self.segment_ids: np.array([segment + [0] * padding_length]), self.input_mask: np.array([[1] * current_length + [0] * padding_length]) } return input_feed def visualize_self_attention_scores(self, sentence): tokens, scores = self.get_self_attention_score(sentence) visualize_self_attention_scores(tokens, scores, use_notebook=self.use_notebook)
class BERTVectorizer: def __init__( self, sess, is_bert, # bert_model_hub_path='https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1' bert_model_hub_path="https://tfhub.dev/google/albert_base/1"): self.sess = sess self.is_bert = is_bert self.bert_model_hub_path = bert_model_hub_path self.create_tokenizer_from_hub_module(is_bert=is_bert) def create_tokenizer_from_hub_module(self, is_bert): """Get the vocab file and casing info from the Hub module.""" bert_module = hub.Module(self.bert_model_hub_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = self.sess.run([ tokenization_info["vocab_file"], tokenization_info["do_lower_case"], ]) if is_bert: from bert.tokenization import FullTokenizer self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: from vectorizers.albert_tokenization import FullTokenizer self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, spm_model_file=vocab_file) def tokenize(self, text: str): words = text.split() # whitespace tokenizer tokens = [] valid_positions = [] for i, word in enumerate(words): token = self.tokenizer.tokenize(word) tokens.extend(token) for i in range(len(token)): if i == 0: valid_positions.append(1) else: valid_positions.append(0) return tokens, valid_positions def transform(self, text_arr): input_ids = [] input_mask = [] segment_ids = [] valid_positions = [] for text in text_arr: ids, mask, seg_ids, valid_pos = self.__vectorize(text) input_ids.append(ids) input_mask.append(mask) segment_ids.append(seg_ids) valid_positions.append(valid_pos) sequence_lengths = np.array([len(i) for i in input_ids]) input_ids = tf.keras.preprocessing.sequence.pad_sequences( input_ids, padding='post') input_mask = tf.keras.preprocessing.sequence.pad_sequences( input_mask, padding='post') segment_ids = tf.keras.preprocessing.sequence.pad_sequences( segment_ids, padding='post') valid_positions = tf.keras.preprocessing.sequence.pad_sequences( valid_positions, padding='post') return input_ids, input_mask, segment_ids, valid_positions, sequence_lengths def __vectorize(self, text: str): tokens, valid_positions = self.tokenize(text) # insert "[CLS]" tokens.insert(0, '[CLS]') valid_positions.insert(0, 1) # insert "[SEP]" tokens.append('[SEP]') valid_positions.append(1) segment_ids = [0] * len(tokens) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) return input_ids, input_mask, segment_ids, valid_positions
class BERTModel: def __init__(self): bert_pretrained_dir = args.pretrain_models_path + args.bert_model_name self.do_lower_case = args.bert_model_name.startswith('uncased') self.vocab_file = os.path.join(bert_pretrained_dir, 'vocab.txt') self.config_file = os.path.join(bert_pretrained_dir, 'bert_config.json') self.tokenizer = FullTokenizer(vocab_file=self.vocab_file, do_lower_case=self.do_lower_case) self.input_id = tf.placeholder(tf.int64, [None, None], 'input_ids') self.input_mask = tf.placeholder(tf.int64, [None, None], 'input_mask') self.segment_ids = tf.placeholder(tf.int64, [None, None], 'segment_ids') bert_config = BertConfig.from_json_file(self.config_file) model = BertModel(config=bert_config, is_training=False, input_ids=self.input_id, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=True, scope='bert') self.output_layer = model.get_sequence_output() self.embedding_layer = model.get_embedding_output() saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.session = tf.Session(config=config) saver.restore(self.session, bert_pretrained_dir + '/bert_model.ckpt') def tokenize(self, token_list, attributes_list): num_attributes = len(attributes_list) output_list = [[] for _ in range(num_attributes)] token_ids = [] masks = [] token_ids.append("[CLS]") for token_id, token in enumerate(token_list): new_tokens = self.tokenizer.tokenize(token) token_ids.extend(new_tokens) for att_id in range(num_attributes): l_ = [ attributes_list[att_id][token_id] for _ in range(len(new_tokens)) ] output_list[att_id].extend(l_) m = [0 for _ in range(len(new_tokens))] m[0] = 1 masks.extend(m) token_ids.append("[SEP]") token_ids = self.tokenizer.convert_tokens_to_ids(token_ids) last_layer, embedding = self.get_embeddings(token_ids) if len(last_layer) != len(output_list[0]): print(token_list) print(token_ids) for list_i in output_list: print(list_i) assert len(last_layer) == len(output_list[0]) return last_layer, embedding, token_ids[1:-1], output_list, masks def get_embeddings(self, token_ids): input_mask = [[1] * len(token_ids)] segment_ids = [[0] * len(token_ids)] input_id = [token_ids] outputs, emb = self.session.run( [self.output_layer, self.embedding_layer], feed_dict={ self.input_mask: input_mask, self.segment_ids: segment_ids, self.input_id: input_id }) return outputs[0][1:-1], emb[0][1:-1] def tokenize_sentence(self, token_list): token_ids = [] token_ids.append("[CLS]") for token_id, token in enumerate(token_list): new_tokens = self.tokenizer.tokenize(token) token_ids.extend(new_tokens) token_ids.append("[SEP]") token_ids = self.tokenizer.convert_tokens_to_ids(token_ids) return token_ids[1:-1]
def texts_to_X(texts: List[List[str]], max_sentence_length: int, data_name: str, path_to_bert: str) -> np.ndarray: if os.path.isfile(data_name): with open(data_name, 'rb') as fp: X = pickle.load(fp) if not isinstance(X, np.ndarray): raise ValueError('The file `{0}` does not contain a `{1}` object.'.format( data_name, type(np.array([1, 2])))) if X.shape != (len(texts), max_sentence_length, EMBEDDING_SIZE): raise ValueError( 'The file `{0}` contains an inadmissible `{1}` object. Shapes are wrong. Expected {2}, got {3}.'.format( data_name, type(np.array([1, 2])), (len(texts), max_sentence_length, EMBEDDING_SIZE), X.shape) ) else: path_to_bert_ = os.path.normpath(path_to_bert) if not check_path_to_bert(path_to_bert_): raise ValueError('`path_to_bert` is wrong! There are no BERT files into the directory `{0}`.'.format( path_to_bert)) if os.path.basename(path_to_bert_).find('_uncased_') >= 0: do_lower_case = True else: if os.path.basename(path_to_bert_).find('_cased_') >= 0: do_lower_case = False else: do_lower_case = None if do_lower_case is None: raise ValueError('`{0}` is bad path to the BERT model, because a tokenization mode (lower case or no) ' 'cannot be detected.'.format(path_to_bert)) X = np.zeros((len(texts), max_sentence_length, EMBEDDING_SIZE), dtype=np.float32) batch_size = 4 n_batches = int(math.ceil(len(texts) / float(batch_size))) max_seq_length_for_bert = 512 with tf.Graph().as_default(): input_ids_ = tf.placeholder(shape=(batch_size, max_seq_length_for_bert), dtype=tf.int32, name='input_ids') input_mask_ = tf.placeholder(shape=(batch_size, max_seq_length_for_bert), dtype=tf.int32, name='input_mask') segment_ids_ = tf.placeholder(shape=(batch_size, max_seq_length_for_bert), dtype=tf.int32, name='segment_ids') bert_config = BertConfig.from_json_file(os.path.join(path_to_bert, 'bert_config.json')) tokenizer = FullTokenizer(vocab_file=os.path.join(path_to_bert, 'vocab.txt'), do_lower_case=do_lower_case) bert_model = BertModel(config=bert_config, is_training=False, input_ids=input_ids_, input_mask=input_mask_, token_type_ids=segment_ids_, use_one_hot_embeddings=False) sequence_output = bert_model.sequence_output tvars = tf.trainable_variables() init_checkpoint = os.path.join(path_to_bert_, 'bert_model.ckpt') (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) tokenized_texts = [] bert2tokens = [] for cur_text in texts: new_text = [] new_bert2tokens = [] start_pos = 0 for word_idx, cur_word in enumerate(cur_text): bert_tokens = tokenizer.tokenize(cur_word) new_text += bert_tokens new_bert2tokens.append((start_pos + 1, start_pos + len(bert_tokens) + 1)) start_pos += len(bert_tokens) if len(new_text) > (max_seq_length_for_bert - 2): new_text = new_text[:(max_seq_length_for_bert - 2)] new_bert2tokens = new_bert2tokens[:(max_seq_length_for_bert - 2)] new_text = ['[CLS]'] + new_text + ['[SEP]'] tokenized_texts.append(tokenizer.convert_tokens_to_ids(new_text)) bert2tokens.append(tuple(new_bert2tokens)) del tokenizer for batch_idx in range(n_batches): start_pos = batch_idx * batch_size end_pos = min(len(texts), (batch_idx + 1) * batch_size) embeddings_of_texts_as_numpy = sess.run( sequence_output, feed_dict={ ph: x for ph, x in zip( [input_ids_, input_mask_, segment_ids_], texts_to_batch_for_bert(tokenized_texts[start_pos:end_pos], batch_size, max_seq_length_for_bert) ) } ) for idx in range(end_pos - start_pos): text_idx = start_pos + idx for token_idx in range(min(len(texts[text_idx]), max_sentence_length)): token_start, token_end = bert2tokens[text_idx][token_idx] X[text_idx][token_idx] = embeddings_of_texts_as_numpy[idx][token_start:token_end].mean( axis=0) del embeddings_of_texts_as_numpy for k in list(sess.graph.get_all_collection_keys()): sess.graph.clear_collection(k) with open(data_name, mode='wb') as fp: pickle.dump(X, fp, protocol=2) tf.reset_default_graph() return X
class PredicateInfer(LoadModelBase): def __init__(self, vocab_file, export_dir=None, url=None, model_name='models', signature_name=None, do_lower_case=True): super(PredicateInfer, self).__init__(export_dir, url, model_name, signature_name) # 加载段落处理器 # self.sen_processor = SentenceProcessor() # 加载 bert tokenizer self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) # 通过 grpc if url: self.stub, self.request = self.load_grpc_connect() if export_dir: self.predict_fn = self.load_pb_model() self.id_map_predicate = self.id_to_label(model_config.PREDICATE_LABEL) def process(self, sentences, max_seq_length=64): if not sentences or not isinstance(sentences, list): raise ValueError( '`sentences` must be list object and not a empty list !') examples = [] for sentence in sentences: feature = self.convert_single_example(sentence, max_seq_length) example = self.convert_single_feature(feature) examples.append(example) return examples def convert_single_example(self, sentence, max_seq_length): """ 处理单个语句 sentence: str, 预测句子 max_seq_length: int,句子最大长度 :return: """ sentence = self.tokenizer.tokenize(sentence) if len(sentence) > max_seq_length - 2: sentence = sentence[0:(max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in sentence: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids) return feature def convert_single_feature(self, feature): features = dict() features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List( value=feature.input_ids)) features['input_mask'] = tf.train.Feature( int64_list=tf.train.Int64List(value=feature.input_mask)) features['segment_ids'] = tf.train.Feature( int64_list=tf.train.Int64List(value=feature.segment_ids)) example = tf.train.Example(features=tf.train.Features( feature=features)) return example.SerializeToString() def infer(self, sentences, max_seq_length, top_n=3): """ 预测调用 sentences: list,输入一批预测句子 max_seq_length: int, 输入最大长度 top_n: int,返回前多少个类别 :return: list,例如 [[('作者', 0.98), ('出生地', 0.02)...], ] """ result = [] examples = self.process(sentences, max_seq_length) if self.url: predictions = self.tf_serving_infer(examples) else: s = time.time() predictions = self.local_infer(examples) print('predicate:', time.time() - s) predictions = predictions['predictions'] for p in predictions: top_n_idx = p.argsort()[::-1][0:top_n] label = list( map(lambda x: (self.id_map_predicate[x], p[x]), top_n_idx)) result.append(label) return result def tf_serving_infer(self, examples): self.request.inputs['examples'].CopyFrom( tf.make_tensor_proto(examples, dtype=types_pb2.DT_STRING)) response = self.stub.Predict(self.request, 5.0) predictions = {} for key in response.outputs: tensor_proto = response.outputs[key] nd_array = tf.contrib.util.make_ndarray(tensor_proto) predictions[key] = nd_array return predictions def local_infer(self, examples): """ 本地进行预测,参数解释同上 """ predictions = self.predict_fn({'examples': examples}) return predictions def id_to_label(self, labels): return dict([(i, label) for i, label in enumerate(labels)])
class BERTVectorizer: def __init__(self, sess, bert_model_hub_path): self.sess = sess self.bert_model_hub_path = bert_model_hub_path self.create_tokenizer_from_hub_module() def create_tokenizer_from_hub_module(self): # get the vocabulary and lowercasing or uppercase information directly from the BERT tf hub module bert_module = hub.Module(self.bert_model_hub_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = self.sess.run( [ tokenization_info["vocab_file"], tokenization_info["do_lower_case"] ] ) self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) #do_lower_case=True # print(tokenizer.tokenize('hello world!')) --> ['hello', 'world', '!'] def tokenize(self, text:str): ## tokenize every sentence words = text.split() ## # text: add leah kauffman to my uncharted 4 nathan drake playlist ## # words: ['add', 'leah', 'kauffman', 'to', 'my', 'uncharted', '4', 'nathan', 'drake', 'playlist'] tokens = [] ## # tokens: ['add', 'leah', 'ka', '##uf', '##fm', '##an', 'to', 'my', 'un', '##cha', '##rted', '4', 'nathan', 'drake', 'play', '##list'] valid_positions = [] ## # valid_positions:[1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0] for i, word in enumerate(words): token = self.tokenizer.tokenize(word) tokens.extend(token) for i in range(len(token)): if i == 0: valid_positions.append(1) else: valid_positions.append(0) return tokens, valid_positions def transform(self, text_arr): input_ids = [] input_mask = [] segment_ids = [] valid_positions = [] for text in text_arr: ids, mask, seg_ids, valid_pos = self.__vectorize(text) input_ids.append(ids) input_mask.append(mask) segment_ids.append(seg_ids) valid_positions.append(valid_pos) sequence_length = np.array([len(i) for i in input_ids]) ## set the maximum length is 50 input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=50, truncating='post', padding='post') input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, maxlen=50, truncating='post', padding='post') segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, maxlen=50, truncating='post', padding='post') valid_positions = tf.keras.preprocessing.sequence.pad_sequences(valid_positions, maxlen=50, truncating='post', padding='post') # input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, padding='post') # input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, padding='post') # segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, padding='post') # valid_positions = tf.keras.preprocessing.sequence.pad_sequences(valid_positions, padding='post') return input_ids, input_mask, segment_ids, valid_positions, sequence_length def __vectorize(self, text:str): tokens, valid_positions = self.tokenize(text) ## insert the first token "[CLS]" tokens.insert(0, '[CLS]') valid_positions.insert(0, 1) ## insert the last token "[SEP]" tokens.append('[SEP]') valid_positions.append(1) ## ['[CLS]', 'add', 'leah', 'ka', '##uf', '##fm', '##an', 'to', 'my', 'un', '##cha', '##rted', '4', 'nathan', 'drake', 'play', '##list', '[SEP]'] ## [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1] ''' (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. ''' segment_ids = [0] * len(tokens) ## # segment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] input_ids = self.tokenizer.convert_tokens_to_ids(tokens) ## # input_ids: [101, 5587, 14188, 10556, 16093, 16715, 2319, 2000, 2026, 4895, 7507, 17724, 1018, 7150, 7867, 2377, 9863, 102] and the first is always 101 and the last is 102 input_mask = [1] * len(input_ids) ## The mask has 1 for real tokens and 0 for padding tokens. ## # input_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] return input_ids, input_mask, segment_ids, valid_positions
class BertPreprocessor(Preprocessor): """Preprocessor for BERT embedding. This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT as embedding. Currently only single sequence classification is supported. """ def __init__(self, pretrained_model_path: str, **kwargs): super().__init__(**kwargs) info = hub.Module(spec=pretrained_model_path)(signature="tokenization_info", as_dict=True) with tf.Session() as sess: vocab_file, do_lower_case = sess.run( [ info["vocab_file"], info["do_lower_case"] ] ) # Create the tokenizer with the vocabulary of the pretrained model self._tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) basic_tokens = self._tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"]) self._CLS_token = basic_tokens[0] self._SEP_token = basic_tokens[1] def _padding_sentence(self): """Return a zero length sentence to pad last batch. :return: Three sequences of zeros (tokens, masks, segment ids). """ return [0] * self._max_seq_len, [0] * self._max_seq_len, [0] * self._max_seq_len def tokenize(self, text: str): """Convert a sequence of words into a sequence of tokens and also compute the masking- and segment ids. For further details please read BERT paper. :param text: The sequence of words. :return: The sequence of tokens, masks and segment ids. """ input_ids = [0] * self._max_seq_len input_mask = [0] * self._max_seq_len input_segment_ids = [0] * self._max_seq_len tokens_input = self._tokenizer.tokenize(text) # if too long cut to size (the first token will be [CLS], the last [SEP]) if len(tokens_input) > self._max_seq_len - 2: tokens_input = tokens_input[0: (self._max_seq_len - 2)] idx = 0 input_ids[idx] = self._CLS_token idx += 1 for element in self._tokenizer.convert_tokens_to_ids(tokens_input): input_ids[idx] = element idx += 1 input_ids[idx] = self._SEP_token # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. for i in range(idx + 1): input_mask[i] = 1 # safety check assert len(input_ids) == self._max_seq_len assert len(input_mask) == self._max_seq_len assert len(input_segment_ids) == self._max_seq_len return input_ids, input_mask, input_segment_ids def fit(self, texts: List[str]) -> 'BertPreprocessor': """This function does nothing in case of BERT but must be implemented. :param texts: - :return: self """ return self def transform(self, texts: List[str]) -> list: """Transform sequences of words into sequences of tokens, masks and segment ids. Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole sequence belongs together. For further details please read BERT paper. :param texts: The sequences of texts. :return: The sequences of tokens, masks and segment ids. """ input_masks = np.empty([len(texts), self._max_seq_len], dtype=np.int64) segment_ids = np.empty([len(texts), self._max_seq_len], dtype=np.int64) # input_ids, input_masks, segment_ids = [], [], [] input_ids, input_masks, segment_ids = zip(*Pool(processes=8).map(self.tokenize, texts)) # for i, text in enumerate(texts): # input_ids[i], input_masks[i], segment_ids[i] = self.tokenize(text=text) # input_id, input_mask, segment_id = self.tokenize(text=text) # input_ids.append(input_id) # input_masks.append(input_mask) # segment_ids.append(segment_id) # return [np.array(input_ids), np.array(input_masks), np.array(segment_ids)] return [input_ids, input_masks, segment_ids] def inverse_transform(self, sequences: np.ndarray): """Transform sequences of tokens back to sequences of words (sentences). :param sequences: The sequences of tokens. :return: The sequences of words """ return self._tokenizer.convert_ids_to_tokens(sequences)
class EntityInfer(LoadModelBase): def __init__(self, vocab_file, export_dir=None, url=None, model_name='models', signature_name=None, do_lower_case=True): super(EntityInfer, self).__init__(export_dir, url, model_name, signature_name) self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) # 通过 grpc if url: self.stub, self.request = self.load_grpc_connect() if export_dir: self.predict_fn = self.load_pb_model() self.id_map_predicate = self.id_to_label(model_config.PREDICATE_LABEL) self.predicate_map_id = self.label_to_id(model_config.PREDICATE_LABEL) self.id_map_sequence = self.id_to_label(model_config.SEQ_LABEL) def id_to_label(self, labels): return dict([(i, label) for i, label in enumerate(labels)]) def label_to_id(self, labels): return dict([(label, i) for i, label in enumerate(labels)]) def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() def process(self, sentences, predicate_labels, max_seq_length=64): if not sentences or (not isinstance(sentences, list) and not isinstance(sentences, tuple)): raise ValueError( '`sentences` must be list object and not a empty list !') examples = [] for sentence, predicate_label in zip(sentences, predicate_labels): feature = self.convert_single_example(sentence, predicate_label, max_seq_length) example = self.convert_single_feature(feature) examples.append(example) return examples def convert_single_example(self, sentence, predicate_label, max_seq_length): tokens = [] for token in sentence: tokens.extend(self.tokenizer.tokenize(token)) tokens_b = [predicate_label] * len(tokens) predicate_label_id = self.predicate_map_id[predicate_label] # 把 tokens 和 tokens_b 都截断到相等长度,并且长度的和小于 max_seq_length - 3 self._truncate_seq_pair(tokens, tokens_b, max_seq_length - 3) tokens_a = [] segment_ids = [] tokens_a.append("[CLS]") segment_ids.append(0) for token in tokens: tokens_a.append(token) segment_ids.append(0) tokens_a.append("[SEP]") segment_ids.append(0) input_ids = self.tokenizer.convert_tokens_to_ids(tokens_a) # bert_tokenizer.convert_tokens_to_ids(["[SEP]"]) --->[102] # 1-100 dict index not used bias = 1 for token in tokens_b: # add bias for different from word dict tokens.append(token) input_ids.append(predicate_label_id + bias) segment_ids.append(1) tokens.append('[SEP]') # `[SEP]` index 等于 102 input_ids.append(self.tokenizer.convert_tokens_to_ids(["[SEP]"])[0]) segment_ids.append(1) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) tokens.append("[Padding]") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids) return feature def convert_single_feature(self, feature): features = dict() features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List( value=feature.input_ids)) features['input_mask'] = tf.train.Feature( int64_list=tf.train.Int64List(value=feature.input_mask)) features['segment_ids'] = tf.train.Feature( int64_list=tf.train.Int64List(value=feature.segment_ids)) example = tf.train.Example(features=tf.train.Features( feature=features)) return example.SerializeToString() def infer(self, sentences, predicate_labels, max_seq_length, predicate_probabilities=None): """ 预测调用 sentences: list,句子,['xxxx', 'xxxx'...] predicate_labels: list, 标签, ['作者', '出生地'...] max_seq_length: int predicate_probabilities: list, [0.92, 0.01, ...] :return: list, [ [{'predicate': predicate, 'subject': subj, 'object': entity}, {'predicate': predicate...], [{'predicate': predicate, 'subject': subj, 'object': entity}, {'predicate': predicate...]... ] """ examples = self.process(sentences, predicate_labels, max_seq_length) if self.url: predictions = self.tf_serving_infer(examples) else: s = time.time() predictions = self.local_infer(examples) print('sequence:', time.time() - s) token_label_predictions = predictions['token_label_predictions'] predicate_predictions = predictions['predicate_predictions'] predicate_labels_index = np.argmax(predicate_predictions, -1) result = [] for i in range(len(sentences)): token_label = list( map(lambda x: self.id_map_sequence[x], token_label_predictions[i])) entities = self.entity_extract( sentences[i], token_label[1:token_label.index('[SEP]')]) predicate_label_index = predicate_labels_index[i] # 关系分类的模型输出 与 序列标注模型输出的结果比较 if predicate_probabilities: predicate_label = max( [(predicate_labels[i], predicate_probabilities[i]), (self.id_map_predicate[predicate_label_index], predicate_predictions[i][predicate_label_index])], key=lambda x: x[1]) else: predicate_label = predicate_predictions[i][ predicate_label_index] triplets = self.organize_triplet(entities, predicate_label[0]) if triplets: result.append(triplets) return result def organize_triplet(self, entities, predicate): """ 把三元组转成字典形式, 可解决一个关系、一个主体(subject)、多个客体(object) entities: list, [('xx公司', 'SUB'), ('xx公司', 'OBJ')] predicate: str, 关系 :return: list, [{'predicate': predicate, 'subject': subj, 'object': entity}, {'predicate': predicate, 'subject': subj, 'object': entity}...] """ triplets = [] subj = None for entity, tag in entities: if tag == 'SUB': subj = entity break for entity, tag in entities: if tag == 'OBJ': triplet = { 'predicate': predicate, 'subject': subj, 'object': entity } triplets.append(triplet) return triplets def entity_extract(self, sentence, tags): """ 依据tags,从sentence抽取实体 sentence: str,句子 tags: list, 序列标记,例如 ['O', 'B-SUB', 'I-SUB'...] :return: list, [('xx公司', 'SUB'), ('xx公司', 'OBJ')] """ entities = [] sentence_len = len(sentence) if sentence_len != len(tags): warnings.warn( 'Token and tags have different lengths.\ndetails:\n{}\n{}'. format(sentence, tags)) entity = Entity(None) t_zip = zip(sentence, tags) for i, (token, tag) in enumerate(t_zip): if tag == 'O': if entity.types: entities.append(entity.get_entity_types()) entity = Entity(None) continue elif tag[0] == 'B': if entity.types: entities.append(entity.get_entity_types()) entity = Entity(tag[2:]) entity.begin = token elif tag[0] == 'I': if i == sentence_len - 1: entity.intermediate = token entities.append(entity.get_entity_types()) break try: entity.intermediate = token except Exception as e: print(e) return entities def tf_serving_infer(self, examples): self.request.inputs['examples'].CopyFrom( tf.make_tensor_proto(examples, dtype=types_pb2.DT_STRING)) response = self.stub.Predict(self.request, 5.0) predictions = {} for key in response.outputs: tensor_proto = response.outputs[key] nd_array = tf.contrib.util.make_ndarray(tensor_proto) predictions[key] = nd_array return predictions def local_infer(self, examples): """ 本地进行预测,参数解释同上 """ predictions = self.predict_fn({'examples': examples}) return predictions
class TrainDataReader(): def __init__(self, config, category_dir, vocab_file): self.config = config self.category_dir = category_dir self.tokenizer = FullTokenizer(vocab_file) if not os.path.exists( os.path.join(self.category_dir, 'train_data', 'raw.csv')): raise Exception("local raw train data not exists!!") if not os.path.exists(vocab_file): raise Exception("local vocab_file not exists") def transform(self): with open(os.path.join(self.category_dir, 'train_data', 'raw.csv')) as fr, \ open(os.path.join(self.category_dir, 'attr_values.pkl'), 'wb') as fwa: attr_values_c = {} for row in fr: if row.strip() == '' or len(row.strip().split('\t')) != 10: continue segment = row.strip().split('\t') attr_values_c[(segment[8], segment[9])] = 1 attr_values = {k: i for i, k in enumerate(attr_values_c.keys())} attr_values_r = {i: k for k, i in attr_values.items()} print('start to write local attr_values.pkl!!') pickle.dump((attr_values, attr_values_r), fwa) with open(os.path.join(self.category_dir, 'train_data', 'raw.csv')) as fr, \ open(os.path.join(self.category_dir, 'train_data', 'transform.csv'), 'w') as fwt: print('start to write local train_data transform.csv!!') for row in fr: if row.strip() == '' or len(row.strip().split('\t')) != 10: continue segment = row.strip().split('\t') label = attr_values[(segment[8], segment[9])] tokens = self.tokenizer.tokenize(segment[7]) if len(tokens) > self.config.max_seq_length - 2: tokens = tokens[0:self.config.max_seq_length - 2] tokens = ['[CLS]'] + tokens + ['[SEP]'] token_ids = self.tokenizer.convert_tokens_to_ids(tokens) token_ids_patch = token_ids[0:self.config.max_seq_length] + [ 0 ] * (self.config.max_seq_length - len(token_ids)) token_ids_patch = list(map(lambda x: str(x), token_ids_patch)) fwt.write( str(label) + ',' + str(min(len(token_ids), len(token_ids_patch))) + ',' + ','.join(token_ids_patch) + '\n') return len(attr_values) def read(self): transform = os.path.join(self.category_dir, 'train_data', 'transform.csv') queue = tf.train.string_input_producer([transform]) reader = tf.TextLineReader() _, value = reader.read(queue) row = tf.decode_csv(value, [[0]] * (self.config.max_seq_length + 2)) label = tf.stack(row[0]) length = tf.stack(row[1]) mask = tf.cast(tf.sequence_mask(length, self.config.max_seq_length), tf.int32) sequence = tf.stack(row[2:self.config.max_seq_length + 2]) return tf.train.shuffle_batch([label, sequence, mask], self.config.batch_size, 50000, 10000)
from bert.tokenization import FullTokenizer export_dir = '/home/CAIL/bert_text/examples/SequenceLabel/saved_models' vocab_file = "/home/CAIL/bert_text/examples/SequenceLabel/vocab.txt" text = "这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般" tokenizer = FullTokenizer(vocab_file=vocab_file) tokens = tokenizer.tokenize(text=text) max_seq_length = 256 input_ids = tokenizer.convert_tokens_to_ids(tokens) input_masks = [1] * len(input_ids) if len(input_ids) < max_seq_length: input_ids += [0] * (max_seq_length - len(input_ids)) input_masks += [0] * (max_seq_length - len(input_masks)) segment_ids = [0] * max_seq_length with tf.Session() as sess: meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], export_dir) signature = meta_graph_def.signature_def x1_tensor_name = signature['predict'].inputs['input_ids'].name x2_tensor_name = signature['predict'].inputs['input_masks'].name x3_tensor_name = signature['predict'].inputs['segment_ids'].name
class BertNer(object): def __init__(self, **kwargs): self.tf = import_tf(kwargs['gpu_no'], kwargs['verbose']) self.logger = set_logger('BertNer', kwargs['log_dir'], kwargs['verbose']) self.model_dir = kwargs['ner_model'] from bert.tokenization import FullTokenizer self.tokenizer = FullTokenizer( os.path.join(self.model_dir, 'vocab.txt')) self.ner_sq_len = 128 self.input_ids = self.tf.placeholder(self.tf.int32, (None, self.ner_sq_len), 'input_ids') self.input_mask = self.tf.placeholder(self.tf.int32, (None, self.ner_sq_len), 'input_mask') # init graph self._init_graph() # init ner assist data self._init_predict_var() self.per_proun = [ '甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸', '子', '丑', '寅', '卯', '辰', '巳', '午', '未', '申', '酉', '戌', '亥' ] def _init_graph(self): """ init bert ner graph :return: """ try: with self.tf.gfile.GFile( os.path.join(self.model_dir, 'ner_model.pb'), 'rb') as f: graph_def = self.tf.GraphDef() graph_def.ParseFromString(f.read()) input_map = { "input_ids:0": self.input_ids, 'input_mask:0': self.input_mask } self.pred_ids = self.tf.import_graph_def( graph_def, name='', input_map=input_map, return_elements=['pred_ids:0'])[0] graph = self.pred_ids.graph sess_config = self.tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True self.sess = self.tf.Session(graph=graph, config=sess_config) self.sess.run(self.tf.global_variables_initializer()) self.tf.reset_default_graph() except Exception as e: self.logger.error(e) def _init_predict_var(self): """ initialize assist of bert ner :return: labels num of ner, label to id dict, id to label dict """ with open(os.path.join(self.model_dir, 'label2id.pkl'), 'rb') as rf: self.id2label = { value: key for key, value in pickle.load(rf).items() } def _convert_lst_to_features(self, lst_str, is_tokenized=True, mask_cls_sep=False): """ Loads a data file into a list of `InputBatch`s. :param lst_str: list str :param is_tokenized: whether token unknown word :param mask_cls_sep: masking the embedding on [CLS] and [SEP] with zero. :return: input feature instance """ from bert.extract_features import read_tokenized_examples, read_examples, InputFeatures examples = read_tokenized_examples( lst_str) if is_tokenized else read_examples(lst_str) _tokenize = lambda x: self.tokenizer.mark_unk_tokens( x) if is_tokenized else self.tokenizer.tokenize(x) for (ex_index, example) in enumerate(examples): tokens_a = _tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = _tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self.ner_sq_len - 2: tokens_a = tokens_a[0:(self.ner_sq_len - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = ['[CLS]'] + tokens_a + ['[SEP]'] input_type_ids = [0] * len(tokens) input_mask = [int(not mask_cls_sep) ] + [1] * len(tokens_a) + [int(not mask_cls_sep)] if tokens_b: tokens += tokens_b + ['[SEP]'] input_type_ids += [1] * (len(tokens_b) + 1) input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)] input_ids = self.tokenizer.convert_tokens_to_ids(tokens) # Zero-pad up to the sequence length. more pythonic pad_len = self.ner_sq_len - len(input_ids) input_ids += [0] * pad_len input_mask += [0] * pad_len input_type_ids += [0] * pad_len assert len(input_ids) == self.ner_sq_len assert len(input_mask) == self.ner_sq_len assert len(input_type_ids) == self.ner_sq_len yield InputFeatures(input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids) def _truncate_seq_pair(self, tokens_a, tokens_b): """ Truncates a sequence pair in place to the maximum length. :param tokens_a: text a :param tokens_b: text b """ try: while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= self.ner_sq_len - 3: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() except: self.logger.error() def _convert_id_to_label(self, pred_ids_result, batch_size): """ turn id to label :param pred_ids_result: predict result :param batch_size: batch size of predict ids result :return: label list """ result = [] index_result = [] for row in range(batch_size): curr_seq = [] curr_idx = [] ids = pred_ids_result[row] for idx, id in enumerate(ids): if id == 0: break curr_label = self.id2label[id] if curr_label in ['[CLS]', '[SEP]']: if id == 102 and (idx < len(ids) and ids[idx + 1] == 0): break continue # elif curr_label == '[SEP]': # break curr_seq.append(curr_label) curr_idx.append(id) result.append(curr_seq) index_result.append(curr_idx) return result, index_result def predict(self, contents): """ bert ner predict :param content_list: content list :return: predict result """ try: splited_contents = [] all_terms = [] for content in contents: content_len = len(content) if content_len % self.ner_sq_len - 2 == 0: terms = int(content_len / (self.ner_sq_len - 2)) else: terms = int(content_len / (self.ner_sq_len - 2)) + 1 all_terms.append(terms) for i in range(terms): splited_contents.append( content[i * (self.ner_sq_len - 2):(i + 1) * (self.ner_sq_len - 2)]) tmp_f = list(self._convert_lst_to_features(splited_contents)) input_ids = [f.input_ids for f in tmp_f] input_masks = [f.input_mask for f in tmp_f] pred_result = self.sess.run(self.pred_ids, feed_dict={ self.input_ids: input_ids, self.input_mask: input_masks }) # restore to original string tmp = [] index = 0 for terms in all_terms: sub_preds = [] for i in range(terms): sub_preds.extend(pred_result[index + i]) tmp.append(sub_preds) index += terms pred_result = tmp pred_result = self._convert_id_to_label(pred_result, len(pred_result))[0] # zip str predict id str_pred = [] for w in zip(contents, pred_result): sub_list = [] for z in zip(list(w[0]), w[1]): sub_list.append([z[0], z[1]]) str_pred.append(sub_list) # get ner ner_result = [self._combine_ner(s) for s in str_pred] return ner_result except Exception as e: self.logger.error(e) return [[]] def _combine_ner(self, pred_result): """ combine ner :param pred_result: model predict result and origin content words list :return: entity words and index """ words_len = len(pred_result) i = 0 tmp = '' _ner_list = [] while i < words_len: word = pred_result[i] # add personal pronoun if word[0] in self.per_proun and word[1][0] == 'O': _ner_list.append([word[0], 'PER']) if word[1][0] == 'O' and tmp is not '': _ner_list.append([tmp, pred_result[i - 1][1][2:]]) tmp = '' elif word[1][0] == 'I': tmp = tmp + word[0] if i == words_len - 1: _ner_list.append([tmp, word[1][2:]]) elif word[1][0] == 'B': if tmp is not '': _ner_list.append([tmp, pred_result[i - 1][1][2:]]) tmp = word[0] if i == words_len - 1: _ner_list.append([tmp, word[1][2:]]) i += 1 return _ner_list
class DisasterDetector: def __init__(self, bert_layer, max_sql, lr, batch_size, epochs): self.bert_layer = bert_layer self.max_sql = max_sql vocab = self.bert_layer.resolved_object.vocab_file.asset_path.numpy() lowercase = self.bert_layer.resolved_object.do_lower_case.numpy() self.token = FullTokenizer(vocab, lowercase) self.lr = lr self.batch_size = batch_size self.epochs = epochs self.models = [] self.scores = {} def encode(self, texts): all_tokens = [] all_masks = [] all_segments = [] for text in texts: text = self.token.tokenize(text) text = text[:self.max_sql - 2] input_seq = ['[CLS]'] + text + ['[SEP]'] pad_len = self.max_sql - len(input_seq) tokens = self.token.convert_tokens_to_ids(input_seq) tokens += [0] * pad_len pad_masks = [1] * len(input_seq) + [0] * pad_len segment_ids = [0] * self.max_sql all_tokens.append(tokens) all_masks.append(pad_masks) all_segments.append(segment_ids) return np.array(all_tokens), np.array(all_masks), np.array( all_segments) def build_model(self): input_words = Input(shape=(self.max_sql, ), dtype=tf.int32, name='input_words') input_mask = Input(shape=(self.max_sql, ), dtype=tf.int32, name='input_mask') segmentids = Input(shape=(self.max_sql, ), dtype=tf.int32, name='segment_ids') _, sequence_output = self.bert_layer( [input_words, input_mask, segmentids]) # without pooled output clf_output = sequence_output[:, 0, :] out = Dense(1, activation='sigmoid')(clf_output) model = Model(inputs=[input_words, input_mask, segmentids], outputs=out) optimizer = Adam(learning_rate=self.lr) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) return model def fit(self, x): xtrain, xval, ytrain, yval = train_test_split(x, x.target_relabeled, test_size=0.2, random_state=878) ytrain = xtrain.target_relabeled xtrain = self.encode(xtrain.cleaned.str.lower()) yval = xval.target_relabeled xval = self.encode(xval.cleaned.str.lower()) metrics = ClassificationReport(train=(xtrain, ytrain), val=(xval, yval)) checkpoint = ModelCheckpoint('model_BERT.h5', monitor='val_loss', save_best_only=True) model = self.build_model() model.fit(xtrain, ytrain, validation_data=(xval, yval), callbacks=[metrics, checkpoint], epochs=self.epochs, batch_size=self.batch_size) def predict(self, x): model = self.build_model() model.load_weights('model_BERT.h5') xtest = self.encode(x.cleaned.str.lower()) ypred = model.predict(xtest) return ypred
class BertPreprocessor(Preprocessor): """Preprocessor for BERT embedding. This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT as embedding. Currently only single sequence classification is supported. Source: https://github.com/google-research/bert_keras """ def __init__(self, pretrained_model_path: str, **kwargs): super().__init__(**kwargs) info = hub.Module(spec=pretrained_model_path)( signature="tokenization_info", as_dict=True) with tf.Session() as sess: vocab_file, do_lower_case = sess.run( [info["vocab_file"], info["do_lower_case"]]) # Create the tokenizer with the vocabulary of the pretrained model self._tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) basic_tokens = self._tokenizer.convert_tokens_to_ids( ["[CLS]", "[SEP]"]) self._CLS_token = basic_tokens[0] self._SEP_token = basic_tokens[1] def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() def _padding_sentence(self): """Return a zero length sentence to pad last batch. :return: Three sequences of zeros (tokens, masks, segment ids). """ return [0] * self._max_seq_len, [0] * self._max_seq_len, [ 0 ] * self._max_seq_len def tokenize(self, text_a: str, text_b: str = None): """Convert sequence(s) of words into sequence(s) of tokens and also compute the masking- and segment ids. For further details please read BERT paper. :param text_a: First sequence :param text_b: Second sequence :return: The sequence of tokens, masks and segment ids. """ input_ids = [0] * self._max_seq_len # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. input_mask = [0] * self._max_seq_len # The segment ids are 0 for text_a and 1 for text_b input_segment_ids = [0] * self._max_seq_len tokens_a = self._tokenizer.tokenize(text_a) tokens_b = None if text_b: tokens_b = self._tokenizer.tokenize(text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b, self._max_seq_len - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self._max_seq_len - 2: tokens_a = tokens_a[0:(self._max_seq_len - 2)] idx = 0 input_ids[idx] = self._CLS_token idx += 1 for element in self._tokenizer.convert_tokens_to_ids(tokens_a): input_ids[idx] = element input_mask[idx] = 1 idx += 1 if tokens_b: for element in self._tokenizer.convert_tokens_to_ids(tokens_b): input_ids[idx] = element input_mask[idx] = 1 input_segment_ids[idx] = 1 idx += 1 input_ids[idx] = self._SEP_token # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. for i in range(idx + 1): input_mask[i] = 1 # safety check assert len(input_ids) == self._max_seq_len assert len(input_mask) == self._max_seq_len assert len(input_segment_ids) == self._max_seq_len return input_ids, input_mask, input_segment_ids def fit(self, texts: List[str]) -> 'BertPreprocessor': """This function does nothing in case of BERT but must be implemented. :param texts: - :return: self """ return self def transform(self, examples: List[InputExample]) -> list: """Transform sequences of words into sequences of tokens, masks and segment ids. Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole sequence belongs together. For further details please read BERT paper. :param texts: The sequences of texts. :return: The sequences of tokens, masks and segment ids. """ input_ids, input_masks, segment_ids = [], [], [] for i, example in enumerate(examples): input_id, input_mask, segment_id = self.tokenize( text_a=example.text_a, text_b=example.text_b) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) return [ np.array(input_ids), np.array(input_masks), np.array(segment_ids) ] def inverse_transform(self, sequences: np.ndarray): """Transform sequences of tokens back to sequences of words (sentences). :param sequences: The sequences of tokens. :return: The sequences of words """ return self._tokenizer.convert_ids_to_tokens(sequences)
class BertInputProcessor(InputProcessor): def __init__(self, params): self.name = 'bert_input_processor' self._max_sent_len = params.max_sent_len self._use_dict = params.use_dict bert_module = hub.Module(params.bert_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) with tf.Session() as sess: vocab_file, do_lower_case = sess.run([ tokenization_info["vocab_file"], tokenization_info["do_lower_case"] ]) self._do_lower_case = do_lower_case self._tokenizer = FullTokenizer(vocab_file, do_lower_case) label_vocab_path = params.data_dir + '/vocab/label_vocab.txt' logging.info('load label vocab from :{}'.format(label_vocab_path)) self.label_vocab = Vocab(params.vocab_pad, params.vocab_unk) self.label_vocab.load(label_vocab_path) params.num_labels = len(self.label_vocab) self._num_labels = params.num_labels if params.use_dict: dict_vocab_path = params.data_dir + '/vocab/dict_vocab.txt' logging.info( 'load user dict vocab from : {}'.format(dict_vocab_path)) self.dict_vocab = Vocab(params.vocab_pad, params.vocab_unk) self.dict_vocab.load(dict_vocab_path) params.dict_vocab_size = len(self.dict_vocab) dict_path = params.data_dir + '/user_dict.json' logging.info('load user dict form : {}'.format(dict_path)) with open(dict_path, 'r') as file: self.user_dict = json.load(file) def transform(self, data, mode="predict", verbose=False): word_seqs = data['word'] bert_word_seqs = [] mapping_seqs = [] for word_seq in word_seqs: bert_word_seq, mapping_seq = [], [] for index, word in enumerate(word_seq): if self._do_lower_case == True: word = word.lower() temp_words = self._tokenizer.wordpiece_tokenizer.tokenize(word) for i, temp_word in enumerate(temp_words): bert_word_seq.append(temp_word) if i == 0: mapping_seq.append(index) else: mapping_seq.append('X') bert_word_seq = ["[CLS]"] + bert_word_seq[:self._max_sent_len - 2] + ["[SEP]"] mapping_seq = ["[CLS]"] + mapping_seq[:self._max_sent_len - 2] + ["[SEP]"] bert_word_seqs.append(bert_word_seq) mapping_seqs.append(mapping_seq) data['bert_mapping'] = mapping_seqs bert_word_id_seqs = [ self._tokenizer.convert_tokens_to_ids(sent) for sent in bert_word_seqs ] bert_mask_id_seqs = [[1] * len(sent) for sent in bert_word_seqs] bert_segment_id_seqs = [[0] * len(sent) for sent in bert_word_seqs] padded_bert_word_id_seqs = pad_sequences(bert_word_id_seqs, padding='post', maxlen=self._max_sent_len) padded_bert_mask_id_seqs = pad_sequences(bert_mask_id_seqs, padding='post', maxlen=self._max_sent_len) padded_bert_segment_id_seqs = pad_sequences(bert_segment_id_seqs, padding='post', maxlen=self._max_sent_len) x_inputs = [ padded_bert_word_id_seqs, padded_bert_mask_id_seqs, padded_bert_segment_id_seqs ] if self._use_dict: bert_dict_tag_seqs = [] for word_seq, mapping_seq in zip(word_seqs, mapping_seqs): bert_dict_tag_seq = [] for mapping_i in mapping_seq: if mapping_i in {"[CLS]", "[SEP]", "X"}: dict_tag = mapping_i else: dict_tag = self.user_dict.get(word_seq[mapping_i], self.dict_vocab._unknown) bert_dict_tag_seq.append(dict_tag) bert_dict_tag_seqs.append(bert_dict_tag_seq) bert_dict_id_seq = [[ self.dict_vocab.encode(dict_tag) for dict_tag in dict_tag_sent ] for dict_tag_sent in bert_dict_tag_seqs] padded_dict_id_seq = pad_sequences(bert_dict_id_seq, padding='post', maxlen=self._max_sent_len) x_inputs.append(padded_dict_id_seq) assert len(padded_bert_word_id_seqs[0]) == self._max_sent_len assert len(padded_bert_mask_id_seqs[0]) == self._max_sent_len assert len(padded_bert_segment_id_seqs[0]) == self._max_sent_len if mode == 'evaluate': label_seqs = data['label'] bert_label_seqs = [] for label_seq, mapping_seq in zip(label_seqs, mapping_seqs): bert_label_seq = [] for mapping_i in mapping_seq: if mapping_i in {"[CLS]", "[SEP]", "X"}: bert_label_seq.append(mapping_i) else: bert_label_seq.append(label_seq[mapping_i]) bert_label_seqs.append(bert_label_seq) bert_label_id_seqs = [[ self.label_vocab.encode(label, allow_oov=False) for label in sent ] for sent in bert_label_seqs] padded_bert_label_id_seqs = pad_sequences( bert_label_id_seqs, padding='post', maxlen=self._max_sent_len) y_seqs = to_categorical(padded_bert_label_id_seqs, self._num_labels).astype(int) y_seqs = y_seqs if len(y_seqs.shape) == 3 else np.expand_dims( y_seqs, axis=0) return x_inputs, y_seqs elif mode == 'predict': return x_inputs else: raise ValueError('mode must be predict or evaluate')
class BertEmbeddingsResolver: def __init__(self, model_folder, max_length=256, lowercase=True): # 1. Create tokenizer self.max_length = max_length vocab_file = os.path.join(model_folder, 'vocab.txt') self.tokenizer = FullTokenizer(vocab_file, do_lower_case=lowercase) # 2. Read Config config_file = os.path.join(model_folder, 'bert_config.json') self.config = BertConfig.from_json_file(config_file) # 3. Create Model self.session = tf.Session() self.token_ids_op = tf.placeholder(tf.int32, shape=(None, max_length), name='token_ids') self.model = BertModel(config=self.config, is_training=False, input_ids=self.token_ids_op, use_one_hot_embeddings=False) # 4. Restore Trained Model self.saver = tf.train.Saver() ckpt_file = os.path.join(model_folder, 'bert_model.ckpt') # RCS ckpt_file = os.path.join(model_folder, 'model.ckpt-1000000') self.saver.restore(self.session, ckpt_file) hidden_layers = self.config.num_hidden_layers self.embeddings_op = tf.get_default_graph().get_tensor_by_name( "bert/encoder/Reshape_{}:0".format(hidden_layers + 1)) def tokenize_sentence(self, tokens, add_service_tokens=True): result = [] is_word_start = [] for token in tokens: pieces = self.tokenizer.tokenize(token) result.extend(pieces) starts = [False] * len(pieces) starts[0] = True is_word_start.extend(starts) if add_service_tokens: if len(result) > self.max_length - 2: result = result[:self.max_length - 2] is_word_start = is_word_start[:self.max_length - 2] result = ['[CLS]'] + result + ['[SEP]'] is_word_start = [False] + is_word_start + [False] else: if len(result) > self.max_length: result = result[:self.max_length] is_word_start = is_word_start[:self.max_length] return (result, is_word_start) def resolve_sentences(self, sentences): batch_is_word_start = [] batch_token_ids = [] batch_tokens = [] for sentence in sentences: tokens, is_word_start = self.tokenize_sentence(sentence) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) to_input = np.pad(token_ids, [(0, self.max_length - len(token_ids))], mode='constant') batch_token_ids.append(to_input) batch_tokens.append(tokens) batch_is_word_start.append(is_word_start) embeddings = self.session.run( self.embeddings_op, feed_dict={self.token_ids_op: batch_token_ids}) result = [] for i in range(len(sentences)): tokens = batch_tokens[i] is_word_start = batch_is_word_start[i] item_embeddings = embeddings[i, :len(tokens), :] resolved = TokenEmbeddings.create_sentence(tokens, is_word_start, item_embeddings) result.append(resolved) return result def resolve_sentence(self, sentence): tokens, is_word_start = self.tokenize_sentence(sentence) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) to_input = np.pad(token_ids, [(0, self.max_length - len(token_ids))], mode='constant') to_input = to_input.reshape((1, self.max_length)) embeddings = self.session.run(self.embeddings_op, feed_dict={self.token_ids_op: to_input}) embeddings = np.squeeze(embeddings) embeddings = embeddings[:len(token_ids), :] return TokenEmbeddings.create_sentence(tokens, is_word_start, embeddings)
def build_dataset(conll_file, tfrecod_file, pos2id, dep2id, path2id, truncate=False): max_len = 0 tokenizer = FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) with open(conll_file, 'r') as reader: text = reader.read().strip() sentences = text.split('\n\n') tf_writer = tf.python_io.TFRecordWriter(tfrecod_file) for sent in sentences: subword_list = ["[CLS]"] span_list = [0] mask_list = [0] cue_list = [0] pos_list = [0] dep_list = [0] path_list = [0] lpath_list = [-1] cp_list = [-1] subword_id_list = tokenizer.convert_tokens_to_ids(["[CLS]"]) for token in sent.split('\n'): if len(token) >= 8: token = token.split('\t') token_ = token[0] subword = tokenizer.tokenize(token_) span = [int(token[8]) for _ in range(len(subword))] cue = [int(token[7]) for _ in range(len(subword))] pos = [ int(mapping(pos2id, token[2])) for _ in range(len(subword)) ] dep = [ int(mapping(dep2id, token[3])) for _ in range(len(subword)) ] path = [ int(mapping(path2id, token[4])) for _ in range(len(subword)) ] lpath = [int(token[5]) for _ in range(len(subword))] cp = [int(token[6]) for _ in range(len(subword))] mask = [0 for _ in range(len(subword))] mask[0] = 1 sub_id = tokenizer.convert_tokens_to_ids(subword) subword_list.extend(subword) mask_list.extend(mask) subword_id_list.extend(sub_id) pos_list.extend(pos) dep_list.extend(dep) path_list.extend(path) lpath_list.extend(lpath) cp_list.extend(cp) cue_list.extend(cue) span_list.extend(span) subword_list.append("[SEP]") span_list.append(0) cue_list.append(0) mask_list.append(0) subword_id_list.extend(tokenizer.convert_tokens_to_ids(["[SEP]"])) pos_list.append(0) dep_list.append(0) path_list.append(0) lpath_list.append(-1) cp_list.append(-1) assert len(subword_list) == len(span_list) == len(mask_list) == len( subword_id_list) max_len = max(max_len, len(subword_id_list)) if len(subword_list) > 2: if (not truncate) or (len(subword_id_list) <= 64): # write tfrecord token_id = [ tf.train.Feature(int64_list=tf.train.Int64List(value=[t_])) for t_ in subword_id_list ] mask = [ tf.train.Feature(int64_list=tf.train.Int64List(value=[m_])) for m_ in mask_list ] span = [ tf.train.Feature(int64_list=tf.train.Int64List(value=[s_])) for s_ in span_list ] cue = [ tf.train.Feature(int64_list=tf.train.Int64List(value=[c_])) for c_ in cue_list ] pos_features = [ tf.train.Feature(int64_list=tf.train.Int64List( value=[pos_])) for pos_ in pos_list ] dep_features = [ tf.train.Feature(int64_list=tf.train.Int64List( value=[dep_])) for dep_ in dep_list ] path_features = [ tf.train.Feature(int64_list=tf.train.Int64List( value=[path_])) for path_ in path_list ] lpath_features = [ tf.train.Feature(int64_list=tf.train.Int64List( value=[lpath_])) for lpath_ in lpath_list ] cp_features = [ tf.train.Feature(int64_list=tf.train.Int64List( value=[cp_])) for cp_ in cp_list ] feature_list = { 'token_id': tf.train.FeatureList(feature=token_id), 'span': tf.train.FeatureList(feature=span), 'masks': tf.train.FeatureList(feature=mask), 'cue': tf.train.FeatureList(feature=cue), 'pos': tf.train.FeatureList(feature=pos_features), 'dep': tf.train.FeatureList(feature=dep_features), 'path': tf.train.FeatureList(feature=path_features), 'lpath': tf.train.FeatureList(feature=lpath_features), 'cp': tf.train.FeatureList(feature=cp_features), } context = tf.train.Features( feature={ "length": tf.train.Feature(int64_list=tf.train.Int64List( value=[len(subword_id_list)])), }) feature_lists = tf.train.FeatureLists( feature_list=feature_list) ex = tf.train.SequenceExample(feature_lists=feature_lists, context=context) tf_writer.write(ex.SerializeToString()) tf_writer.close()
class BertNerPreprocessor: """Takes tokens and splits them into bert subtokens, encode subtokens with their indices. Creates mask of subtokens (one for first subtoken, zero for later subtokens). If tags are provided, calculate tags for subtokens. Args: vocab_file: path to vocabulary do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens max_subword_length: replace token to <unk> if it's length is larger than this (defaults to None, which is equal to +infinity) token_mask_prob: probability of masking token while training provide_subword_tags: output tags for subwords or for words Attributes: max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens max_subword_length: rmax lenght of a bert subtoken tokenizer: instance of Bert FullTokenizer """ def __init__(self, max_seq_length: int = 4096, max_subword_length: int = 15, token_maksing_prob: float = 0.0, provide_subword_tags: bool = False, **kwargs): self._re_tokenizer = re.compile(r"[\w']+|[^\w ]") self.provide_subword_tags = provide_subword_tags self.mode = kwargs.get('mode') self.max_seq_length = max_seq_length self.max_subword_length = max_subword_length self.tokenizer = FullTokenizer(vocab_file=VOCAB_PATH, do_lower_case=False) self.token_maksing_prob = token_maksing_prob self.log = getLogger(__name__) def __call__(self, tokens: Union[List[List[str]], List[str]], tags: List[List[str]] = None, **kwargs): if isinstance(tokens[0], str): tokens = [re.findall(self._re_tokenizer, s) for s in tokens] subword_tokens, subword_tok_ids, subword_masks, subword_tags = [], [], [], [] for i in range(len(tokens)): toks = tokens[i] ys = ['O'] * len(toks) if tags is None else tags[i] mask = [int(y != 'X') for y in ys] print("toks") print(toks) print("ys") print(ys) print("KKKK") assert len(toks) == len(ys) == len(mask), \ f"toks({len(toks)}) should have the same length as " \ f" ys({len(ys)}) and mask({len(mask)}), tokens = {toks}." sw_toks, sw_mask, sw_ys = self._ner_bert_tokenize( toks, mask, ys, self.tokenizer, self.max_subword_length, mode=self.mode, token_maksing_prob=self.token_maksing_prob) if self.max_seq_length is not None: if len(sw_toks) > self.max_seq_length: print("sw_toks") print(sw_toks) print(len(sw_toks)) raise RuntimeError( f"input sequence after bert tokenization" f" shouldn't exceed {self.max_seq_length} tokens. {len(sw_toks)} is" ) subword_tokens.append(sw_toks) subword_tok_ids.append( self.tokenizer.convert_tokens_to_ids(sw_toks)) subword_masks.append(sw_mask) subword_tags.append(sw_ys) assert len(sw_mask) == len(sw_toks) == len(subword_tok_ids[-1]) == len(sw_ys), \ f"length of mask({len(sw_mask)}), tokens({len(sw_toks)})," \ f" token ids({len(subword_tok_ids[-1])}) and ys({len(ys)})" \ f" for tokens = `{toks}` should match" subword_tok_ids = self.zero_pad(subword_tok_ids, dtype=int, padding=0) subword_masks = self.zero_pad(subword_masks, dtype=int, padding=0) if tags is not None: if self.provide_subword_tags: return tokens, subword_tokens, subword_tok_ids, subword_masks, subword_tags else: nonmasked_tags = [[t for t in ts if t != 'X'] for ts in tags] for swts, swids, swms, ts in zip(subword_tokens, subword_tok_ids, subword_masks, nonmasked_tags): if (len(swids) != len(swms)) or (len(ts) != sum(swms)): self.log.warning( 'Not matching lengths of the tokenization!') self.log.warning( f'Tokens len: {len(swts)}\n Tokens: {swts}') self.log.warning( f'Masks len: {len(swms)}, sum: {sum(swms)}') self.log.warning(f'Masks: {swms}') self.log.warning(f'Tags len: {len(ts)}\n Tags: {ts}') return tokens, subword_tokens, subword_tok_ids, subword_masks, nonmasked_tags return tokens, subword_tokens, subword_tok_ids, subword_masks @staticmethod def _ner_bert_tokenize( tokens: List[str], mask: List[int], tags: List[str], tokenizer: FullTokenizer, max_subword_len: int = None, mode: str = None, token_maksing_prob: float = 0.0 ) -> Tuple[List[str], List[int], List[str]]: tokens_subword = ['[CLS]'] mask_subword = [0] tags_subword = ['X'] for token, flag, tag in zip(tokens, mask, tags): subwords = tokenizer.tokenize(token) if not subwords or \ ((max_subword_len is not None) and (len(subwords) > max_subword_len)): tokens_subword.append('[UNK]') mask_subword.append(flag) tags_subword.append(tag) else: if mode == 'train' and token_maksing_prob > 0.0 and np.random.rand( ) < token_maksing_prob: tokens_subword.extend(['[MASK]'] * len(subwords)) else: tokens_subword.extend(subwords) mask_subword.extend([flag] + [0] * (len(subwords) - 1)) tags_subword.extend([tag] + ['X'] * (len(subwords) - 1)) tokens_subword.append('[SEP]') mask_subword.append(0) tags_subword.append('X') return tokens_subword, mask_subword, tags_subword def zero_pad(self, batch, zp_batch=None, dtype=np.float32, padding=0): if zp_batch is None: dims = self.get_dimensions(batch) zp_batch = np.ones(dims, dtype=dtype) * padding if zp_batch.ndim == 1: zp_batch[:len(batch)] = batch else: for b, zp in zip(batch, zp_batch): self.zero_pad(b, zp) return zp_batch def get_dimensions(self, batch) -> List[int]: return list(map(max, self.get_all_dimensions(batch))) def get_all_dimensions( self, batch: Sequence, level: int = 0, res: Optional[List[List[int]]] = None) -> List[List[int]]: if not level: res = [[len(batch)]] if len(batch) and isinstance(batch[0], Sized) and not isinstance(batch[0], str): level += 1 if len(res) <= level: res.append([]) for item in batch: res[level].append(len(item)) self.get_all_dimensions(item, level, res) return res