def parse_sequence(tokenizer: t10n.FullTokenizer, sequence: str) -> SequenceParseResult: tokens = tokenizer.tokenize(sequence) tokens.insert(0, '[CLS]') tokens.append('[SEP]') # Could be 0 or 1, not sure which index is *supposed* to represent a first segment token_type_ids = [0] * len(tokens) input_ids = tokenizer.convert_tokens_to_ids(tokens) attention_mask = [1] * len(tokens) attention_mask[0] = 0 # Default for our model max_seq_length = 128 # Pad arrays while len(input_ids) < max_seq_length: # Not sure if padding belongs to the sequence or not token_type_ids.append(0) # Zero is the [PAD]-token for the BERT-vocab input_ids.append(0) # We probably should exclude the sequence padding from the attention-mask attention_mask.append(0) return SequenceParseResult( tokens=tokens, token_type_ids=token_type_ids, attention_mask=attention_mask, input_ids=input_ids, )
def preprocess(data): tokenizer = FullTokenizer(vocab_file) tok_ip = np.zeros((len(data), 128), dtype="int32") sent_ip = np.zeros((len(data), 128), dtype="int8") pos_ip = np.zeros((len(data), 128), dtype="int8") masks = np.zeros((len(data), 128), dtype="int8") for pos, text in tqdm.tqdm_notebook(enumerate(data)): tok0 = tokenizer.tokenize(text[0]) tok1 = tokenizer.tokenize(text[1]) tok = tok0 + tok1 if len(tok) > 128: tok = tok[:127] + ["[SEP]"] pad_len = 128 - len(tok) tok_len = len(tok) tok0_len = len(tok0) tok = tokenizer.convert_tokens_to_ids(tok) + [0] * pad_len pos_val = range(128) sent = [0] * tok0_len + [1] * (tok_len - tok0_len) + [0] * pad_len mask = [1] * tok_len + [0] * pad_len tok_ip[pos] = tok pos_ip[pos] = pos_val masks[pos] = mask masks = masks[:, None, None, :] return tok_ip, sent_ip, pos_ip, masks
def __init__(self, bert_config_file, init_checkpoint, max_seq_length, vocab_file, num_labels, use_gpu=False): # 导入预训练参数所需 self.bert_config = modeling.BertConfig.from_json_file(bert_config_file) self.init_checkpoint = init_checkpoint # 数据集和计算所需 self.max_seq_length = max_seq_length self.num_labels = num_labels # 数据预处理所需 self.vocab_file = vocab_file self.tokenizer=FullTokenizer(self.vocab_file, do_lower_case=False) # 默认 cased 模型 # gpu self.use_gpu=use_gpu self.graph=tf.Graph() #声明计算图 with self.graph.as_default(): # 定义placeholder self.input_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length)) self.input_mask = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length)) self.segment_ids = tf.placeholder(dtype=tf.int64, shape=(None,self.max_seq_length)) # 定义计算 (self.logits, self.probabilities) = create_predict_model(self.bert_config, self.input_ids, self.input_mask, self.segment_ids, self.num_labels) # 导入预训练参数 self.tvars = tf.trainable_variables() #创建了计算图后,可训练的变量随之被创建。 self.initialized_variable_names = {} if self.init_checkpoint: #init_checkpoint是命令行中传入的预训练BERT或先前训练过的,ckpt文件 (self.assignment_map, self.initialized_variable_names #从init_checkpoints中获取与可用的变量的值(预训练模型与实际任务计算图的变量的交集) ) = modeling.get_assignment_map_from_checkpoint(self.tvars, self.init_checkpoint) tf.train.init_from_checkpoint(self.init_checkpoint, self.assignment_map)
def test_tokenize(self): tokenizer = FullTokenizer() sentence = '実質的変化はなかった' res = tokenizer.tokenize(sentence) firsts = [0, 2, 3, 5, 6, 9] tokens = [ CharToken(c, is_first=i in firsts) for i, c in enumerate(sentence) ] self.assertEqual(res, tokens)
def get_lens(data): tokenizer = FullTokenizer(vocab_file) lens = [] for pos, text in tqdm.tqdm(enumerate(data)): tok0 = tokenizer.tokenize(text[0]) tok1 = tokenizer.tokenize(text[1]) tok = tok0 + tok1 lens.append(len(tok)) return np.array(lens)
def convert_single_example(ex_index, example: InputExample, tag_list: list, label_list: list, max_seq_length, tokenizer: tokenization.FullTokenizer): query = tokenizer.tokenize(example.text) if len(query) > max_seq_length - 2: query = query[0:(max_seq_length - 2)] tokens = ["[CLS]"] tags = ["[CLS]"] for idx, token in enumerate(query): tokens.append(token) tags.append(example.tag[idx]) tokens.append("[SEP]") tags.append("[SEP]") segment_ids = [0] * len(tokens) tag_map = {} for idx, tag in enumerate(tag_list): tag_map[tag] = idx label_map = {} for idx, label in enumerate(label_list): label_map[label] = idx tag_ids = [tag_map[tag] for tag in tags] label_id = label_map[example.label] input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) tag_ids.append(0) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % example.guid) logger.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) logger.info("tag: %s" % " ".join(tags)) logger.info("label: %s" % example.label) feature = InputFeature( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, tag_ids=tag_ids, label_id=label_id ) return feature
def test_tokenize_with_nelogd(self): NEOLOGD_PATH = "/usr/local/lib/mecab/dic/ipadic/mecab-user-dict-seed.dic" if not os.path.isfile(NEOLOGD_PATH): raise ValueError( 'NEOLOGD_PATH is invalid. Please set a file path to neologd dic' ) sentence = '実質的変化はなかった' tokenizer = FullTokenizer(userdic_path=NEOLOGD_PATH) firsts = [0, 3, 5, 6, 9] tokens = [ CharToken(c, is_first=i in firsts) for i, c in enumerate(sentence) ] res = tokenizer.tokenize(sentence) self.assertEqual(res, tokens)
def __init__(self): self.THRESHOLD = 0.1 self.PROB_THRESHOLD = 0.8 self.LABELS_32 = [ "sentimental", "afraid", "proud", "faithful", "terrified", "joyful", "angry", "sad", "jealous", "grateful", "prepared", "embarrassed", "excited", "annoyed", "lonely", "ashamed", "guilty", "surprised", "nostalgic", "confident", "furious", "disappointed", "caring", "trusting", "disgusted", "anticipating", "anxious", "hopeful", "content", "impressed", "apprehensive", "devastated" ] self.MAX_SEQ_LENGTH = 50 self.tokenizer = FullTokenizer( vocab_file='vocab.txt', do_lower_case=True) self.model = load_model('model_data/model32') self.matrix = np.genfromtxt('emotion_multiplier.csv') self.map_probabilities = np.vectorize(lambda x: 1 if x >= self.PROB_THRESHOLD else 0)
def main(_): tokenizer_zh = FullTokenizer(vocab_file=FLAGS.bert_vocab_file, do_lower_case=True) tokenizer_en = load_subword_vocab(FLAGS.vocab_file) target_vocab_size = tokenizer_en.vocab_size + 2 config = FileConfig(FLAGS.config_file) transformer = Transformer(config=config, target_vocab_size=target_vocab_size, bert_config_file=FLAGS.bert_config_file) inp = tf.random.uniform((1, FLAGS.max_seq_length)) tar_inp = tf.random.uniform((1, FLAGS.max_seq_length)) fn_out, _ = transformer(inp, tar_inp, True, look_ahead_mask=None, dec_padding_mask=None) transformer.load_weights(FLAGS.init_checkpoint) print(transformer.encoder.weights[0]) result, _ = evaluate(transformer, tokenizer_zh, tokenizer_en, FLAGS.inp_sentence, FLAGS.max_seq_length) predicted_sentence = tokenizer_en.decode( [i for i in result if i < tokenizer_en.vocab_size]) print('Input: {}'.format(FLAGS.inp_sentence)) print('Predicted translation: {}'.format(predicted_sentence))
def adaptERNIEtokenization(all_sentences): tokenizer = FullTokenizer(vocab_file="vocab.txt", do_lower_case=True) ernie_tokens = [ tokenizer.tokenize(sentence) for sentence in tqdm(all_sentences) ] print("Parsed to ERNIE tokens!") all_cleaned_tokens = [] for line in tqdm(ernie_tokens): cleaned_tokens = [] for i, token in enumerate(line): if token[:2] == "##": cleaned_tokens[-1] += token[2:] else: cleaned_tokens.append(token) all_cleaned_tokens.append(cleaned_tokens) return all_cleaned_tokens
def bert_tokenizer(sess): bert_module = hub.Module(bert_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file = sess.run(tokenization_info["vocab_file"]) do_lower_case = sess.run(tokenization_info["do_lower_case"]) return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def create_tokenizer_from_hub_module(): global do_lower_case """Get the vocab file and casing info from the Hub module.""" bert_module = hub.Module(bert_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]) return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def __init__(self,gcn=False,soft_masked=True, lr=5e-5, beta=3.0, layers_num=2, dropout=1.0, bert_max_len=192, dir_model='../checkpoint/train/soft-masked-bert/', bert_dir='../rest-api/app/models/chinese_L-12_H-768_A-12', train=True): # super(BERT_GCN, self).__init__(config) self.gcn = gcn self.soft_masked=soft_masked self.lr = lr self.dir_model = dir_model # self.nepochs=nepochs self.dropout = dropout # 1 self.bert_dir = bert_dir self.bert_max_len = bert_max_len self.beta = beta self.layers_num = layers_num config_file = bert_dir + '/bert_config.json' self.init_checkpoint = bert_dir + '/bert_model.ckpt' vocab_file = bert_dir + '/vocab.txt' self.vocab = construct_vocab(vocab_file) self.tokenizer = FullTokenizer( vocab_file=vocab_file, do_lower_case=True) self.bert_api = BertModel( config=BertConfig.from_json_file(config_file),soft_masked=soft_masked) if self.soft_masked: self.tags_dict={'O':0, 'B-Err':1} if self.gcn: # Read data from checkpoint file reader = pywrap_tensorflow.NewCheckpointReader(self.init_checkpoint) var_to_shape_map = reader.get_variable_to_shape_map() # Print tensor name and values for key in var_to_shape_map: if "word_embeddings" in key: emb_table = reader.get_tensor(key) break with open("filter_dict.json", 'r') as load_f: dict_filter = json.load(load_f) # in_conf_ind=sorted(dict_filter.items(),key=lambda d:d[0]) zero_id = len(dict_filter) print("zero_id:", zero_id) self.emb_table_filted = [] # y=-1 self.w_index = [zero_id]*21128 # self.b_index = [] self.emb_mask = np.ones([21128, 768]) for x in dict_filter: self.w_index[int(x)] = dict_filter[x] self.emb_table_filted.append(emb_table[int(x)]) # emb_table[int(x)]=np.zeros([768]) self.emb_mask[int(x)] = np.zeros([768]) self.emb_table_filted = np.array(self.emb_table_filted) r = np.load('spellgcn_adj_norm.npz') self.p_A = r['A_p'].astype(np.float32) self.s_A = r['A_s'].astype(np.float32) self.p_A = tf.constant(self.p_A) self.s_A = tf.constant(self.s_A)
def dump_node_feat(args): log.info("Dump node feat starting...") id2str = np.load(os.path.join(args.outpath, "id2str.npy"), mmap_mode="r") pool = multiprocessing.Pool() tokenizer = FullTokenizer(args.vocab_file) term_ids = pool.map(partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str) np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids)) log.info("Dump node feat done.") pool.terminate()
def __init__(self, bert_meta): self.graph = self._load_graph(bert_meta.model_file) self.tokenizer = FullTokenizer(vocab_file=bert_meta.vocab_file, do_lower_case=True) self.max_seq_length = 128 # Input. self.input_ids = self.graph.get_tensor_by_name('infer/input_ids:0') self.word_ids = self.graph.get_tensor_by_name('infer/input_mask:0') self.segment_ids = self.graph.get_tensor_by_name('infer/segment_ids:0') # Output. self.predictions = self.graph.get_tensor_by_name( 'infer/loss/Softmax:0') self.sess = tf.Session(graph=self.graph) self.inference(BertInputPackage(u'预热一下'))
class BERTTextEncoder(TextEncoder): def __init__(self, vocab_file: str, do_lower_case: bool = True) -> None: self.tokenizer = FullTokenizer(vocab_file, do_lower_case) super().__init__(len(self.tokenizer.vocab)) self.bert_unk_id = self.tokenizer.vocab['[UNK]'] self.bert_msk_id = self.tokenizer.vocab['[MASK]'] def standardize_ids(self, ids: List[int]) -> List[int]: for i in range(len(ids)): if ids[i] == self.bert_unk_id: # UNK ids[i] = 0 else: # VOCAB ids[i] -= self.bert_msk_id return ids def encode(self, sent: str) -> List[int]: return self.standardize_ids( self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(sent)))
def __init__(self, param): self.model_path = os.path.abspath(param["model_path"]) self.bert_config_file = os.path.abspath(param["bert_config_file"]) bert_config = modeling.BertConfig.from_json_file(self.bert_config_file) self.fulltoken = FullTokenizer(os.path.abspath(param["vocab_file"])) self.vocab_dict = self.fulltoken.vocab target_start_ids = self.vocab_dict["[CLS]"] target_end_ids = self.vocab_dict["[SEP]"] num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(',')) tf.logging.info("num_gpus is {}".format(num_gpus)) if param["use_mul_gpu"]: distribute = tf.contrib.distribute.MirroredStrategy( num_gpus=num_gpus) else: distribute = None run_config = tf.estimator.RunConfig(model_dir=os.path.abspath( self.model_path), save_summary_steps=200, keep_checkpoint_max=2, save_checkpoints_steps=3000, train_distribute=distribute, eval_distribute=distribute) self.input_max_seq_length = param["max_seq_length"] model_fn = model_fn_builder( bert_config, init_checkpoint=None, learning_rate=0.0001, num_train_steps=10000, num_warmup_steps=100, use_one_hot_embeddings=False, # when use tpu ,it's True input_seq_length=param["max_seq_length"], target_seq_length=param["max_target_seq_length"], target_start_ids=target_start_ids, target_end_ids=target_end_ids, batch_size=param["batch_size"], mode_type=param["mode_type"]) self.estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)
def parse_line(self, line, max_seq_len=512): """ parse one line to token_ids, sentence_ids, pos_ids, label """ line = line.strip().split(",") assert len(line) == 3, \ "One sample must have %d fields!" % 3 text_left, text_right, masklabel = line tokenizer = FullTokenizer(self.vocab_path) # tokenizer = FullTokenizer(vocab_path) text_left = tokenizer.tokenize(text_left) masklabel = tokenizer.tokenize(masklabel) masklabel_ = len(masklabel) * ["[MASK]"] text_right = tokenizer.tokenize(text_right) all_tokens = text_left + masklabel_ + text_right token_ids = tokenizer.convert_tokens_to_ids(all_tokens) sent_ids = [0] * len(all_tokens) pos_ids = [i for i in range(len(all_tokens))] input_mask = [1.0] * len(all_tokens) # 这儿还差一个mask_pos mask_pos = [] for idx, mask in enumerate(token_ids): if mask == self.mask_id: mask_pos.append(idx) # 添加一个mask_label mask_label = list(tokenizer.convert_tokens_to_ids(masklabel)) assert len(token_ids) == len(sent_ids) == len(pos_ids) == len( input_mask ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)" if len(token_ids) > max_seq_len: return None return [token_ids, sent_ids, pos_ids, input_mask, mask_pos, mask_label]
def dump_node_feat(args): log.info("Dump node feat starting...") id2str = [ line.strip("\n").split("\t")[1] for line in io.open(os.path.join(args.outpath, "terms.txt"), encoding=args.encoding) ] pool = multiprocessing.Pool() tokenizer = FullTokenizer(args.vocab_file) term_ids = pool.map( partial(term2id, tokenizer=tokenizer, max_seqlen=args.max_seqlen), id2str) np.save(os.path.join(args.outpath, "term_ids.npy"), np.array(term_ids, np.uint16)) log.info("Dump node feat done.") pool.terminate()
def setUp(self): with NamedTemporaryFile(mode='w') as tf: tf.write("a\n[CLS]\nb\n[SEP]c\nd\ne\nf\ng\nh\n") tf.seek(0) tokenizer = FullTokenizer(vocab_file=tf.name) self.vocab_words = list(tokenizer.vocab.keys()) self.tokens = [ CharToken('a', True), CharToken('b', False), CharToken('c', False), CharToken('d', True), CharToken('e', False), CharToken('f', True), CharToken('g', False), CharToken('h', True) ]
def load_model(self, model_dir: str, model_config: str = "model_config.json"): model_config = os.path.join(model_dir, model_config) model_config = json.load(open(model_config)) bert_config = json.load( open(os.path.join(model_dir, "bert_config.json"))) model = BertNer(bert_config, tf.float32, model_config['num_labels'], model_config['max_seq_length']) ids = tf.ones((1, 128), dtype=tf.int64) _ = model(ids, ids, ids, ids, training=False) model.load_weights(os.path.join(model_dir, "model.h5")) voacb = os.path.join(model_dir, "vocab.txt") tokenizer = FullTokenizer(vocab_file=voacb, do_lower_case=model_config["do_lower"]) return model, tokenizer, model_config
def gen_data(in_file, out_file, tagType): with open(in_file, 'r', encoding='utf8') as f: raw_data = [_.strip() for _ in f.readlines()] vocab_file = '../models/vocab.txt' full_tokenizer = FullTokenizer(vocab_file, do_lower_case=True) basic_tokenizer = BasicTokenizer(do_lower_case=True) data_all = [ preprocess2dict(s, tagType, full_tokenizer, basic_tokenizer) for s in tqdm(raw_data) ] df = pd.DataFrame(data_all) # separate with \t df.to_csv(out_file, sep='\t', encoding='utf-8', index=False) print('Finish writing generated ' + tagType + ' data in ' + out_file)
def gen(self): from extract_features import convert_lst_to_features from tokenization import FullTokenizer tokenizer = FullTokenizer( vocab_file=os.path.join(self.bert_model_dir, 'vocab.txt')) # Windows does not support logger in MP environment, thus get a new logger #这个while 循环保证生成器hold住,estimator.predict不用重新加载 while not self.closed: is_tokenized = all(isinstance(el, list) for el in self.text) tmp_f = list( convert_lst_to_features(self.text, self.seq_length, tokenizer, is_tokenized, mask_cls_sep=True)) # print([f.input_ids for f in tmp_f]) yield { 'input_ids': [f.input_ids for f in tmp_f], 'input_mask': [f.input_mask for f in tmp_f], 'input_type_ids': [f.input_type_ids for f in tmp_f] }
clf_output = sequence_output[:, 0, :] out = keras.layers.LSTM(128) out = keras.layers.Dense(1, activation='sigmoid')(clf_output) model = keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out) model.compile(keras.optimizers.Adam(lr=0.00001), loss='binary_crossentropy', metrics=['accuracy']) return model vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = FullTokenizer(vocab_file, do_lower_case) train_input = bert_encode(train.text1.values, tokenizer, maxlen) test_input = bert_encode(test.text1.values, tokenizer, maxlen) model = build_model(bert_layer, maxlen) model.summary() # model = keras.Sequential([ # bert_layer([input_word_ids, input_mask, segment_ids]), # # keras.layers.Dropout(0.3), # keras.layers.LSTM(128), # # keras.layers.Dropout(0.3), # keras.layers.Dense(64), # keras.layers.Dense(1, activation = 'sigmoid') # ]
help="Maximum number of contexts to output for an example.") parser.add_argument( "--max_position", type=int, default=50, help="Maximum context position for which to generate special tokens.") parser.add_argument( "--skip_nested_contexts", type=bool, default=True, help= "Completely ignore context that are not top level nodes in the page.") args = parser.parse_args() tokenizer = FullTokenizer( 'check_points/bert-large-wwm-finetuned-squad/vocab.txt', do_lower_case=True) # train preprocess import ipdb output_file = os.path.join( args.output_dir, 'train_data_maxlen{}.bin'.format(args.max_seq_length)) ipdb.set_trace() examples = read_nq_examples(input_file=args.train_file, is_training=True, args=args) num_spans_to_ids, features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, is_training=True, args=args) torch.save((features, examples), output_file) for spans, ids in num_spans_to_ids.items():
def onSetup(self): BERT_DIR = os.path.join(ue.get_content_dir(), 'Scripts', 'BertModel') self.imported = tf.saved_model.load(BERT_DIR) self.f = self.imported.signatures["serving_default"] VOCAB_PATH = os.path.join(BERT_DIR, "assets", "vocab.txt") self.tokenizer = FullTokenizer(VOCAB_PATH)
for x, y in imdb["train"].batch(128): imdb_reviews_train.extend(x.numpy()) y_train.extend(y.numpy()) for x, y in imdb["test"].batch(128): imdb_reviews_test.extend(x.numpy()) y_test.extend(y.numpy()) y_train = np.array(y_train) y_test = np.array(y_test) # Extract pre-trained BERT as a Keras layer. bert_model_path = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1" bert_layer = hub.KerasLayer(bert_model_path, trainable=False) # Build tokenizer from pre-trained BERT vocabulary. bert_tokenizer = FullTokenizer( vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy(), do_lower_case=bert_layer.resolved_object.do_lower_case.numpy()) # TODO: # Document longer than 512 words wont be able to be encoded by BERT, # since its positional encoding has a hard limit for 512 words. # For better results we may need to summarize the document into <= 512 tokens, # or encode sentence by sentence then pool together. maxlen = 256 # TODO: # We need to manually handle CLS and SEP special token for sentence beginning and ending. # Encode text with padding, masking, and segmentation (required by BERT even if we don't use it). tok_seq_train = [bert_tokenizer.tokenize(text) for text in imdb_reviews_train] wid_seq_train = [
class BertInference(object): """ The bert model. """ def __init__(self, bert_meta): self.graph = self._load_graph(bert_meta.model_file) self.tokenizer = FullTokenizer(vocab_file=bert_meta.vocab_file, do_lower_case=True) self.max_seq_length = 128 # Input. self.input_ids = self.graph.get_tensor_by_name('infer/input_ids:0') self.word_ids = self.graph.get_tensor_by_name('infer/input_mask:0') self.segment_ids = self.graph.get_tensor_by_name('infer/segment_ids:0') # Output. self.predictions = self.graph.get_tensor_by_name( 'infer/loss/Softmax:0') self.sess = tf.Session(graph=self.graph) self.inference(BertInputPackage(u'预热一下')) def inference(self, bert_input): """ Call model. """ input_ids, input_mask, segment_ids = self._convert_single_example( bert_input.query) preds_evaluated = self.sess.run(self.predictions, feed_dict={ self.input_ids: [input_ids], self.word_ids: [input_mask], self.segment_ids: [segment_ids] }) return preds_evaluated def _load_graph(self, frozen_graph_filename): with tf.gfile.GFile(frozen_graph_filename, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) with tf.Graph().as_default() as graph: tf.import_graph_def(graph_def, input_map=None, return_elements=None, name="infer", op_dict=None, producer_op_list=None) return graph def _convert_single_example(self, text_a): tokens_a = self.tokenizer.tokenize(text_a) if len(tokens_a) > self.max_seq_length - 2: tokens_a = tokens_a[0:(self.max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) while len(input_ids) < self.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) return input_ids, input_mask, segment_ids
def main(_): import time os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.gpu_id) current_path = os.path.dirname(os.path.abspath(__file__)) tokenizer = FullTokenizer(os.path.join(current_path, './model/chinese_L-12_H-768_A-12/vocab.txt')) Configuration=namedtuple('Configuration', ['fp16', 'bert_config', 'checkpoint_path', 'graph_tmp_dir', 'max_seq_length']) fp16=False bert_config='./model/chinese_L-12_H-768_A-12/bert_config.json' #checkpoint_path='./model/ad/model_0622/model.ckpt-610194' checkpoint_path='./model/ad/model_0626/model.ckpt-610194' checkpoint_path='./model/ad/model_pretrain_ctr_0826/model.ckpt-16352' graph_tmp_dir='./model/ad/tmp/' max_seq_length=70 configuration=Configuration(fp16, bert_config, checkpoint_path,graph_tmp_dir,max_seq_length) graph_path, bert_config = optimize_graph(configuration) worker = BertWorker(0, graph_path, configuration) start=time.time() for no in range(140): suffix=10000+no slice_path='/data1/zhangpengpeng/ad_data/eval_ins2_20190701/lm_validset_%s' % str(suffix)[1:] slice_output_path=os.path.join('/data1/zhangpengpeng/ad_data/eval_ins2_20190701/', 'lm_validset_mask_output_%s' % str(suffix)[1:]) slice_output_file=tf.gfile.Open(slice_output_path, 'w') if not tf.gfile.Exists(slice_path): continue if tf.gfile.Exists(slice_output_path): continue print(slice_path, slice_output_path) count=0 with tf.gfile.Open(slice_path, 'r') as f: input_ids_list=[] input_mask_list=[] segment_ids_list=[] rows=[] for index, line in enumerate(f): row=line.split('\t', 4) if len(row)!=5: continue text_a=row[3] text_b=row[-1].strip() text_c=row[1] start_position=text_b.find(text_c) feature=get_example(tokenizer, text_a, text_b, text_c, start_position, 70) input_ids_list.append(feature[0]) input_mask_list.append(feature[1]) segment_ids_list.append(feature[2]) rows.append(row) if len(input_ids_list) == 60: features=(input_ids_list, input_mask_list, segment_ids_list) tags, scores = worker.predict(features) for i in range(len(input_ids_list)): slice_output_file.write('%f\t%s\n' % (scores[i][1], '\t'.join(rows[i][:3]))) input_ids_list=[] input_mask_list=[] segment_ids_list=[] rows=[] count+=1 if len(rows)>0: features=(input_ids_list, input_mask_list, segment_ids_list) tags, scores = worker.predict(features) for i in range(len(input_ids_list)): slice_output_file.write('%f\t%s\n' % (scores[i][1], '\t'.join(rows[i][:3]))) slice_output_file.close() end=time.time() print("filename: %s\tqps: %d" % (slice_path, count/(end-start)))
def convert_single_example(example, max_seq_length=256, tokenizer=FullTokenizer()): """Converts a single `InputExample` into a single `InputFeatures`.""" label_map = {label: i for i, label in enumerate(label_list)} tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = ["[CLS]"] + tokens_a + ["[SEP]"] segment_ids = [0] * len(tokens) if tokens_b: tokens += tokens_b + ["[SEP]"] segment_ids += [1] * (len(tokens_b) + 1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. if len(input_ids) < max_seq_length: input_ids += [0] * (max_seq_length - len(input_ids)) input_mask += [0] * (max_seq_length - len(input_mask)) segment_ids += [0] * (max_seq_length - len(segment_ids)) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] if example.label else 0 feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True) return feature