def report_results(strings, predicts, goldens, id_to_char, id_to_tag, output_path, verbose=False): results = [] for i in range(len(strings)): result = [] string = [x for x in strings[i]] pred = iobes_iob([id_to_tag[int(x)] for x in predicts[i]]) gold = iobes_iob([id_to_tag[int(x)] for x in goldens[i]]) for char, gold, pred in zip(string, gold, pred): result.append(" ".join([char, gold, pred])) results.append(result) with codecs.open(output_path, 'w', 'utf-8') as f: for sentence in results: for line in sentence: f.write(line + '\n') f.write('\n') eval_lines = return_report(output_path) if verbose: for line in eval_lines[1:]: print line.strip() f1 = float(eval_lines[1].strip().split()[-1]) return f1
def result_write_evaluate(results, path, name, size_train_data): """ 将对验证集的预测识别结果写入到原数据中并进行输出,然后计算识别的性能;将对测试集的预测识别结果写入到原数据中并进行输出 :param results: :param path: :param name: :return: """ if name == "dev": # output_file = os.path.join(path, "3000_predict_dev.utf8") # output_file = os.path.join(path, "5000_predict_dev.utf8") # output_file = os.path.join(path, "7000_predict_dev.utf8") # output_file = os.path.join(path, "10000_predict_dev.utf8") output_file = os.path.join(path, str(size_train_data) + "_predict_dev.utf8") with open(output_file, "w", encoding="utf8") as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines elif name == "test": output_file = os.path.join(path, "ner_predict_test.utf8") with open(output_file, "w", encoding="utf8") as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write)
def test_ner(results, path, filename): output_file = os.path.join(path, filename) with open(output_file, "w", encoding='utf-8') as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def evaluate_results(results, result_path): output_file = os.path.join(result_path, "ner_predict.utf8") with open(output_file, "w", encoding="utf8") as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): output_file = os.path.join(path, "ner_predict.utf8") with open(output_file, "w", encoding='utf8') as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) # 返回评估报告 eval_lines = return_report(output_file) return eval_lines
def evaluate_ner(results, conf): with open(conf.result_file, "w", encoding='utf-8') as f: to_write = [] for block in results: print(block) for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(conf.result_file) for line in eval_lines: print(line) f1 = float(eval_lines[1].strip().split()[-1]) return f1
def report_ner(self, results, output_file): """ Run perl script to evaluate model """ with open(output_file, "w", encoding='utf8') as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ Run perl script to evaluate model """ output_file = os.path.join(path, "ner_predict.utf8") with open(output_file, "w") as f: to_write = [] for block in results: for line in block: to_write.append(line.encode('utf-8') + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_srl(results, path): """ perl script를 이용해 평가 """ output_file = os.path.join(path, "srl_predict.utf8") with open(output_file, "w", encoding='utf-8') as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_skill(results, path): """ Run perl script to evaluate model """ output_file = os.path.join(path, "skill_predict.utf8") with codecs.open(output_file, "w", encoding="utf-8") as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines, results_skill = return_report(output_file) return eval_lines, results_skill
def test_ner(results, path): """ Report the performance. """ output_file = os.path.join(path, 'Brands_ner_predict.utf8') with open(output_file, 'w', encoding='utf-8') as f: to_write = [] for block in results: for line in block: to_write.append(line + '\n') to_write.append('\n') f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ Run perl script to evaluate model """ output_file = path + "_predict.utf8" with open(output_file, "w", encoding='utf8') as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ Run perl script to evaluate model """ output_file = os.path.join(path, "ner_predict.utf8") with open(output_file, "w") as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ 使用验证集验证模型效果 """ output_file = os.path.join(path, "ner_predict.utf8") with open(output_file, "w") as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path, epoch, name): """ Run perl script to evaluate model """ output_file = os.path.join(path, "result_" + str(epoch) + "_" + name + ".txt") with codecs.open(output_file, "w", 'utf8') as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ para results dimension: eval/test样本数量* LenSentence 每个元素: 字符 正确标签 预测标签 """ output_file = os.path.join(path, "ner_predict.utf8") with open(output_file, "w") as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines( to_write) # to_write dimension: 样本中的总字数*1 每个节点是str:字符 正确标签 预测标签 eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ 用CoNLL-2000的实体识别评估脚本来评估模型 """ """ 用CoNLL-2000的脚本,需要把预测结果保存为文件,再读取 """ output_file = os.path.join(path, "ner_predict.utf8") with open(output_file, "w", encoding='utf8') as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ Run perl script to evaluate model """ output_file = os.path.join(path, "ner_predict.utf8") #将验证结果写入文件中 with open(output_file, "w", encoding='utf8') as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) #计算模型的准确率和F1系数等指标 return eval_lines
def test_ner(results, path): """ :param results: :param path: :return: """ output_file = os.path.join(path, 'ner_predict.utf8') with codecs.open(output_file, "w", encoding="utf-8") as f_write: to_write = [] for line in results: for iner_line in line: to_write.append(iner_line + "\n") to_write.append("\n") f_write.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ :param results: :param path: :return: """ output_file = os.path.join(path, 'ner_predict.utf8') with open(output_file, 'w', encoding='UTF-8') as f: to_write = [] for line in results: for iner_line in line: to_write.append(iner_line + '\n') to_write.append('\n') f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ Run perl script to evaluate model """ output_file = os.path.join(path, "ner_predict.utf8") with open(output_file, "w") as f: to_write = [] for block in results: for line in block: f.write(line) f.write('\n') f.write('\n') # tmp_str1 = str(to_write).replace('u\'', '\'') # tmp_str1 = tmp_str1.decode("unicode-escape") # f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ Run perl script to evaluate model """ output_file = os.path.join(path, "ner_predict.utf8") with open(output_file, "w") as f: to_write = [] for block in results: for line in block: f.write(line) f.write('\n') f.write('\n') # tmp_str1 = str(to_write).replace('u\'', '\'') # tmp_str1 = tmp_str1.decode("unicode-escape") # f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ ner_results: [batch, ["char true_label pred_label"] result_path: save path """ output_file = os.path.join(path, "ner_predict.utf-8") with open(output_file, "w") as f: to_write = [] for block in results: for line in block: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines
def test_ner(results, path): """ :param results: :param path: :return: """ output_file = os.path.join(path, 'ner_predict.utf8') with open(output_file, 'w', encoding='UTF-8') as f: to_write = [] for line in results: for iner_line in line: to_write.append(iner_line + '\n') to_write.append('\n') f.writelines(to_write) eval_lines = return_report(output_file) # output_file = os.path.join(path, 'ner_predict.utf8') # with open(output_file, 'w', encoding='UTF-8') as f: # to_write = [] # for line in results: # for iner_line in line: # to_write.append(iner_line + '\n') # to_write.append('\n') # f.writelines(to_write) # golden_lists = [] # predic_lists = [] # glod = [] # pred = [] # with open(output_file, 'r', encoding='UTF-8') as f: # for line in f: # if len(line) > 1: # line = line.strip().split() # glod.append(line[1]) # pred.append(line[2]) # else: # golden_lists.append(glod) # predic_lists.append(pred) # glod = [] # pred = [] # accuracy, precision, recall, f_score = get_ner_measure(golden_lists=golden_lists, predict_lists=predic_lists, label_type='BIO') # print('acc: {}, precision: {}, recall: {}, f_score: {}'.format(accuracy, precision, recall, f_score)) # return accuracy, precision, recall, f_score return eval_lines
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "ner": NerProcessor } # if not FLAGS.do_train and not FLAGS.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # 加载BERT模型参数 if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) # 在train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if FLAGS.clean and FLAGS.do_train: if os.path.exists(FLAGS.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(FLAGS.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) if os.path.exists(FLAGS.data_config_path): try: os.remove(FLAGS.data_config_path) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() # print(label_list) # exit() if not os.path.exists(os.path.join(FLAGS.output_dir, 'label_list.pkl')): with open(os.path.join(FLAGS.output_dir, 'label_list.pkl'), 'wb') as fd: pickle.dump(label_list, fd) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # TPU参数配置 tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path) as fd: data_config = json.load(fd) else: data_config = {} # print(data_config) #空的 # exit() if FLAGS.do_train: # 加载训练数据 if len(data_config) == 0: train_examples = processor.get_train_examples(FLAGS.data_dir) # 训练步数 num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) # 预热step步 num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) data_config['num_train_steps'] = num_train_steps # 训练步数 data_config['num_warmup_steps'] = num_warmup_steps # 热身步数 data_config['num_train_size'] = len(train_examples) # 训练样本大小 example对象 else: num_train_steps = int(data_config['num_train_steps']) num_warmup_steps = int(data_config['num_warmup_steps']) # print(data_config) #{'num_train_steps': 4890, 'num_warmup_steps': 489, 'num_train_size': 20864} # exit() # print(bert_config) # exit() # 目前为止,数据处理完毕 # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # 如果用哑编码则tpu比较快,其他GPU比较快 estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # 1. 将数据转化为tf_record 数据 if data_config.get('train.tf_record_path', '') == '': train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) # 将数据写到tf里 else: train_file = data_config.get('train.tf_record_path') num_train_size = num_train_size = int(data_config['num_train_size']) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", num_train_size) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: if data_config.get('eval.tf_record_path', '') == '': eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) data_config['eval.tf_record_path'] = eval_file data_config['num_eval_size'] = len(eval_examples) else: eval_file = data_config['eval.tf_record_path'] # 打印验证集数据信息 num_eval_size = data_config.get('num_eval_size', 0) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", num_eval_size) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(num_eval_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # 保存数据的配置文件,避免在以后的训练过程中多次读取训练以及测试数据集,消耗时间 if not os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd: json.dump(data_config, fd) if FLAGS.do_predict: token_path = os.path.join(FLAGS.output_dir, "test_token.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) # predicted_result = estimator.evaluate(input_fn=predict_input_fn) # output_eval_file = os.path.join(FLAGS.output_dir, "predicted_results.txt") # with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: # tf.logging.info("***** Predict results *****") # for key in sorted(predicted_result.keys()): # tf.logging.info(" %s = %s", key, str(predicted_result[key])) # writer.write("%s = %s\n" % (key, str(predicted_result[key]))) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "test_label.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) for id in prediction: if idx > len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: if curr_labels == '[SEP]': break continue # 不知道为什么,这里会出现idx out of range 的错误。。。do not know why here cache list out of range exception! try: line += line_token[idx] + ' ' + label_token[idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') # 将模型预测的结果和原始标签写入到文件中,以空格分开,使用conevel.py脚本来预测entity level 的结果并且输出 with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from conlleval import return_report eval_result = return_report(output_predict_file) print(''.join(eval_result)) with codecs.open(os.path.join(FLAGS.output_dir, 'entity_level_predicted_result.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result))
def iob_iobes(tags): "I - 中间字; B - 起始字; E - 结束字; S - 单字; O - 非entity" new_tags = [] for i, tag in enumerate(tags): if tag=='O': new_tags.append(tag) elif tag.split('-')[0] == 'B': if (i+1) != len(tags) and tags[i+1].strip('-')[0] == 'I': new_tags.append(tag) else: new_tags.append(tag.replace('B-', 'S-') elif tag.split('-')[0] == 'I': if (i+1) != len(tags) and tags[i+1].split('-')[0] == 'I': new_tags.append(tag) else: new_tags.append(tag.replace('I-', 'E-')) else: raise Exception('# >>> Invalid format !!! <<< #') return new_tags def iobes_iob(tags): new_tags = [] for i, tag in enumerate(tags): tag_prefix = tag.split('-')[0] if tag_prefix == 'B': new_tags.append(tag) elif tag_prefix == 'I': new_tags.append(tag) elif tag_prefix == 'S': new_tags.append(tag.replace('S-', 'B-')) elif tag_prefix == 'E': new_tags.append(tag.replace('E-', 'I-')) elif tag_prefix == 'O': new_tags.append(tag) else: raise Exception('Invalid format!') return new_tags def load_word2vec(emb_path, id_to_word, word_dim, weighs): "加载预训练的词向量,注意维度匹配" # 1. read pretrained weights print('=> Loading pretrained embeddings from {}...'.format(emb_path)) pre_trained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(emb_path, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pre_trained[line[0]] = np.array( [float(x) for x in line[1:]]).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print('=》 WARNING: %i invalid lines' % emb_invalid) # 2. weights assignment c_found = 0 n_words = len(id_to_word) for i in range(n_words): word = id_to_word[i] if word in pre_trained: weights[i] = pre_trained[word] c_found += 1 elif word.lower() in pre_trained: weights[i] = pre_trained[word.lower()] c_found += 1 elif re.sub('\d', '0', word.lower()) in pre_trained: weights[i] = pre_trained[re.sub('\d', '0', word.lower())] c_found += 1 print('=> Loaded %i pretrained embeddings.' % len(pre_trained)) print('=> %i / %i words have been initialized with pretrained embeddings.' % (c_found, n_words)) return weights def test_ner(results, path): """ Run perl script to evaluate model """ output_file = os.path.join(path, "ner_predict.utf8") with open(output_file, "w", encoding='utf8') as f: to_write = [] for res in results: for line in res: to_write.append(line + "\n") to_write.append("\n") f.writelines(to_write) eval_lines = return_report(output_file) return eval_lines def input_from_line(line, char_to_id): """ Take sentence data and return an input for the training or the evaluation function. """ line = full_to_half(line) line = replace_html(line) inputs = list() inputs.append([line]) line.replace(" ", "$") inputs.append([[char_to_id[char] if char in char_to_id else char_to_id["<UNK>"] for char in line]]) inputs.append([get_seg_features(line)]) inputs.append([[]]) return inputs def full_to_half(s): """ Convert full-width character to half-width one """ n = [] for char in s: num = ord(char) if num == 0x3000: num = 32 elif 0xFF01 <= num <= 0xFF5E: num -= 0xfee0 char = chr(num) n.append(char) return ''.join(n) def replace_html(s): s = s.replace('"','"') s = s.replace('&','&') s = s.replace('<','<') s = s.replace('>','>') s = s.replace(' ',' ') s = s.replace("“", "") s = s.replace("”", "") s = s.replace("—","") s = s.replace("\xa0", " ") return(s) def get_seg_features(string): """ Segment text with jieba features are represented in bies format s donates single word 注意:若jieba中的词表没有永久添加目标NER词汇,需要在使用前进行手动添加。 """ seg_feature = [] for word in jieba.cut(string): if len(word) == 1: seg_feature.append(0) else: tmp = [2] * len(word) tmp[0] = 1 tmp[-1] = 3 seg_feature.extend(tmp) return seg_feature def jieba_dict_prepare(dict_path="./source_data/DICT_NOW.csv"): "根据语料资源,向jieba中添加自定义的词。在使用jieba进行分词前可使用。" import jieba, csv # 数据说明: # DICT_NOW.csv: # 所有标记对应的语言组成的dictionary # 这一部分只需要将***.txt中的entity mention和entity category进行对应输出即可 # 得到DICT_NOW.csv文件。(实际中可以在医药网站或者医学百科中爬取一些医学类entity) dics = csv.reader(open(dict_path, 'r', encoding='utf8')) # 利用jieba自定义分词,进行专有名词输入 # 将识别对象加入jieba识别词表,标记视为词性 for row in dics: if len(row) == 2: jieba.add_word(row[0].strip(), tag=row[1].strip()) # 强制加入词为一个joined整体 jieba.suggest_freq(row[0].strip()) def result_to_json(string, tags): item = {"string": string, "entities": []} entity_name = "" entity_start = 0 idx = 0 for char, tag in zip(string, tags): prefix = tag[0] if prefix == "S": item["entities"].append({"word": char, "start": idx, "end": idx+1, "type":tag[2:]}) elif prefix == "B": entity_name += char entity_start = idx elif prefix == "I": entity_name += char elif prefix == "E": entity_name += char item["entities"].append({"word": entity_name, "start": entity_start, "end": idx + 1, "type": tag[2:]}) entity_name = "" else: entity_name = "" entity_start = idx idx += 1 return item
def main(): tf.logging.set_verbosity(tf.logging.INFO) processors = { "ner": NerProcessor } # if not FLAGS.do_train and not FLAGS.do_eval: # raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) # 在train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if FLAGS.clean and FLAGS.do_train: if os.path.exists(FLAGS.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(FLAGS.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) if os.path.exists(FLAGS.data_config_path): try: os.remove(FLAGS.data_config_path) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path) as fd: data_config = json.load(fd) else: data_config = {} if FLAGS.do_train: # 加载训练数据 if len(data_config) == 0: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) data_config['num_train_steps'] = num_train_steps data_config['num_warmup_steps'] = num_warmup_steps data_config['num_train_size'] = len(train_examples) else: num_train_steps = int(data_config['num_train_steps']) num_warmup_steps = int(data_config['num_warmup_steps']) # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # 1. 将数据转化为tf_record 数据 if data_config.get('train.tf_record_path', '') == '': train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) else: train_file = data_config.get('train.tf_record_path') num_train_size = num_train_size = int(data_config['num_train_size']) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", num_train_size) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: if data_config.get('eval.tf_record_path', '') == '': eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) data_config['eval.tf_record_path'] = eval_file data_config['num_eval_size'] = len(eval_examples) else: eval_file = data_config['eval.tf_record_path'] # 打印验证集数据信息 num_eval_size = data_config.get('num_eval_size', 0) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", num_eval_size) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(num_eval_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # 保存数据的配置文件,避免在以后的训练过程中多次读取训练以及测试数据集,消耗时间 if not os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd: json.dump(data_config, fd) if FLAGS.do_predict: token_path = os.path.join(FLAGS.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: # Warning: According to tpu_estimator.py Prediction on TPU is an # experimental feature and hence not supported here raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) predicted_result = estimator.evaluate(input_fn=predict_input_fn) output_eval_file = os.path.join(FLAGS.output_dir, "predicted_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Predict results *****") for key in sorted(predicted_result.keys()): tf.logging.info(" %s = %s", key, str(predicted_result[key])) writer.write("%s = %s\n" % (key, str(predicted_result[key]))) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) for id in prediction: if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue # 不知道为什么,这里会出现idx out of range 的错误。。。do not know why here cache list out of range exception! try: line += line_token[idx] + ' ' + label_token[idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from conlleval import return_report eval_result = return_report(output_predict_file) print(eval_result)
def train(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map processors = {"ner": NerProcessor} bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) # 在re train 的时候,才删除上一轮产出的文件,在predicted 的时候不做clean if args.clean and args.do_train: if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) # check output dir exists if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) processor = processors[args.ner](args.output_dir) logger.info(args.data_dir) # 加载字典 tokenizer = FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) run_config = tf.estimator.RunConfig( model_dir=args.output_dir, save_summary_steps=500, # 这里写死了 前面定义无用 save_checkpoints_steps=500, session_config=session_config) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None if args.do_train and args.do_eval: # 加载训练数据 train_examples = processor.get_train_examples(args.data_dir) logger.info(len(train_examples)) num_train_steps = int( len(train_examples) * 1.0 / args.batch_size * args.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.batch_size) logger.info(" Num steps = %d", num_train_steps) # 加载测试数据 eval_examples = processor.get_dev_examples(args.data_dir) # 打印验证集数据信息 logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.batch_size) # labels = ["B_at", "I_at", "B_ot", "I_ot", "O"] # label_list = processor.get_labels(labels) label_list = processor.get_labels() # 返回的model_dn 是一个函数,其定义了模型,训练,评测方法,并且使用钩子参数,加载了BERT模型的参数进行了自己模型的参数初始化过程 # tf 新的架构方法,通过定义model_fn 函数,定义模型,然后通过EstimatorAPI进行模型的其他工作,Es就可以控制模型的训练,预测,评估工作等。 model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=args.init_checkpoint, learning_rate=args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, args=args) params = {'batch_size': args.batch_size} # 不同场景的dropout设置????, 如何实现 estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) if args.do_train and args.do_eval: # 1. 将数据转化为tf_record 数据 train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir) # 2.读取record 训练数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=args.max_seq_length, is_training=True, drop_remainder=True) # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) # eval的record eval_file = os.path.join(args.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir) eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) # train and eval togither # early stop hook early_stopping_hook = tf.contrib.estimator.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', # loss没有提升的时候提前结束, 为啥不合适dev loss??? max_steps_without_decrease=num_train_steps, # 这里设置了最大值????? eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=args.save_checkpoints_steps) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if args.do_predict: token_path = os.path.join(args.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(args.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(args.data_dir) predict_file = os.path.join(args.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file, args.output_dir, mode="test") logger.info("***** Running prediction*****") logger.info(" Num examples = %d", len(predict_examples)) logger.info(" Batch size = %d", args.batch_size) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=args.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder) # 这里没有进行维特比解码 如何获取序列化标注的结果???? result = estimator.predict(input_fn=predict_input_fn) logger.info(result) output_predict_file = os.path.join(args.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): logger.info(predict_line.text) logger.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]', 'X']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: logger.info(e) logger.info(predict_line.text) logger.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) eval_result = return_report(output_predict_file) print(''.join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(args.output_dir, 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result)) # filter model if args.filter_adam_var: adam_filter(args.output_dir)
def main(): ''' PrePare and check file''' # 检查checkpoint配置的准确性 tokenization.validate_case_matches_checkpoint(arg_dic['do_lower_case'], arg_dic['init_checkpoint']) if not arg_dic['do_train'] and not arg_dic['do_eval'] and not arg_dic[ 'do_predict']: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) # 导入Bert配置 bert_config = modeling.BertConfig.from_json_file( arg_dic['bert_config_file']) if arg_dic['max_seq_length'] > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (arg_dic['max_seq_length'], bert_config.max_position_embeddings)) ''' Estimator Config ''' processors = {"ner": SelfProcessor} processor = processors[arg_dic["ner"]]() tokenizer = tokenization.FullTokenizer( vocab_file=arg_dic["vocab_file"], do_lower_case=arg_dic["do_lower_case"]) ''' 配置tf.Session的运算方式: log_device_placement: 打印出TensorFlow使用了那种操作 inter_op_parallelism_threads: 设置线程一个操作内部并行运算的线程数,比如矩阵乘法,如果设置为0,则表示以最优的线程数处理 intra_op_parallelism_threads: 设置多个操作并行运算的线程数,比如 c = a + b,d = e + f . 可以并行运算 allow_soft_placement: 那么当运行设备不满足要求时,会自动分配GPU或者CPU ''' session_config = tf.ConfigProto(log_device_placement=False, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True) ''' Estimator Config: model_dir: 存储模型参数,graph等的路径 save_summary_steps: 每隔这么多步骤保存摘要 save_checkpoints_steps: 每隔多少个step就存一次checkpoint ''' run_config = tf.estimator.RunConfig( model_dir=arg_dic["ckpt_dir"], save_summary_steps=arg_dic["save_summary_steps"], save_checkpoints_steps=arg_dic["save_checkpoints_steps"], session_config=session_config) train_examples = None eval_examples = None num_train_steps = None num_warmup_steps = None ''' Load Data and Model about train and eval ''' if arg_dic["do_train"] and arg_dic["do_eval"]: # train train_examples = processor.get_train_examples(arg_dic["data_dir"]) num_train_steps = int( len(train_examples) * 1.0 / arg_dic["train_batch_size"] * arg_dic["num_train_epochs"]) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * arg_dic["warmup_proportion"]) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", arg_dic["train_batch_size"]) logger.info(" Num steps = %d", num_train_steps) # eval eval_examples = processor.get_dev_examples(arg_dic["data_dir"]) # 打印验证集数据信息 logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", arg_dic["train_batch_size"]) label_list = processor.get_labels(arg_dic["data_dir"] + "label.txt") ''' Model of Estimator''' model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=arg_dic["init_checkpoint"], learning_rate=arg_dic["learning_rate"], num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) params = {'batch_size': arg_dic["train_batch_size"]} estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) ''' Train of Estimator''' if arg_dic["do_train"] and arg_dic["do_eval"]: '''data input_fn''' # 1. 将数据转化为tf_record 数据 train_file = os.path.join(arg_dic["tfrecord_dir"], "train.tf_record") # 如果不存在train_record则生成 if not os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, arg_dic["max_seq_length"], tokenizer, train_file, arg_dic["tfrecord_dir"]) # 2.读取record 数据,组成batch train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=arg_dic["max_seq_length"], is_training=True, drop_remainder=True) # 1. eval eval_file = os.path.join(arg_dic["tfrecord_dir"], "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, arg_dic["max_seq_length"], tokenizer, eval_file, arg_dic["tfrecord_dir"]) # 2. eval read eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=arg_dic["max_seq_length"], is_training=False, drop_remainder=False) '''estimator train''' ''' max_steps_without_increase:如果没有增加的最大长是多少,如果超过了这个最大步长metric还是没有增加那么就会停止。 eval_dir:默认是使用estimator.eval_dir目录,用于存放评估的summary file。 run_every_secs:表示多长时间调用一次should_stop_fn ''' early_stopping_hook = tf.estimator.experimental.stop_if_no_decrease_hook( estimator=estimator, metric_name='loss', max_steps_without_decrease=num_train_steps, eval_dir=None, min_steps=0, run_every_secs=None, run_every_steps=arg_dic["save_checkpoints_steps"]) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=[early_stopping_hook]) ''' throttle_secs:多少秒后又开始评估,如果没有新的 checkpoints 产生,则不评估,所以这个间隔是最小值。 ''' eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, throttle_secs=arg_dic["eval_model_steps"]) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) # 进行预测 if arg_dic["do_predict"]: token_path = os.path.join(arg_dic["output_dir"], "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(arg_dic["tfrecord_dir"], 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} # 数据 predict_examples = processor.get_test_examples(arg_dic["data_dir"]) predict_file = os.path.join(arg_dic["output_dir"], "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, arg_dic["max_seq_length"], tokenizer, predict_file, arg_dic["output_dir"], mode="test") logger.info("***** Running prediction*****") logger.info(" Num examples = %d", len(predict_examples)) logger.info(" Batch size = %d", arg_dic["train_batch_size"]) predict_drop_remainder = False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=arg_dic["max_seq_length"], is_training=False, drop_remainder=predict_drop_remainder) result = estimator.predict( input_fn=predict_input_fn, checkpoint_path=".\output\ckpt\model.ckpt-30") output_predict_file = os.path.join(arg_dic["output_dir"], "label_test.txt") def result_to_pair(writer): print("********") print(predict_examples) for predict_line, prediction in zip(predict_examples, result): idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') len_seq = len(label_token) if len(line_token) != len(label_token): logger.info(predict_line.text) logger.info(predict_line.label) break for id in prediction: if idx >= len_seq: break if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue try: line += line_token[idx] + ' ' + label_token[ idx] + ' ' + curr_labels + '\n' except Exception as e: logger.info(e) logger.info(predict_line.text) logger.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') # 预测结果写入文件 with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) import conlleval # predict的项 eval_result = conlleval.return_report(output_predict_file) print(''.join(eval_result)) # 写结果到文件中 with codecs.open(os.path.join(arg_dic["output_dir"], 'predict_score.txt'), 'a', encoding='utf-8') as fd: fd.write(''.join(eval_result))
def main(): tf.logging.set_verbosity(tf.logging.INFO) processors = {"ner": NerProcessor} bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError("Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) is_train() task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 is_per_host=3 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig(iterations_per_loop=FLAGS.iterations_per_loop,num_shards=FLAGS.num_tpu_cores,per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path) as fd: data_config = json.load(fd) else: data_config = {} if FLAGS.do_train: # 加载训练数据 if len(data_config) == 0: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int(len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) data_config['num_train_steps'] = num_train_steps data_config['num_warmup_steps'] = num_warmup_steps data_config['num_train_size'] = len(train_examples) else: num_train_steps = int(data_config['num_train_steps']) num_warmup_steps = int(data_config['num_warmup_steps']) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list) + 1, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: if data_config.get('train.tf_record_path', '') == '': train_file = os.path.join(FLAGS.output_dir, "train.tf_record") filed_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) else: train_file = data_config.get('train.tf_record_path') num_train_size = num_train_size = int(data_config['num_train_size']) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", num_train_size) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder(input_file=train_file,seq_length=FLAGS.max_seq_length,is_training=True,drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: if data_config.get('eval.tf_record_path', '') == '': eval_examples = processor.get_dev_examples(FLAGS.data_dir) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") filed_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)#将数据写入tf_record data_config['eval.tf_record_path'] = eval_file data_config['num_eval_size'] = len(eval_examples) else: eval_file = data_config['eval.tf_record_path'] num_eval_size = data_config.get('num_eval_size', 0) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", num_eval_size) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_steps = None if FLAGS.use_tpu: eval_steps = int(num_eval_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder(input_file=eval_file,seq_length=FLAGS.max_seq_length,is_training=False,drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with codecs.open(output_eval_file, "w", encoding='utf-8') as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if not os.path.exists(FLAGS.data_config_path): with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd: json.dump(data_config, fd) if FLAGS.do_predict: token_path = os.path.join(FLAGS.output_dir, "token_test.txt") if os.path.exists(token_path): os.remove(token_path) with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf: label2id = pickle.load(rf) id2label = {value: key for key, value in label2id.items()} predict_examples = processor.get_test_examples(FLAGS.data_dir) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list,FLAGS.max_seq_length, tokenizer,predict_file, mode="test") tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) if FLAGS.use_tpu: raise ValueError("Prediction in TPU not supported") predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder(input_file=predict_file,seq_length=FLAGS.max_seq_length,is_training=False,drop_remainder=predict_drop_remainder) result = estimator.predict(input_fn=predict_input_fn) output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") def result_to_pair(writer): for predict_line, prediction in zip(predict_examples, result): print(prediction) idx = 0 line = '' line_token = str(predict_line.text).split(' ') label_token = str(predict_line.label).split(' ') if len(line_token) != len(label_token): tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) for id in prediction: if id == 0: continue curr_labels = id2label[id] if curr_labels in ['[CLS]', '[SEP]']: continue try: line += line_token[idx] + ' ' + label_token[idx] + ' ' + curr_labels + '\n' except Exception as e: tf.logging.info(e) tf.logging.info(predict_line.text) tf.logging.info(predict_line.label) line = '' break idx += 1 writer.write(line + '\n') with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(writer) from conlleval import return_report eval_result = return_report(output_predict_file) print(eval_result)
def predict(args, processor, tokenizer, bert_config, sess_config, label_list): """ 预测函数 """ # 生成3个examples predict_examples = processor.get_test_examples(args.data_dir) predict_file = os.path.join(args.output_dir, "predict.tf_record") filed_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, predict_file, args.output_dir, mode="test") train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_dev_examples(args.data_dir) train_file = os.path.join(args.output_dir, "train.tf_record") eval_file = os.path.join(args.output_dir, "eval.tf_record") # 生成数据集 train_data = file_based_dataset(input_file=train_file, batch_size=args.batch_size, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) eval_data = file_based_dataset(input_file=eval_file, batch_size=args.batch_size, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) predict_data = file_based_dataset(input_file=predict_file, batch_size=args.batch_size, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) train_iter = train_data.make_one_shot_iterator().get_next() eval_iter = eval_data.make_one_shot_iterator().get_next() predict_iter = predict_data.make_one_shot_iterator().get_next() # 开启计算图 with tf.Session(config=sess_config) as sess: # 从文件中读取计算图 save_dir = os.path.join(args.output_dir, 'model') # saver = tf.train.import_meta_graph( # tf.train.latest_checkpoint(save_dir) + ".meta") # sess.run(tf.global_variables_initializer()) # 打印张量名 # tensor_list = [ # n.name for n in tf.get_default_graph().as_graph_def().node if 'older' in n.name] # print(tensor_list) # 构造模型 input_ids = tf.placeholder(shape=[None, args.max_seq_length], dtype=tf.int32, name='input_ids') input_mask = tf.placeholder(shape=[None, args.max_seq_length], dtype=tf.int32, name='input_mask') segment_ids = tf.placeholder(shape=[None, args.max_seq_length], dtype=tf.int32, name='segment_ids') label_ids = tf.placeholder(shape=[None, args.max_seq_length], dtype=tf.int32, name='label_ids') is_training = tf.get_variable("is_training", shape=[], dtype=tf.bool, trainable=False) total_loss, logits, trans, pred_ids = create_model( bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, len(label_list), False, args.dropout_rate, args.lstm_size, args.cell, args.num_layers) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(save_dir)) # 通过张量名获取模型的占位符和参数 # input_ids = tf.get_default_graph().get_tensor_by_name('input_ids:0') # input_mask = tf.get_default_graph().get_tensor_by_name('input_mask:0') # segment_ids = tf.get_default_graph().get_tensor_by_name('segment_ids:0') # label_ids = tf.get_default_graph().get_tensor_by_name('label_ids:0') # sess.run(tf.assign(tf.get_default_graph().get_tensor_by_name( # 'is_training:0'), tf.constant(False, dtype=tf.bool))) # # 找到crf输出, 注意其名称在crf_decode源码中, 可以在graph中查到 # pred_ids = tf.get_default_graph().get_tensor_by_name('ReverseSequence_1:0') sess.run(tf.assign(is_training, tf.constant(False, dtype=tf.bool))) # test集预测 predict_total = np.array([[0] * 150], dtype=np.int32) for _ in range(0, int(len(predict_examples) / args.batch_size) + 1): # predict feed predict_batch = sess.run(predict_iter) predict_res = sess.run(pred_ids, feed_dict={ input_ids: predict_batch['input_ids'], input_mask: predict_batch['input_mask'], segment_ids: predict_batch['segment_ids'], label_ids: predict_batch['label_ids'] }) predict_total = np.concatenate((predict_total, predict_res), axis=0) # 处理评估结果,计算recall与f1 predict_total = predict_total[1:] output_predict_file = os.path.join(args.output_dir, "label_test.txt") with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer: result_to_pair(args, writer, predict_examples, predict_total) # train集预测 train_total = np.array([[0] * 150], dtype=np.int32) for _ in range(0, int(len(train_examples) / args.batch_size) + 1): # predict feed train_batch = sess.run(train_iter) train_res = sess.run(pred_ids, feed_dict={ input_ids: train_batch['input_ids'], input_mask: train_batch['input_mask'], segment_ids: train_batch['segment_ids'], label_ids: train_batch['label_ids'] }) train_total = np.concatenate((train_total, train_res), axis=0) # 处理评估结果,计算recall与f1 train_total = train_total[1:] output_train_file = os.path.join(args.output_dir, "label_train.txt") with codecs.open(output_train_file, 'w', encoding='utf-8') as writer: result_to_pair(args, writer, train_examples, train_total) train_score, _ = conlleval.return_report(output_train_file) print(''.join(train_score)) # eval集预测 eval_total = np.array([[0] * 150], dtype=np.int32) for _ in range(0, int(len(eval_examples) / args.batch_size) + 1): # predict feed eval_batch = sess.run(eval_iter) eval_res = sess.run(pred_ids, feed_dict={ input_ids: eval_batch['input_ids'], input_mask: eval_batch['input_mask'], segment_ids: eval_batch['segment_ids'], label_ids: eval_batch['label_ids'] }) eval_total = np.concatenate((eval_total, eval_res), axis=0) # 处理评估结果,计算recall与f1 eval_total = eval_total[1:] output_eval_file = os.path.join(args.output_dir, "label_dev.txt") with codecs.open(output_eval_file, 'w', encoding='utf-8') as writer: result_to_pair(args, writer, eval_examples, eval_total) eval_score, _ = conlleval.return_report(output_eval_file) print(''.join(eval_score))
def train_and_eval(args, processor, tokenizer, bert_config, sess_config, label_list): """ 训练和评估函数 """ # 生成tf_record文件 train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_dev_examples(args.data_dir) num_train_steps = int( len(train_examples) * 1.0 / args.batch_size * args.num_train_epochs) if num_train_steps < 1: raise AttributeError('training data is so small...') num_warmup_steps = int(num_train_steps * args.warmup_proportion) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", args.batch_size) tf.logging.info(" Num steps = %d", num_train_steps) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", args.batch_size) # 写入tfrecord train_file = os.path.join(args.output_dir, "train.tf_record") if not os.path.exists(train_file): filed_based_convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir) eval_file = os.path.join(args.output_dir, "eval.tf_record") if not os.path.exists(eval_file): filed_based_convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir) """ -------------分割线------------- """ # 存储路径 log_dir = os.path.join(args.output_dir, 'log') save_dir = os.path.join(args.output_dir, 'model') if not os.path.exists(log_dir): os.makedirs(log_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) # # 加载数据 # train_file = os.path.join(args.output_dir, "train.tf_record") # eval_file = os.path.join(args.output_dir, "eval.tf_record") # if not os.path.exists(train_file) or not os.path.exists(eval_file): # raise ValueError # 生成dataset train_data = file_based_dataset(input_file=train_file, batch_size=args.batch_size, seq_length=args.max_seq_length, is_training=True, drop_remainder=False) eval_data = file_based_dataset(input_file=eval_file, batch_size=args.batch_size, seq_length=args.max_seq_length, is_training=False, drop_remainder=False) train_iter = train_data.make_one_shot_iterator().get_next() # 开启计算图 with tf.Session(config=sess_config) as sess: # 构造模型 input_ids = tf.placeholder(shape=[None, args.max_seq_length], dtype=tf.int32, name='input_ids') input_mask = tf.placeholder(shape=[None, args.max_seq_length], dtype=tf.int32, name='input_mask') segment_ids = tf.placeholder(shape=[None, args.max_seq_length], dtype=tf.int32, name='segment_ids') label_ids = tf.placeholder(shape=[None, args.max_seq_length], dtype=tf.int32, name='label_ids') is_training = tf.get_variable("is_training", shape=[], dtype=tf.bool, trainable=False) total_loss, logits, trans, pred_ids = create_model( bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, len(label_list), False, args.dropout_rate, args.lstm_size, args.cell, args.num_layers) # 优化器 train_op = optimization.create_optimizer(total_loss, args.learning_rate, num_train_steps, num_warmup_steps, False) sess.run(tf.global_variables_initializer()) # 加载bert原始模型 tvars = tf.trainable_variables() if args.init_checkpoint: (assignment_map, initialized_variable_names) = \ modeling.get_assignment_map_from_checkpoint( tvars, args.init_checkpoint) tf.train.init_from_checkpoint(args.init_checkpoint, assignment_map) # 打印加载模型的参数 for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) # 初始化存储和log writer = tf.summary.FileWriter(log_dir, sess.graph) saver = tf.train.Saver() # 定义一些全局变量 best_eval_loss = 1000000.0 patience = 0 # 开始训练 sess.run(tf.assign(is_training, tf.constant(True, dtype=tf.bool))) for go in range(1, num_train_steps + 1): # feed train_batch = sess.run(train_iter) loss, preds, op = sess.run( [total_loss, pred_ids, train_op], feed_dict={ input_ids: train_batch['input_ids'], input_mask: train_batch['input_mask'], segment_ids: train_batch['segment_ids'], label_ids: train_batch['label_ids'] }) if go % args.save_summary_steps == 0: # 训练log writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag="loss/train_loss", simple_value=loss / args.batch_size), ]), sess.run(tf.train.get_global_step())) writer.flush() if go % args.save_checkpoints_steps == 0: # 验证集评估 sess.run( tf.assign(is_training, tf.constant(False, dtype=tf.bool))) eval_loss_total = 0.0 eval_preds_total = np.array([[0] * 150], dtype=np.int32) eval_truth_total = np.array([[0] * 150], dtype=np.int32) # 重新生成一次验证集数据 eval_data = eval_data.repeat() eval_iter = eval_data.make_one_shot_iterator().get_next() for _ in range(0, int(len(eval_examples) / args.batch_size) + 1): # eval feed eval_batch = sess.run(eval_iter) eval_loss, eval_preds, eval_truth = sess.run( [total_loss, pred_ids, label_ids], feed_dict={ input_ids: eval_batch['input_ids'], input_mask: eval_batch['input_mask'], segment_ids: eval_batch['segment_ids'], label_ids: eval_batch['label_ids'] }) # 统计结果 eval_loss_total += eval_loss eval_preds_total = np.concatenate( (eval_preds_total, eval_preds), axis=0) eval_truth_total = np.concatenate( (eval_truth_total, eval_truth), axis=0) # 处理评估结果,计算recall与f1 eval_preds_total = eval_preds_total[1:] eval_truth_total = eval_truth_total[1:] eval_f1 = metrics.f1_score(eval_truth_total.reshape(-1), eval_preds_total.reshape(-1), average='macro') eval_recall = metrics.recall_score( eval_truth_total.reshape(-1), eval_preds_total.reshape(-1), average='macro') eval_acc = metrics.accuracy_score(eval_truth_total.reshape(-1), eval_preds_total.reshape(-1)) eval_loss_aver = eval_loss_total / len(eval_examples) # 评估命名实体识别的指标 output_eval_file = os.path.join(args.output_dir, "label_eval.txt") with codecs.open(output_eval_file, 'w', encoding='utf-8') as writer_1: result_to_pair(args, writer_1, eval_examples, eval_preds_total) eval_score, over_all = conlleval.return_report( output_eval_file) print(''.join(eval_score)) # 评估log writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag="loss/eval_loss", simple_value=eval_loss_aver), ]), sess.run(tf.train.get_global_step())) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag="eval/f1", simple_value=eval_f1), ]), sess.run(tf.train.get_global_step())) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag="eval/recall", simple_value=eval_recall), ]), sess.run(tf.train.get_global_step())) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag="eval/acc", simple_value=eval_acc), ]), sess.run(tf.train.get_global_step())) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag="ner/f1", simple_value=over_all.fscore), ]), sess.run(tf.train.get_global_step())) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag="ner/recall", simple_value=over_all.rec), ]), sess.run(tf.train.get_global_step())) writer.flush() # early stopping 与 模型保存 if eval_loss_aver >= best_eval_loss: patience += 1 if patience >= 5: print("early stoping!") return if eval_loss_aver < best_eval_loss: patience = 0 best_eval_loss = eval_loss_aver saver.save( sess, os.path.join( save_dir, "model_{}_loss_{:.4f}.ckpt".format( sess.run(tf.train.get_global_step()), best_eval_loss))) sess.run( tf.assign(is_training, tf.constant(False, dtype=tf.bool)))
with codecs.open(output_predict_file, 'r', encoding='utf-8') as f: counter = 0 line_1 = [] line_2 = [] line_3 = [] lines = '' for line in f: if line.strip(): content = line.strip() tokens = content.split('\t') if counter == 0: line_1 = tokens counter += 1 elif counter == 1: line_2 = tokens counter += 1 elif counter == 2: line_3 = tokens else: for a, b, c in zip(line_1, line_2, line_3): if a not in ["[PAD]", "[CLS]", "[SEP]" ] and b not in ["X", "APAD"]: lines += a + " " + b + " " + c + '\n' counter = 0 with codecs.open(output_predict_file_processed, 'w', encoding='utf-8') as writer: writer.write(lines + '\n') eval_result = conlleval.return_report(output_predict_file_processed) print(''.join(eval_result))