def evaluate_line(): config = model_utils.load_config(FLAGS.config_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, config) book = openpyxl.load_workbook(r'输入文件.xlsx') sh = book.active arr = [] for r in list(sh.rows)[1:]: line = r[1].value result = model.evaluate_line( sess, data_utils.input_from_line(line, word_to_id), id_to_tag) arr.append({"id": r[0].value, "data": result}) print(arr) newbook = openpyxl.Workbook() sh = newbook.active headtype = { 'DIS': { "name": '疾病', "num": 20 }, '症状的type': { "name": '症状', "num": 20 }, 'DURG': { "name": '药物', "num": 20 } } harr = ['ID'] for n, t in headtype.items(): t['start'] = len(harr) for i in range(1, t['num'] + 1): harr.append(t['name'] + str(i)) sh.append(harr) for t in arr: larr = [''] * len(harr) larr[0] = t['id'] htc = {k: obj['start'] for k, obj in headtype.items()} for entitie in t['data']['entities']: larr[htc[entitie['type']]] = entitie['word'] htc[entitie['type']] += 1 sh.append(larr) newbook.save(r'输出文件.xlsx')
def evaluate_line(): config = model_utils.load_config(FLAGS.config_file) logger = model_utils.get_logger(FLAGS.log_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, 'rb') as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger, FLAGS.train) while True: line = input('请输入测试句子(输入q退出):') if line == 'q': return result = model.evaluate_line( sess, data_utils.input_from_line(line, word_to_id), id_to_tag) print(result)
def evaluate_line(): config = model_utils.load_config(FLAGS.config_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, config) out = [] df = pd.read_excel(local_file) for line in df['病历']: result = model.evaluate_line( sess, data_utils.input_from_line(line, word_to_id), id_to_tag) out.append(handle(result)) columns = [] for item in types_index: for i in range(1, types_num + 1): columns.append(item + str(i)) df_out = pd.DataFrame(out, columns=columns) df_out.insert(loc=0, column='ID', value=df['ID']) df_out.to_excel(result_file, index=None)
def evaluate_line(): config = model_utils.load_config(FLAGS.config_file) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with open(FLAGS.map_file, "rb") as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) with tf.Session(config=tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, config) df = pd.read_excel('1.xlsx') for line in df['文本']: result = model.evaluate_line( sess, data_utils.input_from_line(line, word_to_id), id_to_tag) print(result)
def train(): # 加载数据集 train_sentences = data_loader.load_sentences(FLAGS.train_file) dev_sentences = data_loader.load_sentences(FLAGS.dev_file) test_sentences = data_loader.load_sentences(FLAGS.test_file) # 转换编码 data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) # 创建单词和词典映射 if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_words_train = data_loader.word_mapping(train_sentences)[0] dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained( dico_words_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _, word_to_id, id_to_word = data_loader.word_mapping( train_sentences) _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences) with open(FLAGS.map_file, 'wb') as f: pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) # 准备数据 train_data = data_loader.prepare_dataset(train_sentences, word_to_id, tag_to_id) dev_data = data_loader.prepare_dataset(dev_sentences, word_to_id, tag_to_id) test_data = data_loader.prepare_dataset(test_sentences, word_to_id, tag_to_id) # 将数据分批处理 train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size) dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size) test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size) # 创建不存在的文件夹 model_utils.make_path(FLAGS) # 判断配置文件 if os.path.isfile(FLAGS.config_file): config = model_utils.load_config(FLAGS.config_file) else: config = model_utils.config_model(FLAGS, word_to_id, tag_to_id) model_utils.save_config(config, FLAGS.config_file) # 配置印logger log_path = os.path.join('log', FLAGS.log_file) logger = model_utils.get_logger(log_path) model_utils.print_config(config, logger) tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True step_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger) logger.info('开始训练') loss = [] start = time.time() for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.setps_chech == 0: iteration = step // step_per_epoch + 1 logger.info( "iteration{}: step{}/{}, NER loss:{:>9.6f}".format( iteration, step % step_per_epoch, step_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, 'dev', dev_manager, id_to_tag, logger) if best: model_utils.save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, 'test', test_manager, id_to_tag, logger) t = time.time() - start logger.info('cost time: %f' % t)
logger = model_utils.get_logger(log_path) model_utils.print_config(config, logger) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch =train_manager.len_data with tf.Session(config = tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger) logger.info("开始训练") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.setps_chech== 0:
def train(): # 加载数据集 train_sentences = data_loader.load_sentences(FLAGS.train_file) dev_sentences = data_loader.load_sentences(FLAGS.dev_file) test_sentences = data_loader.load_sentences(FLAGS.test_file) # 转换编码 bio转bioes data_loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) data_loader.update_tag_scheme(dev_sentences, FLAGS.tag_schema) # 创建单词映射及标签映射 if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_words_train = data_loader.word_mapping(train_sentences)[0] dico_word, word_to_id, id_to_word = data_utils.augment_with_pretrained( dico_words_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences] ) ) ) else: _, word_to_id, id_to_word = data_loader.word_mapping(train_sentences) _, tag_to_id, id_to_tag = data_loader.tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([word_to_id, id_to_word, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: word_to_id, id_to_word, tag_to_id, id_to_tag = pickle.load(f) train_data = data_loader.prepare_dataset( train_sentences, word_to_id, tag_to_id ) dev_data = data_loader.prepare_dataset( dev_sentences, word_to_id, tag_to_id ) test_data = data_loader.prepare_dataset( test_sentences, word_to_id, tag_to_id ) train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size) dev_manager = data_utils.BatchManager(dev_data, FLAGS.batch_size) test_manager = data_utils.BatchManager(test_data, FLAGS.batch_size) print('train_data_num %i, dev_data_num %i, test_data_num %i' % (len(train_data), len(dev_data), len(test_data))) model_utils.make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = model_utils.load_config(FLAGS.config_file) else: config = model_utils.config_model(FLAGS, word_to_id, tag_to_id) model_utils.save_config(config, FLAGS.config_file) log_path = os.path.join("log", FLAGS.log_file) logger = model_utils.get_logger(log_path) model_utils.print_config(config, logger) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch =train_manager.len_data with tf.Session(config = tf_config) as sess: model = model_utils.create(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_word, logger) logger.info("开始训练") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.setps_chech== 0: iterstion = step // steps_per_epoch + 1 logger.info("iteration:{} step{}/{},NER loss:{:>9.6f}".format(iterstion, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess,model,"dev", dev_manager, id_to_tag, logger) if best: model_utils.save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)