def train(): # -----------------------------------数据准备------------------------------------- train_manager = BatchManager(batch_size=20, name='train') test_manager = BatchManager(batch_size=100, name='test') # -----------------------------------读取字典------------------------------------- mapping_dict = get_dict(dict_file) # -----------------------------------搭建模型------------------------------------- model = Model(mapping_dict) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for i in range(5): j = 1 for batch in train_manager.iter_batch(shuffle=True): start = time.time() loss = model.run_step(sess, batch) end = time.time() if j % 5 == 0: print('epoch:{},step:{}/{},loss:{},elapse:{},estimate:{}'. format(i + 1, j, train_manager.len_data, loss, end - start, (end - start) * (train_manager.len_data - j))) j += 1 for batch in test_manager.iter_batch(shuffle=True): test_result = model.predict(sess, batch, istrain=False, istest=True) print('precision rate:{} %', test_result[1])
def predict_line(param): # 初始化日志对象 logger = get_logger(param.test_log_file) tf_config = tf.ConfigProto() # 读取字典 mapping_dict = get_dict(param.dict_file) # 根据保存的模型读取模型 model = Model(param, mapping_dict) # 开始测试 with tf.Session(config=tf_config) as sess: # 首先检查模型是否存在 ckpt_path = param.ckpt_path ckpt = tf.train.get_checkpoint_state(ckpt_path) # 看是否存在训练好的模型 if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): logger.info("Reading model parameters from {}".format( ckpt.model_checkpoint_path)) # 如果存在就进行重新加载 model.saver.restore(sess, ckpt.model_checkpoint_path) else: logger.info("Cannot find the ckpt files!") while True: # 反复输入句子进行预测 line = input("请输入测试句子:") raw_inputs, model_inputs = input_from_line_with_feature(line) tag = model.evaluate_line(sess, model_inputs) result = result_to_json(raw_inputs, tag) result = js.dumps(result, ensure_ascii=False, indent=4, separators=(',', ': ')) with open('./result/result.json', 'w', encoding='utf-8') as f: f.write(result) print("预测结果为:{}".format(result))
def test(param): # 检查参数 assert param.clip < 5.1, "gradient clip should't be too much" assert 0 <= param.dropout < 1, "dropout rate between 0 and 1" assert param.lr > 0, "learning rate must larger than zero" # 获取batch_manager test_manager = BatchManager(param.test_batch_size, name='test') number_dataset = test_manager.len_data print("total of number test data is {}".format(number_dataset)) # 配置日志 logger = get_logger(param.test_log_file) # 读取字典 mapping_dict = get_dict(param.dict_file) # 搭建模型 model = Model(param, mapping_dict) # 配置GPU参数 gpu_config = tf.ConfigProto() with tf.Session(config=gpu_config) as sess: logger.info("start testing...") start = time.time() # 首先检查模型是否存在 ckpt_path = param.ckpt_path ckpt = tf.train.get_checkpoint_state(ckpt_path) # 看是否存在训练好的模型 if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): logger.info("Reading model parameters from {}".format( ckpt.model_checkpoint_path)) # 如果存在就进行重新加载 model.saver.restore(sess, ckpt.model_checkpoint_path) else: logger.info("Cannot find the ckpt files!") # 开始评估 evaluate(sess, param, model, "test", test_manager, logger) logger.info("The best_f1 on test_dataset is {:.2f}".format( model.best_test_f1.eval())) logger.info('Time test for {:.2f} batch is {:.2f} sec\n'.format( param.test_batch_size, time.time() - start))
def annotation(): #train() #读入整体的csv文件 try: whole_data = pd.read_csv(raw_data, sep=',', encoding='UTF-8') except UnicodeEncodeError: whole_data = pd.read_csv(raw_data, sep=',', encoding='GBK', errors='ignore') except Exception as e: print(e) row_num = whole_data.shape[0] print(row_num) # -----------------------------------读取字典------------------------------------- mapping_dict = get_dict(dict_file) # -----------------------------------搭建模型------------------------------------- model = Model(mapping_dict) list = ['Per', 'Com', 'Time', 'Job', 'Nat', 'Bir', 'Age', 'Gdr', 'Uni', 'Edu', 'Sch', 'Col', 'Maj', 'Zhi', 'Hon'] feature_dataframe = pd.DataFrame(columns=list) #创建test文件夹,并遍历所有的数据进行预测 for i in range(whole_data): # 单纯创建只能创建两层,用shutil可以创建多层 if os.path.exists('data/Test'): shutil.rmtree('data/Test') if not os.path.exists('data/Test'): os.makedirs('data/Test') cur_data = whole_data['ManagerResume'][i] print(cur_data) filename = 'data/Test/need_annotation.txt' with open(filename, 'w') as f: # 如果filename不存在会自动创建, 'w'表示写数据,写之前会清空文件中的原有数据! f.write(cur_data) task_process(split_text) get_data('task') # -----------------------------------数据准备------------------------------------- task_manager = BatchManager(batch_size=1, name='task') # -----------------------------------搭建模型------------------------------------- item_T = {} item_T = pd.DataFrame(item_T) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for i in range(1): for batch in task_manager.iter_batch(shuffle=True): task_result,item = model.predict(sess, batch, istrain=False, istest=False) #item_Entity = pd.DataFrame(item['entities']) #item_T = item_T.append(item_Entity) item_T = pd.DataFrame(item['entities']) #print('predict result:{} %', task_result) print(item_T) #num_samples = len(item) # 获取有多少句话 等于是有多少个样本 #print(num_samples) # -------------------------------存储标注完的数据---------------------------------- f_Key = {} for feature in list: l_type =[] for j in range(item_T.shape[0]): if(item_T['type'].iloc[j] == feature): return_word = [item_T['word'].iloc[j]] l_type = l_type+return_word f_Key.update({feature:l_type}) feature_dataframe = feature_dataframe.append(f_Key,ignore_index=True) FinalResult = pd.concat([whole_data,feature_dataframe], axis=1) fpath = 'FinalResult.csv' pd.DataFrame(FinalResult).to_csv(fpath)
def train(param): # 检查参数 assert param.clip < 5.1, "gradient clip should't be too much" assert 0 <= param.dropout < 1, "dropout rate between 0 and 1" assert param.lr > 0, "learning rate must larger than zero" # 数据准备 train_manager = BatchManager(param.batch_size, name='train') number_dataset = train_manager.len_data print("total of number train data is {}".format(number_dataset)) # 创建相应的文件夹 make_path(param) # 配置日志 logger = get_logger(param.train_log_file) # 读取字典 mapping_dict = get_dict(param.dict_file) # 读取senc_tag为后续加载词向量做准备 senc_tag = get_sent_tag(param.sent_tag_file) # 加载预训练向量 dico_chars, char_to_id, id_to_char = augment_with_pretrained( mapping_dict['word'][2].copy(), param.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in senc_tag]))) # 获取总的训练集数据数量 steps_per_epoch = train_manager.len_data # 配置GPU参数 gpu_config = tf.ConfigProto() with tf.Session(config=gpu_config) as sess: # 初始化模型 model = creat_model(sess, Model, param.ckpt_path, load_word2vec, param, id_to_char, logger, map_all=mapping_dict) for i in range(param.max_epoch): loss = [] total_loss = 0 # 初始化时间 start = time.time() for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, batch) # 这里计算平均loss loss.append(batch_loss) # 这里计算总的loss后面计算全部平均 total_loss += batch_loss if step % 5 == 0: logger.info( "epoch:{}, step:{}/{}, avg_loss:{:>9.4f}".format( i + 1, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) # 保存模型 model.save_model(sess, logger, i) logger.info('Epoch {}, total Loss {:.4f}'.format( i + 1, total_loss / train_manager.len_data)) logger.info( 'Time taken for one epoch {:.4f} min, take {:.2f} h for rest of epoch\n' .format((time.time() - start) / 60, ((param.max_epoch - i + 1) * (time.time() - start)) / 3600))