def prodeuce_embedding_vec_file( filename, path="../data/simple_questions/fb_0_2m_files"): dh = data_helper.DataClass("sq") model = models.Word2Vec.load(filename) # 遍历每个单词,查出word2vec然后输出 v_base = model['end'] ct.print(v_base) for word in dh.converter.vocab: try: v = model[word] except Exception as e1: msg1 = "%s : %s " % (word, e1) ct.print(msg1) ct.just_log(path + "/wiki.vector.log", msg1) v = model['end'] m_v = ' '.join([str(x) for x in list(v)]) msg = "%s %s" % (word, str(m_v)) ct.just_log(path + "/wiki.vector", msg) # 多记录一个单词 word = 'end' v = model[word] m_v = ' '.join([str(x) for x in list(v)]) msg = "%s %s" % (word, str(m_v)) ct.just_log(path + "/wiki.vector", msg)
def main(_): model_path = os.path.join('model', FLAGS.name) if os.path.exists(model_path) is False: os.makedirs(model_path) model = 'ner' dh = data_helper.DataClass(model) train_batch_size = 1 # g = dh.batch_iter_char_rnn(train_batch_size) # (FLAGS.num_seqs, FLAGS.num_steps) embedding_weight = dh.embeddings # with codecs.open(FLAGS.input_file, encoding='utf-8') as f: # text = f.read() # converter = TextConverter(text, FLAGS.max_vocab) # converter.save_to_file(os.path.join(model_path, 'converter.pkl')) # # arr = converter.text_to_arr(text) # g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) model = CharRNN(dh.converter.vocab_size, # 词汇表大小 从其中生成所有候选 num_seqs=train_batch_size, # FLAGS.num_seqs, # ? 一个batch 的 句子 数目 num_steps=dh.max_document_length, # FLAGS.num_steps, # 一个句子的长度 lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size, embedding_weight=embedding_weight, sampling=False, dh=dh ) model.train( FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(): with tf.device("/gpu"): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) now = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") # 重要的,是否恢复模型,loss的部分;属性的数目 model = FLAGS.mode test_style = True ct.print("tf:%s should be 1.2.1 model:%s " % (str(tf.__version__), model)) # 1.2.1 ct.print("mark:%s " % config.cc_par('mark'), 'mark') # 1.2.1 ct.just_log2("info", now) ct.just_log2("result", now) ct.just_log2("info", get_config_msg()) ct.print(get_config_msg(), "mark") ct.just_log3( "test_check", "mode\tid\tglobal_id\tglobal_id_in_origin\tquestion\tentity\tpos\tanswer\tr1\tr2\tr3\n" ) ct.log3(now) msg1 = "t_relation_num:%d train_part:%s loss_part:%s" % \ (config.cc_par('t_relation_num'),config.cc_par('train_part'), config.cc_par('loss_part')) ct.print(msg1) msg1 = 'restrore:%s use_alias_dict:%s' % ( config.cc_par('restore_model'), config.cc_par('use_alias_dict')) ct.print(msg1) if config.cc_par('restore_model'): ct.print(config.cc_par('restore_path')) embedding_weight = None error_test_dict = dict() valid_test_dict = dict() # 1 读取所有的数据,返回一批数据标记好的数据{data.x,data.label} dh = data_helper.DataClass(model, "test") if FLAGS.word_model == "word2vec_train": embedding_weight = dh.embeddings # 3 构造模型LSTM类 # loss_type = "pair" discriminator = Discriminator( max_document_length=dh.max_document_length, # timesteps word_dimension=FLAGS.word_dimension, # 一个单词的维度 vocab_size=dh.converter. vocab_size, # embedding时候的W的大小embedding_size rnn_size=FLAGS.rnn_size, # 隐藏层大小 model=model, need_cal_attention=config.cc_par('d_need_cal_attention'), need_max_pooling=FLAGS.need_max_pooling, word_model=FLAGS.word_model, embedding_weight=embedding_weight, need_gan=True, first=True) # generator = Generator( # max_document_length=dh.max_document_length, # timesteps # word_dimension=FLAGS.word_dimension, # 一个单词的维度 # vocab_size=dh.converter.vocab_size, # embedding时候的W的大小embedding_size # rnn_size=FLAGS.rnn_size, # 隐藏层大小 # model=model, # need_cal_attention=config.cc_par('g_need_cal_attention'), # 不带注意力玩 # need_max_pooling=FLAGS.need_max_pooling, # word_model=FLAGS.word_model, # embedding_weight=embedding_weight, # need_gan=True, first=False) ct.print("max_document_length=%s,vocab_size=%s " % (str(dh.max_document_length), str(dh.converter.vocab_size))) # 初始化 init = tf.global_variables_initializer() merged = tf.summary.merge_all() with sess.as_default(): writer = tf.summary.FileWriter(ct.log_path() + "\\log\\", sess.graph) sess.run(init) loss_dict = dict() loss_dict['loss'] = 0 loss_dict['pos'] = 0 loss_dict['neg'] = 0 # 如果需要恢复则恢复 if config.cc_par('restore_model'): saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) save_path = config.cc_par('restore_path') ct.print('restore:%s' % save_path, 'model') saver.restore(sess, config.cc_par('restore_path')) # 1 NER 部分1 print('加载别名词典:') dh.bh.stat_dict('../data/nlpcc2016/4-ner/extract_entitys_all.txt') dh.bh.init_ner(f_in2='../data/nlpcc2016/4-ner/extract_e/e1.tj.txt') print('input:') line = '红楼梦的作者是谁?' # input() _best_p, _best_s = ner_rel_analyisis(dh, discriminator, line, sess) # 2 NER LSTM 识别 hh_dh = dh hh_discriminator = discriminator hh_sess = sess print(_best_s) print(_best_p) return hh_dh, hh_discriminator, hh_sess
def main(): time.sleep(0.5) # 休息0.5 秒让之前的进程退出 now = "\n\n\n" + str(datetime.datetime.now().isoformat()) # test 是完整的; small 是少量 ; debug 只是一次 model = FLAGS.mode ct.print("tf:%s should be 1.2.1 model:%s " % (str(tf.__version__), model)) # 1.2.1 ct.just_log2("info", now) ct.just_log2("valid", now) ct.just_log2("test", now) ct.just_log2("info", get_config_msg()) ct.log3(now) embedding_weight = None error_test_dict = dict() valid_test_dict = dict() # 1 读取所有的数据,返回一批数据标记好的数据{data.x,data.label} dh = data_helper.DataClass(model) if FLAGS.word_model == "word2vec_train": embedding_weight = dh.embeddings # 3 构造模型LSTM类 ct.print("max_document_length=%s,vocab_size=%s " % (str(dh.max_document_length), str(dh.converter.vocab_size))) lstm = mynn.CustomNetwork( max_document_length=dh.max_document_length, # timesteps word_dimension=FLAGS.word_dimension, # 一个单词的维度 vocab_size=dh.converter.vocab_size, # embedding时候的W的大小embedding_size rnn_size=FLAGS.rnn_size, # 隐藏层大小 model=model, need_cal_attention=FLAGS.need_cal_attention, need_max_pooling=FLAGS.need_max_pooling, word_model=FLAGS.word_model, embedding_weight=embedding_weight, need_gan=False) # 4 ----------------------------------- 设定loss----------------------------------- global_step = tf.Variable(0, name="globle_step", trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(lstm.loss, tvars), FLAGS.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(1e-1) optimizer.apply_gradients(zip(grads, tvars)) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # 初始化 init = tf.global_variables_initializer() merged = tf.summary.merge_all() with tf.Session().as_default() as sess: writer = tf.summary.FileWriter("log/", sess.graph) sess.run(init) embeddings = [] use_error = False error_test_q_list = [] error_test_pos_r_list = [] error_test_neg_r_list = [] # 测试输出所以的训练问题和测试问题 # dh.build_train_test_q() # train_step = 0 max_acc = 0 for step in range(FLAGS.epoches): toogle_line = ">>>>>>>>>>>>>>>>>>>>>>>>>step=%d,total_train_step=%d " % ( step, len(dh.q_neg_r_tuple)) ct.log3(toogle_line) ct.just_log2("info", toogle_line) # 数据准备 my_generator = '' if FLAGS.fix_model and len(error_test_q_list) != 0: my_generator = dh.batch_iter_wq_debug_fix_model( error_test_q_list, error_test_pos_r_list, error_test_neg_r_list, FLAGS.batch_size) use_error = True toogle_line = "\n\n\n\n\n------------------use_error to train" ct.log3(toogle_line) ct.just_log2("info", toogle_line) ct.just_log2("valid", 'use_error to train') ct.just_log2("test", 'use_error to train') elif ct.is_debug_few(): toogle_line = "\n------------------is_debug_few to train" ct.log3(toogle_line) ct.just_log2("info", toogle_line) train_part = config.cc_par('train_part') model = 'train' # 属性就生成问题就读取 shuffle_indices = get_shuffle_indices_train( len(dh.q_neg_r_tuple_train), step, train_part, model, train_step) if train_part == 'relation': my_generator = dh.batch_iter_wq_debug( dh.train_question_list_index, dh.train_relation_list_index, shuffle_indices, FLAGS.batch_size, train_part) else: my_generator = dh.batch_iter_wq_debug( dh.train_question_list_index, dh.train_answer_list_index, shuffle_indices, FLAGS.batch_size, train_part) else: # 不用 train_q, train_cand, train_neg = \ dh.batch_iter_wq(dh.train_question_list_index, dh.train_relation_list_index, FLAGS.batch_size) toogle_line = "\n==============================train_step=%d\n" % train_step ct.just_log2("info", toogle_line) ct.log3(toogle_line) # 训练数据 for gen in my_generator: toogle_line = "\n==============================train_step=%d\n" % train_step ct.just_log2("info", toogle_line) ct.log3(toogle_line) if not use_error: train_step += 1 train_q = gen[0] train_cand = gen[1] train_neg = gen[2] run_step2(sess, lstm, step, train_step, train_op, train_q, train_cand, train_neg, merged, writer, dh, use_error) if use_error: continue # -------------------------test # 1 源数据,训练数据OR验证数据OR测试数据 # 验证 valid_test_dict, error_test_dict, max_acc, all_right,\ error_test_q_list, error_test_pos_r_list, error_test_neg_r_list \ = valid_test_checkpoint(train_step, dh, step, sess, lstm, merged, writer, train_op, valid_test_dict, error_test_dict, max_acc) if config.cc_par('keep_run') and all_right and step > 2: del lstm # 清理资源 del sess return True if use_error: error_test_q_list.clear() error_test_pos_r_list.clear() error_test_neg_r_list.clear() use_error = False toogle_line = "<<<<<<<<<<<<<<<<<<<<<<<<<<<<step=%d\n" % step # ct.just_log2("test", toogle_line) ct.just_log2("info", toogle_line) ct.log3(toogle_line)
def build_all_q_r_tuple(): dh = data_helper.DataClass("sq") dh.build_all_q_r_tuple(99999, 9999999, is_record=True)
def main(_): # prepare_data() # FLAGS.start_string = FLAGS.start_string.decode('utf-8') # converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path = \ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model_path = os.path.join('model', FLAGS.name) if os.path.exists(model_path) is False: os.makedirs(model_path) model = 'ner' dh = data_helper.DataClass(model) train_batch_size = 1 # g = dh.batch_iter_char_rnn(train_batch_size) # (FLAGS.num_seqs, FLAGS.num_steps) embedding_weight = dh.embeddings model = CharRNN(dh.converter.vocab_size, # 词汇表大小 从其中生成所有候选 num_seqs=train_batch_size, # FLAGS.num_seqs, # ? 一个batch 的 句子 数目 num_steps=dh.max_document_length, # FLAGS.num_steps, # 一个句子的长度 lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size, embedding_weight=embedding_weight, sampling=True, dh=dh ) model.load(FLAGS.checkpoint_path) # cs = [] # cs.append('♠是什么类型的产品') # cs.append('♠是谁') # cs.append('♠是哪个公司的长度') f1 = '../data/nlpcc2016/6-answer/q.rdf.ms.re.v1.txt' f3 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.v1.txt' f4 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.sort_by_ner_lstm.v1.txt' f1s = ct.file_read_all_lines_strip(f1) f3s = ct.file_read_all_lines_strip(f3) f1s_new = [] f3s_new = [] for i in range(len(f1s)): # if str(f1s[i]).__contains__('NULL'): # continue f1s_new.append(f1s[i]) f3s_new.append(f3s[i]) # 过滤NULL # 获取候选实体逐个去替代和判断 # cs.append('立建候时么什是♠') # 读取出所有候选实体并打分取出前3 看准确率 f4s = [] _index = -1 for l1 in f1s_new: # 遍历每个问题 _index += 1 replace_qs = [] for l3 in f3s_new[_index].split('\t'): q_1 = str(l1).split('\t')[0].replace(l3, '♠') replace_qs.append((q_1, l3)) entitys = [] for content, l3 in replace_qs: # content = input("input:") start = dh.convert_str_to_indexlist_2(content, False) # arr = model.sample(FLAGS.max_length, start, dh.converter.vocab_size,dh.get_padding_num()) # #converter.vocab_size r1, score_list = model.judge(start, dh.converter.vocab_size) entitys.append((l3, r1)) # print(content) # print(r1) # print(score_list) ct.print("%s\t%s\t%s" % (content, l3, r1), 'debug_process') entitys.sort(key=lambda x: x[1]) entitys_new = [x[0] for x in entitys] ct.print('\t'.join(entitys_new)) f4s.append('\t'.join(entitys_new)) ct.file_wirte_list(f4, f4s)