def sentence2vec(w2v_model, s, max_length): if isinstance(s, str): words = word_tokenize(remove_punc(s.lower())) else: words = s vec = [] if len(words) > max_length: words = words[:max_length] for word in words: if word in w2v_model.wv.vocab: vec.append(w2v_model.wv[word]) dim = len(vec[0]) # print("dim", dim) print("len(vec)", len(vec)) for i in range(max_length - len(vec)): vec.append(np.zeros(dim)) return np.array(vec)
def get_train_data(data_type, w2v_model, qa_file, doc_file, to_file_path, args): logger.info("preprocessing...") ns_amount = args.ns_amount questions = [] answers = [] # 计算每个question的向量 input_length = 0 with open(qa_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "" and i % 2 == 0: words = word_tokenize(remove_punc(line)) input_length = max(len(words), input_length) questions.append(words) elif line != "" and i % 2 == 1: arr = line.strip().split(" ") ans = [] for a in arr: if a != "": ans.append(int(a) - 1) # 因为原始数据从1开始计数,这里减去1。改为从0开始。 answers.append(ans) question_vecs = [] for q_words in questions: question_vecs.append(sentence2vec(w2v_model, q_words, input_length)) print("len(question_vecs)", len(question_vecs)) # 计算每个document的向量 docs = [] output_length = 0 with open(doc_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "": words = word_tokenize(remove_punc(line)) output_length = max(len(words), output_length) docs.append(words) doc_vecs = [] output_length = args.output_length for d_words in docs: doc_vecs.append(sentence2vec(w2v_model, d_words, output_length)) print("len(doc_vecs)", len(doc_vecs)) logger.info("input_length:%d, output_length:%d" % (input_length, output_length)) # 计算每个doc出现的频率 doc_count = {} for ans in answers: for a in ans: if a in doc_count.keys(): doc_count[a] += 1 else: doc_count[a] = 1 # 计算每个doc的weight doc_weight = {} t_max = 0 for k in doc_count.keys(): t_max = max(t_max, doc_count[k]) for k in doc_count.keys(): doc_weight[k] = doc_count[k] / t_max total = len(question_vecs) train_num = int(total * 0.9) logger.info("train_num:%d, total:%d" % (train_num, total)) # 打乱数据 qa_index = list(range(total)) random.shuffle(qa_index) step = 0 while step * 200 <= train_num: # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w] q_encoder_input = [] r_decoder_input = [] w_decoder_input = [] weight_data_r = [] weight_data_w = [] y_data = [] qid_list = [] label_list = [] aid_list = [] logger.info("step: %d" % step) end = min(train_num, (step + 1) * 200) for ss in range(step * 200, end): i = qa_index[ss] logger.info("question: %d" % i) qid_list.append(i) label_list.append(1) y = [1] + [0] * ns_amount y_data.append(y) # question q_encoder_input.append(question_vecs[i]) # 每个question一个正确答案 aid = answers[i][0] aid_list.append(aid) r_decoder_input.append(doc_vecs[aid]) weight_data_r.append(doc_weight[aid]) # 10个un-related答案 aids = get_randoms(list(doc_weight.keys()), [aid], 10) w_decoder = [] w_weight = [] for aid in aids: w_decoder.append(doc_vecs[aid]) w_weight.append(doc_weight[aid]) w_decoder = np.array(w_decoder).reshape(output_length, args.input_dim, ns_amount) w_weight = np.array(w_weight).reshape((1, ns_amount)) w_decoder_input.append(w_decoder) weight_data_w.append(w_weight) for aaid in aids: qid_list.append(i) label_list.append(0) aid_list.append(aaid) # 这些答案都是unrelated y = [0] * (1 + ns_amount) y_data.append(y) # question q_encoder_input.append(question_vecs[i]) r_decoder_input.append(doc_vecs[aaid]) weight_data_r.append(doc_weight[aaid]) # 10个un-related答案 aids = get_randoms(list(doc_weight.keys()), [aid], 10) w_decoder = [] w_weight = [] for aid in aids: w_decoder.append(doc_vecs[aid]) w_weight.append(doc_weight[aid]) w_decoder = np.array(w_decoder).reshape( output_length, args.input_dim, ns_amount) w_weight = np.array(w_weight).reshape((1, ns_amount)) w_decoder_input.append(w_decoder) weight_data_w.append(w_weight) logger.info("loading weights: ckpt/nn_weights_%s.h5" % data_type) model = negative_samples(input_length=input_length, input_dim=args.input_dim, output_length=output_length, output_dim=args.output_dim, hidden_dim=args.hidden_dim, ns_amount=ns_amount, learning_rate=args.learning_rate, drop_rate=args.drop_rate) model.load_weights("ckpt/nn_weights_%s.h5" % data_type) new_dnn_model = Model(inputs=model.input, outputs=model.get_layer('dropout_con').output) logger.info("predicting...") res = new_dnn_model.predict([ q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w ]) # print(res) with open(to_file_path, "a") as f: for i in range(len(res)): row = res[i] feature_str = '' for j in range(len(row)): feature_str = feature_str + (" %d:%.9f" % (j + 1, row[j])) label = label_list[i] id = qid_list[i] doc_id = aid_list[i] line = "%d qid:%d%s # doc-%d \n" % (label, id, feature_str, doc_id) f.write(line) print("saved to:", to_file_path) logger.info("step:%d added" % step) step += 1 logger.info("saved to: %s" % to_file_path)
def get_train_data(data_type, w2v_model, qa_file, doc_file, to_file_path, args, step=0): logger.info("preprocessing...") questions = [] answers = [] # 计算每个question的向量 input_length = 0 with open(qa_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "" and i % 2 == 0: words = word_tokenize(remove_punc(line)) input_length = max(len(words), input_length) questions.append(words) elif line != "" and i % 2 == 1: arr = line.strip().split(" ") ans = [] for a in arr: if a != "": ans.append(int(a) - 1) # 因为原始数据从1开始计数,这里减去1。改为从0开始。 answers.append(ans) input_length = args.input_length question_vecs = [] for q_words in questions: question_vecs.append(sentence2vec(w2v_model, q_words, input_length)) print("len(question_vecs)", len(question_vecs)) # 计算每个document的向量 docs = [] output_length = 0 with open(doc_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "": words = word_tokenize(remove_punc(line)) output_length = max(len(words), output_length) docs.append(words) doc_vecs = [] output_length = args.output_length for d_words in docs: doc_vecs.append(sentence2vec(w2v_model, d_words, output_length)) print("len(doc_vecs)", len(doc_vecs)) logger.info("input_length:%d, output_length:%d" % (input_length, output_length)) # 计算每个doc出现的频率 doc_count = {} for ans in answers: for a in ans: if a in doc_count.keys(): doc_count[a] += 1 else: doc_count[a] = 1 # 计算每个doc的weight doc_weight = {} t_max = 0 for k in doc_count.keys(): t_max = max(t_max, doc_count[k]) for k in doc_count.keys(): doc_weight[k] = doc_count[k] / t_max model = DNN(args.input_lenth, args.input_dim, filters_num=args.filters_num, kernel_val=(args.kernel_val), pool_s=args.pool_s, pool_stride=args.pool_stride, hidden1_dim=args.hidden1_dim, hidden2_dim=args.hidden2_dim, activation=args.activation) model.load_weights("ckpt/dnn_weights_v2_%s.h5" % data_type) new_dnn_model = Model(inputs=model.input, outputs=model.get_layer('hidden_layer').output) total = len(question_vecs) train_num = int(total * 0.9) for i in range(train_num): q_encoder_input = [] r_decoder_input = [] label_list = [] aid_list = [] logger.info("get all documents for question: %d" % i) print("get all documents for question: %d" % i) # qid_list.append(i) # label_list.append(1) cur_answers = answers[i] doc_list_ordered = [a for a in cur_answers] for aid in list(doc_weight.keys()): if aid not in doc_list_ordered: doc_list_ordered.append(aid) print("len(doc_list_ordered):", len(doc_list_ordered)) print("len(cur_answers):", len(cur_answers)) for aid in doc_list_ordered: aid_list.append(aid) if aid in cur_answers: label_list.append(1) else: label_list.append(0) # question q_encoder_input.append(question_vecs[i]) r_decoder_input.append(doc_vecs[aid]) logger.info("predicting question: %d" % i) print("predicting question: %d" % i) res = new_dnn_model.predict([q_encoder_input, r_decoder_input]) # print(res) with open(to_file_path, "a") as f: for j in range(len(res)): row = res[j] feature_str = '' for k in range(len(row)): feature_str = feature_str + (" %d:%.9f" % (k + 1, row[k])) label = label_list[j] doc_id = aid_list[j] line = "%d qid:%d%s # doc-%d \n" % (label, i, feature_str, doc_id) f.write(line) print("saved to:", to_file_path) logger.info("total:%d" % total) logger.info("saved to: %s" % to_file_path)
def train(w2v_model, qa_file, doc_file, to_model_file, to_ckpt_file, args): logger.info("preprocessing...") ns_amount = args.ns_amount questions = [] answers = [] # question vector input_length = 0 with open(qa_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "" and i % 2 == 0: words = word_tokenize(remove_punc(line)) input_length = max(len(words), input_length) questions.append(words) elif line != "" and i % 2 == 1: arr = line.strip().split(" ") ans = [] for a in arr: if a != "": ans.append( int(a) - 1 ) # the index starts from 1 in the QA_list file, make it start from 0. answers.append(ans) question_vecs = [] for q_words in questions: question_vecs.append(sentence2vec(w2v_model, q_words, input_length)) print("len(question_vecs)", len(question_vecs)) # document vector docs = [] output_length = 0 with open(doc_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "": words = word_tokenize(remove_punc(line)) output_length = max(len(words), output_length) docs.append(words) doc_vecs = [] output_length = args.output_length for d_words in docs: doc_vecs.append(sentence2vec(w2v_model, d_words, output_length)) print("len(doc_vecs)", len(doc_vecs)) logger.info("input_length:%d, output_length:%d" % (input_length, output_length)) # weights for each doc doc_count = {} for ans in answers: for a in ans: if a in doc_count.keys(): doc_count[a] += 1 else: doc_count[a] = 1 doc_weight = {} t_max = 0 for k in doc_count.keys(): t_max = max(t_max, doc_count[k]) for k in doc_count.keys(): doc_weight[k] = doc_count[k] / t_max # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w] q_encoder_input = [] r_decoder_input = [] w_decoder_input = [] weight_data_r = [] weight_data_w = [] y_data = [] total = len(question_vecs) qa_index = list(range(total)) random.shuffle(qa_index) for i in qa_index: y = [1] + [0] * ns_amount y_data.append(y) # question q_encoder_input.append(question_vecs[i]) aid = answers[i][0] r_decoder_input.append(doc_vecs[aid]) weight_data_r.append(doc_weight[aid]) aids = get_randoms(list(doc_weight.keys()), [aid], 10) w_decoder = [] w_weight = [] for aid in aids: w_decoder.append(doc_vecs[aid]) w_weight.append(doc_weight[aid]) w_decoder = np.array(w_decoder).reshape(output_length, args.input_dim, ns_amount) w_weight = np.array(w_weight).reshape((1, ns_amount)) w_decoder_input.append(w_decoder) weight_data_w.append(w_weight) y_data = np.array(y_data).reshape(total, (1 + ns_amount)) train_num = int(total * 0.9) model = negative_samples(input_length=input_length, input_dim=args.input_dim, output_length=output_length, output_dim=args.output_dim, hidden_dim=args.hidden_dim, ns_amount=ns_amount, learning_rate=args.learning_rate, drop_rate=args.drop_rate) print(model.summary()) print("start training...") logger.info("start training...") model.fit([ q_encoder_input[:train_num], r_decoder_input[:train_num], w_decoder_input[:train_num], weight_data_r[:train_num], weight_data_w[:train_num] ], y_data[:train_num], batch_size=args.batch_size, epochs=args.epochs, verbose=1, validation_data=([ q_encoder_input[train_num:], r_decoder_input[train_num:], w_decoder_input[train_num:], weight_data_r[train_num:], weight_data_w[train_num:] ], y_data[train_num:])) res = model.evaluate([ q_encoder_input[train_num:], r_decoder_input[train_num:], w_decoder_input[train_num:], weight_data_r[train_num:], weight_data_w[train_num:] ], y_data[train_num:], verbose=1) print("training over.") logger.info("training over") print(model.metrics_names) print(res) print(model.summary()) model.save(to_model_file) print("saved model to:", ) model.save_weights(to_ckpt_file) print("saved weights to:", to_ckpt_file)
if __name__ == '__main__': qa_path = "%s/QA_list.txt" % "twitter" qa_list = [] # 读取 question 和 answer input_length = 0 qid = 0 with open(qa_path, "r") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "" and i % 2 == 0: words = word_tokenize(remove_punc(line)) input_length = max(len(words), input_length) elif line != "" and i % 2 == 1: arr = line.strip().split(" ") ans = [] for a in arr: if a != "": ans.append(int(a) - 1) # 因为原始数据从1开始计数,这里减去1。改为从0开始。 qa_list.append({"qid": qid, "question": words, "answers": ans}) qid += 1 qa_index = list(range(len(qa_list))) random.shuffle(qa_index) parser = argparse.ArgumentParser(description='Test for argparse') parser.add_argument('--data_type', help='data_type',
def get_train_data(data_type, w2v_model, qa_file, doc_file, to_file_path, args, step=0): logger.info("preprocessing...") questions = [] answers = [] # 计算每个question的向量 input_length = 0 with open(qa_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): if i >= 2000: # 数据太大,内存不够 break line = line.strip().lower() if line != "" and i % 2 == 0: words = word_tokenize(remove_punc(line)) input_length = max(len(words), input_length) questions.append(words) elif line != "" and i % 2 == 1: arr = line.strip().split(" ") ans = [] for a in arr: if a != "": ans.append(int(a) - 1) # 因为原始数据从1开始计数,这里减去1。改为从0开始。 answers.append(ans) input_length = args.input_length question_vecs = [] for q_words in questions: question_vecs.append(sentence2vec(w2v_model, q_words, input_length)) print("len(question_vecs)", len(question_vecs)) # 计算每个document的向量 docs = [] output_length = 0 with open(doc_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "": words = word_tokenize(remove_punc(line)) output_length = max(len(words), output_length) docs.append(words) doc_vecs = [] output_length = args.output_length for d_words in docs: doc_vecs.append(sentence2vec(w2v_model, d_words, output_length)) print("len(doc_vecs)", len(doc_vecs)) logger.info("input_length:%d, output_length:%d" % (input_length, output_length)) # 计算每个doc出现的频率 doc_count = {} for ans in answers: for a in ans: if a in doc_count.keys(): doc_count[a] += 1 else: doc_count[a] = 1 # 计算每个doc的weight doc_weight = {} t_max = 0 for k in doc_count.keys(): t_max = max(t_max, doc_count[k]) for k in doc_count.keys(): doc_weight[k] = doc_count[k] / t_max total = len(question_vecs) train_num = int(total * 0.9) logger.info("train_num:%d, total:%d" % (train_num, total)) # 打乱数据 qa_index = list(range(total)) random.shuffle(qa_index) # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w] q_encoder_input = [] r_decoder_input = [] y_data = [] label_list = [] qid_list = [] aid_list = [] total = len(question_vecs) for i in range(total): # question q_encoder_input.append(question_vecs[i]) qid_list.append(i) # 每个question一个正确答案 aid = answers[i][0] r_decoder_input.append(doc_vecs[aid]) y_data.append([1, 0]) label_list.append(1) aid_list.append(aid) # 10个un-related答案 aids = get_randoms(list(doc_weight.keys()), [aid], 10) for aaid in aids: q_encoder_input.append(question_vecs[i]) r_decoder_input.append(doc_vecs[aaid]) y_data.append([0, 1]) label_list.append(0) qid_list.append(i) aid_list.append(aaid) y_data = np.array(y_data) train_num = int(total * 0.9) * 11 model = DNN(args.input_lenth, args.input_dim, filters_num=args.filters_num, kernel_val=(args.kernel_val), pool_s=args.pool_s, pool_stride=args.pool_stride, hidden1_dim=args.hidden1_dim, hidden2_dim=args.hidden2_dim, activation=args.activation) model.load_weights("ckpt/dnn_weights_v2_%s.h5" % data_type) new_dnn_model = Model(inputs=model.input, outputs=model.get_layer('hidden_layer').output) logger.info("predicting...") res = new_dnn_model.predict( [r_decoder_input[:train_num], q_encoder_input[:train_num]]) print("len(res)", len(res)) print("train_num", train_num) print("len(r_decoder_input[:train_num])", len(r_decoder_input[:train_num])) with open(to_file_path, "w") as f: for i in range(len(res)): row = res[i] feature_str = '' for j in range(len(row)): feature_str = feature_str + (" %d:%.9f" % (j + 1, row[j])) label = label_list[i] id = qid_list[i] doc_id = aid_list[i] line = "%d qid:%d%s # doc-%d \n" % (label, id, feature_str, doc_id) f.write(line) print("saved to:", to_file_path) logger.info("total:%d" % total) logger.info("saved to: %s" % to_file_path)
def get_train_data(data_type, w2v_model, ckpt_path, qa_file, doc_file, to_file_path, args, step=0): if os.path.exists(to_file_path): logger.info("file exists: %s" % to_file_path) return logger.info("preprocessing...") ns_amount = 10 questions = [] answers = [] # 计算每个question的向量 input_length = 0 with open(qa_file, "r", encoding="utf-8") as f: lines = f.readlines() for i, line in enumerate(lines): if i >= 2000: break line = line.strip().lower() if line != "" and i % 2 == 0: words = word_tokenize(remove_punc(line)) input_length = max(len(words), input_length) questions.append(words) elif line != "" and i % 2 == 1: arr = line.strip().split(" ") ans = [] for a in arr: if a != "": ans.append(int(a) - 1) # 因为原始数据从1开始计数,这里减去1。改为从0开始。 answers.append(ans) question_vecs = [] for q_words in questions: question_vecs.append(sentence2vec(w2v_model, q_words, input_length)) print("len(question_vecs)", len(question_vecs)) # 计算每个document的向量 docs = [] output_length = 0 with open(doc_file, "r", encoding="utf-8") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "": words = word_tokenize(remove_punc(line)) output_length = max(len(words), output_length) docs.append(words) doc_vecs = [] output_length = 1000 for d_words in docs: doc_vecs.append(sentence2vec(w2v_model, d_words, output_length)) print("len(doc_vecs)", len(doc_vecs)) logger.info("input_length:%d, output_length:%d" % (input_length, output_length)) # 计算每个doc出现的频率 doc_count = {} for ii in range(len(docs)): doc_count[ii] = 0 for ans in answers: for a in ans: if a in doc_count.keys(): doc_count[a] += 1 # 计算每个doc的weight doc_weight = {} t_max = 0 for k in doc_count.keys(): t_max = max(t_max, doc_count[k]) for k in doc_count.keys(): doc_weight[k] = doc_count[k] / t_max logger.info("loading weights...") model = negative_samples(input_length=input_length, input_dim=args.input_dim, output_length=output_length, output_dim=args.output_dim, hidden_dim=args.hidden_dim, ns_amount=ns_amount, learning_rate=args.learning_rate, drop_rate=args.drop_rate) model.load_weights(ckpt_path) new_dnn_model = Model(inputs=model.input, outputs=model.get_layer('dropout_con').output) total = len(question_vecs) train_num = int(total * 0.9) qid_list = [] # 打乱数据 qa_index = list(range(total)) # random.shuffle(qa_index) for ss in range(train_num, total): i = qa_index[ss] # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w] q_encoder_input = [] r_decoder_input = [] w_decoder_input = [] weight_data_r = [] weight_data_w = [] logger.info("get all documents for question: %d" % i) print("get all documents for question: %d" % i) # qid_list.append(i) # label_list.append(1) cur_answers = answers[i] doc_list_ordered = [a for a in cur_answers] for aid in list(doc_weight.keys()): if aid not in doc_list_ordered: doc_list_ordered.append(aid) label_list = [] aid_list = [] print("len(doc_list_ordered):", len(doc_list_ordered)) print("len(cur_answers):", len(cur_answers)) for aid in doc_list_ordered: aid_list.append(aid) if aid in cur_answers: label_list.append(1) else: label_list.append(0) # question q_encoder_input.append(question_vecs[i]) r_decoder_input.append(doc_vecs[aid]) weight_data_r.append(doc_weight[aid]) # 10个un-related答案 aids = get_randoms(list(doc_weight.keys()), cur_answers, ns_amount) w_decoder = [] w_weight = [] for aid in aids: w_decoder.append(doc_vecs[aid]) w_weight.append(doc_weight[aid]) w_decoder = np.array(w_decoder).reshape(output_length, args.input_dim, ns_amount) w_weight = np.array(w_weight).reshape((1, ns_amount)) w_decoder_input.append(w_decoder) weight_data_w.append(w_weight) logger.info("now:%d , predicting question: %d" % (ss, i)) print("now:%d , predicting question: %d" % (ss, i)) start = 0 end = len(q_encoder_input) for cur in range(0, end, 1000): print("cur:%d / %d" % (cur, end)) a = q_encoder_input[cur:cur + 1000] b = r_decoder_input[cur:cur + 1000] c = w_decoder_input[cur:cur + 1000] d = weight_data_r[cur:cur + 1000] e = weight_data_w[cur:cur + 1000] res = new_dnn_model.predict([a, b, c, d, e]) # print(res) with open(to_file_path, "a") as f: for j in range(len(res)): row = res[j] feature_str = '' for k in range(len(row)): feature_str = feature_str + (" %d:%.9f" % (k + 1, row[k])) label = label_list[j] doc_id = aid_list[j] line = "%d qid:%d%s # doc-%d \n" % (label, i, feature_str, doc_id) f.write(line) print("saved to:", to_file_path) logger.info("total:%d" % total) logger.info("saved to: %s" % to_file_path)
def train(w2v_model, qa_file, doc_file, to_model_file, to_ckpt_file, args): logger.info("preprocessing...") questions = [] answers = [] # 计算每个question的向量 input_length = 0 with open(qa_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): if i >= 2000: # 数据太大,内存不够 break line = line.strip().lower() if line != "" and i % 2 == 0: words = word_tokenize(remove_punc(line)) input_length = max(len(words), input_length) questions.append(words) elif line != "" and i % 2 == 1: arr = line.strip().split(" ") ans = [] for a in arr: if a != "": ans.append(int(a) - 1) # 因为原始数据从1开始计数,这里减去1。改为从0开始。 answers.append(ans) input_length = args.input_length question_vecs = [] for q_words in questions: question_vecs.append(sentence2vec(w2v_model, q_words, input_length)) print("len(question_vecs)", len(question_vecs)) # 计算每个document的向量 docs = [] output_length = 0 with open(doc_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "": words = word_tokenize(remove_punc(line)) output_length = max(len(words), output_length) docs.append(words) doc_vecs = [] output_length = args.output_length for d_words in docs: doc_vecs.append(sentence2vec(w2v_model, d_words, output_length)) print("len(doc_vecs)", len(doc_vecs)) logger.info("input_length:%d, output_length:%d" % (input_length, output_length)) # 计算每个doc出现的频率 doc_count = {} for ans in answers: for a in ans: if a in doc_count.keys(): doc_count[a] += 1 else: doc_count[a] = 1 # 计算每个doc的weight doc_weight = {} t_max = 0 for k in doc_count.keys(): t_max = max(t_max, doc_count[k]) for k in doc_count.keys(): doc_weight[k] = doc_count[k] / t_max # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w] q_encoder_input = [] r_decoder_input = [] y_data = [] total = len(question_vecs) for i in range(total): # question q_encoder_input.append(question_vecs[i]) # 每个question一个正确答案 aid = answers[i][0] r_decoder_input.append(doc_vecs[aid]) y_data.append([1, 0]) # 10个un-related答案 aids = get_randoms(list(doc_weight.keys()), [aid], 10) for aaid in aids: q_encoder_input.append(question_vecs[i]) r_decoder_input.append(doc_vecs[aaid]) y_data.append([0, 1]) y_data = np.array(y_data) train_num = int(total * 0.9) * 11 model = DNN(args.input_lenth, args.input_dim, filters_num=args.filters_num, kernel_val=(args.kernel_val), pool_s=args.pool_s, pool_stride=args.pool_stride, hidden1_dim=args.hidden1_dim, hidden2_dim=args.hidden2_dim, activation=args.activation) print("start training...") logger.info("start training...") model.fit([r_decoder_input[:train_num], q_encoder_input[:train_num]], y_data[:train_num], batch_size=args.batch_size, epochs=args.epochs, verbose=1, validation_data=([ r_decoder_input[train_num:], q_encoder_input[train_num:] ], y_data[train_num:])) res = model.evaluate( [r_decoder_input[train_num:], q_encoder_input[train_num:]], y_data[train_num:], verbose=1) print("training over.") logger.info("training over") print(model.metrics_names) print(res) print(model.summary()) model.save(to_model_file) print("saved model to:", ) model.save_weights(to_ckpt_file) print("saved weights to:", to_ckpt_file)