def get_validation_data(input_texts, target_texts, char2id, maxlen=400): # 数据生成器 X, Y = [], [] for i in range(len(input_texts)): X.append(str2id(input_texts[i], char2id, maxlen)) Y.append(str2id(target_texts[i], char2id, maxlen)) X = np.array(padding(X, char2id)) Y = np.array(padding(Y, char2id)) return [X, Y], None
def data_generator(input_texts, target_texts, char2id, batch_size, maxlen=400): # 数据生成器 while True: X, Y = [], [] for i in range(len(input_texts)): X.append(str2id(input_texts[i], char2id, maxlen)) Y.append(str2id(target_texts[i], char2id, maxlen)) if len(X) == batch_size: X = np.array(padding(X, char2id)) Y = np.array(padding(Y, char2id)) yield [X, Y], None X, Y = [], []
def get_validation_data(input_texts, target_texts, char2id, input_pinyins, output_pinyins, pingyin2id, maxlen=400): # 数据生成器 X, Y = [], [] X_p, Y_p = [], [] for i in range(len(input_texts)): X.append(str2id(input_texts[i], char2id, maxlen)) Y.append(str2id(target_texts[i], char2id, maxlen)) X_p.append(str2id(input_pinyins[i], pingyin2id, maxlen)) Y_p.append(str2id(output_pinyins[i], pingyin2id, maxlen)) X = np.array(padding(X, char2id)) Y = np.array(padding(Y, char2id)) X_p = np.array(padding(X_p, pingyin2id)) Y_p = np.array(padding(Y_p, pingyin2id)) return [X, Y, X_p, Y_p], None
def data_generator(input_texts, target_texts, char2id, input_pinyins, output_pinyins, pingyin2id, batch_size, maxlen=400): # 数据生成器 while True: X, Y = [], [] X_p, Y_p = [], [] for i in range(len(input_texts)): X.append(str2id(input_texts[i], char2id, maxlen)) Y.append(str2id(target_texts[i], char2id, maxlen)) X_p.append(str2id(input_pinyins[i], pingyin2id, maxlen)) Y_p.append(str2id(output_pinyins[i], pingyin2id, maxlen)) if len(X) == batch_size: X = np.array(padding(X, char2id)) Y = np.array(padding(Y, char2id)) X_p = np.array(padding(X_p, pingyin2id)) Y_p = np.array(padding(Y_p, pingyin2id)) yield [X, Y, X_p, Y_p], None X, Y, X_p, Y_p = [], [], [], []
def gen_target(input_text, model, char2id, id2char, pinyin2id, id2pinyin, maxlen=400, topk=3, max_target_len=50): """beam search解码 每次只保留topk个最优候选结果;如果topk=1,那么就是贪心搜索 """ xid = np.array([str2id(input_text, char2id, maxlen)] * topk) # 输入转id yid = np.array([[char2id[GO_TOKEN]]] * topk) # 解码均以GO开始 scores = [0] * topk # 候选答案分数 for i in range(max_target_len): # 强制要求target不超过maxlen字 proba = model.predict([xid, yid])[:, i, :] # 预测 log_proba = np.log(proba + 1e-6) # 取对数,方便计算 arg_topk = log_proba.argsort(axis=1)[:, -topk:] # 每一项选出topk _yid = [] # 暂存的候选目标序列 _scores = [] # 暂存的候选目标序列得分 if i == 0: for j in range(topk): _yid.append(list(yid[j]) + [arg_topk[0][j]]) _scores.append(scores[j] + log_proba[0][arg_topk[0][j]]) else: for j in range(len(xid)): for k in range(topk): # 遍历topk*topk的组合 _yid.append(list(yid[j]) + [arg_topk[j][k]]) _scores.append(scores[j] + log_proba[j][arg_topk[j][k]]) _arg_topk = np.argsort(_scores)[-topk:] # 从中选出新的topk _yid = [_yid[k] for k in _arg_topk] _scores = [_scores[k] for k in _arg_topk] yid = [] scores = [] for k in range(len(xid)): if _yid[k][-1] == char2id[EOS_TOKEN]: # 找到<end>就返回 return id2str(_yid[k][1:-1], id2char) else: yid.append(_yid[k]) scores.append(_scores[k]) yid = np.array(yid) # 如果maxlen字都找不到EOS,直接返回 return id2str(yid[np.argmax(scores)][1:-1], id2char)