Example #1
0
def get_validation_data(input_texts, target_texts, char2id, maxlen=400):
    # 数据生成器
    X, Y = [], []
    for i in range(len(input_texts)):
        X.append(str2id(input_texts[i], char2id, maxlen))
        Y.append(str2id(target_texts[i], char2id, maxlen))
        X = np.array(padding(X, char2id))
        Y = np.array(padding(Y, char2id))
        return [X, Y], None
Example #2
0
def data_generator(input_texts, target_texts, char2id, batch_size, maxlen=400):
    # 数据生成器
    while True:
        X, Y = [], []
        for i in range(len(input_texts)):
            X.append(str2id(input_texts[i], char2id, maxlen))
            Y.append(str2id(target_texts[i], char2id, maxlen))
            if len(X) == batch_size:
                X = np.array(padding(X, char2id))
                Y = np.array(padding(Y, char2id))
                yield [X, Y], None
                X, Y = [], []
Example #3
0
def get_validation_data(input_texts,
                        target_texts,
                        char2id,
                        input_pinyins,
                        output_pinyins,
                        pingyin2id,
                        maxlen=400):
    # 数据生成器
    X, Y = [], []
    X_p, Y_p = [], []
    for i in range(len(input_texts)):
        X.append(str2id(input_texts[i], char2id, maxlen))
        Y.append(str2id(target_texts[i], char2id, maxlen))
        X_p.append(str2id(input_pinyins[i], pingyin2id, maxlen))
        Y_p.append(str2id(output_pinyins[i], pingyin2id, maxlen))
        X = np.array(padding(X, char2id))
        Y = np.array(padding(Y, char2id))
        X_p = np.array(padding(X_p, pingyin2id))
        Y_p = np.array(padding(Y_p, pingyin2id))
        return [X, Y, X_p, Y_p], None
Example #4
0
def data_generator(input_texts,
                   target_texts,
                   char2id,
                   input_pinyins,
                   output_pinyins,
                   pingyin2id,
                   batch_size,
                   maxlen=400):
    # 数据生成器
    while True:
        X, Y = [], []
        X_p, Y_p = [], []
        for i in range(len(input_texts)):
            X.append(str2id(input_texts[i], char2id, maxlen))
            Y.append(str2id(target_texts[i], char2id, maxlen))
            X_p.append(str2id(input_pinyins[i], pingyin2id, maxlen))
            Y_p.append(str2id(output_pinyins[i], pingyin2id, maxlen))
            if len(X) == batch_size:
                X = np.array(padding(X, char2id))
                Y = np.array(padding(Y, char2id))
                X_p = np.array(padding(X_p, pingyin2id))
                Y_p = np.array(padding(Y_p, pingyin2id))
                yield [X, Y, X_p, Y_p], None
                X, Y, X_p, Y_p = [], [], [], []
Example #5
0
def gen_target(input_text,
               model,
               char2id,
               id2char,
               pinyin2id,
               id2pinyin,
               maxlen=400,
               topk=3,
               max_target_len=50):
    """beam search解码
    每次只保留topk个最优候选结果;如果topk=1,那么就是贪心搜索
    """
    xid = np.array([str2id(input_text, char2id, maxlen)] * topk)  # 输入转id
    yid = np.array([[char2id[GO_TOKEN]]] * topk)  # 解码均以GO开始
    scores = [0] * topk  # 候选答案分数
    for i in range(max_target_len):  # 强制要求target不超过maxlen字
        proba = model.predict([xid, yid])[:, i, :]  # 预测
        log_proba = np.log(proba + 1e-6)  # 取对数,方便计算
        arg_topk = log_proba.argsort(axis=1)[:, -topk:]  # 每一项选出topk
        _yid = []  # 暂存的候选目标序列
        _scores = []  # 暂存的候选目标序列得分
        if i == 0:
            for j in range(topk):
                _yid.append(list(yid[j]) + [arg_topk[0][j]])
                _scores.append(scores[j] + log_proba[0][arg_topk[0][j]])
        else:
            for j in range(len(xid)):
                for k in range(topk):  # 遍历topk*topk的组合
                    _yid.append(list(yid[j]) + [arg_topk[j][k]])
                    _scores.append(scores[j] + log_proba[j][arg_topk[j][k]])
            _arg_topk = np.argsort(_scores)[-topk:]  # 从中选出新的topk
            _yid = [_yid[k] for k in _arg_topk]
            _scores = [_scores[k] for k in _arg_topk]
        yid = []
        scores = []
        for k in range(len(xid)):
            if _yid[k][-1] == char2id[EOS_TOKEN]:  # 找到<end>就返回
                return id2str(_yid[k][1:-1], id2char)
            else:
                yid.append(_yid[k])
                scores.append(_scores[k])
        yid = np.array(yid)
    # 如果maxlen字都找不到EOS,直接返回
    return id2str(yid[np.argmax(scores)][1:-1], id2char)