Example #1
0
def sentence2vec(w2v_model, s, max_length):
    if isinstance(s, str):
        words = word_tokenize(remove_punc(s.lower()))
    else:
        words = s
    vec = []
    if len(words) > max_length:
        words = words[:max_length]
    for word in words:
        if word in w2v_model.wv.vocab:
            vec.append(w2v_model.wv[word])
    dim = len(vec[0])
    # print("dim", dim)
    print("len(vec)", len(vec))
    for i in range(max_length - len(vec)):
        vec.append(np.zeros(dim))
    return np.array(vec)
Example #2
0
def get_train_data(data_type, w2v_model, qa_file, doc_file, to_file_path,
                   args):
    logger.info("preprocessing...")
    ns_amount = args.ns_amount

    questions = []
    answers = []

    # 计算每个question的向量
    input_length = 0
    with open(qa_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "" and i % 2 == 0:
                words = word_tokenize(remove_punc(line))
                input_length = max(len(words), input_length)
                questions.append(words)
            elif line != "" and i % 2 == 1:
                arr = line.strip().split(" ")
                ans = []
                for a in arr:
                    if a != "":
                        ans.append(int(a) - 1)  # 因为原始数据从1开始计数,这里减去1。改为从0开始。
                answers.append(ans)

    question_vecs = []
    for q_words in questions:
        question_vecs.append(sentence2vec(w2v_model, q_words, input_length))
    print("len(question_vecs)", len(question_vecs))

    # 计算每个document的向量
    docs = []
    output_length = 0
    with open(doc_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "":
                words = word_tokenize(remove_punc(line))
                output_length = max(len(words), output_length)
                docs.append(words)
    doc_vecs = []
    output_length = args.output_length
    for d_words in docs:
        doc_vecs.append(sentence2vec(w2v_model, d_words, output_length))
    print("len(doc_vecs)", len(doc_vecs))
    logger.info("input_length:%d, output_length:%d" %
                (input_length, output_length))

    # 计算每个doc出现的频率
    doc_count = {}
    for ans in answers:
        for a in ans:
            if a in doc_count.keys():
                doc_count[a] += 1
            else:
                doc_count[a] = 1

    # 计算每个doc的weight
    doc_weight = {}
    t_max = 0
    for k in doc_count.keys():
        t_max = max(t_max, doc_count[k])
    for k in doc_count.keys():
        doc_weight[k] = doc_count[k] / t_max

    total = len(question_vecs)
    train_num = int(total * 0.9)
    logger.info("train_num:%d, total:%d" % (train_num, total))

    # 打乱数据
    qa_index = list(range(total))
    random.shuffle(qa_index)

    step = 0
    while step * 200 <= train_num:
        # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w]
        q_encoder_input = []
        r_decoder_input = []
        w_decoder_input = []
        weight_data_r = []
        weight_data_w = []
        y_data = []

        qid_list = []
        label_list = []
        aid_list = []

        logger.info("step: %d" % step)

        end = min(train_num, (step + 1) * 200)
        for ss in range(step * 200, end):
            i = qa_index[ss]
            logger.info("question: %d" % i)
            qid_list.append(i)
            label_list.append(1)

            y = [1] + [0] * ns_amount
            y_data.append(y)
            # question
            q_encoder_input.append(question_vecs[i])
            # 每个question一个正确答案
            aid = answers[i][0]
            aid_list.append(aid)
            r_decoder_input.append(doc_vecs[aid])
            weight_data_r.append(doc_weight[aid])
            # 10个un-related答案
            aids = get_randoms(list(doc_weight.keys()), [aid], 10)
            w_decoder = []
            w_weight = []
            for aid in aids:
                w_decoder.append(doc_vecs[aid])
                w_weight.append(doc_weight[aid])

            w_decoder = np.array(w_decoder).reshape(output_length,
                                                    args.input_dim, ns_amount)
            w_weight = np.array(w_weight).reshape((1, ns_amount))
            w_decoder_input.append(w_decoder)
            weight_data_w.append(w_weight)

            for aaid in aids:
                qid_list.append(i)
                label_list.append(0)
                aid_list.append(aaid)

                # 这些答案都是unrelated
                y = [0] * (1 + ns_amount)
                y_data.append(y)
                # question
                q_encoder_input.append(question_vecs[i])

                r_decoder_input.append(doc_vecs[aaid])
                weight_data_r.append(doc_weight[aaid])
                # 10个un-related答案
                aids = get_randoms(list(doc_weight.keys()), [aid], 10)
                w_decoder = []
                w_weight = []
                for aid in aids:
                    w_decoder.append(doc_vecs[aid])
                    w_weight.append(doc_weight[aid])

                w_decoder = np.array(w_decoder).reshape(
                    output_length, args.input_dim, ns_amount)
                w_weight = np.array(w_weight).reshape((1, ns_amount))
                w_decoder_input.append(w_decoder)
                weight_data_w.append(w_weight)

        logger.info("loading weights: ckpt/nn_weights_%s.h5" % data_type)
        model = negative_samples(input_length=input_length,
                                 input_dim=args.input_dim,
                                 output_length=output_length,
                                 output_dim=args.output_dim,
                                 hidden_dim=args.hidden_dim,
                                 ns_amount=ns_amount,
                                 learning_rate=args.learning_rate,
                                 drop_rate=args.drop_rate)
        model.load_weights("ckpt/nn_weights_%s.h5" % data_type)
        new_dnn_model = Model(inputs=model.input,
                              outputs=model.get_layer('dropout_con').output)

        logger.info("predicting...")
        res = new_dnn_model.predict([
            q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r,
            weight_data_w
        ])
        # print(res)

        with open(to_file_path, "a") as f:
            for i in range(len(res)):
                row = res[i]
                feature_str = ''
                for j in range(len(row)):
                    feature_str = feature_str + (" %d:%.9f" % (j + 1, row[j]))
                label = label_list[i]
                id = qid_list[i]
                doc_id = aid_list[i]

                line = "%d qid:%d%s # doc-%d \n" % (label, id, feature_str,
                                                    doc_id)
                f.write(line)
        print("saved to:", to_file_path)
        logger.info("step:%d added" % step)
        step += 1

    logger.info("saved to: %s" % to_file_path)
def get_train_data(data_type,
                   w2v_model,
                   qa_file,
                   doc_file,
                   to_file_path,
                   args,
                   step=0):
    logger.info("preprocessing...")

    questions = []
    answers = []

    # 计算每个question的向量
    input_length = 0
    with open(qa_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "" and i % 2 == 0:
                words = word_tokenize(remove_punc(line))
                input_length = max(len(words), input_length)
                questions.append(words)
            elif line != "" and i % 2 == 1:
                arr = line.strip().split(" ")
                ans = []
                for a in arr:
                    if a != "":
                        ans.append(int(a) - 1)  # 因为原始数据从1开始计数,这里减去1。改为从0开始。
                answers.append(ans)
    input_length = args.input_length
    question_vecs = []
    for q_words in questions:
        question_vecs.append(sentence2vec(w2v_model, q_words, input_length))
    print("len(question_vecs)", len(question_vecs))

    # 计算每个document的向量
    docs = []
    output_length = 0
    with open(doc_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "":
                words = word_tokenize(remove_punc(line))
                output_length = max(len(words), output_length)
                docs.append(words)
    doc_vecs = []
    output_length = args.output_length
    for d_words in docs:
        doc_vecs.append(sentence2vec(w2v_model, d_words, output_length))
    print("len(doc_vecs)", len(doc_vecs))
    logger.info("input_length:%d, output_length:%d" %
                (input_length, output_length))

    # 计算每个doc出现的频率
    doc_count = {}
    for ans in answers:
        for a in ans:
            if a in doc_count.keys():
                doc_count[a] += 1
            else:
                doc_count[a] = 1

    # 计算每个doc的weight
    doc_weight = {}
    t_max = 0
    for k in doc_count.keys():
        t_max = max(t_max, doc_count[k])
    for k in doc_count.keys():
        doc_weight[k] = doc_count[k] / t_max

    model = DNN(args.input_lenth,
                args.input_dim,
                filters_num=args.filters_num,
                kernel_val=(args.kernel_val),
                pool_s=args.pool_s,
                pool_stride=args.pool_stride,
                hidden1_dim=args.hidden1_dim,
                hidden2_dim=args.hidden2_dim,
                activation=args.activation)
    model.load_weights("ckpt/dnn_weights_v2_%s.h5" % data_type)
    new_dnn_model = Model(inputs=model.input,
                          outputs=model.get_layer('hidden_layer').output)

    total = len(question_vecs)
    train_num = int(total * 0.9)

    for i in range(train_num):
        q_encoder_input = []
        r_decoder_input = []

        label_list = []
        aid_list = []

        logger.info("get all documents for question: %d" % i)
        print("get all documents for question: %d" % i)
        # qid_list.append(i)
        # label_list.append(1)

        cur_answers = answers[i]
        doc_list_ordered = [a for a in cur_answers]
        for aid in list(doc_weight.keys()):
            if aid not in doc_list_ordered:
                doc_list_ordered.append(aid)

        print("len(doc_list_ordered):", len(doc_list_ordered))
        print("len(cur_answers):", len(cur_answers))

        for aid in doc_list_ordered:
            aid_list.append(aid)
            if aid in cur_answers:
                label_list.append(1)
            else:
                label_list.append(0)

            # question
            q_encoder_input.append(question_vecs[i])
            r_decoder_input.append(doc_vecs[aid])

        logger.info("predicting question: %d" % i)
        print("predicting question: %d" % i)
        res = new_dnn_model.predict([q_encoder_input, r_decoder_input])
        # print(res)

        with open(to_file_path, "a") as f:
            for j in range(len(res)):
                row = res[j]
                feature_str = ''
                for k in range(len(row)):
                    feature_str = feature_str + (" %d:%.9f" % (k + 1, row[k]))
                label = label_list[j]
                doc_id = aid_list[j]

                line = "%d qid:%d%s # doc-%d \n" % (label, i, feature_str,
                                                    doc_id)
                f.write(line)
    print("saved to:", to_file_path)
    logger.info("total:%d" % total)
    logger.info("saved to: %s" % to_file_path)
Example #4
0
def train(w2v_model, qa_file, doc_file, to_model_file, to_ckpt_file, args):
    logger.info("preprocessing...")
    ns_amount = args.ns_amount

    questions = []
    answers = []

    # question vector
    input_length = 0
    with open(qa_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "" and i % 2 == 0:
                words = word_tokenize(remove_punc(line))
                input_length = max(len(words), input_length)
                questions.append(words)
            elif line != "" and i % 2 == 1:
                arr = line.strip().split(" ")
                ans = []
                for a in arr:
                    if a != "":
                        ans.append(
                            int(a) - 1
                        )  # the index starts from 1 in the QA_list file, make it start from 0.
                answers.append(ans)

    question_vecs = []
    for q_words in questions:
        question_vecs.append(sentence2vec(w2v_model, q_words, input_length))
    print("len(question_vecs)", len(question_vecs))

    # document vector
    docs = []
    output_length = 0
    with open(doc_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "":
                words = word_tokenize(remove_punc(line))
                output_length = max(len(words), output_length)
                docs.append(words)
    doc_vecs = []
    output_length = args.output_length
    for d_words in docs:
        doc_vecs.append(sentence2vec(w2v_model, d_words, output_length))
    print("len(doc_vecs)", len(doc_vecs))
    logger.info("input_length:%d, output_length:%d" %
                (input_length, output_length))

    # weights for each doc
    doc_count = {}
    for ans in answers:
        for a in ans:
            if a in doc_count.keys():
                doc_count[a] += 1
            else:
                doc_count[a] = 1

    doc_weight = {}
    t_max = 0
    for k in doc_count.keys():
        t_max = max(t_max, doc_count[k])
    for k in doc_count.keys():
        doc_weight[k] = doc_count[k] / t_max

    # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w]
    q_encoder_input = []
    r_decoder_input = []
    w_decoder_input = []
    weight_data_r = []
    weight_data_w = []
    y_data = []

    total = len(question_vecs)
    qa_index = list(range(total))
    random.shuffle(qa_index)

    for i in qa_index:
        y = [1] + [0] * ns_amount
        y_data.append(y)
        # question
        q_encoder_input.append(question_vecs[i])

        aid = answers[i][0]
        r_decoder_input.append(doc_vecs[aid])
        weight_data_r.append(doc_weight[aid])

        aids = get_randoms(list(doc_weight.keys()), [aid], 10)
        w_decoder = []
        w_weight = []
        for aid in aids:
            w_decoder.append(doc_vecs[aid])
            w_weight.append(doc_weight[aid])
        w_decoder = np.array(w_decoder).reshape(output_length, args.input_dim,
                                                ns_amount)
        w_weight = np.array(w_weight).reshape((1, ns_amount))
        w_decoder_input.append(w_decoder)
        weight_data_w.append(w_weight)
    y_data = np.array(y_data).reshape(total, (1 + ns_amount))

    train_num = int(total * 0.9)
    model = negative_samples(input_length=input_length,
                             input_dim=args.input_dim,
                             output_length=output_length,
                             output_dim=args.output_dim,
                             hidden_dim=args.hidden_dim,
                             ns_amount=ns_amount,
                             learning_rate=args.learning_rate,
                             drop_rate=args.drop_rate)
    print(model.summary())

    print("start training...")
    logger.info("start training...")
    model.fit([
        q_encoder_input[:train_num], r_decoder_input[:train_num],
        w_decoder_input[:train_num], weight_data_r[:train_num],
        weight_data_w[:train_num]
    ],
              y_data[:train_num],
              batch_size=args.batch_size,
              epochs=args.epochs,
              verbose=1,
              validation_data=([
                  q_encoder_input[train_num:], r_decoder_input[train_num:],
                  w_decoder_input[train_num:], weight_data_r[train_num:],
                  weight_data_w[train_num:]
              ], y_data[train_num:]))

    res = model.evaluate([
        q_encoder_input[train_num:], r_decoder_input[train_num:],
        w_decoder_input[train_num:], weight_data_r[train_num:],
        weight_data_w[train_num:]
    ],
                         y_data[train_num:],
                         verbose=1)
    print("training over.")
    logger.info("training over")
    print(model.metrics_names)
    print(res)
    print(model.summary())

    model.save(to_model_file)
    print("saved model to:", )

    model.save_weights(to_ckpt_file)
    print("saved weights to:", to_ckpt_file)
Example #5
0

if __name__ == '__main__':
    qa_path = "%s/QA_list.txt" % "twitter"

    qa_list = []

    # 读取 question 和 answer
    input_length = 0
    qid = 0
    with open(qa_path, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "" and i % 2 == 0:
                words = word_tokenize(remove_punc(line))
                input_length = max(len(words), input_length)
            elif line != "" and i % 2 == 1:
                arr = line.strip().split(" ")
                ans = []
                for a in arr:
                    if a != "":
                        ans.append(int(a) - 1)  # 因为原始数据从1开始计数,这里减去1。改为从0开始。
                qa_list.append({"qid": qid, "question": words, "answers": ans})
                qid += 1
    qa_index = list(range(len(qa_list)))
    random.shuffle(qa_index)

    parser = argparse.ArgumentParser(description='Test for argparse')
    parser.add_argument('--data_type',
                        help='data_type',
Example #6
0
def get_train_data(data_type,
                   w2v_model,
                   qa_file,
                   doc_file,
                   to_file_path,
                   args,
                   step=0):
    logger.info("preprocessing...")

    questions = []
    answers = []

    # 计算每个question的向量
    input_length = 0
    with open(qa_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if i >= 2000:  # 数据太大,内存不够
                break
            line = line.strip().lower()
            if line != "" and i % 2 == 0:
                words = word_tokenize(remove_punc(line))
                input_length = max(len(words), input_length)
                questions.append(words)
            elif line != "" and i % 2 == 1:
                arr = line.strip().split(" ")
                ans = []
                for a in arr:
                    if a != "":
                        ans.append(int(a) - 1)  # 因为原始数据从1开始计数,这里减去1。改为从0开始。
                answers.append(ans)
    input_length = args.input_length
    question_vecs = []
    for q_words in questions:
        question_vecs.append(sentence2vec(w2v_model, q_words, input_length))
    print("len(question_vecs)", len(question_vecs))

    # 计算每个document的向量
    docs = []
    output_length = 0
    with open(doc_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "":
                words = word_tokenize(remove_punc(line))
                output_length = max(len(words), output_length)
                docs.append(words)
    doc_vecs = []
    output_length = args.output_length
    for d_words in docs:
        doc_vecs.append(sentence2vec(w2v_model, d_words, output_length))
    print("len(doc_vecs)", len(doc_vecs))
    logger.info("input_length:%d, output_length:%d" %
                (input_length, output_length))

    # 计算每个doc出现的频率
    doc_count = {}
    for ans in answers:
        for a in ans:
            if a in doc_count.keys():
                doc_count[a] += 1
            else:
                doc_count[a] = 1

    # 计算每个doc的weight
    doc_weight = {}
    t_max = 0
    for k in doc_count.keys():
        t_max = max(t_max, doc_count[k])
    for k in doc_count.keys():
        doc_weight[k] = doc_count[k] / t_max

    total = len(question_vecs)
    train_num = int(total * 0.9)
    logger.info("train_num:%d, total:%d" % (train_num, total))

    # 打乱数据
    qa_index = list(range(total))
    random.shuffle(qa_index)

    # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w]
    q_encoder_input = []
    r_decoder_input = []
    y_data = []

    label_list = []
    qid_list = []
    aid_list = []

    total = len(question_vecs)

    for i in range(total):
        # question
        q_encoder_input.append(question_vecs[i])
        qid_list.append(i)
        # 每个question一个正确答案
        aid = answers[i][0]
        r_decoder_input.append(doc_vecs[aid])
        y_data.append([1, 0])
        label_list.append(1)
        aid_list.append(aid)

        # 10个un-related答案
        aids = get_randoms(list(doc_weight.keys()), [aid], 10)
        for aaid in aids:
            q_encoder_input.append(question_vecs[i])
            r_decoder_input.append(doc_vecs[aaid])
            y_data.append([0, 1])
            label_list.append(0)
            qid_list.append(i)
            aid_list.append(aaid)

    y_data = np.array(y_data)

    train_num = int(total * 0.9) * 11
    model = DNN(args.input_lenth,
                args.input_dim,
                filters_num=args.filters_num,
                kernel_val=(args.kernel_val),
                pool_s=args.pool_s,
                pool_stride=args.pool_stride,
                hidden1_dim=args.hidden1_dim,
                hidden2_dim=args.hidden2_dim,
                activation=args.activation)

    model.load_weights("ckpt/dnn_weights_v2_%s.h5" % data_type)
    new_dnn_model = Model(inputs=model.input,
                          outputs=model.get_layer('hidden_layer').output)

    logger.info("predicting...")
    res = new_dnn_model.predict(
        [r_decoder_input[:train_num], q_encoder_input[:train_num]])
    print("len(res)", len(res))
    print("train_num", train_num)
    print("len(r_decoder_input[:train_num])", len(r_decoder_input[:train_num]))

    with open(to_file_path, "w") as f:
        for i in range(len(res)):
            row = res[i]
            feature_str = ''
            for j in range(len(row)):
                feature_str = feature_str + (" %d:%.9f" % (j + 1, row[j]))
            label = label_list[i]
            id = qid_list[i]
            doc_id = aid_list[i]

            line = "%d qid:%d%s # doc-%d \n" % (label, id, feature_str, doc_id)
            f.write(line)
    print("saved to:", to_file_path)
    logger.info("total:%d" % total)
    logger.info("saved to: %s" % to_file_path)
Example #7
0
def get_train_data(data_type,
                   w2v_model,
                   ckpt_path,
                   qa_file,
                   doc_file,
                   to_file_path,
                   args,
                   step=0):

    if os.path.exists(to_file_path):
        logger.info("file exists: %s" % to_file_path)
        return

    logger.info("preprocessing...")
    ns_amount = 10

    questions = []
    answers = []

    # 计算每个question的向量
    input_length = 0
    with open(qa_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if i >= 2000:
                break
            line = line.strip().lower()
            if line != "" and i % 2 == 0:
                words = word_tokenize(remove_punc(line))
                input_length = max(len(words), input_length)
                questions.append(words)
            elif line != "" and i % 2 == 1:
                arr = line.strip().split(" ")
                ans = []
                for a in arr:
                    if a != "":
                        ans.append(int(a) - 1)  # 因为原始数据从1开始计数,这里减去1。改为从0开始。
                answers.append(ans)

    question_vecs = []
    for q_words in questions:
        question_vecs.append(sentence2vec(w2v_model, q_words, input_length))
    print("len(question_vecs)", len(question_vecs))

    # 计算每个document的向量
    docs = []
    output_length = 0
    with open(doc_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "":
                words = word_tokenize(remove_punc(line))
                output_length = max(len(words), output_length)
                docs.append(words)
    doc_vecs = []
    output_length = 1000
    for d_words in docs:
        doc_vecs.append(sentence2vec(w2v_model, d_words, output_length))
    print("len(doc_vecs)", len(doc_vecs))
    logger.info("input_length:%d, output_length:%d" %
                (input_length, output_length))

    # 计算每个doc出现的频率
    doc_count = {}
    for ii in range(len(docs)):
        doc_count[ii] = 0
    for ans in answers:
        for a in ans:
            if a in doc_count.keys():
                doc_count[a] += 1

    # 计算每个doc的weight
    doc_weight = {}
    t_max = 0
    for k in doc_count.keys():
        t_max = max(t_max, doc_count[k])
    for k in doc_count.keys():
        doc_weight[k] = doc_count[k] / t_max

    logger.info("loading weights...")
    model = negative_samples(input_length=input_length,
                             input_dim=args.input_dim,
                             output_length=output_length,
                             output_dim=args.output_dim,
                             hidden_dim=args.hidden_dim,
                             ns_amount=ns_amount,
                             learning_rate=args.learning_rate,
                             drop_rate=args.drop_rate)
    model.load_weights(ckpt_path)
    new_dnn_model = Model(inputs=model.input,
                          outputs=model.get_layer('dropout_con').output)

    total = len(question_vecs)
    train_num = int(total * 0.9)

    qid_list = []

    # 打乱数据
    qa_index = list(range(total))
    # random.shuffle(qa_index)

    for ss in range(train_num, total):
        i = qa_index[ss]

        # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w]
        q_encoder_input = []
        r_decoder_input = []
        w_decoder_input = []
        weight_data_r = []
        weight_data_w = []

        logger.info("get all documents for question: %d" % i)
        print("get all documents for question: %d" % i)
        # qid_list.append(i)
        # label_list.append(1)

        cur_answers = answers[i]
        doc_list_ordered = [a for a in cur_answers]
        for aid in list(doc_weight.keys()):
            if aid not in doc_list_ordered:
                doc_list_ordered.append(aid)

        label_list = []
        aid_list = []

        print("len(doc_list_ordered):", len(doc_list_ordered))
        print("len(cur_answers):", len(cur_answers))

        for aid in doc_list_ordered:
            aid_list.append(aid)
            if aid in cur_answers:
                label_list.append(1)
            else:
                label_list.append(0)

            # question
            q_encoder_input.append(question_vecs[i])
            r_decoder_input.append(doc_vecs[aid])
            weight_data_r.append(doc_weight[aid])
            # 10个un-related答案
            aids = get_randoms(list(doc_weight.keys()), cur_answers, ns_amount)
            w_decoder = []
            w_weight = []
            for aid in aids:
                w_decoder.append(doc_vecs[aid])
                w_weight.append(doc_weight[aid])

            w_decoder = np.array(w_decoder).reshape(output_length,
                                                    args.input_dim, ns_amount)
            w_weight = np.array(w_weight).reshape((1, ns_amount))
            w_decoder_input.append(w_decoder)
            weight_data_w.append(w_weight)

        logger.info("now:%d , predicting question: %d" % (ss, i))
        print("now:%d , predicting question: %d" % (ss, i))

        start = 0
        end = len(q_encoder_input)
        for cur in range(0, end, 1000):
            print("cur:%d / %d" % (cur, end))
            a = q_encoder_input[cur:cur + 1000]
            b = r_decoder_input[cur:cur + 1000]
            c = w_decoder_input[cur:cur + 1000]
            d = weight_data_r[cur:cur + 1000]
            e = weight_data_w[cur:cur + 1000]

            res = new_dnn_model.predict([a, b, c, d, e])
            # print(res)

            with open(to_file_path, "a") as f:
                for j in range(len(res)):
                    row = res[j]
                    feature_str = ''
                    for k in range(len(row)):
                        feature_str = feature_str + (" %d:%.9f" %
                                                     (k + 1, row[k]))
                    label = label_list[j]
                    doc_id = aid_list[j]

                    line = "%d qid:%d%s # doc-%d \n" % (label, i, feature_str,
                                                        doc_id)
                    f.write(line)
    print("saved to:", to_file_path)
    logger.info("total:%d" % total)
    logger.info("saved to: %s" % to_file_path)
Example #8
0
def train(w2v_model, qa_file, doc_file, to_model_file, to_ckpt_file, args):
    logger.info("preprocessing...")

    questions = []
    answers = []

    # 计算每个question的向量
    input_length = 0
    with open(qa_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if i >= 2000:  # 数据太大,内存不够
                break

            line = line.strip().lower()
            if line != "" and i % 2 == 0:
                words = word_tokenize(remove_punc(line))
                input_length = max(len(words), input_length)
                questions.append(words)
            elif line != "" and i % 2 == 1:
                arr = line.strip().split(" ")
                ans = []
                for a in arr:
                    if a != "":
                        ans.append(int(a) - 1)  # 因为原始数据从1开始计数,这里减去1。改为从0开始。
                answers.append(ans)
    input_length = args.input_length
    question_vecs = []
    for q_words in questions:
        question_vecs.append(sentence2vec(w2v_model, q_words, input_length))
    print("len(question_vecs)", len(question_vecs))

    # 计算每个document的向量
    docs = []
    output_length = 0
    with open(doc_file, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            line = line.strip().lower()
            if line != "":
                words = word_tokenize(remove_punc(line))
                output_length = max(len(words), output_length)
                docs.append(words)
    doc_vecs = []
    output_length = args.output_length
    for d_words in docs:
        doc_vecs.append(sentence2vec(w2v_model, d_words, output_length))
    print("len(doc_vecs)", len(doc_vecs))
    logger.info("input_length:%d, output_length:%d" %
                (input_length, output_length))

    # 计算每个doc出现的频率
    doc_count = {}
    for ans in answers:
        for a in ans:
            if a in doc_count.keys():
                doc_count[a] += 1
            else:
                doc_count[a] = 1

    # 计算每个doc的weight
    doc_weight = {}
    t_max = 0
    for k in doc_count.keys():
        t_max = max(t_max, doc_count[k])
    for k in doc_count.keys():
        doc_weight[k] = doc_count[k] / t_max

    # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w]
    q_encoder_input = []
    r_decoder_input = []
    y_data = []

    total = len(question_vecs)

    for i in range(total):
        # question
        q_encoder_input.append(question_vecs[i])
        # 每个question一个正确答案
        aid = answers[i][0]
        r_decoder_input.append(doc_vecs[aid])
        y_data.append([1, 0])

        # 10个un-related答案
        aids = get_randoms(list(doc_weight.keys()), [aid], 10)
        for aaid in aids:
            q_encoder_input.append(question_vecs[i])
            r_decoder_input.append(doc_vecs[aaid])
            y_data.append([0, 1])

    y_data = np.array(y_data)

    train_num = int(total * 0.9) * 11
    model = DNN(args.input_lenth,
                args.input_dim,
                filters_num=args.filters_num,
                kernel_val=(args.kernel_val),
                pool_s=args.pool_s,
                pool_stride=args.pool_stride,
                hidden1_dim=args.hidden1_dim,
                hidden2_dim=args.hidden2_dim,
                activation=args.activation)

    print("start training...")
    logger.info("start training...")

    model.fit([r_decoder_input[:train_num], q_encoder_input[:train_num]],
              y_data[:train_num],
              batch_size=args.batch_size,
              epochs=args.epochs,
              verbose=1,
              validation_data=([
                  r_decoder_input[train_num:], q_encoder_input[train_num:]
              ], y_data[train_num:]))

    res = model.evaluate(
        [r_decoder_input[train_num:], q_encoder_input[train_num:]],
        y_data[train_num:],
        verbose=1)
    print("training over.")
    logger.info("training over")
    print(model.metrics_names)
    print(res)
    print(model.summary())

    model.save(to_model_file)
    print("saved model to:", )

    model.save_weights(to_ckpt_file)
    print("saved weights to:", to_ckpt_file)