Ejemplo n.º 1
0
def raw_que_processing(l_min=5, l_max=100, del_flag=True):
    """
    :return: 问题id,问题主题,问题内容
    """
    raw_q = read_file("data/LSH_questions.txt", 0, 0)  # count 24294+1
    q0_in = []  # 问题主题
    q1_in = []  # 问题详细内容
    index_q_in = []
    if del_flag:  # 删除无回答的问句
        index_r_in, _ = raw_reply_processing()
        r_id_in = set(index_r_in)
        for line in raw_q[1:]:
            try:
                q = line.replace('"', '').strip().split('\t')
                if l_min <= len(q[1]) <= l_max:
                    q_id_in = int(q[0])
                    if q_id_in in r_id_in:
                        index_q_in.append(q_id_in)
                        q0_in.append(' '.join(jieba.cut(zhengze(
                            q[1].strip()))))
                        q1_in.append(' '.join(jieba.cut(zhengze(
                            q[2].strip()))))
            except IndexError:
                pass
    else:  # 保留所有问句
        for line in raw_q[1:]:
            try:
                q = line.replace('"', '').strip().split('\t')
                if l_min <= len(q[1]) <= l_max:
                    index_q_in.append(int(q[0]))
                    q0_in.append(' '.join(jieba.cut(zhengze(q[1].strip()))))
                    q1_in.append(' '.join(jieba.cut(zhengze(q[2].strip()))))
            except IndexError:
                pass
    return index_q_in, q0_in, q1_in
Ejemplo n.º 2
0
def TQR_import():
    raw_data = read_file(CORPUS_FILE, 0, 0)
    title = []
    question = []
    reply = []
    for line in raw_data:
        tqr = line.strip().split('\t')
        title.append(tqr[0].split(' '))
        question.append(tqr[1].split(' '))
        reply.append(tqr[2])
    return title, question, reply
def task101_LSH_reply_processing():
    raw_r = read_file("data/0208/LSHR.txt", 0, 0)  # count 44729+1
    index_r = []
    r0 = []
    for line in raw_r[1:]:
        try:
            r = line.replace('"', '').strip().split('\t')
            index_r.append(int(r[1]))
            r0.append(' '.join(jieba.cut(zhengze(r[2].strip()))))
        except IndexError:
            pass
    # for i, j in zip(index_r, r0):
    #     print(i, j)
    write2excel(RESULT_FILE, 'LSHR', index_r, r0)
Ejemplo n.º 4
0
def raw_reply_processing(l_min=4, l_max=100):
    """
    :return: 回答id,回答内容
    """
    raw_r = read_file("data/LSH_reply.txt", 0, 0)  # count 44729+1
    index_r_in = []
    r0_in = []
    for line in raw_r[1:]:
        try:
            r = line.replace('"', '').strip().split('\t')
            if l_min <= len(r[2]) <= l_max:
                index_r_in.append(int(r[1]))
                r0_in.append(zhengze(r[2].strip()))
        except IndexError:
            pass
    return index_r_in, r0_in
def task102_LSH_question_processing():
    raw_q = read_file("data/0208/LSHQ.txt", 0, 0)  # count 24294+1
    q0 = []  # 问题主题
    q1 = []  # 问题详细内容
    index_q = []
    for line in raw_q[1:]:
        try:
            q = line.replace('"', '').strip().split('\t')
            index_q.append(int(q[0]))
            q0.append(' '.join(jieba.cut(zhengze(q[1].strip()))))
            q1.append(' '.join(jieba.cut(zhengze(q[2].strip()))))
        except IndexError:
            pass
    # for i, j in zip(index_q, q0):
    #     print(i, j)
    write2excel(RESULT_FILE, 'LSHQ', index_q, q0, q1)
def task103_WXL1_processing():
    raw_qr = read_file("data/0208/WXL1.txt", 0, 0)
    title = []
    que = []
    reply = []
    for line in raw_qr:
        try:
            tqr = line.strip().split('\t')
            title.append(' '.join(jieba.cut(zhengze(tqr[2].strip()))))
            que.append(' '.join(jieba.cut(zhengze(tqr[3].strip()))))
            reply.append(' '.join(jieba.cut(zhengze(tqr[4].strip()))))
        except IndexError:
            pass
    # for i,j,k in zip(title, que, reply):
    #     print(i)
    #     print(j)
    #     print(k)
    write2excel(RESULT_FILE, 'WXL1', title, que, reply)