def raw_que_processing(l_min=5, l_max=100, del_flag=True): """ :return: 问题id,问题主题,问题内容 """ raw_q = read_file("data/LSH_questions.txt", 0, 0) # count 24294+1 q0_in = [] # 问题主题 q1_in = [] # 问题详细内容 index_q_in = [] if del_flag: # 删除无回答的问句 index_r_in, _ = raw_reply_processing() r_id_in = set(index_r_in) for line in raw_q[1:]: try: q = line.replace('"', '').strip().split('\t') if l_min <= len(q[1]) <= l_max: q_id_in = int(q[0]) if q_id_in in r_id_in: index_q_in.append(q_id_in) q0_in.append(' '.join(jieba.cut(zhengze( q[1].strip())))) q1_in.append(' '.join(jieba.cut(zhengze( q[2].strip())))) except IndexError: pass else: # 保留所有问句 for line in raw_q[1:]: try: q = line.replace('"', '').strip().split('\t') if l_min <= len(q[1]) <= l_max: index_q_in.append(int(q[0])) q0_in.append(' '.join(jieba.cut(zhengze(q[1].strip())))) q1_in.append(' '.join(jieba.cut(zhengze(q[2].strip())))) except IndexError: pass return index_q_in, q0_in, q1_in
def TQR_import(): raw_data = read_file(CORPUS_FILE, 0, 0) title = [] question = [] reply = [] for line in raw_data: tqr = line.strip().split('\t') title.append(tqr[0].split(' ')) question.append(tqr[1].split(' ')) reply.append(tqr[2]) return title, question, reply
def task101_LSH_reply_processing(): raw_r = read_file("data/0208/LSHR.txt", 0, 0) # count 44729+1 index_r = [] r0 = [] for line in raw_r[1:]: try: r = line.replace('"', '').strip().split('\t') index_r.append(int(r[1])) r0.append(' '.join(jieba.cut(zhengze(r[2].strip())))) except IndexError: pass # for i, j in zip(index_r, r0): # print(i, j) write2excel(RESULT_FILE, 'LSHR', index_r, r0)
def raw_reply_processing(l_min=4, l_max=100): """ :return: 回答id,回答内容 """ raw_r = read_file("data/LSH_reply.txt", 0, 0) # count 44729+1 index_r_in = [] r0_in = [] for line in raw_r[1:]: try: r = line.replace('"', '').strip().split('\t') if l_min <= len(r[2]) <= l_max: index_r_in.append(int(r[1])) r0_in.append(zhengze(r[2].strip())) except IndexError: pass return index_r_in, r0_in
def task102_LSH_question_processing(): raw_q = read_file("data/0208/LSHQ.txt", 0, 0) # count 24294+1 q0 = [] # 问题主题 q1 = [] # 问题详细内容 index_q = [] for line in raw_q[1:]: try: q = line.replace('"', '').strip().split('\t') index_q.append(int(q[0])) q0.append(' '.join(jieba.cut(zhengze(q[1].strip())))) q1.append(' '.join(jieba.cut(zhengze(q[2].strip())))) except IndexError: pass # for i, j in zip(index_q, q0): # print(i, j) write2excel(RESULT_FILE, 'LSHQ', index_q, q0, q1)
def task103_WXL1_processing(): raw_qr = read_file("data/0208/WXL1.txt", 0, 0) title = [] que = [] reply = [] for line in raw_qr: try: tqr = line.strip().split('\t') title.append(' '.join(jieba.cut(zhengze(tqr[2].strip())))) que.append(' '.join(jieba.cut(zhengze(tqr[3].strip())))) reply.append(' '.join(jieba.cut(zhengze(tqr[4].strip())))) except IndexError: pass # for i,j,k in zip(title, que, reply): # print(i) # print(j) # print(k) write2excel(RESULT_FILE, 'WXL1', title, que, reply)