def correlation_negative(): """ 相关性, 篇章正负面, 机构标签 -> 正负面 :return: """ # get parameters start_time = datetime.now() records = request.json['record'] logger.info('starting correlation_negative, {list_size: %d}' % (len(records))) # 相关性判断 words_list = pre.handle_contents([record['content'] for record in records]) result_list = predict.predict_corpus(words_list) record_result_list = zip(records, result_list) ret_list = [] for record_result in record_result_list: id = int(record_result[0]['id']) title = record_result[0]['title'] content = record_result[0]['content'] content = title + '。' + content sec = int(record_result[1]) tendency = 0 org_list = [] if sec == 1: # 句子正负面 tendency, org_list = sa.evaluate_article(content) # 相关文章, 篇章级正负面 # 早先设定-----tendency: 正负面字段, -1: 负面; 0: 正面 (篇章正负面) # 模型返回-----positive -> 1 negetive -> 0 content = pre.handle_contents([content]) tendency = chapter_pipeline.predict(content) tendency = int(tendency[0]) - 1 # 将模型返回值对应早先设定的值 ret_list.append({ 'id': id, 'sec': sec, 'tendency': tendency, 'org_list': org_list }) # 返回结果 logger.info('end correlation_negative: {ret_list: %d, lost_seconds: %ds}' % (len(ret_list), (datetime.now() - start_time).seconds)) ret = {'docs': ret_list} return jsonify(ret)
def handle_test_excel(test_file, mid_file, all_file, tag_file, right_flag): # 从excel读取数据 from openpyxl import Workbook from openpyxl import load_workbook test_corpus = [] workbook = load_workbook(test_file) sheet_names = workbook.sheetnames # 获得表单名字 for sheet_name in sheet_names: sheet = workbook[sheet_name] for row in range(2, sheet.max_row + 1): row = row title_column = 1 column_column = 2 title = sheet.cell(row=row, column=title_column).value.encode('utf-8') content = sheet.cell(row=row, column=column_column).value.encode('utf-8') # python3 需要转换 content = content.decode() words_str = handle_content(content) words_str = words_str.encode('utf-8') test_corpus.append((title, content, words_str)) workbook.close() # 预测, 输出准确率 import predict result = predict.predict_corpus([content_word[2] for content_word in test_corpus]) print('测试文件: ', test_file) print('总条数: ', len(result)) right_result = list(filter(lambda x: x == right_flag, result)) print('正确条数; ', len(right_result)) print('准确度: ', len(right_result) / len(result)) # 将预处理的完毕的数据保存到中间文件中 mid_wb = Workbook() mid_ws = mid_wb.active mid_ws.cell(row=1, column=1).value = 'words_str' for row in range(0, len(test_corpus)): mid_ws.cell(row=row+2, column=1).value = test_corpus[row][2] mid_wb.save(mid_file) print('保存中间数据: ', mid_file) # 保存所有数据 all_wb = Workbook() all_ws = all_wb.active all_ws.cell(row=1, column=1).value = 'title' all_ws.cell(row=1, column=2).value = 'content' all_ws.cell(row=1, column=3).value = 'predict_flag' all_list = [content for content in zip(test_corpus, result)] for row in range(0, len(all_list)): all_ws.cell(row=row+2, column=1).value = all_list[row][0][0] all_ws.cell(row=row+2, column=2).value = all_list[row][0][1] all_ws.cell(row=row+2, column=3).value = all_list[row][1] all_wb.save(all_file) print('保存所有数据: ', all_file) # 将结果不正确的数据, 保存到tag_file中 wb = Workbook() ws = wb.active ws.cell(row=1, column=1).value = 'title' ws.cell(row=1, column=2).value = 'content' error_list = [content[0] for content in zip(test_corpus, result) if content[1] != right_flag] for row in range(2, len(error_list)): ws.cell(row=row, column=1).value = error_list[row][0] ws.cell(row=row, column=2).value = error_list[row][1] wb.save(tag_file) print('保存错误数据: ', tag_file)
else: length = len(x) data.append([ len(x), self.getcnt(x), self.getcnt(x) / length, self.getnegcnt(x), self.getnegcnt(x) / length ]) return data if __name__ == '__main__': import pre import predict record_ = [{ 'id': 1, 'content': '一个恐怖的数字[怒]据国家癌症中心发布☞全国每天约有10000人确诊为癌症,平均每分钟就有7人确诊[怒]不过放心,癌症是可以治愈的,但……需要很多' }] # predict corpus = [] files = open("corpus/test_new.txt", "r", encoding="utf-8").readlines() for item in files: corpus.append(item.strip()) contents = [pre.handle_content(i_content) for i_content in corpus] correlations = predict.predict_corpus(contents) print(correlations)