def calculate_index(file, topcategory_pr_file): """ 计算机器打标的微平均准确率和召回率 """ head, table = read_xlrd(file) alg_top_label_index = head.index("algFirstCategory") alg_sub_label_index = head.index("algSecondCategory") diting_top_label_index = head.index("ditingFirstCategory") diting_sub_label_index = head.index("ditingSecondCategory") top_label_true = list() top_label_pred = list() sub_label_true = list() sub_label_pred = list() for row_num in range(1, table.nrows): row_value = table.row_values(row_num) top_label_pred.append( get_standard_label(row_value[alg_top_label_index])) top_label_true.append( get_standard_label(row_value[diting_top_label_index])) sub_label_pred.append( get_standard_label(row_value[alg_sub_label_index])) sub_label_true.append( get_standard_label(row_value[diting_sub_label_index])) top_report = get_sorted_report(top_label_true, top_label_pred) # print(top_report) sub_report = get_sorted_report(sub_label_true, sub_label_pred) # print(sub_report) true_label_count = Counter([ la for lable in top_label_true for la in lable.split(",") if la != "" ]) print(true_label_count) calculate_topcategory_pr(topcategory_pr_file, true_label_count)
def calculate_topcategory_pr(file, true_label_count): head, table = read_xlrd(file) topcategory_dict = dict() for row_num in range(1, table.nrows): row_value = table.row_values(row_num) name = row_value[head.index("name")] p = float(row_value[head.index("p")]) r = float(row_value[head.index('r')]) topcategory_dict[name] = (p, r) print(_calculate_micro_precision(topcategory_dict, true_label_count)) print(_calculate_micro_recall(topcategory_dict, true_label_count))
def write_validation_set_to_file(excel_file, outfile): """ 将点检信息处理为文本 """ head, table = read_xlrd(excel_file) # 一二级分类 alg_tag2_index = head.index("alg_tag-2.0") manual_tag2_index = head.index("manual_tag-2.0") alg_subtag2_index = head.index("alg_subtag-2.0") manual_subtag2_index = head.index("manual_subtag-2.0") alg_tag3_index = head.index("alg_tag-3.0") manual_tag3_index = head.index("manual_tag-3.0") alg_subtag3_index = head.index("alg_subtag-3.0") manual_subtag3_index = head.index("manual_subtag-3.0") # 低调性分类 alg_vulgar_index = head.index("alg_vulgar") manual_vulgar_index = head.index("manual_vulgar") alg_gossip_index = head.index("alg_gossip") manual_gossip_index = head.index("manual_gossip") alg_clickbait_index = head.index("alg_clickbait") manual_clickbait_index = head.index("manual_clickbait") alg_advert_index = head.index("alg_advert") manual_advert_index = head.index("manual_advert") file = open(outfile, "w", encoding="utf-8") for row_num in range(1, table.nrows): row_value = table.row_values(row_num) # print(type(row_value[alg_tag2_index])) row_value[manual_tag2_index] = process_manual_category( row_value[alg_tag2_index], row_value[manual_tag2_index]) row_value[manual_subtag2_index] = process_manual_category( row_value[alg_subtag2_index], row_value[manual_subtag2_index]) row_value[manual_tag3_index] = process_manual_category( row_value[alg_tag3_index], row_value[manual_tag3_index]) row_value[manual_subtag3_index] = process_manual_category( row_value[alg_subtag3_index], row_value[manual_subtag3_index]) row_value[manual_vulgar_index] = process_manual_tonality( row_value[alg_vulgar_index], row_value[manual_vulgar_index]) row_value[manual_gossip_index] = process_manual_tonality( row_value[alg_gossip_index], row_value[manual_gossip_index]) row_value[manual_clickbait_index] = process_manual_tonality( row_value[alg_clickbait_index], row_value[manual_clickbait_index]) row_value[manual_advert_index] = process_manual_tonality( row_value[alg_advert_index], row_value[manual_advert_index]) row_line = dict(zip(head, row_value)) file.write(json.dumps(row_line, ensure_ascii=False) + "\n") file.close()
def calculate(file): """ 计算指标,并写入excel文件 """ head, table = read_xlrd(file) (ai_tag_result, ai_subtag_result), \ (alg_tag2_result, alg_subtag2_result), \ (alg_tag3_result, alg_subtag3_result), \ (manual_tag2_result, manual_subtag2_result), \ (manual_tag3_result, manual_subtag3_result) = get_category_result(head, table) # print(ai_tag_result) # print(ai_subtag_result) file_path = "./spot_check_result_26.xlsx" # 创建一个空的excel文件 nan_excel = pd.DataFrame() nan_excel.to_excel(file_path) writer = pd.ExcelWriter(file_path, engine="openpyxl") ai_tag_df, ai_precision = get_df_result(manual_tag2_result, ai_tag_result) alg_tag2_df, alg_tag2_precision = get_df_result(manual_tag2_result, alg_tag2_result) alg_tag3_df, alg_tag3_precision = get_df_result(manual_tag3_result, alg_tag3_result) ai_subtag_df, ai_subtag_precision = get_df_result(manual_subtag2_result, ai_subtag_result) alg_subtag2_df, alg_subtag2_precision = get_df_result(manual_subtag2_result, alg_subtag2_result) alg_subtag3_df, alg_subtag3_precision = get_df_result(manual_subtag3_result, alg_subtag3_result) # precision_list = dict(zip(report_df.index, report_df.precision)) # confusion_matrix_top = metrics.confusion_matrix(y_true=manual_tag_result, y_pred=ai_tag_result) # print(json.dumps(precision_list, indent=4, ensure_ascii=False)) # print(confusion_matrix_top) # 分类准确率 precision = dict() precision["一级分类"] = {"ai-tag": ai_precision, "alg-tag-2.0": alg_tag2_precision, "alg-tag-3.0": alg_tag3_precision} precision["二级分类"] = {"ai-tag": ai_subtag_precision, "alg-tag-2.0": alg_subtag2_precision, "alg-tag-3.0": alg_subtag3_precision} precision_df = pd.DataFrame(precision).T add_excel_sheet(precision_df, writer, "precision") # 一级分类classification report add_excel_sheet(ai_tag_df, writer, "ai-tag") add_excel_sheet(alg_tag2_df, writer, "alg-tag-2.0") add_excel_sheet(alg_tag3_df, writer, "alg-tag-3.0") # 二级分类classification report add_excel_sheet(ai_subtag_df, writer, "ai-subtag") add_excel_sheet(alg_subtag2_df, writer, "alg-subtag-2.0") add_excel_sheet(alg_subtag3_df, writer, "alg-subtag-3.0")
def analysis_example(excel_file): head, table = read_xlrd(excel_file) if_real_index = head.index("true/false") label_index = head.index("label") manual_label_index = head.index("manual_check_one") predict_label_index = head.index("predict_label") label_count = dict() for row_num in range(1, table.nrows): row_value = table.row_values(row_num) if row_value[if_real_index] == "": continue if_real = int(row_value[if_real_index]) label = row_value[label_index] manual_label = row_value[manual_label_index] predict_label = row_value[predict_label_index].split(",") if label in label_count: label_count[label]["right_count"] += if_real label_count[label]["all_count"] += 1 if manual_label in predict_label: label_count[label]["predict_count"] += 1 if manual_label in label_count[label]["manual_label"]: label_count[label]["manual_label"][manual_label] += 1 else: label_count[label]["manual_label"][manual_label] = 1 else: label_count[label] = dict() label_count[label]["right_count"] = if_real label_count[label]["predict_count"] = 0 if manual_label in predict_label: label_count[label]["predict_count"] += 1 label_count[label]["all_count"] = 1 label_count[label]["manual_label"] = dict() if manual_label in label_count[label]["manual_label"]: label_count[label]["manual_label"][manual_label] += 1 else: label_count[label]["manual_label"][manual_label] = 1 label_analysis = dict() for k, v in label_count.items(): v["percent"] = "%.2f%%" % (v["right_count"] / v["all_count"] * 100) v["predict_percent"] = "%.2f%%" % (v["predict_count"] / v["all_count"] * 100) label_analysis[k] = v print(json.dumps(label_analysis, ensure_ascii=False, indent=4))