def calculate_index(file, topcategory_pr_file):
    """
    计算机器打标的微平均准确率和召回率
    """
    head, table = read_xlrd(file)
    alg_top_label_index = head.index("algFirstCategory")
    alg_sub_label_index = head.index("algSecondCategory")
    diting_top_label_index = head.index("ditingFirstCategory")
    diting_sub_label_index = head.index("ditingSecondCategory")

    top_label_true = list()
    top_label_pred = list()
    sub_label_true = list()
    sub_label_pred = list()
    for row_num in range(1, table.nrows):
        row_value = table.row_values(row_num)
        top_label_pred.append(
            get_standard_label(row_value[alg_top_label_index]))
        top_label_true.append(
            get_standard_label(row_value[diting_top_label_index]))
        sub_label_pred.append(
            get_standard_label(row_value[alg_sub_label_index]))
        sub_label_true.append(
            get_standard_label(row_value[diting_sub_label_index]))
    top_report = get_sorted_report(top_label_true, top_label_pred)
    # print(top_report)
    sub_report = get_sorted_report(sub_label_true, sub_label_pred)
    # print(sub_report)

    true_label_count = Counter([
        la for lable in top_label_true for la in lable.split(",") if la != ""
    ])
    print(true_label_count)
    calculate_topcategory_pr(topcategory_pr_file, true_label_count)
def calculate_topcategory_pr(file, true_label_count):

    head, table = read_xlrd(file)
    topcategory_dict = dict()
    for row_num in range(1, table.nrows):
        row_value = table.row_values(row_num)
        name = row_value[head.index("name")]
        p = float(row_value[head.index("p")])
        r = float(row_value[head.index('r')])
        topcategory_dict[name] = (p, r)
    print(_calculate_micro_precision(topcategory_dict, true_label_count))
    print(_calculate_micro_recall(topcategory_dict, true_label_count))
def write_validation_set_to_file(excel_file, outfile):
    """
    将点检信息处理为文本
    """
    head, table = read_xlrd(excel_file)
    # 一二级分类
    alg_tag2_index = head.index("alg_tag-2.0")
    manual_tag2_index = head.index("manual_tag-2.0")
    alg_subtag2_index = head.index("alg_subtag-2.0")
    manual_subtag2_index = head.index("manual_subtag-2.0")
    alg_tag3_index = head.index("alg_tag-3.0")
    manual_tag3_index = head.index("manual_tag-3.0")
    alg_subtag3_index = head.index("alg_subtag-3.0")
    manual_subtag3_index = head.index("manual_subtag-3.0")
    # 低调性分类
    alg_vulgar_index = head.index("alg_vulgar")
    manual_vulgar_index = head.index("manual_vulgar")
    alg_gossip_index = head.index("alg_gossip")
    manual_gossip_index = head.index("manual_gossip")
    alg_clickbait_index = head.index("alg_clickbait")
    manual_clickbait_index = head.index("manual_clickbait")
    alg_advert_index = head.index("alg_advert")
    manual_advert_index = head.index("manual_advert")
    file = open(outfile, "w", encoding="utf-8")
    for row_num in range(1, table.nrows):
        row_value = table.row_values(row_num)
        # print(type(row_value[alg_tag2_index]))
        row_value[manual_tag2_index] = process_manual_category(
            row_value[alg_tag2_index], row_value[manual_tag2_index])
        row_value[manual_subtag2_index] = process_manual_category(
            row_value[alg_subtag2_index], row_value[manual_subtag2_index])
        row_value[manual_tag3_index] = process_manual_category(
            row_value[alg_tag3_index], row_value[manual_tag3_index])
        row_value[manual_subtag3_index] = process_manual_category(
            row_value[alg_subtag3_index], row_value[manual_subtag3_index])

        row_value[manual_vulgar_index] = process_manual_tonality(
            row_value[alg_vulgar_index], row_value[manual_vulgar_index])
        row_value[manual_gossip_index] = process_manual_tonality(
            row_value[alg_gossip_index], row_value[manual_gossip_index])
        row_value[manual_clickbait_index] = process_manual_tonality(
            row_value[alg_clickbait_index], row_value[manual_clickbait_index])
        row_value[manual_advert_index] = process_manual_tonality(
            row_value[alg_advert_index], row_value[manual_advert_index])

        row_line = dict(zip(head, row_value))
        file.write(json.dumps(row_line, ensure_ascii=False) + "\n")

    file.close()
def calculate(file):
    """
    计算指标,并写入excel文件
    """
    head, table = read_xlrd(file)
    (ai_tag_result, ai_subtag_result), \
    (alg_tag2_result, alg_subtag2_result), \
    (alg_tag3_result, alg_subtag3_result), \
    (manual_tag2_result, manual_subtag2_result), \
    (manual_tag3_result, manual_subtag3_result) = get_category_result(head, table)
    # print(ai_tag_result)
    # print(ai_subtag_result)

    file_path = "./spot_check_result_26.xlsx"
    # 创建一个空的excel文件
    nan_excel = pd.DataFrame()
    nan_excel.to_excel(file_path)
    writer = pd.ExcelWriter(file_path, engine="openpyxl")

    ai_tag_df, ai_precision = get_df_result(manual_tag2_result, ai_tag_result)
    alg_tag2_df, alg_tag2_precision = get_df_result(manual_tag2_result, alg_tag2_result)
    alg_tag3_df, alg_tag3_precision = get_df_result(manual_tag3_result, alg_tag3_result)
    ai_subtag_df, ai_subtag_precision = get_df_result(manual_subtag2_result, ai_subtag_result)
    alg_subtag2_df, alg_subtag2_precision = get_df_result(manual_subtag2_result, alg_subtag2_result)
    alg_subtag3_df, alg_subtag3_precision = get_df_result(manual_subtag3_result, alg_subtag3_result)
    # precision_list = dict(zip(report_df.index, report_df.precision))
    # confusion_matrix_top = metrics.confusion_matrix(y_true=manual_tag_result, y_pred=ai_tag_result)
    # print(json.dumps(precision_list, indent=4, ensure_ascii=False))
    # print(confusion_matrix_top)
    # 分类准确率
    precision = dict()
    precision["一级分类"] = {"ai-tag": ai_precision, "alg-tag-2.0": alg_tag2_precision, "alg-tag-3.0": alg_tag3_precision}
    precision["二级分类"] = {"ai-tag": ai_subtag_precision, "alg-tag-2.0": alg_subtag2_precision,
                         "alg-tag-3.0": alg_subtag3_precision}
    precision_df = pd.DataFrame(precision).T
    add_excel_sheet(precision_df, writer, "precision")
    # 一级分类classification report
    add_excel_sheet(ai_tag_df, writer, "ai-tag")
    add_excel_sheet(alg_tag2_df, writer, "alg-tag-2.0")
    add_excel_sheet(alg_tag3_df, writer, "alg-tag-3.0")
    # 二级分类classification report
    add_excel_sheet(ai_subtag_df, writer, "ai-subtag")
    add_excel_sheet(alg_subtag2_df, writer, "alg-subtag-2.0")
    add_excel_sheet(alg_subtag3_df, writer, "alg-subtag-3.0")
Exemple #5
0
def analysis_example(excel_file):
    head, table = read_xlrd(excel_file)
    if_real_index = head.index("true/false")
    label_index = head.index("label")
    manual_label_index = head.index("manual_check_one")
    predict_label_index = head.index("predict_label")
    label_count = dict()
    for row_num in range(1, table.nrows):
        row_value = table.row_values(row_num)
        if row_value[if_real_index] == "":
            continue
        if_real = int(row_value[if_real_index])
        label = row_value[label_index]
        manual_label = row_value[manual_label_index]
        predict_label = row_value[predict_label_index].split(",")
        if label in label_count:
            label_count[label]["right_count"] += if_real
            label_count[label]["all_count"] += 1
            if manual_label in predict_label:
                label_count[label]["predict_count"] += 1
            if manual_label in label_count[label]["manual_label"]:
                label_count[label]["manual_label"][manual_label] += 1
            else:
                label_count[label]["manual_label"][manual_label] = 1
        else:
            label_count[label] = dict()
            label_count[label]["right_count"] = if_real
            label_count[label]["predict_count"] = 0
            if manual_label in predict_label:
                label_count[label]["predict_count"] += 1
            label_count[label]["all_count"] = 1
            label_count[label]["manual_label"] = dict()
            if manual_label in label_count[label]["manual_label"]:
                label_count[label]["manual_label"][manual_label] += 1
            else:
                label_count[label]["manual_label"][manual_label] = 1
    label_analysis = dict()
    for k, v in label_count.items():
        v["percent"] = "%.2f%%" % (v["right_count"] / v["all_count"] * 100)
        v["predict_percent"] = "%.2f%%" % (v["predict_count"] / v["all_count"] * 100)
        label_analysis[k] = v
    print(json.dumps(label_analysis, ensure_ascii=False, indent=4))