Esempio n. 1
0
def merge_all_sockshop():
    ss_part_1 = pd.read_csv("ss/trace_y_config_cpu_sockshop_extract_1.csv",
                            header=0,
                            index_col=None)
    ss_part_2 = pd.read_csv("ss/trace_y_config_mem_sockshop_extract_1.csv",
                            header=0,
                            index_col=None)
    ss_part_3 = pd.read_csv("ss/trace_y_instance_sockshop_extract_1.csv",
                            header=0,
                            index_col=None)
    ss_part_4 = pd.read_csv("ss/trace_y_sequence_sockshop_extract_1.csv",
                            header=0,
                            index_col=None)
    ss_total = preprocessing_set.append_data(ss_part_1, ss_part_2)
    ss_total = preprocessing_set.append_data(ss_total, ss_part_3)
    ss_total = preprocessing_set.append_data(ss_total, ss_part_4)

    ss_total["y_issue_ms"].fillna("Success", inplace=True)
    ss_total["y_issue_dim_type"].fillna("Success", inplace=True)

    ss_total = ss_total.loc[ss_total["y_issue_dim_type"] != "Success"]

    ss_total = shuffle(ss_total)
    print("总数据量:", len(ss_total))

    # ss_total = ss_total.loc[ss_total["y_issue_dim_type"] != "unknown"]

    ss_total = preprocessing_set.sampling(ss_total, "y_issue_ms")

    ss_total.to_csv("ss_tpds_total.csv")
Esempio n. 2
0
def cross_validation_knn(df: DataFrame, y_name, n_neighbors, n_splits):
    fds = KFold(n_splits=n_splits, shuffle=True)
    accuracy = 0.0
    for train_raw_indices, test_raw_indices in fds.split(df):
        train_raw = df.iloc[train_raw_indices]
        test_raw = df.iloc[test_raw_indices]
        train = train_raw
        train = preprocessing_set.sampling(train_raw, y_name)
        test = test_raw
        test = preprocessing_set.sampling(test_raw, y_name)
        temp_accuracy = knn_multi_label_provided_train_test_given_params(
            df_train=train,
            df_test=test,
            y_name=y_name,
            n_neighbors=n_neighbors)
        print(temp_accuracy)
        accuracy = accuracy + temp_accuracy
    return accuracy / n_splits
Esempio n. 3
0
def evaluation_3():
    # extract()

    file_list = [
        "production/f1/f1.csv", "production/f2/f2.csv", "production/f3/f3.csv",
        "production/f4/f4.csv", "production/f5/f5.csv", "production/f7/f7.csv",
        "production/f8/f8.csv", "production/f11/f11.csv",
        "production/f12/f12.csv", "production/f13/f13.csv"
    ]

    df_train = pd.read_csv("production/train_total.csv",
                           header=0,
                           index_col="trace_id")
    df_train = df_train.loc[(df_train["y_final_result"] == 0) |
                            (df_train["y_final_result"] == 1)]
    df_train.pop("y_issue_ms")
    df_train.pop("y_issue_dim_type")

    print("train")
    df_train = preprocessing_set.sampling(df_train, "y_final_result")
    x, y = df_train, df_train.pop("y_final_result")

    clf = RandomForestClassifier(min_samples_leaf=6000, n_estimators=3)
    clf.fit(x, y)

    for i in range(len(file_list)):
        print("F", i)
        file_name = file_list[i]
        df_test = pd.read_csv(file_name, header=0, index_col="trace_id")
        df_test = df_test.loc[(df_test["y_final_result"] == 0) |
                              (df_test["y_final_result"] == 1)]

        df_test.pop("y_issue_ms")
        df_test.pop("y_issue_dim_type")

        print("predict")
        df_test = preprocessing_set.sampling(df_test, "y_final_result")

        real_x, real_y = df_test, df_test.pop("y_final_result")

        pred_y = clf.predict(real_x)

        calculate(real_y, pred_y)
Esempio n. 4
0
def train_version_2():
    # 读入之前准备好的数据
    df = pd.read_csv("ready_use_max_without_sampling.csv",
                     header=0,
                     index_col="trace_id")
    df.pop("y_final_result")
    df.pop("y_issue_ms")
    df.pop("trace_api")
    df.pop("trace_service")
    df_train_raw, df_test = preprocessing_set.split_data(df, 0.8)
    df_train = preprocessing_set.sampling(df_train_raw, "y_issue_dim_type")
    multi_label_model.knn_multi_label_provided_train_test(
        df_train, df_test, "y_issue_dim_type")
Esempio n. 5
0
def eval_1_part_2_ft_ts_ss():
    df_ts = pd.read_csv("ts_model2_total.csv", header=0, index_col=0)
    df_ss = pd.read_csv("ss_model2_total.csv", header=0, index_col=0)

    df_ts = df_ts.loc[(df_ts["final_result"] == 0) |
                      (df_ts["final_result"] == 1)]
    df_ss = df_ss.loc[(df_ss["final_result"] == 0) |
                      (df_ss["final_result"] == 1)]

    df_ts.pop("trace_id")
    df_ts.pop("test_trace_id")
    df_ts.pop("issue_ms")
    df_ts.pop("final_result")  # 训练过程仅仅需要dim_type,不需要这个
    df_ts = preprocessing_set.sampling(df_ts, "issue_type")
    train_ft = df_ts.pop("issue_type")

    df_ss.pop("trace_id")
    df_ss.pop("test_trace_id")
    df_ss.pop("issue_ms")
    real_le = df_ss.pop("final_result")
    df_ss = preprocessing_set.sampling(df_ss, "issue_type")
    real_ft = df_ss.pop("issue_type")

    # clf = MLPClassifier(hidden_layer_sizes=[3, 3], max_iter=200)
    clf = KNeighborsClassifier(n_neighbors=570)
    # clf = RandomForestClassifier(min_samples_leaf=500, n_estimators=5)

    # 训练与预测
    clf.fit(X=df_ts, y=train_ft)
    pred = clf.predict(X=df_ss)
    real_ft_value = real_ft.values

    # # 计算FT的P R F1
    # ft_result_set = []
    # ft_real_set = []
    # for i in range(len(pred)):
    #     print(real_ft_value[i], pred[i])
    #     if real_ft_value[i] == "Success":
    #         ft_real_set.append([0, 0, 0, 1])
    #     elif real_ft_value[i] == "config":
    #         ft_real_set.append([0, 0, 1, 0])
    #     elif real_ft_value[i] == "instance":
    #         ft_real_set.append([0, 1, 0, 0])
    #     else:
    #         ft_real_set.append([1, 0, 0, 0])
    #
    #     if pred[i] == "Success":
    #         ft_result_set.append([0, 0, 0, 1])
    #     elif pred[i] == "config":
    #         ft_result_set.append([0, 0, 1, 0])
    #     elif pred[i] == "instance":
    #         ft_result_set.append([0, 1, 0, 0])
    #     else:
    #         ft_result_set.append([1, 0, 0, 0])
    # calculation.calculate_a_p_r_f(ft_real_set, ft_result_set, 4)

    # # 统计Accuracy
    # acc_count = 0
    # for i in range(len(pred)):
    #     if real_ft_value[i] == pred[i]:
    #         acc_count += 1
    # print("Accuracy", acc_count/len(pred))

    # 计算LE的P R F1
    # Success为正,其他值为负
    le_result_set = []
    le_real_set = []
    for i in range(len(pred)):
        if real_ft_value[i] == "config" or real_ft_value[
                i] == "seq" or real_ft_value[i] == "instance":
            le_real_set.append([1, 0])
        else:
            le_real_set.append([0, 1])
        if pred[i] == "config" or pred[i] == "seq" or pred[i] == "instance":
            le_result_set.append([1, 0])
        else:
            le_result_set.append([0, 1])
    calculation.calculate_a_p_f_single_label(le_real_set, le_result_set)
Esempio n. 6
0
def big_model(tf_file_path, fault_file_path, model_2_file_path,
              test_trace_file_path, test_spans_file_path, ml_name):

    # 模型选择
    clf_le = None
    clf_ms = None
    clf_ft = None
    clf_model2 = None
    if ml_name == "rf":
        print("Big Model", "RF")
        clf_le = RandomForestClassifier(min_samples_leaf=6000, n_estimators=3)
        clf_ms = RandomForestClassifier(min_samples_leaf=1200, n_estimators=5)
        clf_ft = RandomForestClassifier(min_samples_leaf=1500, n_estimators=3)
        clf_model2 = RandomForestClassifier(min_samples_leaf=500,
                                            n_estimators=3)
    elif ml_name == "knn":
        print("Big Model", "KNN")
        clf_le = KNeighborsClassifier(n_neighbors=200)
        clf_ms = KNeighborsClassifier(n_neighbors=200)
        clf_ft = KNeighborsClassifier(n_neighbors=200)
        clf_model2 = KNeighborsClassifier(n_neighbors=200)
    else:
        print("Big Model", "MLP")
        clf_le = MLPClassifier(hidden_layer_sizes=[5, 5], max_iter=200)
        clf_ms = MLPClassifier(hidden_layer_sizes=[5, 5], max_iter=200)
        clf_ft = MLPClassifier(hidden_layer_sizes=[5, 5], max_iter=200)
        clf_model2 = MLPClassifier(hidden_layer_sizes=[5, 5], max_iter=200)

    # 训练LE的模型
    print("LE Model训练开始")
    y_le = "y_final_result"
    df_tf_all = pd.read_csv(tf_file_path, header=0, index_col="trace_id")
    # 丢弃一些无用列并筛选出final_result为正确或者错误的 丢弃unknown数据
    df_tf_all.pop("y_issue_ms")
    df_tf_all.pop("y_issue_dim_type")
    df_tf_all.pop("trace_api")
    df_tf_all.pop("trace_service")
    df_tf_all = df_tf_all.loc[(df_tf_all["y_final_result"] == 0) |
                              (df_tf_all["y_final_result"] == 1)]
    df_tf_all = preprocessing_set.sampling(df_tf_all, "y_final_result")
    le_train_x, le_train_y = preprocessing_set.convert_y_multi_label_by_name(
        df_tf_all, y_le)
    # 训练模型
    clf_le.fit(le_train_x, le_train_y)
    print("LE Model训练完毕")

    # 训练MS的模型
    print("MS Model训练开始")
    y_ms = "y_issue_ms"
    df_fault_all_ms = pd.read_csv(fault_file_path,
                                  header=0,
                                  index_col="trace_id")
    # 将目标服务名全部小写化 丢弃无用列并选择final_result仅仅为错误的数据
    df_fault_all_ms["y_issue_ms"] = df_fault_all_ms["y_issue_ms"].str.lower()
    df_fault_all_ms = df_fault_all_ms.loc[df_fault_all_ms["y_final_result"] ==
                                          1]
    df_fault_all_ms.pop("y_final_result")
    df_fault_all_ms.pop("y_issue_dim_type")
    df_fault_all_ms.pop("trace_api")
    df_fault_all_ms.pop("trace_service")
    ms_train_x, ms_train_y = preprocessing_set.convert_y_multi_label_by_name(
        df_fault_all_ms, y_ms)
    # 训练模型
    clf_ms.fit(X=ms_train_x, y=ms_train_y)
    print("MS Model训练结束")

    # 训练FT模型
    print("FT Model训练开始")
    y_ft = "y_issue_dim_type"
    df_fault_all_ft = pd.read_csv(fault_file_path,
                                  header=0,
                                  index_col="trace_id")
    # 将错误类型转化成小写,抛弃无用属性并选择final_result仅仅为错误的数据
    df_fault_all_ft["y_issue_dim_type"] = df_fault_all_ft[
        "y_issue_dim_type"].str.lower()
    df_fault_all_ft = df_fault_all_ft.loc[df_fault_all_ft["y_final_result"] ==
                                          1]
    df_fault_all_ft.pop("y_final_result")
    df_fault_all_ft.pop("y_issue_ms")
    df_fault_all_ft.pop("trace_api")
    df_fault_all_ft.pop("trace_service")
    ft_train_x, ft_train_y = preprocessing_set.convert_y_multi_label_by_name(
        df_fault_all_ft, y_ft)
    # 训练模型
    clf_ft.fit(X=ft_train_x, y=ft_train_y)
    print("FT Model训练结束")

    # 训练Model_2 - 仅仅使用MS数据
    print("Model2 Model训练开始")
    y_model2 = "issue_type"
    df_model2_all = pd.read_csv(model_2_file_path, header=0, index_col=None)
    # 丢弃无用列
    df_model2_all.pop("issue_ms")
    df_model2_all.pop("trace_id")
    df_model2_all.pop("test_trace_id")
    df_model2_all.pop("final_result")
    df_model2_all = preprocessing_set.sampling(df_model2_all, y_model2)
    model2_train_x, model2_train_y = preprocessing_set.convert_y_multi_label_by_name(
        df_model2_all, y_model2)
    # 训练模型
    clf_model2.fit(X=model2_train_x, y=model2_train_y)
    print("Model2 Model训练结束")

    print("四个小模型训练完成,开始进行测试集读取,每条测试集需要抽取")

    # ======================预测部分
    # 用来储存最终德结果集
    le_test_result = []
    ms_test_result = []
    ft_test_result = []

    # 读入测试数据,并分离出真实的final_result,ms和dim_type
    df_test_trace = pd.read_csv(test_trace_file_path, header=0, index_col=0)

    # 不规范的服务名和错误类型小写化 避免后续麻烦
    df_test_trace["y_issue_ms"] = df_test_trace["y_issue_ms"].str.lower()
    df_test_trace["y_issue_dim_type"] = df_test_trace[
        "y_issue_dim_type"].str.lower()

    # df_test_trace = preprocessing_set.sampling(df_test_trace, "y_issue_ms")
    # df_test_trace = shuffle(df_test_trace)
    # print("测试集维度分布", df_test_trace["y_issue_dim_type"].value_counts())

    # 记录真正的故障ms 以便后续统计P R F1 并丢弃医学无用数据
    real_ms = df_test_trace.pop("y_issue_ms")
    df_test_trace.pop("trace_api")
    df_test_trace.pop("trace_service")

    # 只选择错误数据 还是选择全部数据
    df_test_trace = df_test_trace.loc[(df_test_trace["y_final_result"] == 1)]

    df_test_trace, real_dim_type = preprocessing_set.convert_y_multi_label_by_name(
        df_test_trace, "y_issue_dim_type")
    df_test_trace, real_result = preprocessing_set.convert_y_multi_label_by_name(
        df_test_trace, "y_final_result")

    # 读入model_2测试数据。这个与前面读入的测试数据的Index是匹配的,只是Trace拆分出的Span而已
    df_test_spans = pd.read_csv(test_spans_file_path, header=0, index_col=None)
    df_test_spans.pop("issue_type")
    df_test_spans.pop("test_trace_id")
    df_test_spans.pop("final_result")

    # 下面是一些统计信息 统计model_1和model_2各处理了多少数据 统计top1 top3 top5分别命中了多少样本
    model_2_count = 0
    model_1_count = 0
    count_top1 = 0
    count_top3 = 0
    count_top5 = 0

    # 记录所有Trace的Index以便后续进行记录和提取
    indexs = df_test_trace.index.tolist()
    # 统计model2表中的trace_id列表
    spans_indexs = df_test_spans["trace_id"].tolist()

    # 依次对测试集的每一条trace进行预测(LE的结果会影响使用的预测模型)
    for temp_trace_index in indexs:
        # 抽出要进行预测的那条trace
        temp_trace = df_test_trace.loc[temp_trace_index, :]
        temp_trace = [temp_trace]
        # 预测这个Trace的LE结果以及结果的置信度
        temp_trace_result = clf_le.predict(temp_trace)
        temp_trace_proba = clf_le.predict_proba(temp_trace)
        # 如果使用Trace预测出的LE的置信度低于一定阈值 则进行Model_2预测 否则使用Model_1现有的模型预测

        # 下面这行判断语句用来判断这条Trace使用Model_1还是Model_2
        # [注意] MLP的置信度输出和别人不太一样 mlp是[0.2 0.8] 别人[[0.1,0.9],[0.8,0.2]]
        # [注意] RF KNN等应该用下面这行判断语句
        if spans_indexs.__contains__(temp_trace_index)\
                and (temp_trace_result[0][0] == 0 and temp_trace_result[0][1] == 1 and temp_trace_proba[1][0][1] < 0.7) \
                or (temp_trace_result[0][0] == 1 and temp_trace_result[0][1] == 0 and temp_trace_proba[0][0][1] < 0.7):
            # [注意] MLP应该用下面这行判断语句
            # if spans_indexs.__contains__(temp_trace_index)\
            #         and (temp_trace_result[0][0] == 0 and temp_trace_result[0][1] == 1 and temp_trace_proba[0][1] < 0.1) \
            #         or (temp_trace_result[0][0] == 1 and temp_trace_result[0][1] == 0 and temp_trace_proba[0][0] < 0.1):

            # 根据Trace_id把对应的一组Span抽取出来
            spans_set = df_test_spans.loc[df_test_spans["trace_id"] ==
                                          temp_trace_index]
            spans_set.pop("trace_id")
            # 记录下抽取出的一组Span中每个Span对应的故障微服务
            span_set_ms_raw = spans_set.iloc["issue_ms"]
            # 准备储存这些Span的结果,以便后续转化输出
            span_set_dim_result_collect = []
            span_set_dim_confidence_collect = []
            # 执行并存储每个Span的结果
            spans_set_size = len(spans_set)
            for i in range(spans_set_size):
                temp_span = spans_set.iloc[i]
                temp_span = [temp_span]
                temp_span_result = clf_model2.predict(temp_span)
                temp_span_proba = clf_model2.predict_proba(temp_span)
                # print("temp_span_result", temp_span_result)
                # print("temp_span_proba", temp_span_proba)
                span_set_dim_result_collect.append(temp_span_result[0])
                # [注意] 下面这行是MLP专用
                # span_set_dim_confidence_collect.append([
                #     [1 - temp_span_proba[0][0], temp_span_proba[0][0]],
                #     [1 - temp_span_proba[0][1], temp_span_proba[0][1]],
                #     [1 - temp_span_proba[0][2], temp_span_proba[0][2]]
                # ])
                # [注意] RF与KNN专用
                span_set_dim_confidence_collect.append([
                    temp_span_proba[0][0], temp_span_proba[1][0],
                    temp_span_proba[2][0]
                ])
            # 计算最终结果 1.计算le
            # 这里会将一个trace对应的所有span都预测一遍 然后记录这些span中有无错误 并记录哪些span被标记为错误
            temp_trace_model2_le = True
            temp_trace_model2_fault_span_record = []
            for i in range(spans_set_size):
                if span_set_dim_result_collect[i][0] != 0 \
                    or span_set_dim_result_collect[i][1] != 0 \
                        or span_set_dim_result_collect[i][2] != 0:
                    temp_trace_model2_le = False
                    temp_trace_model2_fault_span_record.append(i)
            # 如果这些span中有故障 那么需要对故障类型和故障微服务做进一步的预测
            # 顺便也记录一下每个错误的span对应的微服务以及其置信度
            temp_trace_model_2_ms_set = np.zeros(42)  # 记录每个微服务故障的置信度
            if not temp_trace_model2_le:
                # 如果一系列span有些报错了 说明整体trace有错误 LE预测结果添加一个有故障的结论
                le_test_result.append([0, 1])
                # 然后开始计算这个Trace的DIM_TYPE 找出置信度最高的那个 然后添加故障类型预测的结论里
                temp_trace_model_2_max_index = -1
                temp_trace_model_2_max_confidence = -1.0
                for i in temp_trace_model2_fault_span_record:
                    temp_confidence = max(span_set_dim_confidence_collect[i][0][0],
                                          span_set_dim_confidence_collect[i][0][1]) \
                                      + max(span_set_dim_confidence_collect[i][1][0],
                                            span_set_dim_confidence_collect[i][1][1]) \
                                      + max(span_set_dim_confidence_collect[i][1][0],
                                            span_set_dim_confidence_collect[i][2][1])
                    # 把这个故障span对应的微服务的置信度记录下来
                    local_ms_index = preprocessing_set.service_index_map.get(
                        span_set_ms_raw[i])
                    temp_trace_model_2_ms_set[local_ms_index] = max(
                        temp_trace_model_2_ms_set[local_ms_index],
                        temp_confidence)
                    # 然后再找最大的dim_type
                    if temp_confidence > temp_trace_model_2_max_confidence:
                        temp_trace_model_2_max_index = i
                        temp_trace_model_2_max_confidence = temp_confidence
                ft_test_result.append(
                    span_set_dim_result_collect[temp_trace_model_2_max_index])
                # 置信度最高的那个也是最终预测的故障微服务 将其写入结论
                spans_set_ms_set = preprocessing_set.convert_y_multi_label_by_name(
                    spans_set, "issue_ms")
                ms_test_result.append(
                    spans_set_ms_set[temp_trace_model_2_max_index])
                # 现在更新TOP1 TOP3 TOP5的计算
                top1, top3, top5 = tryTopKMS(
                    temp_trace_model_2_ms_set,
                    real_ms[(model_2_count + model_1_count)])
                if top1:
                    count_top1 += 1
                if top3:
                    count_top3 += 1
                if top5:
                    count_top5 += 1

            else:
                # 如果一系列span都没有报错,说明整体trace是对的,结论中输出正确结果
                le_test_result.append([1.0, 0.0])
                ms_test_result.append(np.zeros(42))
                ft_test_result.append([0.0, 0.0, 0.0])
                # 更新top1 top3 top5的统计值 这里已经预测无故障 若原本就无故障 三种top都加一 否则认为没预测对 不予理会
                real_svc = real_ms[(model_2_count + model_1_count)]
                if real_svc == "success":
                    count_top1 += 1
                    count_top3 += 1
                    count_top5 += 1

            # 更新一下统计值
            model_2_count += 1
        else:

            # if temp_trace_result[0][0] == 0 and temp_trace_result[0][1] == 1:
            #     le_test_result.append(temp_trace_result[0])
            #     ms_test_result.append(np.zeros(42))
            #     ft_test_result.append([0, 0, 0])
            # else:
            ms_pred_result = clf_ms.predict(temp_trace)
            ms_proba = clf_ms.predict_proba(temp_trace)

            ft_pred_result = clf_ft.predict(temp_trace)

            le_test_result.append(temp_trace_result[0])
            ms_test_result.append(ms_pred_result[0])
            ft_test_result.append(ft_pred_result[0])

            # 更新一些统计值
            # [注意]RF.KNN专用 RF.KNN的proba输出比MLP多嵌套一层 于是写了个函数将Proba提取出来
            ms_proba = convert_to_proba_list(ms_proba)
            top1, top3, top5 = tryTopKMS(
                ms_proba, real_ms[(model_2_count + model_1_count)])
            # [注意]MLP专用
            # top1, top3, top5 = tryTopKMS(ms_proba[0], real_ms[(model_2_count+model_1_count)])

            if top1:
                count_top1 += 1
            if top3:
                count_top3 += 1
            if top5:
                count_top5 += 1
            model_1_count += 1

    # 打印最终统计值
    print("使用Model-1的Trace数量", model_1_count, "使用Model-2的Trace数量",
          model_2_count)
    print("Trace错误种类统计值")
    calculation.calculate_a_p_r_f(real_dim_type, ft_test_result, 3)
    calculation.calculate_a_p_r_f(real_result, le_test_result, 2)
    print("MS Top1 Accuracy", count_top1 / (model_1_count + model_2_count))
    print("MS Top3 Accuracy", count_top3 / (model_1_count + model_2_count))
    print("MS Top5 Accuracy", count_top5 / (model_1_count + model_2_count))