Ejemplo n.º 1
0
def preprocessiong_sockshop_model2_data():
    ss_model_2_data = [
        "data_for_big_model/sock_shop_model2/sock_shop_config_model2.csv",
        "data_for_big_model/sock_shop_model2/sock_shop_instance_model2.csv",
        "data_for_big_model/sock_shop_model2/sock_shop_sequence_model2.csv",
    ]
    ss_model2_config = pd.read_csv(ss_model_2_data[0],
                                   header=0,
                                   index_col=None)
    ss_model2_inst = pd.read_csv(ss_model_2_data[1], header=0, index_col=None)
    ss_model2_seq = pd.read_csv(ss_model_2_data[2], header=0, index_col=None)
    ss_temp_append = preprocessing_set.append_data(ss_model2_config,
                                                   ss_model2_inst)
    ss_model2_total = preprocessing_set.append_data(ss_temp_append,
                                                    ss_model2_seq)
    ss_model2_total = preprocessing_set.drop_na_data(ss_model2_total)
    print(len(ss_model2_total.keys()))
    ss_model2_total.pop("issue_content")

    ss_model2_total["issue_ms"] = ss_model2_total["issue_ms"].str.lower()
    ss_model2_total["issue_type"] = ss_model2_total["issue_type"].str.lower()

    ss_model2_total = preprocessing_set.fill_empty_data_model2(ss_model2_total)
    ss_model2_total = preprocessing_set.convert_data_model2(ss_model2_total)
    ss_model2_total.to_csv("ss_model2_total.csv")
Ejemplo n.º 2
0
def preprocessing_model_2():
    ts_model_2_data = [
        "data_for_big_model/train_ticket_model2/model2_config.csv",
        "data_for_big_model/train_ticket_model2/model2_inst.csv",
        "data_for_big_model/train_ticket_model2/model2_seq.csv",
    ]
    ts_model2_config = pd.read_csv(ts_model_2_data[0],
                                   header=0,
                                   index_col=None)
    ts_model2_inst = pd.read_csv(ts_model_2_data[1], header=0, index_col=None)
    ts_model2_seq = pd.read_csv(ts_model_2_data[2], header=0, index_col=None)
    ts_temp_append = preprocessing_set.append_data(ts_model2_config,
                                                   ts_model2_inst)
    ts_model2_total = preprocessing_set.append_data(ts_temp_append,
                                                    ts_model2_seq)

    ts_model2_total = preprocessing_set.drop_na_data(ts_model2_total)

    print(len(ts_model2_total.keys()))

    # ts_model2_total = preprocessing_set.drop_all_same_data(ts_model2_total)

    ts_model2_total.pop("issue_content")

    ts_model2_total = preprocessing_set.fill_empty_data_model2(ts_model2_total)
    ts_model2_total = preprocessing_set.convert_data_model2(ts_model2_total)
    ts_model2_total.to_csv("ts_model2_total.csv")
Ejemplo n.º 3
0
def merge_all_sockshop():
    ss_part_1 = pd.read_csv("ss/trace_y_config_cpu_sockshop_extract_1.csv",
                            header=0,
                            index_col=None)
    ss_part_2 = pd.read_csv("ss/trace_y_config_mem_sockshop_extract_1.csv",
                            header=0,
                            index_col=None)
    ss_part_3 = pd.read_csv("ss/trace_y_instance_sockshop_extract_1.csv",
                            header=0,
                            index_col=None)
    ss_part_4 = pd.read_csv("ss/trace_y_sequence_sockshop_extract_1.csv",
                            header=0,
                            index_col=None)
    ss_total = preprocessing_set.append_data(ss_part_1, ss_part_2)
    ss_total = preprocessing_set.append_data(ss_total, ss_part_3)
    ss_total = preprocessing_set.append_data(ss_total, ss_part_4)

    ss_total["y_issue_ms"].fillna("Success", inplace=True)
    ss_total["y_issue_dim_type"].fillna("Success", inplace=True)

    ss_total = ss_total.loc[ss_total["y_issue_dim_type"] != "Success"]

    ss_total = shuffle(ss_total)
    print("总数据量:", len(ss_total))

    # ss_total = ss_total.loc[ss_total["y_issue_dim_type"] != "unknown"]

    ss_total = preprocessing_set.sampling(ss_total, "y_issue_ms")

    ss_total.to_csv("ss_tpds_total.csv")
Ejemplo n.º 4
0
def merge_all():
    ts_part_1 = pd.read_csv("ts/trace_y_config_cpu_extract_1.csv",
                            header=0,
                            index_col=None)
    ts_part_2 = pd.read_csv("ts/trace_y_config_memory_extract_1.csv",
                            header=0,
                            index_col=None)
    ts_part_3 = pd.read_csv("ts/trace_y_instance_extract_1.csv",
                            header=0,
                            index_col=None)
    ts_part_4 = pd.read_csv("ts/trace_y_instance_extract_2.csv",
                            header=0,
                            index_col=None)
    ts_part_5 = pd.read_csv("ts/trace_y_sequence_extract_1.csv",
                            header=0,
                            index_col=None)
    ts_part_6 = pd.read_csv("ts/trace_y_sequence_extract_2.csv",
                            header=0,
                            index_col=None)
    ts_total = preprocessing_set.append_data(ts_part_1, ts_part_2)
    ts_total = preprocessing_set.append_data(ts_total, ts_part_3)
    ts_total = preprocessing_set.append_data(ts_total, ts_part_4)
    ts_total = preprocessing_set.append_data(ts_total, ts_part_5)
    ts_total = preprocessing_set.append_data(ts_total, ts_part_6)

    ts_total["y_issue_ms"].fillna("Success", inplace=True)
    ts_total["y_issue_dim_type"].fillna("Success", inplace=True)

    ts_total = ts_total.loc[ts_total["y_issue_dim_type"] != "Success"]
    # ts_total = preprocessing_set.sampling(ts_total, "y_issue_ms")
    ts_total = shuffle(ts_total)
    print("总数据量:", len(ts_total))
    ts_total.to_csv("ts_tpds_total.csv")
Ejemplo n.º 5
0
def preprocessing_sockshop_model1_data():
    # trace_id列数序号列
    index_col = "trace_id"
    # 把需要的数据全读进来
    df_part_1 = pd.read_csv(
        "sockshop_data/trace_verified_config_cpu_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_2 = pd.read_csv(
        "sockshop_data/trace_verified_config_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_3 = pd.read_csv(
        "sockshop_data/trace_verified_instance_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_4 = pd.read_csv(
        "sockshop_data/trace_verified_sequence_sock_combined.csv",
        header=0,
        index_col=index_col)
    # 把读进来的数据都串在一起
    df_total = preprocessing_set.append_data(df_part_1, df_part_2)
    df_total = preprocessing_set.append_data(df_total, df_part_3)
    df_total = preprocessing_set.append_data(df_total, df_part_4)

    # 做数据的筛选
    print("select")
    df_total = get_min_data(df_total)

    # 把y_issue_ms和y_issue_dim_type两列的值全搞成小写
    df_total["y_issue_ms"] = df_total["y_issue_ms"].str.lower()
    df_total["y_issue_dim_type"] = df_total["y_issue_dim_type"].str.lower()

    # 填补空缺值
    print("fill")
    df_total = preprocessing_set.fill_empty_data(df_total)

    # 把不规则的值转换成数字
    print("convert")
    df_total = preprocessing_set.convert_data(df_total)

    # df_total = shuffle(df_total)

    print("处理完的数据有这么多:", len(df_total))

    # 把文件保存下来
    df_total.to_csv("sockshop_data/ss_total_mms.csv")
Ejemplo n.º 6
0
def preprocessing_sockshop_model1_data():
    index_col = "trace_id"
    df_part_1 = pd.read_csv(
        "sockshop_data/trace_verified_config_cpu_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_2 = pd.read_csv(
        "sockshop_data/trace_verified_config_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_3 = pd.read_csv(
        "sockshop_data/trace_verified_instance_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_4 = pd.read_csv(
        "sockshop_data/trace_verified_sequence_sock_combined.csv",
        header=0,
        index_col=index_col)

    df_total = preprocessing_set.append_data(df_part_1, df_part_2)
    df_total = preprocessing_set.append_data(df_total, df_part_3)
    df_total = preprocessing_set.append_data(df_total, df_part_4)

    print("select")

    df_total = get_min_data(df_total)

    df_total["y_issue_ms"] = df_total["y_issue_ms"].str.lower()
    df_total["y_issue_dim_type"] = df_total["y_issue_dim_type"].str.lower()

    print("fill")

    # 填补空缺值
    df_total = preprocessing_set.fill_empty_data(df_total)

    print("convert")

    # 把不规则的值转换成数字
    df_total = preprocessing_set.convert_data(df_total)

    # df_total = shuffle(df_total)

    print("总数据:", len(df_total))

    df_total.to_csv("sockshop_data/ss_total_mms.csv")
Ejemplo n.º 7
0
def preprocessing():
    # First Set of CSV
    trace_csv_one = [
        "110/trace_verified_sequence.csv", "110/seq_seq_sequence.csv",
        "110/seq_caller_sequence.csv"
    ]
    # Second Set of CSV
    trace_csv_two = [
        "110/trace_verified_instance3.csv", "110/seq_seq_instance3.csv",
        "110/seq_caller_instance3.csv"
    ]
    # Third Set of CSV
    trace_csv_three = [
        "110/trace_verified_config_2.csv", "110/seq_seq_config.csv",
        "110/seq_caller_config.csv"
    ]
    # Index Column Name
    index_col = "trace_id"
    # Read CONFIG and INSTANCE
    df_one_0 = pd.read_csv(trace_csv_one[0], header=0, index_col=index_col)
    df_two_0 = pd.read_csv(trace_csv_two[0], header=0, index_col=index_col)
    df_three_0 = pd.read_csv(trace_csv_three[0], header=0, index_col=index_col)
    df_total_0 = preprocessing_set.append_data(df_one_0, df_two_0)
    df_total_0 = preprocessing_set.append_data(df_total_0, df_three_0)
    df_total_0 = get_min_data(df_total_0)
    # Read SEQUENCE-SEQ
    df_one_1 = pd.read_csv(trace_csv_one[1], header=0, index_col=index_col)
    df_two_1 = pd.read_csv(trace_csv_two[1], header=0, index_col=index_col)
    df_three_1 = pd.read_csv(trace_csv_three[1], header=0, index_col=index_col)
    df_total_1 = preprocessing_set.append_data(df_one_1, df_two_1)
    df_total_1 = preprocessing_set.append_data(df_total_1, df_three_1)
    df_total_1 = get_min_data(df_total_1)
    # Read SEQUENCE - CALLER
    df_one_2 = pd.read_csv(trace_csv_one[2], header=0, index_col=index_col)
    df_two_2 = pd.read_csv(trace_csv_two[2], header=0, index_col=index_col)
    df_three_2 = pd.read_csv(trace_csv_three[2], header=0, index_col=index_col)
    df_total_2 = preprocessing_set.append_data(df_one_2, df_two_2)
    df_total_2 = preprocessing_set.append_data(df_total_2, df_three_2)
    df_total_2 = get_min_data(df_total_2)
    # 将各个部分的数据JOIN起来
    df_total = preprocessing_set.merge_data(df_trace=df_total_0,
                                            df_seq=df_total_1,
                                            df_seq_caller=df_total_2)
    # 填补空缺值
    print("fill")
    df_total = preprocessing_set.fill_empty_data(df_total)
    # 丢弃没故障数据
    # df_total = df_total.loc[df_total["y_issue_ms"] != "Success"]
    # 把不规则的值转换成数字
    print("convert")
    df_total = preprocessing_set.convert_data(df_total)
    # 按照某个Label对数据进行过采样以平衡样本数量
    # df_total = preprocessing_set.sampling(df_total, "y_final_result")
    # 过采样后打乱数据
    # df_total = shuffle(df_total)
    # 输出数据
    print("output")
    df_total.to_csv("ready_use_max_without_sampling_mms.csv")
Ejemplo n.º 8
0
def collect_data():

    print("准备读取production数据")
    df_f1 = pd.read_csv(
        "production/f1/trace_verified_sequence_f1_combined.csv",
        header=0,
        index_col="trace_id")
    df_f2 = pd.read_csv(
        "production/f2/trace_verified_sequence_f2_combined.csv",
        header=0,
        index_col="trace_id")
    df_f3 = pd.read_csv("production/f3/trace_verified_config_f3_combined.csv",
                        header=0,
                        index_col="trace_id")
    df_f4 = pd.read_csv("production/f4/trace_verified_config_f4_combined.csv",
                        header=0,
                        index_col="trace_id")
    df_f5 = pd.read_csv("production/f5/trace_verified_config_f5_combined.csv",
                        header=0,
                        index_col="trace_id")
    df_f7 = pd.read_csv("production/f7/trace_verified_config_f7_combined.csv",
                        header=0,
                        index_col="trace_id")
    df_f8 = pd.read_csv(
        "production/f8/trace_verified_instance_f8_combined.csv",
        header=0,
        index_col="trace_id")
    df_f11 = pd.read_csv(
        "production/f11/trace_verified_instance_f11_combined.csv",
        header=0,
        index_col="trace_id")
    df_f12 = pd.read_csv(
        "production/f12/trace_verified_instance_f12_combined.csv",
        header=0,
        index_col="trace_id")
    df_f13 = pd.read_csv(
        "production/f13/trace_verified_sequence_f13_combined.csv",
        header=0,
        index_col="trace_id")

    print("获取production数据的index")
    df_f1_index = df_f1.index
    df_f2_index = df_f2.index
    df_f3_index = df_f3.index
    df_f4_index = df_f4.index
    df_f5_index = df_f5.index
    df_f7_index = df_f7.index
    df_f8_index = df_f8.index
    df_f11_index = df_f11.index
    df_f12_index = df_f12.index
    df_f13_index = df_f13.index

    print("连接Production数据")
    df_f_all = preprocessing_set.append_data(df_f1, df_f2)
    df_f_all = preprocessing_set.append_data(df_f_all, df_f3)
    df_f_all = preprocessing_set.append_data(df_f_all, df_f4)
    df_f_all = preprocessing_set.append_data(df_f_all, df_f5)
    df_f_all = preprocessing_set.append_data(df_f_all, df_f7)
    df_f_all = preprocessing_set.append_data(df_f_all, df_f8)
    df_f_all = preprocessing_set.append_data(df_f_all, df_f11)
    df_f_all = preprocessing_set.append_data(df_f_all, df_f12)
    df_f_all = preprocessing_set.append_data(df_f_all, df_f13)

    df_f_all.pop("test_case_id")
    df_f_all.pop("test_trace_id")

    print("读取训练数据")
    # First Set of CSV
    trace_csv_one = [
        "110/trace_verified_sequence.csv", "110/seq_seq_sequence.csv",
        "110/seq_caller_sequence.csv"
    ]
    # Second Set of CSV
    trace_csv_two = [
        "110/trace_verified_instance3.csv", "110/seq_seq_instance3.csv",
        "110/seq_caller_instance3.csv"
    ]
    # Third Set of CSV
    trace_csv_three = [
        "110/trace_verified_config_2.csv", "110/seq_seq_config.csv",
        "110/seq_caller_config.csv"
    ]
    # Index Column Name
    index_col = "trace_id"
    # Read CONFIG and INSTANCE
    df_one_0 = pd.read_csv(trace_csv_one[0], header=0, index_col=index_col)
    df_two_0 = pd.read_csv(trace_csv_two[0], header=0, index_col=index_col)
    df_three_0 = pd.read_csv(trace_csv_three[0], header=0, index_col=index_col)

    df_one_0.pop("test_case_id")
    df_one_0.pop("test_trace_id")
    df_two_0.pop("test_case_id")
    df_two_0.pop("test_trace_id")
    df_three_0.pop("test_case_id")
    df_three_0.pop("test_trace_id")

    df_total_0 = preprocessing_set.append_data(df_one_0, df_two_0)
    df_total_0 = preprocessing_set.append_data(df_total_0, df_three_0)
    # Read SEQUENCE-SEQ
    df_one_1 = pd.read_csv(trace_csv_one[1], header=0, index_col=index_col)
    df_two_1 = pd.read_csv(trace_csv_two[1], header=0, index_col=index_col)
    df_three_1 = pd.read_csv(trace_csv_three[1], header=0, index_col=index_col)

    df_one_1.pop("test_case_id")
    df_one_1.pop("test_trace_id")
    df_two_1.pop("test_case_id")
    df_two_1.pop("test_trace_id")
    df_three_1.pop("test_case_id")
    df_three_1.pop("test_trace_id")

    df_total_1 = preprocessing_set.append_data(df_one_1, df_two_1)
    df_total_1 = preprocessing_set.append_data(df_total_1, df_three_1)
    # Read SEQUENCE - CALLER
    df_one_2 = pd.read_csv(trace_csv_one[2], header=0, index_col=index_col)
    df_two_2 = pd.read_csv(trace_csv_two[2], header=0, index_col=index_col)
    df_three_2 = pd.read_csv(trace_csv_three[2], header=0, index_col=index_col)

    # df_one_2.pop("test_case_id")
    # df_one_2.pop("test_trace_id")
    # df_two_2.pop("test_case_id")
    # df_two_2.pop("test_trace_id")
    # df_three_2.pop("test_case_id")
    # df_three_2.pop("test_trace_id")

    df_total_2 = preprocessing_set.append_data(df_one_2, df_two_2)
    df_total_2 = preprocessing_set.append_data(df_total_2, df_three_2)
    # 将各个部分的数据JOIN起来
    df_total = preprocessing_set.merge_data(df_trace=df_total_0,
                                            df_seq=df_total_1,
                                            df_seq_caller=df_total_2)

    print("连接训练数据与Production数据")
    df_total_with_f = preprocessing_set.append_data(df_f_all, df_total)

    df_total_with_f.to_csv("production/df_total_with_f.csv")