Exemple #1
0
def collect_data_2():

    df_total_with_f = pd.read_csv("production/df_total_with_f.csv",
                                  header=0,
                                  index_col="trace_id")
    # 丢弃全列为NA的数据
    print("丢弃NA")
    df_total_with_f = preprocessing_set.drop_na_data(df_total_with_f)

    # 丢弃全列值相同的数据
    print("丢弃All Same")
    df_total_with_f = preprocessing_set.drop_all_same_data(df_total_with_f)

    # 丢弃不需要的列
    print("选择需要的列")
    df_total_with_f = preprocessing_set.select_data(df_total_with_f)

    # 填补空缺值
    print("填补空缺值")
    df_total_with_f = preprocessing_set.fill_empty_data(df_total_with_f)

    # 把不规则的值转换成数字
    print("数据转换")
    df_total_with_f = preprocessing_set.convert_data(df_total_with_f)

    df_total_with_f.to_csv("production/df_total_with_f_min.csv")
Exemple #2
0
def preprocessing():
    # First Set of CSV
    trace_csv_one = [
        "110/trace_verified_sequence.csv", "110/seq_seq_sequence.csv",
        "110/seq_caller_sequence.csv"
    ]
    # Second Set of CSV
    trace_csv_two = [
        "110/trace_verified_instance3.csv", "110/seq_seq_instance3.csv",
        "110/seq_caller_instance3.csv"
    ]
    # Third Set of CSV
    trace_csv_three = [
        "110/trace_verified_config_2.csv", "110/seq_seq_config.csv",
        "110/seq_caller_config.csv"
    ]
    # Index Column Name
    index_col = "trace_id"
    # Read CONFIG and INSTANCE
    df_one_0 = pd.read_csv(trace_csv_one[0], header=0, index_col=index_col)
    df_two_0 = pd.read_csv(trace_csv_two[0], header=0, index_col=index_col)
    df_three_0 = pd.read_csv(trace_csv_three[0], header=0, index_col=index_col)
    df_total_0 = preprocessing_set.append_data(df_one_0, df_two_0)
    df_total_0 = preprocessing_set.append_data(df_total_0, df_three_0)
    df_total_0 = get_min_data(df_total_0)
    # Read SEQUENCE-SEQ
    df_one_1 = pd.read_csv(trace_csv_one[1], header=0, index_col=index_col)
    df_two_1 = pd.read_csv(trace_csv_two[1], header=0, index_col=index_col)
    df_three_1 = pd.read_csv(trace_csv_three[1], header=0, index_col=index_col)
    df_total_1 = preprocessing_set.append_data(df_one_1, df_two_1)
    df_total_1 = preprocessing_set.append_data(df_total_1, df_three_1)
    df_total_1 = get_min_data(df_total_1)
    # Read SEQUENCE - CALLER
    df_one_2 = pd.read_csv(trace_csv_one[2], header=0, index_col=index_col)
    df_two_2 = pd.read_csv(trace_csv_two[2], header=0, index_col=index_col)
    df_three_2 = pd.read_csv(trace_csv_three[2], header=0, index_col=index_col)
    df_total_2 = preprocessing_set.append_data(df_one_2, df_two_2)
    df_total_2 = preprocessing_set.append_data(df_total_2, df_three_2)
    df_total_2 = get_min_data(df_total_2)
    # 将各个部分的数据JOIN起来
    df_total = preprocessing_set.merge_data(df_trace=df_total_0,
                                            df_seq=df_total_1,
                                            df_seq_caller=df_total_2)
    # 填补空缺值
    print("fill")
    df_total = preprocessing_set.fill_empty_data(df_total)
    # 丢弃没故障数据
    # df_total = df_total.loc[df_total["y_issue_ms"] != "Success"]
    # 把不规则的值转换成数字
    print("convert")
    df_total = preprocessing_set.convert_data(df_total)
    # 按照某个Label对数据进行过采样以平衡样本数量
    # df_total = preprocessing_set.sampling(df_total, "y_final_result")
    # 过采样后打乱数据
    # df_total = shuffle(df_total)
    # 输出数据
    print("output")
    df_total.to_csv("ready_use_max_without_sampling_mms.csv")
Exemple #3
0
def preprocessing_sockshop_model1_data():
    # trace_id列数序号列
    index_col = "trace_id"
    # 把需要的数据全读进来
    df_part_1 = pd.read_csv(
        "sockshop_data/trace_verified_config_cpu_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_2 = pd.read_csv(
        "sockshop_data/trace_verified_config_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_3 = pd.read_csv(
        "sockshop_data/trace_verified_instance_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_4 = pd.read_csv(
        "sockshop_data/trace_verified_sequence_sock_combined.csv",
        header=0,
        index_col=index_col)
    # 把读进来的数据都串在一起
    df_total = preprocessing_set.append_data(df_part_1, df_part_2)
    df_total = preprocessing_set.append_data(df_total, df_part_3)
    df_total = preprocessing_set.append_data(df_total, df_part_4)

    # 做数据的筛选
    print("select")
    df_total = get_min_data(df_total)

    # 把y_issue_ms和y_issue_dim_type两列的值全搞成小写
    df_total["y_issue_ms"] = df_total["y_issue_ms"].str.lower()
    df_total["y_issue_dim_type"] = df_total["y_issue_dim_type"].str.lower()

    # 填补空缺值
    print("fill")
    df_total = preprocessing_set.fill_empty_data(df_total)

    # 把不规则的值转换成数字
    print("convert")
    df_total = preprocessing_set.convert_data(df_total)

    # df_total = shuffle(df_total)

    print("处理完的数据有这么多:", len(df_total))

    # 把文件保存下来
    df_total.to_csv("sockshop_data/ss_total_mms.csv")
Exemple #4
0
def preprocessing_sockshop_model1_data():
    index_col = "trace_id"
    df_part_1 = pd.read_csv(
        "sockshop_data/trace_verified_config_cpu_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_2 = pd.read_csv(
        "sockshop_data/trace_verified_config_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_3 = pd.read_csv(
        "sockshop_data/trace_verified_instance_sock_combined.csv",
        header=0,
        index_col=index_col)
    df_part_4 = pd.read_csv(
        "sockshop_data/trace_verified_sequence_sock_combined.csv",
        header=0,
        index_col=index_col)

    df_total = preprocessing_set.append_data(df_part_1, df_part_2)
    df_total = preprocessing_set.append_data(df_total, df_part_3)
    df_total = preprocessing_set.append_data(df_total, df_part_4)

    print("select")

    df_total = get_min_data(df_total)

    df_total["y_issue_ms"] = df_total["y_issue_ms"].str.lower()
    df_total["y_issue_dim_type"] = df_total["y_issue_dim_type"].str.lower()

    print("fill")

    # 填补空缺值
    df_total = preprocessing_set.fill_empty_data(df_total)

    print("convert")

    # 把不规则的值转换成数字
    df_total = preprocessing_set.convert_data(df_total)

    # df_total = shuffle(df_total)

    print("总数据:", len(df_total))

    df_total.to_csv("sockshop_data/ss_total_mms.csv")