def for_model_2(): # First Set of CSV trace_csv_one = [ "110/trace_verified_sequence.csv", "110/seq_seq_sequence.csv", "110/seq_caller_sequence.csv" ] # Second Set of CSV trace_csv_two = [ "110/trace_verified_instance3.csv", "110/seq_seq_instance3.csv", "110/seq_caller_instance3.csv" ] # Third Set of CSV trace_csv_three = [ "110/trace_verified_config_2.csv", "110/seq_seq_config.csv", "110/seq_caller_config.csv" ] # Index Column Name index_col = "trace_id" # Read CONFIG and INSTANCE df_one_0 = pd.read_csv(trace_csv_one[0], header=0, index_col=index_col) df_one_1 = pd.read_csv(trace_csv_one[1], header=0, index_col=index_col) df_one_1.pop("test_trace_id") df_one_1.pop("test_case_id") df_one_2 = pd.read_csv(trace_csv_one[2], header=0, index_col=index_col) # 将各个部分的数据JOIN起来 df_total1 = preprocessing_set.merge_data(df_trace=df_one_0, df_seq=df_one_1, df_seq_caller=df_one_2) df_total1.to_csv("110/model_1_seq_total") df_two_0 = pd.read_csv(trace_csv_two[0], header=0, index_col=index_col) df_two_1 = pd.read_csv(trace_csv_two[1], header=0, index_col=index_col) df_two_1.pop("test_trace_id") df_two_1.pop("test_case_id") df_two_2 = pd.read_csv(trace_csv_two[2], header=0, index_col=index_col) # 将各个部分的数据JOIN起来 df_total2 = preprocessing_set.merge_data(df_trace=df_two_0, df_seq=df_two_1, df_seq_caller=df_two_2) df_total2.to_csv("110/model_1_inst_total") df_three_0 = pd.read_csv(trace_csv_three[0], header=0, index_col=index_col) df_three_1 = pd.read_csv(trace_csv_three[1], header=0, index_col=index_col) df_three_1.pop("test_trace_id") df_three_1.pop("test_case_id") df_three_2 = pd.read_csv(trace_csv_three[2], header=0, index_col=index_col) # 将各个部分的数据JOIN起来 df_total3 = preprocessing_set.merge_data(df_trace=df_three_0, df_seq=df_three_1, df_seq_caller=df_three_2) df_total3.to_csv("110/model_1_config_total")
def preprocessing(): # First Set of CSV trace_csv_one = [ "110/trace_verified_sequence.csv", "110/seq_seq_sequence.csv", "110/seq_caller_sequence.csv" ] # Second Set of CSV trace_csv_two = [ "110/trace_verified_instance3.csv", "110/seq_seq_instance3.csv", "110/seq_caller_instance3.csv" ] # Third Set of CSV trace_csv_three = [ "110/trace_verified_config_2.csv", "110/seq_seq_config.csv", "110/seq_caller_config.csv" ] # Index Column Name index_col = "trace_id" # Read CONFIG and INSTANCE df_one_0 = pd.read_csv(trace_csv_one[0], header=0, index_col=index_col) df_two_0 = pd.read_csv(trace_csv_two[0], header=0, index_col=index_col) df_three_0 = pd.read_csv(trace_csv_three[0], header=0, index_col=index_col) df_total_0 = preprocessing_set.append_data(df_one_0, df_two_0) df_total_0 = preprocessing_set.append_data(df_total_0, df_three_0) df_total_0 = get_min_data(df_total_0) # Read SEQUENCE-SEQ df_one_1 = pd.read_csv(trace_csv_one[1], header=0, index_col=index_col) df_two_1 = pd.read_csv(trace_csv_two[1], header=0, index_col=index_col) df_three_1 = pd.read_csv(trace_csv_three[1], header=0, index_col=index_col) df_total_1 = preprocessing_set.append_data(df_one_1, df_two_1) df_total_1 = preprocessing_set.append_data(df_total_1, df_three_1) df_total_1 = get_min_data(df_total_1) # Read SEQUENCE - CALLER df_one_2 = pd.read_csv(trace_csv_one[2], header=0, index_col=index_col) df_two_2 = pd.read_csv(trace_csv_two[2], header=0, index_col=index_col) df_three_2 = pd.read_csv(trace_csv_three[2], header=0, index_col=index_col) df_total_2 = preprocessing_set.append_data(df_one_2, df_two_2) df_total_2 = preprocessing_set.append_data(df_total_2, df_three_2) df_total_2 = get_min_data(df_total_2) # 将各个部分的数据JOIN起来 df_total = preprocessing_set.merge_data(df_trace=df_total_0, df_seq=df_total_1, df_seq_caller=df_total_2) # 填补空缺值 print("fill") df_total = preprocessing_set.fill_empty_data(df_total) # 丢弃没故障数据 # df_total = df_total.loc[df_total["y_issue_ms"] != "Success"] # 把不规则的值转换成数字 print("convert") df_total = preprocessing_set.convert_data(df_total) # 按照某个Label对数据进行过采样以平衡样本数量 # df_total = preprocessing_set.sampling(df_total, "y_final_result") # 过采样后打乱数据 # df_total = shuffle(df_total) # 输出数据 print("output") df_total.to_csv("ready_use_max_without_sampling_mms.csv")
def collect_data(): print("准备读取production数据") df_f1 = pd.read_csv( "production/f1/trace_verified_sequence_f1_combined.csv", header=0, index_col="trace_id") df_f2 = pd.read_csv( "production/f2/trace_verified_sequence_f2_combined.csv", header=0, index_col="trace_id") df_f3 = pd.read_csv("production/f3/trace_verified_config_f3_combined.csv", header=0, index_col="trace_id") df_f4 = pd.read_csv("production/f4/trace_verified_config_f4_combined.csv", header=0, index_col="trace_id") df_f5 = pd.read_csv("production/f5/trace_verified_config_f5_combined.csv", header=0, index_col="trace_id") df_f7 = pd.read_csv("production/f7/trace_verified_config_f7_combined.csv", header=0, index_col="trace_id") df_f8 = pd.read_csv( "production/f8/trace_verified_instance_f8_combined.csv", header=0, index_col="trace_id") df_f11 = pd.read_csv( "production/f11/trace_verified_instance_f11_combined.csv", header=0, index_col="trace_id") df_f12 = pd.read_csv( "production/f12/trace_verified_instance_f12_combined.csv", header=0, index_col="trace_id") df_f13 = pd.read_csv( "production/f13/trace_verified_sequence_f13_combined.csv", header=0, index_col="trace_id") print("获取production数据的index") df_f1_index = df_f1.index df_f2_index = df_f2.index df_f3_index = df_f3.index df_f4_index = df_f4.index df_f5_index = df_f5.index df_f7_index = df_f7.index df_f8_index = df_f8.index df_f11_index = df_f11.index df_f12_index = df_f12.index df_f13_index = df_f13.index print("连接Production数据") df_f_all = preprocessing_set.append_data(df_f1, df_f2) df_f_all = preprocessing_set.append_data(df_f_all, df_f3) df_f_all = preprocessing_set.append_data(df_f_all, df_f4) df_f_all = preprocessing_set.append_data(df_f_all, df_f5) df_f_all = preprocessing_set.append_data(df_f_all, df_f7) df_f_all = preprocessing_set.append_data(df_f_all, df_f8) df_f_all = preprocessing_set.append_data(df_f_all, df_f11) df_f_all = preprocessing_set.append_data(df_f_all, df_f12) df_f_all = preprocessing_set.append_data(df_f_all, df_f13) df_f_all.pop("test_case_id") df_f_all.pop("test_trace_id") print("读取训练数据") # First Set of CSV trace_csv_one = [ "110/trace_verified_sequence.csv", "110/seq_seq_sequence.csv", "110/seq_caller_sequence.csv" ] # Second Set of CSV trace_csv_two = [ "110/trace_verified_instance3.csv", "110/seq_seq_instance3.csv", "110/seq_caller_instance3.csv" ] # Third Set of CSV trace_csv_three = [ "110/trace_verified_config_2.csv", "110/seq_seq_config.csv", "110/seq_caller_config.csv" ] # Index Column Name index_col = "trace_id" # Read CONFIG and INSTANCE df_one_0 = pd.read_csv(trace_csv_one[0], header=0, index_col=index_col) df_two_0 = pd.read_csv(trace_csv_two[0], header=0, index_col=index_col) df_three_0 = pd.read_csv(trace_csv_three[0], header=0, index_col=index_col) df_one_0.pop("test_case_id") df_one_0.pop("test_trace_id") df_two_0.pop("test_case_id") df_two_0.pop("test_trace_id") df_three_0.pop("test_case_id") df_three_0.pop("test_trace_id") df_total_0 = preprocessing_set.append_data(df_one_0, df_two_0) df_total_0 = preprocessing_set.append_data(df_total_0, df_three_0) # Read SEQUENCE-SEQ df_one_1 = pd.read_csv(trace_csv_one[1], header=0, index_col=index_col) df_two_1 = pd.read_csv(trace_csv_two[1], header=0, index_col=index_col) df_three_1 = pd.read_csv(trace_csv_three[1], header=0, index_col=index_col) df_one_1.pop("test_case_id") df_one_1.pop("test_trace_id") df_two_1.pop("test_case_id") df_two_1.pop("test_trace_id") df_three_1.pop("test_case_id") df_three_1.pop("test_trace_id") df_total_1 = preprocessing_set.append_data(df_one_1, df_two_1) df_total_1 = preprocessing_set.append_data(df_total_1, df_three_1) # Read SEQUENCE - CALLER df_one_2 = pd.read_csv(trace_csv_one[2], header=0, index_col=index_col) df_two_2 = pd.read_csv(trace_csv_two[2], header=0, index_col=index_col) df_three_2 = pd.read_csv(trace_csv_three[2], header=0, index_col=index_col) # df_one_2.pop("test_case_id") # df_one_2.pop("test_trace_id") # df_two_2.pop("test_case_id") # df_two_2.pop("test_trace_id") # df_three_2.pop("test_case_id") # df_three_2.pop("test_trace_id") df_total_2 = preprocessing_set.append_data(df_one_2, df_two_2) df_total_2 = preprocessing_set.append_data(df_total_2, df_three_2) # 将各个部分的数据JOIN起来 df_total = preprocessing_set.merge_data(df_trace=df_total_0, df_seq=df_total_1, df_seq_caller=df_total_2) print("连接训练数据与Production数据") df_total_with_f = preprocessing_set.append_data(df_f_all, df_total) df_total_with_f.to_csv("production/df_total_with_f.csv")