def collect_data_2(): df_total_with_f = pd.read_csv("production/df_total_with_f.csv", header=0, index_col="trace_id") # 丢弃全列为NA的数据 print("丢弃NA") df_total_with_f = preprocessing_set.drop_na_data(df_total_with_f) # 丢弃全列值相同的数据 print("丢弃All Same") df_total_with_f = preprocessing_set.drop_all_same_data(df_total_with_f) # 丢弃不需要的列 print("选择需要的列") df_total_with_f = preprocessing_set.select_data(df_total_with_f) # 填补空缺值 print("填补空缺值") df_total_with_f = preprocessing_set.fill_empty_data(df_total_with_f) # 把不规则的值转换成数字 print("数据转换") df_total_with_f = preprocessing_set.convert_data(df_total_with_f) df_total_with_f.to_csv("production/df_total_with_f_min.csv")
def preprocessing(): # First Set of CSV trace_csv_one = [ "110/trace_verified_sequence.csv", "110/seq_seq_sequence.csv", "110/seq_caller_sequence.csv" ] # Second Set of CSV trace_csv_two = [ "110/trace_verified_instance3.csv", "110/seq_seq_instance3.csv", "110/seq_caller_instance3.csv" ] # Third Set of CSV trace_csv_three = [ "110/trace_verified_config_2.csv", "110/seq_seq_config.csv", "110/seq_caller_config.csv" ] # Index Column Name index_col = "trace_id" # Read CONFIG and INSTANCE df_one_0 = pd.read_csv(trace_csv_one[0], header=0, index_col=index_col) df_two_0 = pd.read_csv(trace_csv_two[0], header=0, index_col=index_col) df_three_0 = pd.read_csv(trace_csv_three[0], header=0, index_col=index_col) df_total_0 = preprocessing_set.append_data(df_one_0, df_two_0) df_total_0 = preprocessing_set.append_data(df_total_0, df_three_0) df_total_0 = get_min_data(df_total_0) # Read SEQUENCE-SEQ df_one_1 = pd.read_csv(trace_csv_one[1], header=0, index_col=index_col) df_two_1 = pd.read_csv(trace_csv_two[1], header=0, index_col=index_col) df_three_1 = pd.read_csv(trace_csv_three[1], header=0, index_col=index_col) df_total_1 = preprocessing_set.append_data(df_one_1, df_two_1) df_total_1 = preprocessing_set.append_data(df_total_1, df_three_1) df_total_1 = get_min_data(df_total_1) # Read SEQUENCE - CALLER df_one_2 = pd.read_csv(trace_csv_one[2], header=0, index_col=index_col) df_two_2 = pd.read_csv(trace_csv_two[2], header=0, index_col=index_col) df_three_2 = pd.read_csv(trace_csv_three[2], header=0, index_col=index_col) df_total_2 = preprocessing_set.append_data(df_one_2, df_two_2) df_total_2 = preprocessing_set.append_data(df_total_2, df_three_2) df_total_2 = get_min_data(df_total_2) # 将各个部分的数据JOIN起来 df_total = preprocessing_set.merge_data(df_trace=df_total_0, df_seq=df_total_1, df_seq_caller=df_total_2) # 填补空缺值 print("fill") df_total = preprocessing_set.fill_empty_data(df_total) # 丢弃没故障数据 # df_total = df_total.loc[df_total["y_issue_ms"] != "Success"] # 把不规则的值转换成数字 print("convert") df_total = preprocessing_set.convert_data(df_total) # 按照某个Label对数据进行过采样以平衡样本数量 # df_total = preprocessing_set.sampling(df_total, "y_final_result") # 过采样后打乱数据 # df_total = shuffle(df_total) # 输出数据 print("output") df_total.to_csv("ready_use_max_without_sampling_mms.csv")
def preprocessing_sockshop_model1_data(): # trace_id列数序号列 index_col = "trace_id" # 把需要的数据全读进来 df_part_1 = pd.read_csv( "sockshop_data/trace_verified_config_cpu_sock_combined.csv", header=0, index_col=index_col) df_part_2 = pd.read_csv( "sockshop_data/trace_verified_config_sock_combined.csv", header=0, index_col=index_col) df_part_3 = pd.read_csv( "sockshop_data/trace_verified_instance_sock_combined.csv", header=0, index_col=index_col) df_part_4 = pd.read_csv( "sockshop_data/trace_verified_sequence_sock_combined.csv", header=0, index_col=index_col) # 把读进来的数据都串在一起 df_total = preprocessing_set.append_data(df_part_1, df_part_2) df_total = preprocessing_set.append_data(df_total, df_part_3) df_total = preprocessing_set.append_data(df_total, df_part_4) # 做数据的筛选 print("select") df_total = get_min_data(df_total) # 把y_issue_ms和y_issue_dim_type两列的值全搞成小写 df_total["y_issue_ms"] = df_total["y_issue_ms"].str.lower() df_total["y_issue_dim_type"] = df_total["y_issue_dim_type"].str.lower() # 填补空缺值 print("fill") df_total = preprocessing_set.fill_empty_data(df_total) # 把不规则的值转换成数字 print("convert") df_total = preprocessing_set.convert_data(df_total) # df_total = shuffle(df_total) print("处理完的数据有这么多:", len(df_total)) # 把文件保存下来 df_total.to_csv("sockshop_data/ss_total_mms.csv")
def preprocessing_sockshop_model1_data(): index_col = "trace_id" df_part_1 = pd.read_csv( "sockshop_data/trace_verified_config_cpu_sock_combined.csv", header=0, index_col=index_col) df_part_2 = pd.read_csv( "sockshop_data/trace_verified_config_sock_combined.csv", header=0, index_col=index_col) df_part_3 = pd.read_csv( "sockshop_data/trace_verified_instance_sock_combined.csv", header=0, index_col=index_col) df_part_4 = pd.read_csv( "sockshop_data/trace_verified_sequence_sock_combined.csv", header=0, index_col=index_col) df_total = preprocessing_set.append_data(df_part_1, df_part_2) df_total = preprocessing_set.append_data(df_total, df_part_3) df_total = preprocessing_set.append_data(df_total, df_part_4) print("select") df_total = get_min_data(df_total) df_total["y_issue_ms"] = df_total["y_issue_ms"].str.lower() df_total["y_issue_dim_type"] = df_total["y_issue_dim_type"].str.lower() print("fill") # 填补空缺值 df_total = preprocessing_set.fill_empty_data(df_total) print("convert") # 把不规则的值转换成数字 df_total = preprocessing_set.convert_data(df_total) # df_total = shuffle(df_total) print("总数据:", len(df_total)) df_total.to_csv("sockshop_data/ss_total_mms.csv")