def new_WI_19(data, features): solved_features = ["WeblogInfo_19"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "WeblogInfo_19_info_(cat)" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) feature_data = np.zeros((len(data), 1)) for user in range(data.shape[0]): if data[user, fea_indexs[0]] == "H": feature_data[user, 0] = 0 elif data[user, fea_indexs[0]] == "G": feature_data[user, 0] = 1 elif data[user, fea_indexs[0]] == "J": feature_data[user, 0] = 2 elif data[user, fea_indexs[0]] == "E": feature_data[user, 0] = 3 elif data[user, fea_indexs[0]] == "F": feature_data[user, 0] = 4 elif data[user, fea_indexs[0]] == "D": feature_data[user, 0] = 5 else: feature_data[user, 0] = 6 new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print("WeblogInfo_19 solved") print(deleted) return new_data, new_features
def new_EI_5_6_7_8(data, features): key_features = ["Education_Info5", "Education_Info6", "Education_Info7", "Education_Info8"] fea_indexs = get_known_features_index(features, key_features) feature_name = "combine_EI_5_6_7_8" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) ##### map rules #### map_to_zero = [["1", "AQ", "-1", "T"], ["1", "AQ", "-1", "80"], ["1", "U", "-1", "-1"], ["1", "AQ", "-1", "-1"], ["1", "B", "-1", "-1"], ["1", "A", "-1", "-1"], ["1", "AM", "-1", "80"], ["1", "A", "-1", "F"], ["1", "B", "-1", "AE"], ["1", "U", "-1", "AE"], ["1", "AQ", "-1", "V"], ["1", "AM", "-1", "V"]] map_to_one = [["1", "A", "-1", "T"], ["1", "AQ", "-1", "F"], ["1", "AM", "-1", "-1"], ["1", "AM", "-1", "-1"], ["1", "AM", "-1", "F"], ["1", "AM", "-1", "T"]] map_to_two = [["0", "E", "E", "E"]] feature_data = np.ones((len(data), 1)) for user in range(data.shape[0]): EI_5_6_7_8 = list(data[user, fea_indexs]) if EI_5_6_7_8 in map_to_zero: feature_data[user, 0] = 0 elif EI_5_6_7_8 in map_to_one: feature_data[user, 0] = 1 elif EI_5_6_7_8 in map_to_two: feature_data[user, 0] = 2 else: feature_data[user, 0] = 3 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = key_features[1:]) print(deleted) return new_data, new_features
def new_UserInfo_18(data, features): solved_features = ["UserInfo_18"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "UserInfo_18_bined" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) feature_data = np.zeros((len(data), 1)) for user in range(data.shape[0]): user_age = data[user, fea_indexs] if user_age < "22": feature_data[user, 0] = 0 elif user_age < "30": feature_data[user, 0] = 1 elif user_age < "40": feature_data[user, 0] = 2 elif user_age < "50": feature_data[user, 0] = 3 else: feature_data[user, 0] = 4 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print(deleted) return new_data, new_features
def new_UserInfo_18(data, features): solved_features = ["UserInfo_18"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "UserInfo_18_bined" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) feature_data = np.zeros((len(data), 1)) for user in range(data.shape[0]): user_age = data[user, fea_indexs] if user_age < "22": feature_data[user, 0] = 0 elif user_age < "30": feature_data[user, 0] = 1 elif user_age < "40": feature_data[user, 0] = 2 elif user_age < "50": feature_data[user, 0] = 3 else: feature_data[user, 0] = 4 new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print(deleted) return new_data, new_features
def replace_miss(data, features, label = "", for_train = True): delete_fea_index = [] missing_num = [] new_data = data.copy() new_features = features.copy() if for_train: SAVE_DIR = "resultData" else: SAVE_DIR = "resultData/test/" #!start from range(1,...) is because the first line of the feature is the id, useless for fea_pos in range(1, len(features)): fea_val_cla = feature_value_class(data, fea_pos, label) if not fea_val_cla[-1]._present_num == 0: new_data = replace_miss_with_specialV(new_data, fea_pos, fea_val_cla, label, \ delete_fea_index, missing_num) if for_train: new_data, new_features, deleted_feas = delete_features(new_data, new_features, \ delete_fea_pos = delete_fea_index) save_result(new_data, "data_after_delete_too_many_missing_features.csv", new_features) save_result(np.array(missing_num), "deleted_features_with_too_many_missing.csv", \ np.array(deleted_feas), dir_name = SAVE_DIR) return new_data, new_features
def pipeline_for_features_solved(for_train=True, saved_area="resultData"): if for_train: print("**************** Train ************************") data_file_name = "PPD_Training_Master_GBK_3_1_Training_Set.csv" else: print("**************** Test ************************") data_file_name = "PPD_Master_GBK_2_Test_Set.csv" data, features, label = load_data_for_solve(data_file_name, for_train) data, features = replace_miss(data, features, label, for_train) deleted_features_in_train = load_all_deleted_features_during_train( deleted_features_file_label="deleted_") data, features, deleted = delete_features( data, features, delete_feas_list=deleted_features_in_train) print(deleted) data, features = solve_user_info_package(data, features, saved_dir=saved_area) data, features = solve_weblog_info_package(data, features, saved_dir=saved_area) data, features = solve_thirdparty_info_package(data, features, saved_dir=saved_area) data, features = extract_log_update_package(data, features, for_train) return data, features
def new_WI_19(data, features): solved_features = ["WeblogInfo_19"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "WeblogInfo_19_info_(cat)" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) feature_data = np.zeros((len(data), 1)) for user in range(data.shape[0]): if data[user, fea_indexs[0]] == "H": feature_data[user, 0] = 0 elif data[user, fea_indexs[0]] == "G": feature_data[user, 0] = 1 elif data[user, fea_indexs[0]] == "J": feature_data[user, 0] = 2 elif data[user, fea_indexs[0]] == "E": feature_data[user, 0] = 3 elif data[user, fea_indexs[0]] == "F": feature_data[user, 0] = 4 elif data[user, fea_indexs[0]] == "D": feature_data[user, 0] = 5 else: feature_data[user, 0] = 6 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print("WeblogInfo_19 solved") print(deleted) return new_data, new_features
def replace_miss(data, features, label = "", for_train = True, is_round_two = False): delete_fea_index = [] missing_num = [] new_data = data.copy() new_features = features.copy() if for_train: if is_round_two: SAVE_DIR = "resultData_two" else: SAVE_DIR = "resultData" else: if is_round_two: SAVE_DIR = "resultData_two/test/" else: SAVE_DIR = "resultData/test/" threshold = int(data.shape[0] * 2 / 3) print("threshold: ", threshold) #!start from range(1,...) is because the first line of the feature is the id, useless for fea_pos in range(1, len(features)): fea_val_cla = feature_value_class(data, fea_pos, label) if not fea_val_cla[-1]._present_num == 0: new_data = replace_miss_with_specialV(new_data, fea_pos, fea_val_cla, label, \ delete_fea_index, missing_num, threshold) if for_train and not is_round_two: new_data, new_features, deleted_feas = delete_features(new_data, new_features, \ delete_fea_pos = delete_fea_index) print("delete while training: ", deleted_feas) #save_result(new_data, "data_after_delete_too_many_missing_features.csv", new_features) save_result(np.array(missing_num), "deleted_features_with_too_many_missing.csv", \ np.array(deleted_feas), dir_name = SAVE_DIR) return new_data, new_features
def view_each_features(data, features): data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) str_style_features = np.array(load_result("str_features.csv")[0]) str_features_index = get_known_features_index(features, str_style_features) x = range(len(data)) for fea_pos in range(len(features)): feature_name = features[fea_pos] # not draw the str style features if fea_pos in str_features_index: file_path = "view_data_area/csj/" + "(str" + str( fea_pos) + ")" + feature_name + ".png" # print(fea_pos) # print(features[fea_pos]) else: file_path = "view_data_area/csj/" + str( fea_pos) + ")" + feature_name + ".png" y = data[:, fea_pos] plt.scatter(x, y) plt.xlabel("instances(30000)") plt.ylabel("value") plt.title(feature_name + " value " + "distributed " + "in instances") plt.ylim(-2) # rect = plt.bar(left = (0,1),height = (0.5,0.5),width = 0.15) # plt.legend((rect,),(feature_name + "`s value",)) #print(file_path) plt.savefig(file_path) plt.close()
def view_each_features(data, features): data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) str_style_features = np.array(load_result("str_features.csv")[0]) str_features_index = get_known_features_index(features, str_style_features) x = range(len(data)) for fea_pos in range(len(features)): feature_name = features[fea_pos] # not draw the str style features if fea_pos in str_features_index: file_path = "view_data_area/csj/" + "(str" + str(fea_pos) + ")" + feature_name + ".png" # print(fea_pos) # print(features[fea_pos]) else: file_path = "view_data_area/csj/" + str(fea_pos) + ")" + feature_name + ".png" y = data[:, fea_pos] plt.scatter(x, y) plt.xlabel("instances(30000)") plt.ylabel("value") plt.title(feature_name + " value " + "distributed " + "in instances") plt.ylim(-2) # rect = plt.bar(left = (0,1),height = (0.5,0.5),width = 0.15) # plt.legend((rect,),(feature_name + "`s value",)) #print(file_path) plt.savefig(file_path) plt.close()
def new_EI_1_2_3_4(data, features): key_features = ["Education_Info1", "Education_Info2", "Education_Info3", "Education_Info4"] fea_indexs = get_known_features_index(features, key_features) feature_name = "combine_EI_1_2_3_4" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) ##### map rules #### map_to_zero = [["1", "AQ", "毕业", "T"], ["1", "A", "毕业", "V"], ["1", "AN", "结业", "T"], ["1", "AM", "结业", "T"], ["1", "B", "毕业", "AE"], ["1", "A", "结业", "T"], ["1", "A", "毕业", "AR"]] map_to_one = [["1", "U", "毕业", "AE"], ["1", "AM", "毕业", "AR"], ["1", "AM", "毕业", "V"], ["1", "AQ", "毕业", "F"], ["1", "A", "毕业", "F"], ["1", "AN", "毕业", "T"], ["1", "AQ", "毕业", "V"]] map_to_two = [["1", "AM", "毕业", "T"], ["1", "A", "毕业", "T"], ["1", "AM", "毕业", "F"]] map_to_three = [["0", "E", "E", "E"]] feature_data = np.ones((len(data), 1)) for user in range(data.shape[0]): EI_1_2_3_4 = list(data[user, fea_indexs]) if EI_1_2_3_4 in map_to_zero: feature_data[user, 0] = 0 elif EI_1_2_3_4 in map_to_one: feature_data[user, 0] = 1 elif EI_1_2_3_4 in map_to_two: feature_data[user, 0] = 2 elif EI_1_2_3_4 in map_to_three: feature_data[user, 0] = 3 else: print("error!!!!") new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = key_features[1:]) print(deleted) return new_data, new_features
def new_UserInfo_differ(data, features, key_features, feature_name, deleted_all=True): fea_indexs = get_known_features_index(features, key_features) new_add_feature = np.array([feature_name]) feature_data = np.zeros((len(data), 1)) new_features = np.concatenate((features, new_add_feature)) for user in range(data.shape[0]): if not len(set(list(data[user, fea_indexs]))) == 1: feature_data[user, 0] = 1 new_data = np.concatenate((data, feature_data), axis=1) delete_feas = key_features[1:] if deleted_all: delete_feas = key_features new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = delete_feas) print(deleted) return new_data, new_features
def new_WI_20_by_present(data, features): solved_features = ["WeblogInfo_20"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "WeblogInfo_20_present_info_(cat)" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) none_finded_combine = OrderedDict() feature_data = np.zeros((len(data), 1)) map_to_zero = [ 'F3', 'C39', 'F5', 'F2', 'F9', 'F12', 'I8', 'C38', 'F10', 'F11', 'O', 'C13', 'I6', 'C16', 'I7', 'I10' ] map_to_one = ['F14', 'F15', 'F13', 'C17', 'C12', 'C18', 'C15'] map_to_two = ['F16', 'C11', 'C1', 'C20', 'C19'] map_to_three = ['I3', 'U', 'C21', 'I4'] map_to_four = ['I5'] map_to_five = ['-1'] for user in range(data.shape[0]): fea_value = data[user, fea_indexs[0]] if fea_value in map_to_zero: feature_data[user, 0] = 0 elif fea_value in map_to_one: feature_data[user, 0] = 1 elif fea_value in map_to_two: feature_data[user, 0] = 2 elif fea_value in map_to_three: feature_data[user, 0] = 3 elif fea_value in map_to_four: feature_data[user, 0] = 4 elif fea_value in map_to_five: feature_data[user, 0] = 5 else: # print("error") # print(fea_value) if fea_value not in none_finded_combine.keys(): none_finded_combine[fea_value] = list() none_finded_combine[fea_value].append(user) for fea_value, users in none_finded_combine.items(): if fea_value[0] == "-1": feature_data[users, 0] = 5 if len(users) < 20: feature_data[users, 0] = 0 elif len(users) < 100: feature_data[users, 0] = 1 elif len(users) < 1000: feature_data[users, 0] = 2 elif len(users) < 5000: feature_data[users, 0] = 3 else: feature_data[users, 0] = 4 new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print("WeblogInfo_20 solved present") print(deleted) return new_data, new_features
def extract_log_update_package(data, features, for_train = True): data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) data, features = add_features_from_log(data, features, for_train) data, features = add_features_from_update(data, features, for_train) return data, features
def extract_log_update_package(data, features, for_train=True): data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) data, features = add_features_from_log(data, features, for_train) data, features = add_features_from_update(data, features, for_train) return data, features
def new_WI_20_by_present(data, features): solved_features = ["WeblogInfo_20"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "WeblogInfo_20_present_info_(cat)" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) none_finded_combine = OrderedDict() feature_data = np.zeros((len(data), 1)) map_to_zero = ['F3', 'C39', 'F5', 'F2', 'F9', 'F12', 'I8', 'C38', 'F10', 'F11', 'O', 'C13', 'I6', 'C16', 'I7', 'I10'] map_to_one = ['F14', 'F15', 'F13', 'C17', 'C12', 'C18', 'C15'] map_to_two = ['F16', 'C11', 'C1', 'C20', 'C19'] map_to_three = ['I3', 'U', 'C21', 'I4'] map_to_four = ['I5'] map_to_five = ['-1'] for user in range(data.shape[0]): fea_value = data[user, fea_indexs[0]] if fea_value in map_to_zero: feature_data[user, 0] = 0 elif fea_value in map_to_one: feature_data[user, 0] = 1 elif fea_value in map_to_two: feature_data[user, 0] = 2 elif fea_value in map_to_three: feature_data[user, 0] = 3 elif fea_value in map_to_four: feature_data[user, 0] = 4 elif fea_value in map_to_five: feature_data[user, 0] = 5 else: # print("error") # print(fea_value) if fea_value not in none_finded_combine.keys(): none_finded_combine[fea_value] = list() none_finded_combine[fea_value].append(user) for fea_value, users in none_finded_combine.items(): if fea_value[0] == "-1": feature_data[users, 0] = 5 if len(users) < 20: feature_data[users, 0] = 0 elif len(users) < 100: feature_data[users, 0] = 1 elif len(users) < 1000: feature_data[users, 0] = 2 elif len(users) < 5000: feature_data[users, 0] = 3 else: feature_data[users, 0] = 4 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print("WeblogInfo_20 solved present") print(deleted) return new_data, new_features
def remove_thirdparty6(data, features): solved_features = "ThirdParty_Info_Period6" third_p6_index = find_featuers_index(solved_features, features) new_data, new_features, deleted = delete_features(data, features, \ delete_fea_pos = third_p6_index) print("ThirdParty_Info_Period6 all removed") print(deleted) return new_data, new_features
def pipeline_for_features_solved(for_train = True, is_round_two = False): if not for_train and not is_round_two: print("I f**k your mother, cao ni ma de !! SB!!!!!") return 0 if for_train: print("**************** Train ************************") if is_round_two: print("******* Round Two *********") data_file_name = "Kesci_Master_9w_gbk_3_2.csv" saved_area = "resultData_two" else: data_file_name = "PPD_Training_Master_GBK_3_1_Training_Set.csv" saved_area = "resultData" else: print("**************** Test ************************") if is_round_two: print("******* Round Two *********") data_file_name = "Kesci_Master_9w_gbk_1_test_set.csv" saved_area = "resultData_two/test" else: data_file_name = "PPD_Master_GBK_2_Test_Set.csv" saved_area = "resultData/test" data, features, label = load_data_for_solve(data_file_name, for_train, is_round_two) data, features = replace_miss(data, features, label, for_train, is_round_two) if not for_train or is_round_two: print("all deleted: ") deleted_features_in_train = load_all_deleted_features_during_train(is_round_two = False, deleted_features_file_label = "deleted_") #print(deleted_features_in_train) data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train) print(deleted) data, features = solve_user_info_package(data, features, saved_dir = saved_area) #save_result(data, "after_solve_user_info.csv", features, dir_name = saved_area) data, features = solve_weblog_info_package(data, features, saved_dir = saved_area) if for_train and not is_round_two: data, features = deleted_web_log_features(data, features, saved_dir = saved_area) data, features = solve_thirdparty_info_package(data, features, saved_dir = saved_area) data, features = extract_log_update_package(data, features, for_train, is_round_two) save_result(data, "data_after_features_processed.csv", features, dir_name = saved_area) print("****** all finished *********") print("size: (data, features)") print(data.shape) return data, features
def solve_thirdparty_info_package(data, features, saved_dir = "resultData/"): data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) data, features = sta_start_missing_period(data, features) data, features = remove_thirdparty6(data, features) data, features = fill_thirdParty_miss(data, features) data, features = third_party_stable(data, features) data, features = third_party_level(data, features) save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir) return data, features
def solve_thirdparty_info_package(data, features, saved_dir="resultData/"): data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) data, features = sta_start_missing_period(data, features) data, features = remove_thirdparty6(data, features) data, features = fill_thirdParty_miss(data, features) data, features = third_party_stable(data, features) data, features = third_party_level(data, features) save_result(data, "data_after_thirdparty_solved.csv", features, dir_name=saved_dir) return data, features
def new_UserInfo_24_resident_level(data, features): resident_detail_level = ["UserInfo_24"] digited_residence_data = digit_resident_features(data, features, resident_detail_level) new_add_feature = np.array(["UserInfo_24_resident_detail_level"]) new_features = np.concatenate((features, new_add_feature)) feature_data = digited_residence_data[:, np.where(features == resident_detail_level)[0][0]] feature_data = feature_data.reshape((feature_data.size, 1)) #print(feature_data) #print("sdf", feature_data.shape) new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = resident_detail_level) print("UserInfo_24_resident_detail_level" + " solved") print(deleted) return new_data, new_features
def new_UserInfo_19_20(data, features): solved_features = ["UserInfo_19", "UserInfo_20"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "UserInfo_19_20_wrong_province_city" new_add_feature = np.array([feature_name]) feature_data = np.zeros((len(data), 1)) new_features = np.concatenate((features, new_add_feature)) for user in range(data.shape[0]): if str(-1) in list(data[user, fea_indexs]): feature_data[user, 0] = 1 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print(deleted) return new_data, new_features
def new_UserInfo_19_20(data, features): solved_features = ["UserInfo_19", "UserInfo_20"] fea_indexs = get_known_features_index(features, solved_features) feature_name = "UserInfo_19_20_wrong_province_city" new_add_feature = np.array([feature_name]) feature_data = np.zeros((len(data), 1)) new_features = np.concatenate((features, new_add_feature)) for user in range(data.shape[0]): if str(-1) in list(data[user, fea_indexs]): feature_data[user, 0] = 1 new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = solved_features) print(deleted) return new_data, new_features
def new_EI_1_2_3_4(data, features): key_features = [ "Education_Info1", "Education_Info2", "Education_Info3", "Education_Info4" ] fea_indexs = get_known_features_index(features, key_features) feature_name = "combine_EI_1_2_3_4" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) ##### map rules #### map_to_zero = [["1", "AQ", "毕业", "T"], ["1", "A", "毕业", "V"], ["1", "AN", "结业", "T"], ["1", "AM", "结业", "T"], ["1", "B", "毕业", "AE"], ["1", "A", "结业", "T"], ["1", "A", "毕业", "AR"]] map_to_one = [["1", "U", "毕业", "AE"], ["1", "AM", "毕业", "AR"], ["1", "AM", "毕业", "V"], ["1", "AQ", "毕业", "F"], ["1", "A", "毕业", "F"], ["1", "AN", "毕业", "T"], ["1", "AQ", "毕业", "V"]] map_to_two = [["1", "AM", "毕业", "T"], ["1", "A", "毕业", "T"], ["1", "AM", "毕业", "F"]] map_to_three = [["0", "E", "E", "E"]] feature_data = np.ones((len(data), 1)) for user in range(data.shape[0]): EI_1_2_3_4 = list(data[user, fea_indexs]) if EI_1_2_3_4 in map_to_zero: feature_data[user, 0] = 0 elif EI_1_2_3_4 in map_to_one: feature_data[user, 0] = 1 elif EI_1_2_3_4 in map_to_two: feature_data[user, 0] = 2 elif EI_1_2_3_4 in map_to_three: feature_data[user, 0] = 3 else: print("error!!!!") new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = key_features[1:]) print(deleted) return new_data, new_features
def new_UserInfo_24_resident_level(data, features): resident_detail_level = ["UserInfo_24"] digited_residence_data = digit_resident_features(data, features, resident_detail_level) new_add_feature = np.array(["UserInfo_24_resident_detail_level"]) new_features = np.concatenate((features, new_add_feature)) feature_data = digited_residence_data[:, np.where( features == resident_detail_level)[0][0]] feature_data = feature_data.reshape((feature_data.size, 1)) #print(feature_data) #print("sdf", feature_data.shape) new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = resident_detail_level) print("UserInfo_24_resident_detail_level" + " solved") print(deleted) return new_data, new_features
def deleted_web_log_features(data, features, saved_dir = "result"): from create_new_features import find_featuers_index features_name = "WeblogInfo" fea_indexs = find_featuers_index(features_name, features) # print(fea_indexs) weblog_data = data[:, fea_indexs] weblog_features = features[fea_indexs] correlation_between_properties(weblog_data, weblog_features) delete_result = according_properties_correlation_delete() #save_result(data, file_name, features, style, dir_name) save_result(delete_result, "deleted_weblog_features_with_strong_correlation.csv") weblog_delete_needed = ["WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13", "WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28", "WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58"] save_result(weblog_delete_needed, "deleted_useless_weblog.csv") delete_result.extend(weblog_delete_needed) data, features, deleted = delete_features(data, features, \ delete_feas_list = delete_result) print("Train delete(weblog) : ", deleted) return data, features
def new_UserInfo_differ(data, features, key_features, feature_name, deleted_all = True): fea_indexs = get_known_features_index(features, key_features) new_add_feature = np.array([feature_name]) feature_data = np.zeros((len(data), 1)) new_features = np.concatenate((features, new_add_feature)) for user in range(data.shape[0]): if not len(set(list(data[user, fea_indexs]))) == 1: feature_data[user, 0] = 1 new_data = np.concatenate((data, feature_data), axis = 1) delete_feas = key_features[1:] if deleted_all: delete_feas = key_features new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = delete_feas) print(deleted) return new_data, new_features
# @Author : chensijia ([email protected]) # @Version : 0.0.0 # @Style : Python3.5 # # @Description: from main_for_process_data import load_data_for_solve, replace_miss, strStyle_features_to_digit from save_load_result import save_features_info, save_result, load_all_deleted_features_during_train from solve_data import delete_features import numpy as np if __name__ == '__main__': data, features, label = load_data_for_solve("PPD_Master_GBK_2_Test_Set.csv", for_train = False) data, features = replace_miss(data, features, label, for_train = False) #save_result(data, "test/data_after_filling_missing_.csv", features) deleted_features_in_train = load_all_deleted_features_during_train(deleted_features_file_label = "deleted_features_with_too_many_missing") data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train) save_result(data, "test_data_after_deleted_features.csv", features, dir_name = "resultData/test/") data = strStyle_features_to_digit(data, features, for_train = False, use_experience = True) save_result(data, "data_after_digited.csv", features, dir_name= "resultData/test/") save_features_info(data, features, label, "info_after_digit_all_features.csv", \ dir_name = "resultData/test/")
data, features = third_party_stable(data, features) data, features = third_party_level(data, features) save_result(data, "data_after_thirdparty_solved.csv", features, dir_name=saved_dir) return data, features if __name__ == '__main__': contents = load_result("data_after_solved_weblog.csv") features = np.array(contents[0]) data = np.array(contents[1:]) data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) solve_thirdparty_info_package(data, features) # calculate_number = ["17"] # users_sta_name, users_stability = calculate_stability(data, features, calculate_number[0]) # print(users_sta_name) # for i in range(10): # print(users_stability[i]) # from create_new_features import find_featuers_index # features_name = "ThirdPart" # fea_indexs = find_featuers_index(features_name, features) # print(fea_indexs)
# if a value in one features is bigger than 20000, besides # the positive in it is almost equal to the positive in the train data if __name__ == '__main__': #################### used to calculate the correlation between properties ######### contents = load_result("data_after_delete_no_discrimination_features.csv") features = np.array(contents[0]) data = np.array(contents[1:]) from map_features_to_digit import convert_to_numerical from solve_data import delete_features data = convert_to_numerical(data, features) data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) correlation_between_properties(data, features) delete_result = according_properties_correlation_delete() save_result(delete_result, "deleted_features_with_strong_correlation.csv") data, features, deleted_features = delete_features(data, features, \ delete_feas_list = delete_result) # print(deleted_features) save_result(data, "data_after_delete_strong_correlation_features.csv", features) print(data.shape) ###############3 used pca to delete #####################
def view_each_features_label(data, features, label): data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) str_style_features = np.array(load_result("str_features.csv")[0]) str_features_index = get_known_features_index(features, str_style_features) new_label = label.reshape((label.size, )) x = range(len(data)) for fea_pos in range(len(features)): feature_name = features[fea_pos] if fea_pos in str_features_index: file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str( fea_pos) + ")" + feature_name + ".png" else: file_path = "view_data_area/after_all/with_label_under_mean/" + str( fea_pos) + ")" + feature_name + ".png" features_info = feature_value_class(data, fea_pos, label, str_features_index) if features_info["num_of_value"] > 30: save_result([features[fea_pos]], "complex_value_features.csv", style="a+") else: if fea_pos not in str_features_index: save_result( [features[fea_pos]], "simple_discrete_value_features(nonestrfeatures).csv", style="a+") y_positive = data[new_label == 1, fea_pos] y_negitive = data[new_label == 0, fea_pos] positive_index = np.array([ index for index in range(len(new_label)) if new_label[index] == 1 ]) negitive_index = np.array([ index for index in range(len(new_label)) if new_label[index] == 0 ]) plt.scatter(positive_index, y_positive, marker='o', color='r', s=10) plt.scatter(negitive_index, y_negitive, marker='x', color='g', s=10) plt.xlabel("instances(30000)") plt.ylabel("value") if features_info["num_of_value"] < 40: plt.title(feature_name + " value - label " + "distributed " + "in instances" + \ "\n the arrow --> Proportion of positive in that value & in positive") for k, v in features_info.items(): if isinstance(v, FeatureInData): arrow_data = round( v._respond_positive_num / features_info["num_positive"], 4) arrow_start_position_x = len(data) + 2000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) arrow_data = round( v._respond_positive_num / v._present_num, 4) arrow_start_position_x = -4000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) else: fea_average = round(np.mean(data[:, fea_pos]), 4) fea_std = np.std(data[:, fea_pos]) fea_oo = round(fea_std / fea_average, 4) max_v = np.amax(data[:, fea_pos]) min_v = np.amin(data[:, fea_pos]) plt.title(feature_name + " | mean & Proportion of positive under that mean" + \ "\n degree of fluctuation --> " + str(fea_oo)) x1 = np.array(range(-5000, 35000)) y_mean = fea_average * np.ones((x1.size)) #plt.plot(x1, y_mean, color = 'k', linestyle = "--") plt.annotate(fea_average, \ xy=(-4000,fea_average), \ xytext=(-4000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) under_mean_positive = 0 under_mean_num = 0 for k, v in features_info.items(): if isinstance(v, FeatureInData): if k <= fea_average: under_mean_num += v._present_num under_mean_positive += v._respond_positive_num ave_posi = round( under_mean_positive / features_info["num_positive"], 4) plt.annotate(ave_posi, \ xy=(31000,fea_average), \ xytext=(31000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) pos_rat = 0 pos_rat_whole = 0 if -1 in features_info.keys(): pos_rat = features_info[ -1]._respond_positive_num / features_info[-1]._present_num pos_rat_whole = features_info[ -1]._respond_positive_num / features_info["num_positive"] plt.annotate(round(pos_rat_whole, 4), \ xy=(31000,-1), \ xytext=(31000,-1)) plt.annotate(round(pos_rat, 4), \ xy=(-4000,-1), \ xytext=(-4000,-1)) plt.ylim(min_v - 10, fea_average * 2) #plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10)) plt.savefig(file_path) plt.close()
data, features = remove_thirdparty6(data, features) data, features = fill_thirdParty_miss(data, features) data, features = third_party_stable(data, features) data, features = third_party_level(data, features) save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir) return data, features if __name__ == '__main__': contents = load_result("data_after_solved_weblog.csv") features = np.array(contents[0]) data = np.array(contents[1:]) data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) solve_thirdparty_info_package(data, features) # calculate_number = ["17"] # users_sta_name, users_stability = calculate_stability(data, features, calculate_number[0]) # print(users_sta_name) # for i in range(10): # print(users_stability[i]) # from create_new_features import find_featuers_index # features_name = "ThirdPart" # fea_indexs = find_featuers_index(features_name, features) # print(fea_indexs)
weblog_features = features[fea_indexs] print(weblog_data.shape) print(weblog_features.shape) #save_result(weblog_data, "weblog_data_view.csv", weblog_features) # # label_lines = np.array(load_result("train_label_original.csv")) # # #print(label_lines.shape) # # from save_load_result import convert_to_float # # label = convert_to_float(label_lines) # # label = label.reshape((label.size, )) correlation_between_properties(weblog_data, weblog_features) delete_result = according_properties_correlation_delete() save_result(delete_result, "deleted_weblog_features_with_strong_correlation.csv") weblog_data, weblog_features, deleted_features = delete_features(weblog_data, weblog_features, \ delete_feas_list = delete_result) save_result(weblog_data, "data_after_delete_strong_correlation_weblog.csv", weblog_features) weblog_delete_needed = [ "WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13", "WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28", "WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58" ] save_result(weblog_delete_needed, "deleted_useless_weblog.csv") new_data, new_features, deleted = delete_features(weblog_data, weblog_features, \ delete_feas_list = weblog_delete_needed) save_result(new_data, "data_after_delete_useless_weblog.csv", new_features)
def new_UserInfo_22_23_combine2(data, features): key_features = ["UserInfo_22", "UserInfo_23"] print("combine2") fea_indexs = get_known_features_index(features, key_features) feature_name = "UserInfo_combine2_by_present_22_23" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) ##### map rules #### map_to_zero = [['未婚', 'AE'], ['已婚', 'AH'], ['再婚', 'M'], ['未婚', 'X'], ['未婚', 'AJ'], ['-1', 'AC'], ['离婚', 'K'], ['已婚', 'AF'], ['未婚', 'AP'], ['再婚', 'G'], ['未婚', 'R'], ['已婚', 'AL'], ['离婚', '专科毕业'], ['离婚', 'G'], ['离婚', 'AC'], ['未婚', 'AD'], ['-1', 'M'], ['离婚', 'AB'], ['已婚', 'AJ'], ['-1', 'R'], ['已婚', 'Y'], ['离婚', 'H'], ['未婚', 'Q'], ['离婚', 'P'], ['已婚', 'Z'], ['初婚', 'G'], ['-1', 'K'], ['再婚', 'O'], ['-1', 'AI'], ['离婚', '-1'], ['已婚', '-1'], ['再婚', 'H'], ['未婚', 'AH'], ['离婚', '大学本科(简称“大学'], ['离婚', 'M'], ['-1', 'P'], ['已婚', 'AE'], ['-1', '专科毕业'], ['-1', 'AH'], ['已婚', 'P'], ['已婚', 'AI'], ['离婚', 'AH'], ['离婚', 'O'], ['已婚', 'AC'], ['-1', 'H'], ['未婚', 'AC'], ['-1', 'AK']] map_to_one = [['已婚', 'K'], ['未婚', 'K'], ['未婚', 'W'], ['-1', '大学本科(简称“大学'], ['已婚', '专科毕业']] map_to_two = [['未婚', 'Y'], ['已婚', 'AB'], ['未婚', '专科毕业'], ['已婚', '大学本科(简称“大学'], ['已婚', 'M'], ['-1', 'Y'], ['未婚', 'P'], ['-1', 'O'], ['已婚', 'AK'], ['未婚', 'AI'], ['未婚', 'M'], ['未婚', '-1'], ['-1', 'G'], ['未婚', 'H'], ['已婚', 'H'], ['-1', 'AB'], ['未婚', 'AK'], ['已婚', 'O']] map_to_three = [['未婚', '大学本科(简称“大学'], ['已婚', 'G'], ['未婚', 'O'], ['未婚', 'AB'], ['未婚', 'G']] map_to_four = [['D', 'D']] map_to_five = [['-1', '-1']] none_finded_combine = OrderedDict() feature_data = np.ones((len(data), 1)) for user in range(data.shape[0]): EI_22_23 = list(data[user, fea_indexs]) if EI_22_23 in map_to_zero: feature_data[user, 0] = 0 elif EI_22_23 in map_to_one: feature_data[user, 0] = 1 elif EI_22_23 in map_to_two: feature_data[user, 0] = 2 elif EI_22_23 in map_to_three: feature_data[user, 0] = 3 elif EI_22_23 in map_to_four: feature_data[user, 0] = 4 elif EI_22_23 in map_to_five: feature_data[user, 0] = 5 else: EI_22_23_str = reduce(lambda x, y: x + "_" + y, EI_22_23) if EI_22_23_str not in none_finded_combine.keys(): none_finded_combine[EI_22_23_str] = list() none_finded_combine[EI_22_23_str].append(user) for EI_combine, users in none_finded_combine.items(): EI_combine = EI_combine.split("_") if EI_combine[0] == "-1" and EI_combine[1] == "-1": feature_data[users, 0] = 5 if len(users) < 10: feature_data[users, 0] = 0 elif len(users) < 20: feature_data[users, 0] = 1 elif len(users) < 100: feature_data[users, 0] = 2 elif len(users) < 1000: feature_data[users, 0] = 3 else: feature_data[users, 0] = 4 new_data = np.concatenate((data, feature_data), axis=1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = key_features) print(deleted) return new_data, new_features
fea_indexs = find_featuers_index(features_name, features) print(fea_indexs) weblog_data = data[:, fea_indexs] weblog_features = features[fea_indexs] print(weblog_data.shape) print(weblog_features.shape) #save_result(weblog_data, "weblog_data_view.csv", weblog_features) # # label_lines = np.array(load_result("train_label_original.csv")) # # #print(label_lines.shape) # # from save_load_result import convert_to_float # # label = convert_to_float(label_lines) # # label = label.reshape((label.size, )) correlation_between_properties(weblog_data, weblog_features) delete_result = according_properties_correlation_delete() save_result(delete_result, "deleted_weblog_features_with_strong_correlation.csv") weblog_data, weblog_features, deleted_features = delete_features(weblog_data, weblog_features, \ delete_feas_list = delete_result) save_result(weblog_data, "data_after_delete_strong_correlation_weblog.csv", weblog_features) weblog_delete_needed = ["WeblogInfo_10", "WeblogInfo_11", "WeblogInfo_12", "WeblogInfo_13", "WeblogInfo_23", "WeblogInfo_25", "WeblogInfo_26", "WeblogInfo_28", "WeblogInfo_31", "WeblogInfo_55", "WeblogInfo_58"] save_result(weblog_delete_needed, "deleted_useless_weblog.csv") new_data, new_features, deleted = delete_features(weblog_data, weblog_features, \ delete_feas_list = weblog_delete_needed) save_result(new_data, "data_after_delete_useless_weblog.csv", new_features)
def view_each_features_label(data, features, label): data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) str_style_features = np.array(load_result("str_features.csv")[0]) str_features_index = get_known_features_index(features, str_style_features) new_label = label.reshape((label.size,)) x = range(len(data)) for fea_pos in range(len(features)): feature_name = features[fea_pos] if fea_pos in str_features_index: file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str(fea_pos) + ")" + feature_name + ".png" else: file_path = "view_data_area/after_all/with_label_under_mean/" + str(fea_pos) + ")" + feature_name + ".png" features_info = feature_value_class(data, fea_pos, label, str_features_index) if features_info["num_of_value"] > 30: save_result([features[fea_pos]], "complex_value_features.csv", style = "a+") else: if fea_pos not in str_features_index: save_result([features[fea_pos]], "simple_discrete_value_features(nonestrfeatures).csv", style = "a+") y_positive = data[new_label == 1, fea_pos] y_negitive = data[new_label == 0, fea_pos] positive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 1]) negitive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 0]) plt.scatter(positive_index, y_positive, marker = 'o', color = 'r', s = 10) plt.scatter(negitive_index, y_negitive, marker = 'x', color = 'g', s = 10) plt.xlabel("instances(30000)") plt.ylabel("value") if features_info["num_of_value"] < 40: plt.title(feature_name + " value - label " + "distributed " + "in instances" + \ "\n the arrow --> Proportion of positive in that value & in positive") for k, v in features_info.items(): if isinstance(v, FeatureInData): arrow_data = round(v._respond_positive_num / features_info["num_positive"] , 4) arrow_start_position_x = len(data) + 2000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) arrow_data = round(v._respond_positive_num / v._present_num , 4) arrow_start_position_x = -4000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) else: fea_average = round(np.mean(data[:, fea_pos]), 4) fea_std = np.std(data[:, fea_pos]) fea_oo = round(fea_std / fea_average, 4) max_v = np.amax(data[:, fea_pos]) min_v = np.amin(data[:, fea_pos]) plt.title(feature_name + " | mean & Proportion of positive under that mean" + \ "\n degree of fluctuation --> " + str(fea_oo)) x1 = np.array(range(-5000, 35000)) y_mean = fea_average * np.ones((x1.size)) #plt.plot(x1, y_mean, color = 'k', linestyle = "--") plt.annotate(fea_average, \ xy=(-4000,fea_average), \ xytext=(-4000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) under_mean_positive = 0 under_mean_num = 0 for k, v in features_info.items(): if isinstance(v, FeatureInData): if k <= fea_average: under_mean_num += v._present_num under_mean_positive += v._respond_positive_num ave_posi = round(under_mean_positive / features_info["num_positive"], 4) plt.annotate(ave_posi, \ xy=(31000,fea_average), \ xytext=(31000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) pos_rat = 0 pos_rat_whole = 0 if -1 in features_info.keys(): pos_rat = features_info[-1]._respond_positive_num / features_info[-1]._present_num pos_rat_whole = features_info[-1]._respond_positive_num / features_info["num_positive"] plt.annotate(round(pos_rat_whole, 4), \ xy=(31000,-1), \ xytext=(31000,-1)) plt.annotate(round(pos_rat, 4), \ xy=(-4000,-1), \ xytext=(-4000,-1)) plt.ylim(min_v - 10, fea_average * 2) #plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10)) plt.savefig(file_path) plt.close()
# if a value in one features is bigger than 20000, besides # the positive in it is almost equal to the positive in the train data if __name__ == '__main__': #################### used to calculate the correlation between properties ######### contents = load_result("data_after_delete_no_discrimination_features.csv") features = np.array(contents[0]) data = np.array(contents[1:]) from map_features_to_digit import convert_to_numerical from solve_data import delete_features data = convert_to_numerical(data, features) data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) correlation_between_properties(data, features) delete_result = according_properties_correlation_delete() save_result(delete_result, "deleted_features_with_strong_correlation.csv") data, features, deleted_features = delete_features(data, features, \ delete_feas_list = delete_result) # print(deleted_features) save_result(data, "data_after_delete_strong_correlation_features.csv", features) print(data.shape) ###############3 used pca to delete #####################
def new_UserInfo_22_23_combine2(data, features): key_features = ["UserInfo_22", "UserInfo_23"] print("combine2") fea_indexs = get_known_features_index(features, key_features) feature_name = "UserInfo_combine2_by_present_22_23" new_add_feature = np.array([feature_name]) new_features = np.concatenate((features, new_add_feature)) ##### map rules #### map_to_zero = [['未婚', 'AE'], ['已婚', 'AH'], ['再婚', 'M'], ['未婚', 'X'], ['未婚', 'AJ'], ['-1', 'AC'], ['离婚', 'K'], ['已婚', 'AF'], ['未婚', 'AP'], ['再婚', 'G'], ['未婚', 'R'], ['已婚', 'AL'], ['离婚', '专科毕业'], ['离婚', 'G'], ['离婚', 'AC'], ['未婚', 'AD'], ['-1', 'M'], ['离婚', 'AB'], ['已婚', 'AJ'], ['-1', 'R'], ['已婚', 'Y'], ['离婚', 'H'], ['未婚', 'Q'], ['离婚', 'P'], ['已婚', 'Z'], ['初婚', 'G'], ['-1', 'K'], ['再婚', 'O'], ['-1', 'AI'], ['离婚', '-1'], ['已婚', '-1'], ['再婚', 'H'], ['未婚', 'AH'], ['离婚', '大学本科(简称“大学'], ['离婚', 'M'], ['-1', 'P'], ['已婚', 'AE'], ['-1', '专科毕业'], ['-1', 'AH'], ['已婚', 'P'], ['已婚', 'AI'], ['离婚', 'AH'], ['离婚', 'O'], ['已婚', 'AC'], ['-1', 'H'], ['未婚', 'AC'], ['-1', 'AK']] map_to_one = [['已婚', 'K'], ['未婚', 'K'], ['未婚', 'W'], ['-1', '大学本科(简称“大学'], ['已婚', '专科毕业']] map_to_two = [['未婚', 'Y'], ['已婚', 'AB'], ['未婚', '专科毕业'], ['已婚', '大学本科(简称“大学'], ['已婚', 'M'], ['-1', 'Y'], ['未婚', 'P'], ['-1', 'O'], ['已婚', 'AK'], ['未婚', 'AI'], ['未婚', 'M'], ['未婚', '-1'], ['-1', 'G'], ['未婚', 'H'], ['已婚', 'H'], ['-1', 'AB'], ['未婚', 'AK'], ['已婚', 'O']] map_to_three = [['未婚', '大学本科(简称“大学'], ['已婚', 'G'], ['未婚', 'O'], ['未婚', 'AB'], ['未婚', 'G']] map_to_four = [['D', 'D']] map_to_five = [['-1', '-1']] none_finded_combine = OrderedDict() feature_data = np.ones((len(data), 1)) for user in range(data.shape[0]): EI_22_23 = list(data[user, fea_indexs]) if EI_22_23 in map_to_zero: feature_data[user, 0] = 0 elif EI_22_23 in map_to_one: feature_data[user, 0] = 1 elif EI_22_23 in map_to_two: feature_data[user, 0] = 2 elif EI_22_23 in map_to_three: feature_data[user, 0] = 3 elif EI_22_23 in map_to_four: feature_data[user, 0] = 4 elif EI_22_23 in map_to_five: feature_data[user, 0] = 5 else: EI_22_23_str = reduce(lambda x, y: x + "_" + y, EI_22_23) if EI_22_23_str not in none_finded_combine.keys(): none_finded_combine[EI_22_23_str] = list() none_finded_combine[EI_22_23_str].append(user) for EI_combine, users in none_finded_combine.items(): EI_combine = EI_combine.split("_") if EI_combine[0] == "-1" and EI_combine[1] == "-1": feature_data[users, 0] = 5 if len(users) < 10: feature_data[users, 0] = 0 elif len(users) < 20: feature_data[users, 0] = 1 elif len(users) < 100: feature_data[users, 0] = 2 elif len(users) < 1000: feature_data[users, 0] = 3 else: feature_data[users, 0] = 4 new_data = np.concatenate((data, feature_data), axis = 1) new_data, new_features, deleted = delete_features(new_data, new_features, \ delete_feas_list = key_features) print(deleted) return new_data, new_features
from main_for_process_data import load_data_for_solve, replace_miss, strStyle_features_to_digit from save_load_result import save_features_info, save_result, load_all_deleted_features_during_train from solve_data import delete_features import numpy as np if __name__ == '__main__': data, features, label = load_data_for_solve( "PPD_Master_GBK_2_Test_Set.csv", for_train=False) data, features = replace_miss(data, features, label, for_train=False) #save_result(data, "test/data_after_filling_missing_.csv", features) deleted_features_in_train = load_all_deleted_features_during_train( deleted_features_file_label="deleted_features_with_too_many_missing") data, features, deleted = delete_features( data, features, delete_feas_list=deleted_features_in_train) save_result(data, "test_data_after_deleted_features.csv", features, dir_name="resultData/test/") data = strStyle_features_to_digit(data, features, for_train=False, use_experience=True) save_result(data, "data_after_digited.csv", features, dir_name="resultData/test/") save_features_info(data, features, label, "info_after_digit_all_features.csv", \ dir_name = "resultData/test/")