def get_respond_data(for_train = True): if for_train: log_file_name = "PPD_LogInfo_3_1_Training_Set.csv" update_file_name = "PPD_Userupdate_Info_3_1_Training_Set.csv" id_target_file_name = "PPD_Training_Master_GBK_3_1_Training_Set.csv" DATA_DIR = "PPD-First-Round-Data/Training Set/" else: log_file_name = "PPD_LogInfo_2_Test_Set.csv" update_file_name = "PPD_Userupdate_Info_2_Test_Set.csv" id_target_file_name = "PPD_Master_GBK_2_Test_Set.csv" DATA_DIR = "PPD-First-Round-Data/Test Set/" #################### load the needed data #################################### log_info = np.array(load_result(log_file_name, dir_name = DATA_DIR)) log_info_features = log_info[0] log_info_data = log_info[1:] update_info = np.array(load_result(update_file_name, dir_name = DATA_DIR)) update_info_features = update_info[0] update_info_data = update_info[1:] id_target = np.array(load_result(id_target_file_name, dir_name = DATA_DIR)) if for_train: features = id_target[0, [0, -2, -1]] data = id_target[1:, [0, -2, -1]] else: features = id_target[0, [0, -1]] data = id_target[1:, [0, -1]] return data, log_info_data, update_info_data
def view_each_features(data, features): data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) str_style_features = np.array(load_result("str_features.csv")[0]) str_features_index = get_known_features_index(features, str_style_features) x = range(len(data)) for fea_pos in range(len(features)): feature_name = features[fea_pos] # not draw the str style features if fea_pos in str_features_index: file_path = "view_data_area/csj/" + "(str" + str(fea_pos) + ")" + feature_name + ".png" # print(fea_pos) # print(features[fea_pos]) else: file_path = "view_data_area/csj/" + str(fea_pos) + ")" + feature_name + ".png" y = data[:, fea_pos] plt.scatter(x, y) plt.xlabel("instances(30000)") plt.ylabel("value") plt.title(feature_name + " value " + "distributed " + "in instances") plt.ylim(-2) # rect = plt.bar(left = (0,1),height = (0.5,0.5),width = 0.15) # plt.legend((rect,),(feature_name + "`s value",)) #print(file_path) plt.savefig(file_path) plt.close()
def view_each_features(data, features): data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) str_style_features = np.array(load_result("str_features.csv")[0]) str_features_index = get_known_features_index(features, str_style_features) x = range(len(data)) for fea_pos in range(len(features)): feature_name = features[fea_pos] # not draw the str style features if fea_pos in str_features_index: file_path = "view_data_area/csj/" + "(str" + str( fea_pos) + ")" + feature_name + ".png" # print(fea_pos) # print(features[fea_pos]) else: file_path = "view_data_area/csj/" + str( fea_pos) + ")" + feature_name + ".png" y = data[:, fea_pos] plt.scatter(x, y) plt.xlabel("instances(30000)") plt.ylabel("value") plt.title(feature_name + " value " + "distributed " + "in instances") plt.ylim(-2) # rect = plt.bar(left = (0,1),height = (0.5,0.5),width = 0.15) # plt.legend((rect,),(feature_name + "`s value",)) #print(file_path) plt.savefig(file_path) plt.close()
def new_UserInfo_7_num(data, features): province_features = ["UserInfo_7"] finded_index = np.where( features == province_features[0])[0][0] #### create new features --> UserInfo_7_province_preIncome new_add_feature = np.array(["UserInfo_7_province_preIncome"]) features = np.concatenate((features, new_add_feature)) province_content = load_result("2014中国各省人均可支配收入排行.csv", dir_name = "material_data") province_info = np.array(province_content[0]) province_data = np.array(province_content[1:]) province_index = np.where( province_info == "省份名")[0][0] income_index = np.where( province_info == "可支配收入")[0][0] provinces = list(province_data[:, province_index]) feature_data = np.zeros((data.shape[0], 1)) def is_contain(provinces, finded): for i in range(len(provinces)): if finded in provinces[i]: return i return -1 for user in range(data.shape[0]): temp = is_contain(provinces, data[user, finded_index]) if not temp == -1: feature_data[user, 0] = int(province_data[temp, income_index]) else: feature_data[user, 0] = temp data = np.concatenate((data, feature_data), axis = 1) print("UserInfo_7_province_preIncome" + " solved") return data, features
def correlation_between_properties(data, features): fixed_str_features = np.array(load_result("str_features.csv"))[0] indexs = get_known_features_index(features, fixed_str_features) title = list() title.append("features1") title.append("features2") title.append("calculate_method") title.append("cor") title.append("pval") save_result(title, "pearsonr_spearmanr_results.csv") save_result(title, "pearsonr_spearmanr_Strong_correlation.csv") for fea_pos in range(len(features)): for fea_pos_add in range(fea_pos + 1, len(features)): info_result = list() info_result.append(features[fea_pos]) info_result.append(features[fea_pos_add]) a1 = data[:, fea_pos] a2 = data[:, fea_pos_add] # they are all not str style features if fea_pos not in indexs and fea_pos_add not in indexs: info_result.append("pearsonr") cor, pval = stats.pearsonr(a1, a2) else: # one of them or all of them are str style features info_result.append("spearmanr") cor, pval = stats.spearmanr(a1, a2) cor = round(cor, 3) info_result.append(cor) info_result.append(pval) if abs(cor) >= 0.2: save_result(info_result, "pearsonr_spearmanr_results.csv", style = "a+") if abs(cor) >= 0.86: save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \ style = "a+")
def map_str_to_digit(data, features, no_map_features, only_map_features=" ", label=" "): no_map_features_index = get_known_features_index(features, no_map_features) features_map_info = dict() fixed_str_features = np.array(load_result("str_features.csv"))[0] fixed_str_features_index = get_known_features_index( features, fixed_str_features) only_map_features_index = range(len(features)) if not only_map_features == " ": only_map_features_index = get_known_features_index( features, only_map_features) for fea_pos in range(1, len(features)): if not fea_pos in no_map_features_index and fea_pos in only_map_features_index: map_info = OrderedDict() #feature_map_info = OrderedDict() fea_val_cla = feature_value_class(data, fea_pos, label, fixed_str_features_index) # if this feature is a string value, just convert it to value if fea_val_cla["str_feature"]: data, map_info = map_str_feature_to_value( data, fea_pos, fea_val_cla) features_map_info[features[fea_pos]] = map_info #features_map_info[].append([feature_map_info]) digited_data = convert_to_numerical(data, features) return digited_data, features_map_info
def Integrate_Log_Update(for_train = True, is_round_two = False): if for_train: if is_round_two: DATA_DIR = "PPD-Second-Round-Data/Rematch Train/" LOG_FILE = "LogInfo_9w_3_2.csv" UPDATE_FILE = "Userupdate_Info_9w_3_2.csv" DATA_FILE = "Kesci_Master_9w_gbk_3_2.csv" else: DATA_DIR = "PPD-First-Round-Data/Training Set/" LOG_FILE = "PPD_LogInfo_3_1_Training_Set.csv" UPDATE_FILE = "PPD_Userupdate_Info_3_1_Training_Set.csv" DATA_FILE = "PPD_Training_Master_GBK_3_1_Training_Set.csv" else: if is_round_two: DATA_DIR = "PPD-Second-Round-Data/Rematch Test/" LOG_FILE = "LogInfo_9w_1.csv" UPDATE_FILE = "Userupdate_Info_9w_1.csv" DATA_FILE = "Kesci_Master_9w_gbk_1_test_set.csv" else: DATA_DIR = "PPD-First-Round-Data/Test Set/" LOG_FILE = "PPD_LogInfo_2_Test_Set.csv" UPDATE_FILE = "PPD_Userupdate_Info_2_Test_Set.csv" DATA_FILE = "PPD_Master_GBK_2_Test_Set.csv" # #################### load the needed data #################################### log_info = np.array(load_result(LOG_FILE, dir_name = DATA_DIR)) log_info_features = log_info[0] log_info_data = log_info[1:] update_info = np.array(load_result(UPDATE_FILE, dir_name = DATA_DIR)) update_info_features = update_info[0] update_info_data = update_info[1:] print("for_train: ", for_train) print("is_round_two: ", is_round_two) id_target = np.array(load_result(DATA_FILE, dir_name = DATA_DIR)) if for_train: features = id_target[0, [0, -2, -1]] data = id_target[1:, [0, -2, -1]] else: features = id_target[0, [0, -1]] data = id_target[1:, [0, -1]] return data, log_info_data, update_info_data
def digit_city_features(data, features, city_features, use_original_features = False): # get the map basis cei_record_content = load_result("2013中国直辖市 省会城市和计划单列市排名榜.csv", dir_name = "material_data") cei_features = np.array(cei_record_content[0]) cei_recored_data1 = np.array(cei_record_content[1:]) cei_record_content = load_result("2013中国城市商业信用环境指数地级市排名榜.csv", dir_name = "material_data") cei_recored_data2 = np.array(cei_record_content[1:]) cei_recored_data = np.concatenate((cei_recored_data1, cei_recored_data2)) # create the map basis city_map_basis = create_city_map_basis(cei_recored_data, cei_features) if use_original_features: data = replace_with_original(data, features, city_features) digited_city_data = use_map_basis_to_digit(data, features, city_map_basis, city_features) return digited_city_data
def map_str_to_digit_with_experience(data, features, digited_special_str_features, \ contain_special_features): map_experience = load_result(FEATURES_MAP_INFO_FILE_NAME, \ dir_name = "resultData/features_map") print(map_experience) fixed_str_features = np.array(load_result("str_features.csv"))[0] fixed_str_features_index = get_known_features_index( features, fixed_str_features) digited_special_str_features_index = get_known_features_index(features, \ digited_special_str_features) contain_special_features_index = get_known_features_index(features, \ contain_special_features) remember = list() for fea_pos in range(1, len(features)): # str style features + str .. but not digited + the str we want to digit if fea_pos in fixed_str_features_index and \ fea_pos not in digited_special_str_features_index and \ fea_pos in contain_special_features_index: # the ListingInfo may be reverse !!! if features[fea_pos] == "ListingInfo" and int( data[0, fea_pos].split("/")[0]) < 1000: data = reverse_date(data, fea_pos) if features[fea_pos] in map_experience.keys(): for i in range(len(data)): if data[i, fea_pos] == "-1": continue try: data[i, fea_pos] = map_experience[features[fea_pos]][ data[i, fea_pos]] except: if i < 50: print(features[fea_pos]) print(map_experience[features[fea_pos]]) print(map_experience[features[fea_pos]][data[ i, fea_pos]]) remember.append(i) # this is a error value #print(remember) data = np.delete(data, remember, 0) digited_data = convert_to_numerical(data, features) return digited_data
def according_properties_correlation_delete(): contents = load_result("pearsonr_spearmanr_Strong_correlation.csv") array_contents = np.array(contents) comp_fea1 = np.array(array_contents[1:, 0]) comp_fea2 = np.array(array_contents[1:, 1]) delete_features = [comp_fea2[i] for i in range(len(comp_fea2)) \ if comp_fea1[i] not in comp_fea2] #print(set(delete_features)) return np.array(list(set(delete_features)))
def according_properties_correlation_delete(): contents = load_result("pearsonr_spearmanr_Strong_correlation.csv") array_contents = np.array(contents) comp_fea1 = np.array(array_contents[1:, 0]) comp_fea2 = np.array(array_contents[1:, 1]) delete_features = [comp_fea2[i] for i in range(len(comp_fea2)) \ if comp_fea1[i] not in comp_fea2] os.remove(os.path.join(os.getcwd(), "resultData/pearsonr_spearmanr_results.csv")) os.remove(os.path.join(os.getcwd(), "resultData/pearsonr_spearmanr_Strong_correlation.csv")) #print(set(delete_features)) return list(set(delete_features))
def fill_all_missing(data, features, label = None): fixed_str_features = np.array(load_result("str_features.csv"))[0] indexs = get_known_features_index(features, fixed_str_features) #!start from range(1,...) is because the first line of the feature is the id, useless for fea_pos in range(1, len(features)): fea_val_cla = feature_value_class(data, fea_pos, label, indexs) if not fea_val_cla[-1]._present_num == 0: if fea_pos == 5: print(fea_val_cla) data = fill_the_missing(data, fea_pos, fea_val_cla, label) #write_to_deleted_features_area(np.array(deleted_feas)) return data, features
def according_coefficient_variation_delete(data, features): waiting_to_delete = np.array(load_result("complex_value_features.csv")) waiting_to_delete = waiting_to_delete.reshape((waiting_to_delete.size,)) #print(waiting_to_delete) indexs = get_known_features_index(features, waiting_to_delete) coefficient_variation_info = OrderedDict() for fea_pos in indexs: try: coefficient_variation_fea = stats.variation(data[:, fea_pos]) coefficient_variation_info[features[fea_pos]] = coefficient_variation_fea except: pass return coefficient_variation_info
def digit_province_features(data, features, province_features, use_original_features = False): province_content = load_result("2014中国各省人均可支配收入排行.csv", dir_name = "material_data") province_info = np.array(province_content[0]) province_data = np.array(province_content[1:]) province_map_basis = create_province_map_basis(province_data, province_info) if use_original_features: data = replace_with_original(data, features, province_features) digited_province_data = use_map_basis_to_digit(data, features, province_map_basis, province_features) return digited_province_data
def map_str_to_digit_with_experience(data, features, digited_special_str_features, \ contain_special_features): map_experience = load_result(FEATURES_MAP_INFO_FILE_NAME, \ dir_name = "resultData/features_map") print(map_experience) fixed_str_features = np.array(load_result("str_features.csv"))[0] fixed_str_features_index = get_known_features_index(features, fixed_str_features) digited_special_str_features_index = get_known_features_index(features, \ digited_special_str_features) contain_special_features_index = get_known_features_index(features, \ contain_special_features) remember = list() for fea_pos in range(1, len(features)): # str style features + str .. but not digited + the str we want to digit if fea_pos in fixed_str_features_index and \ fea_pos not in digited_special_str_features_index and \ fea_pos in contain_special_features_index: # the ListingInfo may be reverse !!! if features[fea_pos] == "ListingInfo" and int(data[0, fea_pos].split("/")[0]) < 1000: data = reverse_date(data, fea_pos) if features[fea_pos] in map_experience.keys(): for i in range(len(data)): if data[i, fea_pos] == "-1": continue try: data[i, fea_pos] = map_experience[features[fea_pos]][data[i, fea_pos]] except: if i < 50: print(features[fea_pos]) print(map_experience[features[fea_pos]]) print(map_experience[features[fea_pos]][data[i, fea_pos]]) remember.append(i) # this is a error value #print(remember) data = np.delete(data, remember, 0) digited_data = convert_to_numerical(data, features) return digited_data
def digit_city_features(data, features, city_features, use_original_features=False): # get the map basis cei_record_content = load_result("2013中国直辖市 省会城市和计划单列市排名榜.csv", dir_name="material_data") cei_features = np.array(cei_record_content[0]) cei_recored_data1 = np.array(cei_record_content[1:]) cei_record_content = load_result("2013中国城市商业信用环境指数地级市排名榜.csv", dir_name="material_data") cei_recored_data2 = np.array(cei_record_content[1:]) cei_recored_data = np.concatenate((cei_recored_data1, cei_recored_data2)) # create the map basis city_map_basis = create_city_map_basis(cei_recored_data, cei_features) if use_original_features: data = replace_with_original(data, features, city_features) digited_city_data = use_map_basis_to_digit(data, features, city_map_basis, city_features) return digited_city_data
def according_coefficient_variation_delete(data, features): waiting_to_delete = np.array(load_result("complex_value_features.csv")) waiting_to_delete = waiting_to_delete.reshape((waiting_to_delete.size, )) #print(waiting_to_delete) indexs = get_known_features_index(features, waiting_to_delete) coefficient_variation_info = OrderedDict() for fea_pos in indexs: try: coefficient_variation_fea = stats.variation(data[:, fea_pos]) coefficient_variation_info[ features[fea_pos]] = coefficient_variation_fea except: pass return coefficient_variation_info
def replace_with_original(data, features, replace_features, \ original_name = "withoutLabel_originalData.csv"): original_contents = load_result(original_name, dir_name=SAVE_DIR) original_features = np.array(original_contents[0]) original_data = np.array(original_contents[1:]) for fea in replace_features: try: original_index = np.where(original_features == fea)[0][0] index = np.where(features == fea)[0][0] data[:, index] = original_data[:, original_index] except: print(str(fea) + "may not existed in input features") continue return data
def replace_with_original(data, features, replace_features, \ original_name = "withoutLabel_originalData.csv"): original_contents = load_result(original_name, dir_name = SAVE_DIR) original_features = np.array(original_contents[0]) original_data = np.array(original_contents[1:]) for fea in replace_features: try: original_index = np.where(original_features == fea)[0][0] index = np.where(features == fea)[0][0] data[:, index] = original_data[:, original_index] except: print(str(fea) + "may not existed in input features") continue return data
def digit_province_features(data, features, province_features, use_original_features=False): province_content = load_result("2014中国各省人均可支配收入排行.csv", dir_name="material_data") province_info = np.array(province_content[0]) province_data = np.array(province_content[1:]) province_map_basis = create_province_map_basis(province_data, province_info) if use_original_features: data = replace_with_original(data, features, province_features) digited_province_data = use_map_basis_to_digit(data, features, province_map_basis, province_features) return digited_province_data
def combine_results(results_dir = "resultData/test/test_result/final_predict"): dir_path = results_dir upper_count = 0 sum_upper_thresh = np.zeros((19999, 1)) for result_file in os.listdir(dir_path): #result_file_path = os.path.join(dir_path, result_file) content = load_result(result_file, dir_path) test_scores = np.array(content[1:]) test_fea = np.array(content[0]) test_scores = np.array(list(map(float, list(test_scores[:, 1])))) test_scores = test_scores.reshape((test_scores.size, 1)) result_test_score = result_file.split(".")[1][:3] if int(result_test_score) > 770: print(result_file) print(test_scores) print(result_test_score) print("add+ sum_upper_thresh", result_file) sum_upper_thresh += test_scores upper_count += 1 sum_upper_average = sum_upper_thresh / upper_count print("*** average ***") print(sum_upper_average) print(sum_upper_average.shape) sum_upper_average = sum_upper_average.reshape((sum_upper_average.size,)) submit(sum_upper_average)
def submit(test_predict, save_dir): ###################################### Idx ######################### print(test_predict) test_predict = np.array([round(test_predict[i], 4) for i in range(test_predict.shape[0])]) print(test_predict) contents = load_result("withoutLabel_originalData.csv", dir_name = "resultData_All/test") features = np.array(contents[0]) sublime_features = np.array([features[0], "score"] ) save_result(sublime_features, "sublime_data.csv", dir_name = save_dir) data = np.array(contents[1:]) test_users = data[:, 0] test_users = test_users.reshape((test_users.size, 1)) test_predict = test_predict.reshape((test_predict.size, 1)) sublime_data = np.concatenate((test_users, test_predict), axis = 1) save_result(sublime_data, "sublime_data.csv", style = "a+", dir_name = save_dir)
def map_str_to_digit(data, features, no_map_features, only_map_features = " ", label = " "): no_map_features_index = get_known_features_index(features, no_map_features) features_map_info = dict() fixed_str_features = np.array(load_result("str_features.csv"))[0] fixed_str_features_index = get_known_features_index(features, fixed_str_features) only_map_features_index = range(len(features)) if not only_map_features == " ": only_map_features_index = get_known_features_index(features, only_map_features) for fea_pos in range(1, len(features)): if not fea_pos in no_map_features_index and fea_pos in only_map_features_index: map_info = OrderedDict() #feature_map_info = OrderedDict() fea_val_cla = feature_value_class(data, fea_pos, label, fixed_str_features_index) # if this feature is a string value, just convert it to value if fea_val_cla["str_feature"]: data, map_info = map_str_feature_to_value(data, fea_pos, fea_val_cla) features_map_info[features[fea_pos]] = map_info #features_map_info[].append([feature_map_info]) digited_data = convert_to_numerical(data, features) return digited_data, features_map_info
def correlation_between_properties(data, features): fixed_str_features = np.array(load_result("str_features.csv"))[0] indexs = get_known_features_index(features, fixed_str_features) title = list() title.append("features1") title.append("features2") title.append("calculate_method") title.append("cor") title.append("pval") save_result(title, "pearsonr_spearmanr_results.csv") save_result(title, "pearsonr_spearmanr_Strong_correlation.csv") for fea_pos in range(len(features)): for fea_pos_add in range(fea_pos + 1, len(features)): info_result = list() info_result.append(features[fea_pos]) info_result.append(features[fea_pos_add]) a1 = data[:, fea_pos] a2 = data[:, fea_pos_add] # they are all not str style features if fea_pos not in indexs and fea_pos_add not in indexs: info_result.append("pearsonr") cor, pval = stats.pearsonr(a1, a2) else: # one of them or all of them are str style features info_result.append("spearmanr") cor, pval = stats.spearmanr(a1, a2) cor = round(cor, 3) info_result.append(cor) info_result.append(pval) if abs(cor) >= 0.2: save_result(info_result, "pearsonr_spearmanr_results.csv", style="a+") if abs(cor) >= 0.86: save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \ style = "a+")
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2016-03-06 09:00:08 # @Author : chensijia ([email protected]) # @Version : 0.0.0 # @Style : Python3.5 # # @Description: import os import numpy as np from save_load_result import save_result, load_result if __name__ == '__main__': contents = load_result("after_Str_features_digited_data.csv") features = np.array(contents[0]) data = np.array(contents[1:]) from map_features_to_digit import convert_to_numerical data = convert_to_numerical(data, features)
random_forest = RandomForestClassifier(class_weight = {1: ratio}) #print(random_forest.get_params().keys()) cv = StratifiedKFold(train_target) grid = GridSearchCV(random_forest, parameters,scoring='roc_auc',cv=cv,verbose=10,n_jobs=-1) grid.fit(train_data, train_target) #print best params print (grid.best_params_) print (grid.best_score_) return grid.best_params_, grid.best_score_ if __name__ == '__main__': contents = load_result("all_data_after_features_processed.csv", dir_name = "resultData_All") features = np.array(contents[0]) data = np.array(contents[1:]) print("data: ", data.shape) label_lines = np.array(load_result("all_train_label_original.csv", dir_name = "resultData_All")) print(label_lines.shape) from save_load_result import convert_to_int label = convert_to_int(label_lines) label = label.reshape((label.size, )) print("label: ", label.shape) data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features)
lr.fit(data, label) #A helper method for pretty-printing linear models def pretty_print_linear(coefs, names = None, sort = False): if names == None: names = ["X%s" % x for x in range(len(coefs))] lst = zip(coefs, names) if sort: lst = sorted(lst, key = lambda x:-np.abs(x[0])) return " + ".join("%s * %s" % (round(coef, 3), name) for coef, name in lst) print("Linear model:", pretty_print_linear(lr.coef_, features[1:])) if __name__ == '__main__': contents = load_result("after_delete_strong_correlation_features_data.csv") features = np.array(contents[0]) data = np.array(contents[1:]) label_lines = np.array(load_result("train_label_original.csv")) data = convert_to_numerical(data, features) label = convert_to_float(label_lines) label = label.reshape((label.size, )) #use_RandomForestRegressor_to_delete(data, features, label) use_LR_to_delete(data, features, label) #################### first example ####################### # testNum = 10 # average = 0 # for i in range(0, testNum): # #加载数据集,切分数据集80%训练,20%测试
data = convert_to_numerical(data, features) data, features = sta_start_missing_period(data, features) data, features = remove_thirdparty6(data, features) data, features = fill_thirdParty_miss(data, features) data, features = third_party_stable(data, features) data, features = third_party_level(data, features) save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir) return data, features if __name__ == '__main__': contents = load_result("data_after_solved_weblog.csv") features = np.array(contents[0]) data = np.array(contents[1:]) data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) solve_thirdparty_info_package(data, features) # calculate_number = ["17"] # users_sta_name, users_stability = calculate_stability(data, features, calculate_number[0]) # print(users_sta_name) # for i in range(10): # print(users_stability[i]) # from create_new_features import find_featuers_index # features_name = "ThirdPart"
return user_value_info def compare_features_info2(data, features, key_features): fea_indexs = get_known_features_index(features, key_features) compare_result = OrderedDict() for user in range(data.shape[0]): # user_id = data[user, 0] combine_data = reduce(lambda x,y: str(x) + '_' + str(y), list(data[user, fea_indexs])) if combine_data not in compare_result.keys(): compare_result[combine_data] = 0 compare_result[combine_data] += 1 return compare_result if __name__ == '__main__': contents = load_result("withoutLabel_originalData.csv", dir_name = "resultData/") features = np.array(contents[0]) data = np.array(contents[1:]) #data = convert_to_numerical(data, features) print(data.shape) print(features.shape) # label_lines = np.array(load_result("train_label_original.csv")) #print(label_lines.shape) # from save_load_result import convert_to_int # label = convert_to_int(label_lines) # label = label.reshape((label.size, ))
def view_each_features_label(data, features, label): data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) str_style_features = np.array(load_result("str_features.csv")[0]) str_features_index = get_known_features_index(features, str_style_features) new_label = label.reshape((label.size, )) x = range(len(data)) for fea_pos in range(len(features)): feature_name = features[fea_pos] if fea_pos in str_features_index: file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str( fea_pos) + ")" + feature_name + ".png" else: file_path = "view_data_area/after_all/with_label_under_mean/" + str( fea_pos) + ")" + feature_name + ".png" features_info = feature_value_class(data, fea_pos, label, str_features_index) if features_info["num_of_value"] > 30: save_result([features[fea_pos]], "complex_value_features.csv", style="a+") else: if fea_pos not in str_features_index: save_result( [features[fea_pos]], "simple_discrete_value_features(nonestrfeatures).csv", style="a+") y_positive = data[new_label == 1, fea_pos] y_negitive = data[new_label == 0, fea_pos] positive_index = np.array([ index for index in range(len(new_label)) if new_label[index] == 1 ]) negitive_index = np.array([ index for index in range(len(new_label)) if new_label[index] == 0 ]) plt.scatter(positive_index, y_positive, marker='o', color='r', s=10) plt.scatter(negitive_index, y_negitive, marker='x', color='g', s=10) plt.xlabel("instances(30000)") plt.ylabel("value") if features_info["num_of_value"] < 40: plt.title(feature_name + " value - label " + "distributed " + "in instances" + \ "\n the arrow --> Proportion of positive in that value & in positive") for k, v in features_info.items(): if isinstance(v, FeatureInData): arrow_data = round( v._respond_positive_num / features_info["num_positive"], 4) arrow_start_position_x = len(data) + 2000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) arrow_data = round( v._respond_positive_num / v._present_num, 4) arrow_start_position_x = -4000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) else: fea_average = round(np.mean(data[:, fea_pos]), 4) fea_std = np.std(data[:, fea_pos]) fea_oo = round(fea_std / fea_average, 4) max_v = np.amax(data[:, fea_pos]) min_v = np.amin(data[:, fea_pos]) plt.title(feature_name + " | mean & Proportion of positive under that mean" + \ "\n degree of fluctuation --> " + str(fea_oo)) x1 = np.array(range(-5000, 35000)) y_mean = fea_average * np.ones((x1.size)) #plt.plot(x1, y_mean, color = 'k', linestyle = "--") plt.annotate(fea_average, \ xy=(-4000,fea_average), \ xytext=(-4000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) under_mean_positive = 0 under_mean_num = 0 for k, v in features_info.items(): if isinstance(v, FeatureInData): if k <= fea_average: under_mean_num += v._present_num under_mean_positive += v._respond_positive_num ave_posi = round( under_mean_positive / features_info["num_positive"], 4) plt.annotate(ave_posi, \ xy=(31000,fea_average), \ xytext=(31000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) pos_rat = 0 pos_rat_whole = 0 if -1 in features_info.keys(): pos_rat = features_info[ -1]._respond_positive_num / features_info[-1]._present_num pos_rat_whole = features_info[ -1]._respond_positive_num / features_info["num_positive"] plt.annotate(round(pos_rat_whole, 4), \ xy=(31000,-1), \ xytext=(31000,-1)) plt.annotate(round(pos_rat, 4), \ xy=(-4000,-1), \ xytext=(-4000,-1)) plt.ylim(min_v - 10, fea_average * 2) #plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10)) plt.savefig(file_path) plt.close()
return user_value_info def compare_features_info2(data, features, key_features): fea_indexs = get_known_features_index(features, key_features) compare_result = OrderedDict() for user in range(data.shape[0]): # user_id = data[user, 0] combine_data = reduce(lambda x,y: str(x) + '_' + str(y), list(data[user, fea_indexs])) if combine_data not in compare_result.keys(): compare_result[combine_data] = 0 compare_result[combine_data] += 1 return compare_result if __name__ == '__main__': contents = load_result("PPD_Userupdate_Info_3_1_Training_Set.csv", dir_name = "PPD-First-Round-Data/Training Set/") features = np.array(contents[0]) data = np.array(contents[1:]) #data = convert_to_numerical(data, features) print(data.shape) print(features.shape) # label_lines = np.array(load_result("train_label_original.csv")) #print(label_lines.shape) # from save_load_result import convert_to_int # label = convert_to_int(label_lines) # label = label.reshape((label.size, ))
model = xgb.train(plst, xgtrain, num_boost_round=num_rounds, evals=watchlist, early_stopping_rounds=120) preds2 = model.predict(xgtest, ntree_limit=model.best_iteration) model.save_model('0001_1.model') #combine predictions #since the metric only cares about relative rank we don't need to average preds = (preds1) * 1.4 + (preds2) * 8.6 return preds if __name__ == '__main__': contents = load_result("data_after_features_processed.csv") features = np.array(contents[0]) data = np.array(contents[1:]) label_lines = np.array(load_result("train_label_original.csv")) #print(label_lines.shape) from save_load_result import convert_to_int label = convert_to_int(label_lines) label = label.reshape((label.size, )) print(label.shape) data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features)
data, features = new_WI_19(data, features) data, features = new_WI_20_by_present(data, features) #data, features = new_WI_20_by_positive(data, features) data, features = new_WI_21(data, features) # save_result(data, "data_after_solve_WeblogInfo_21.csv", features, dir_name = saved_dir) save_result(data, "data_after_solved_weblog.csv", features, dir_name=saved_dir) return data, features if __name__ == '__main__': contents = load_result("data_after_solved_UserInfo22_23.csv", dir_name=saved_dir) features = np.array(contents[0]) data = np.array(contents[1:]) #data = convert_to_numerical(data, features) print(data.shape) print(features.shape) from create_new_features import find_featuers_index features_name = "WeblogInfo" fea_indexs = find_featuers_index(features_name, features) print(fea_indexs) weblog_data = data[:, fea_indexs] weblog_features = features[fea_indexs]
#A helper method for pretty-printing linear models def pretty_print_linear(coefs, names=None, sort=False): if names == None: names = ["X%s" % x for x in range(len(coefs))] lst = zip(coefs, names) if sort: lst = sorted(lst, key=lambda x: -np.abs(x[0])) return " + ".join("%s * %s" % (round(coef, 3), name) for coef, name in lst) print("Linear model:", pretty_print_linear(lr.coef_, features[1:])) if __name__ == '__main__': contents = load_result("after_delete_strong_correlation_features_data.csv") features = np.array(contents[0]) data = np.array(contents[1:]) label_lines = np.array(load_result("train_label_original.csv")) data = convert_to_numerical(data, features) label = convert_to_float(label_lines) label = label.reshape((label.size, )) #use_RandomForestRegressor_to_delete(data, features, label) use_LR_to_delete(data, features, label) #################### first example ####################### # testNum = 10 # average = 0 # for i in range(0, testNum): # #加载数据集,切分数据集80%训练,20%测试
data, features = fill_thirdParty_miss(data, features) data, features = third_party_stable(data, features) data, features = third_party_level(data, features) save_result(data, "data_after_thirdparty_solved.csv", features, dir_name=saved_dir) return data, features if __name__ == '__main__': contents = load_result("data_after_solved_weblog.csv") features = np.array(contents[0]) data = np.array(contents[1:]) data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) solve_thirdparty_info_package(data, features) # calculate_number = ["17"] # users_sta_name, users_stability = calculate_stability(data, features, calculate_number[0]) # print(users_sta_name) # for i in range(10): # print(users_stability[i]) # from create_new_features import find_featuers_index
indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(data.shape[1]): print("%s. %d (%f)" % (features[indices[f]], indices[f], importances[indices[f]])) # if a value in one features is bigger than 20000, besides # the positive in it is almost equal to the positive in the train data if __name__ == '__main__': #################### used to calculate the correlation between properties ######### contents = load_result("data_after_delete_no_discrimination_features.csv") features = np.array(contents[0]) data = np.array(contents[1:]) from map_features_to_digit import convert_to_numerical from solve_data import delete_features data = convert_to_numerical(data, features) data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) correlation_between_properties(data, features) delete_result = according_properties_correlation_delete() save_result(delete_result, "deleted_features_with_strong_correlation.csv")
# labels = labels.reshape((labels.size, )) # print(labels.shape) # key_words = ["Log_", "Update_"] # LU_features, LU_data = extract_features(key_words) # LU_data = convert_to_numerical(LU_data, LU_features) # print("***** UI analysis data *******") # print(LU_features) # print(LU_data.shape) # save_result(LU_data, "UI_analysis_data.csv", LU_features, dir_name = "resultData_All/") label_lines = np.array(load_result("all_train_label_original.csv", dir_name = "resultData_All")) #print(label_lines.shape) from save_load_result import convert_to_int labels = convert_to_int(label_lines) labels = labels.reshape((labels.size, )) print(labels.shape) contents = load_result("UI_analysis_data.csv", dir_name = "resultData_All") LU_features = np.array(contents[0]) LU_data = np.array(contents[1:]) print("LU_data: ", LU_data.shape) #LU_analysis_xgboost(LU_data, labels) # LU_analysis_LR(LU_data, labels) LU_analysis_SVC(LU_data, labels)
data, features = new_UserInfo_24_resident_level(data, features) # save_result(data, "data_after_solved_UserInfo24.csv", features, dir_name = saved_dir) #data, features = new_UserInfo_22_23_combine1(data, features) # save_result(data, "data_after_solved1_UserInfo22_23.csv", features, dir_name = saved_dir) data, features = new_UserInfo_22_23_combine2(data, features) save_result(data, "data_after_solved_user_info.csv", features, dir_name=saved_dir) return data, features if __name__ == '__main__': contents = load_result("data_after_delete_too_many_missing_features.csv") features = np.array(contents[0]) data = np.array(contents[1:]) deleted_features_in_train = load_all_deleted_features_during_train( deleted_features_file_label="deleted_") data, features, deleted = delete_features( data, features, delete_feas_list=deleted_features_in_train) data, features = solve_user_info_package(data, features) from create_features_from_weblog import solve_weblog_info_package data, features = solve_weblog_info_package(data, features)
# Print the feature ranking print("Feature ranking:") for f in range(data.shape[1]): print("%s. %d (%f)" % (features[indices[f]], indices[f], importances[indices[f]])) # if a value in one features is bigger than 20000, besides # the positive in it is almost equal to the positive in the train data if __name__ == '__main__': #################### used to calculate the correlation between properties ######### contents = load_result("data_after_delete_no_discrimination_features.csv") features = np.array(contents[0]) data = np.array(contents[1:]) from map_features_to_digit import convert_to_numerical from solve_data import delete_features data = convert_to_numerical(data, features) data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) correlation_between_properties(data, features) delete_result = according_properties_correlation_delete() save_result(delete_result, "deleted_features_with_strong_correlation.csv")
print(bins) iteral = 0 while iteral < 100: sta_result = stats.binned_statistic(sorted_to_bin_values, sorted_to_bin_values, \ bin_label, bins) new_bins = compare_and_combine(sta_result) if bins == new_bins: break bins = new_bins return sta_result if __name__ == '__main__': contents = load_result("after_Str_features_digited_data.csv") features = np.array(contents[0]) data = np.array(contents[1:]) label_lines = np.array(load_result("train_label_original.csv")) print(label_lines.shape) label = convert_to_float(label_lines) from map_features_to_digit import convert_to_numerical data = convert_to_numerical(data, features) #index = np.where(features == "ThirdParty_Info_Period4_1")[0][0] index = np.where(features == "WeblogInfo_12")[0][0] fea_info = feature_value_class(data, index, label) #print(fea_info)
data, features = new_UserInfo_23_education_level(data, features) # save_result(data, "data_after_solved_UserInfo23.csv", features, dir_name = saved_dir) data, features = new_UserInfo_24_resident_level(data, features) # save_result(data, "data_after_solved_UserInfo24.csv", features, dir_name = saved_dir) #data, features = new_UserInfo_22_23_combine1(data, features) # save_result(data, "data_after_solved1_UserInfo22_23.csv", features, dir_name = saved_dir) data, features = new_UserInfo_22_23_combine2(data, features) #save_result(data, "data_after_solved_user_info.csv", features, dir_name = saved_dir) return data, features # new_UserInfo_7_num if __name__ == '__main__': contents = load_result("withoutLabel_originalData.csv") features = np.array(contents[0]) data = np.array(contents[1:]) data, features = new_UserInfo_7_num(data, features) save_result(data, "test.csv", features) # deleted_features_in_train = load_all_deleted_features_during_train(deleted_features_file_label = "deleted_") # data, features, deleted = delete_features(data, features, delete_feas_list = deleted_features_in_train) # data, features = solve_user_info_package(data, features) # from create_features_from_weblog import solve_weblog_info_package # data, features = solve_weblog_info_package(data, features)
iteral = 0 while iteral < 100: sta_result = stats.binned_statistic(sorted_to_bin_values, sorted_to_bin_values, \ bin_label, bins) new_bins = compare_and_combine(sta_result) if bins == new_bins: break bins = new_bins return sta_result if __name__ == '__main__': contents = load_result("after_Str_features_digited_data.csv") features = np.array(contents[0]) data = np.array(contents[1:]) label_lines = np.array(load_result("train_label_original.csv")) print(label_lines.shape) label = convert_to_float(label_lines) from map_features_to_digit import convert_to_numerical data = convert_to_numerical(data, features) #index = np.where(features == "ThirdParty_Info_Period4_1")[0][0] index = np.where(features == "WeblogInfo_12")[0][0] fea_info = feature_value_class(data, index, label) #print(fea_info)
xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset]) watchlist = [(xgtrain, 'train'),(xgval, 'eval')] model = xgb.train(plst, xgtrain, num_boost_round = num_rounds, evals = watchlist, early_stopping_rounds=120) preds2 = model.predict(xgtest, ntree_limit=model.best_iteration) model.save_model('0001_1.model') #combine predictions #since the metric only cares about relative rank we don't need to average preds = (preds1)*1.4 + (preds2)*8.6 return preds if __name__ == '__main__': contents = load_result("data_after_features_processed.csv") features = np.array(contents[0]) data = np.array(contents[1:]) label_lines = np.array(load_result("train_label_original.csv")) #print(label_lines.shape) from save_load_result import convert_to_int label = convert_to_int(label_lines) label = label.reshape((label.size, )) print(label.shape) data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) test_data = data[:2000]
for i in id_index_in_land: land_operate_code.append(log_info_data[i, 2]) land_operate_style.append(log_info_data[i, 3]) land_date.append(log_info_data[i, 4]) all_id_info[id_name]["land_info"][ "land_operate_code"] = land_operate_code all_id_info[id_name]["land_info"][ "land_operate_style"] = land_operate_style all_id_info[id_name]["land_info"]["land_date"] = land_date # add the modify info all_id_info[id_name]["modify_info"] = OrderedDict() modify_info = list() modify_date = list() id_index_in_modify = np.where(update_info_data[:, 0] == id_name)[0] for i in id_index_in_modify: modify_info.append(update_info_data[i, 2]) modify_date.append(update_info_data[i, 3]) all_id_info[id_name]["modify_info"]["modify_things"] = modify_info all_id_info[id_name]["modify_info"]["modify_date"] = modify_date save_result(all_id_info, "all_id_info.pickle", dir_name=saved_dir) if __name__ == '__main__': #combine_land_modify_infos(data, log_info_data, update_info_data) combined = load_result("all_id_info.pickle", dir_name=SAVE_DIR) print(combined["10001"])
def solve_weblog_info_package(data, features, saved_dir = "resultData/"): from map_features_to_digit import convert_to_numerical from solve_data import delete_features data, features = new_WI_19(data, features) data, features = new_WI_20_by_present(data, features) #data, features = new_WI_20_by_positive(data, features) data, features = new_WI_21(data, features) # save_result(data, "data_after_solve_WeblogInfo_21.csv", features, dir_name = saved_dir) #save_result(data, "data_after_solved_weblog.csv", features, dir_name = saved_dir) return data, features if __name__ == '__main__': contents = load_result("data_after_solved_UserInfo22_23.csv", dir_name = saved_dir) features = np.array(contents[0]) data = np.array(contents[1:]) #data = convert_to_numerical(data, features) print(data.shape) print(features.shape) from create_new_features import find_featuers_index features_name = "WeblogInfo" fea_indexs = find_featuers_index(features_name, features) print(fea_indexs) weblog_data = data[:, fea_indexs] weblog_features = features[fea_indexs]
def view_each_features_label(data, features, label): data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) str_style_features = np.array(load_result("str_features.csv")[0]) str_features_index = get_known_features_index(features, str_style_features) new_label = label.reshape((label.size,)) x = range(len(data)) for fea_pos in range(len(features)): feature_name = features[fea_pos] if fea_pos in str_features_index: file_path = "view_data_area/after_all/with_label_under_mean/" + "(str" + str(fea_pos) + ")" + feature_name + ".png" else: file_path = "view_data_area/after_all/with_label_under_mean/" + str(fea_pos) + ")" + feature_name + ".png" features_info = feature_value_class(data, fea_pos, label, str_features_index) if features_info["num_of_value"] > 30: save_result([features[fea_pos]], "complex_value_features.csv", style = "a+") else: if fea_pos not in str_features_index: save_result([features[fea_pos]], "simple_discrete_value_features(nonestrfeatures).csv", style = "a+") y_positive = data[new_label == 1, fea_pos] y_negitive = data[new_label == 0, fea_pos] positive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 1]) negitive_index = np.array([index for index in range(len(new_label)) if new_label[index] == 0]) plt.scatter(positive_index, y_positive, marker = 'o', color = 'r', s = 10) plt.scatter(negitive_index, y_negitive, marker = 'x', color = 'g', s = 10) plt.xlabel("instances(30000)") plt.ylabel("value") if features_info["num_of_value"] < 40: plt.title(feature_name + " value - label " + "distributed " + "in instances" + \ "\n the arrow --> Proportion of positive in that value & in positive") for k, v in features_info.items(): if isinstance(v, FeatureInData): arrow_data = round(v._respond_positive_num / features_info["num_positive"] , 4) arrow_start_position_x = len(data) + 2000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) arrow_data = round(v._respond_positive_num / v._present_num , 4) arrow_start_position_x = -4000 arrow_start_position_y = int(k) arrow_end_postion_x = arrow_start_position_x arrow_end_postion_y = int(k) plt.annotate(arrow_data, \ xy=(arrow_start_position_x,arrow_start_position_y), \ xytext=(arrow_end_postion_x,arrow_end_postion_y), \ arrowprops=dict(facecolor='blue', shrink=0.02)) else: fea_average = round(np.mean(data[:, fea_pos]), 4) fea_std = np.std(data[:, fea_pos]) fea_oo = round(fea_std / fea_average, 4) max_v = np.amax(data[:, fea_pos]) min_v = np.amin(data[:, fea_pos]) plt.title(feature_name + " | mean & Proportion of positive under that mean" + \ "\n degree of fluctuation --> " + str(fea_oo)) x1 = np.array(range(-5000, 35000)) y_mean = fea_average * np.ones((x1.size)) #plt.plot(x1, y_mean, color = 'k', linestyle = "--") plt.annotate(fea_average, \ xy=(-4000,fea_average), \ xytext=(-4000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) under_mean_positive = 0 under_mean_num = 0 for k, v in features_info.items(): if isinstance(v, FeatureInData): if k <= fea_average: under_mean_num += v._present_num under_mean_positive += v._respond_positive_num ave_posi = round(under_mean_positive / features_info["num_positive"], 4) plt.annotate(ave_posi, \ xy=(31000,fea_average), \ xytext=(31000,fea_average), \ arrowprops=dict(facecolor='blue', shrink=0.05)) pos_rat = 0 pos_rat_whole = 0 if -1 in features_info.keys(): pos_rat = features_info[-1]._respond_positive_num / features_info[-1]._present_num pos_rat_whole = features_info[-1]._respond_positive_num / features_info["num_positive"] plt.annotate(round(pos_rat_whole, 4), \ xy=(31000,-1), \ xytext=(31000,-1)) plt.annotate(round(pos_rat, 4), \ xy=(-4000,-1), \ xytext=(-4000,-1)) plt.ylim(min_v - 10, fea_average * 2) #plt.ylim(fea_average - round(fea_average / 10), max_v + round(fea_average / 10)) plt.savefig(file_path) plt.close()
id_index_in_land = np.where(log_info_data[:, 0] == id_name)[0] for i in id_index_in_land: land_operate_code.append(log_info_data[i, 2]) land_operate_style.append(log_info_data[i, 3]) land_date.append(log_info_data[i, 4]) all_id_info[id_name]["land_info"]["land_operate_code"] = land_operate_code all_id_info[id_name]["land_info"]["land_operate_style"] = land_operate_style all_id_info[id_name]["land_info"]["land_date"] = land_date # add the modify info all_id_info[id_name]["modify_info"] = OrderedDict() modify_info = list() modify_date =list() id_index_in_modify = np.where(update_info_data[:, 0] == id_name)[0] for i in id_index_in_modify: modify_info.append(update_info_data[i, 2]) modify_date.append(update_info_data[i, 3]) all_id_info[id_name]["modify_info"]["modify_things"] = modify_info all_id_info[id_name]["modify_info"]["modify_date"] = modify_date save_result(all_id_info, "all_id_info.pickle", dir_name = saved_dir) if __name__ == '__main__': #combine_land_modify_infos(data, log_info_data, update_info_data) combined = load_result("all_id_info.pickle", dir_name = SAVE_DIR) print(combined["10001"])