def extract_log_update_package(data, features, for_train = True): data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) data, features = add_features_from_log(data, features, for_train) data, features = add_features_from_update(data, features, for_train) return data, features
def extract_log_update_package(data, features, for_train=True): data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) data, features = add_features_from_log(data, features, for_train) data, features = add_features_from_update(data, features, for_train) return data, features
def solve_thirdparty_info_package(data, features, saved_dir = "resultData/"): data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) data, features = sta_start_missing_period(data, features) data, features = remove_thirdparty6(data, features) data, features = fill_thirdParty_miss(data, features) data, features = third_party_stable(data, features) data, features = third_party_level(data, features) save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir) return data, features
def solve_thirdparty_info_package(data, features, saved_dir="resultData/"): data, features, deleted = delete_features( data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) data, features = sta_start_missing_period(data, features) data, features = remove_thirdparty6(data, features) data, features = fill_thirdParty_miss(data, features) data, features = third_party_stable(data, features) data, features = third_party_level(data, features) save_result(data, "data_after_thirdparty_solved.csv", features, dir_name=saved_dir) return data, features
def correlation_between_properties(data, features): fixed_str_features = np.array(load_result("str_features.csv"))[0] indexs = get_known_features_index(features, fixed_str_features) data = convert_to_numerical(data, features) title = list() title.append("features1") title.append("features2") title.append("calculate_method") title.append("cor") title.append("pval") save_result(title, "pearsonr_spearmanr_results.csv") save_result(title, "pearsonr_spearmanr_Strong_correlation.csv") for fea_pos in range(len(features)): for fea_pos_add in range(fea_pos + 1, len(features)): info_result = list() info_result.append(features[fea_pos]) info_result.append(features[fea_pos_add]) a1 = data[:, fea_pos] a2 = data[:, fea_pos_add] # they are all not str style features if fea_pos not in indexs and fea_pos_add not in indexs: info_result.append("pearsonr") try: cor, pval = stats.pearsonr(a1, a2) except: print("error!!!!:", features[fea_pos]) print("error!!!!:", features[fea_pos_add]) else: # one of them or all of them are str style features info_result.append("spearmanr") cor, pval = stats.spearmanr(a1, a2) cor = round(cor, 3) info_result.append(cor) info_result.append(pval) if abs(cor) >= 0.2: save_result(info_result, "pearsonr_spearmanr_results.csv", style = "a+") if abs(cor) >= 0.9: save_result(info_result, "pearsonr_spearmanr_Strong_correlation.csv", \ style = "a+")
data, features = fill_thirdParty_miss(data, features) data, features = third_party_stable(data, features) data, features = third_party_level(data, features) save_result(data, "data_after_thirdparty_solved.csv", features, dir_name = saved_dir) return data, features if __name__ == '__main__': contents = load_result("data_after_solved_weblog.csv") features = np.array(contents[0]) data = np.array(contents[1:]) data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) solve_thirdparty_info_package(data, features) # calculate_number = ["17"] # users_sta_name, users_stability = calculate_stability(data, features, calculate_number[0]) # print(users_sta_name) # for i in range(10): # print(users_stability[i]) # from create_new_features import find_featuers_index # features_name = "ThirdPart" # fea_indexs = find_featuers_index(features_name, features) # print(fea_indexs) # data = data[:, fea_indexs] # features = features[fea_indexs]
while iteral < 100: sta_result = stats.binned_statistic(sorted_to_bin_values, sorted_to_bin_values, \ bin_label, bins) new_bins = compare_and_combine(sta_result) if bins == new_bins: break bins = new_bins return sta_result if __name__ == '__main__': contents = load_result("after_Str_features_digited_data.csv") features = np.array(contents[0]) data = np.array(contents[1:]) label_lines = np.array(load_result("train_label_original.csv")) print(label_lines.shape) label = convert_to_float(label_lines) from map_features_to_digit import convert_to_numerical data = convert_to_numerical(data, features) #index = np.where(features == "ThirdParty_Info_Period4_1")[0][0] index = np.where(features == "WeblogInfo_12")[0][0] fea_info = feature_value_class(data, index, label) #print(fea_info) result = discretization_feature(fea_info) print(result)
if __name__ == '__main__': contents = load_result("all_data_after_features_processed.csv", dir_name = "resultData_All") features = np.array(contents[0]) data = np.array(contents[1:]) print("data: ", data.shape) label_lines = np.array(load_result("all_train_label_original.csv", dir_name = "resultData_All")) print(label_lines.shape) from save_load_result import convert_to_int label = convert_to_int(label_lines) label = label.reshape((label.size, )) print("label: ", label.shape) data, features, deleted = delete_features(data, features, delete_feas_list=["Idx", "ListingInfo"]) data = convert_to_numerical(data, features) # s_d = StratifiedShuffleSplit(label, n_iter = 2, test_size = 0.3, train_size = 0.7) # i = 0 # for train_index, test_index in s_d: # train_data, test_data = data[train_index], data[test_index] # train_label, test_label = label[train_index], label[test_index] # test_preds = module_xgboost_pre(train_data, train_label, test_data) # # print(test_preds) # # print(type(test_preds)) # fig_name = "module_xgb_ROC_7" + "_" + str(i) + ".png" # print(fig_name) # i += 1