def main(): ll_data_2g = utils.gongcan_to_ll() train_data = utils.ll_to_grid(ll_data_2g) # print(train_data) # 删除原有的ID,不作为训练特征 for i in range(1, 8): train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True) train_data.drop(['CellID_' + str(i)], axis=1, inplace=True) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) # features和labels X = train_data.drop( ['MRTime', 'Longitude', 'Latitude', 'Num_connected', 'grid_num'], axis=1, inplace=False).as_matrix() y = train_data[['grid_num', 'Longitude', 'Latitude']].as_matrix() # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的 # random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] # errors_all = [] # GradientBoosting 调参 # start = datetime.datetime.now() # errors = [] # overall_pres = [] # top10_pres = [] # top10_recalls = [] # top10_fs = [] # print(y[:,0]) print("GradientBoosting") # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_states[i]) param_test1 = { 'n_estimators': range(10, 61, 10), 'learning_rate': np.arange(0.01, 0.1, 10) } param_test2 = {'max_depth': range(3, 14, 2)} param_test3 = { 'max_features': range(7, 20, 2), 'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9] } gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier( learning_rate=0.1, min_samples_split=300), param_grid=param_test1, scoring='f1_micro', cv=5) gsearch1.fit(np.delete(X, 0, axis=1), y[:, 0]) print("Best param: {}".format(gsearch1.best_params_)) print("Best score: {}".format(gsearch1.best_score_)) print("****************************")
def main(): ll_data_2g = utils.gongcan_to_ll() train_data = utils.ll_to_grid(ll_data_2g) # print(train_data) # 删除原有的ID,不作为训练特征 for i in range(1, 8): train_data.drop(['RNCID_'+str(i)], axis=1, inplace=True) train_data.drop(['CellID_'+str(i)], axis=1, inplace=True) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) # features和labels X = train_data.drop(['MRTime', 'Longitude', 'Latitude', 'Num_connected', 'grid_num'], axis=1, inplace=False).as_matrix() y = train_data[['grid_num', 'Longitude', 'Latitude']].as_matrix() # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的 # random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] # errors_all = [] # GradientBoosting 调参 # start = datetime.datetime.now() # errors = [] # overall_pres = [] # top10_pres = [] # top10_recalls = [] # top10_fs = [] # print(y[:,0]) print("Adaboost") # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_states[i]) # param_test1 = {'n_estimators': range(30, 101, 10), # 'learning_rate': np.arange(0.01, 0.1, 10)} # param_test2 = {'algorithm' : ['SAMME', 'SAMME.R']} param_test3 = {'base_estimator': [GaussianNB(), DecisionTreeClassifier()]} gsearch1 = GridSearchCV(estimator=AdaBoostClassifier(n_estimators=30, learning_rate=0.01, algorithm='SAMME.R'), param_grid=param_test3, scoring='f1_micro', cv=5) gsearch1.fit(np.delete(X, 0, axis=1), y[:, 0]) print("Best param: {}".format(gsearch1.best_params_)) print("Best score: {}".format(gsearch1.best_score_)) print("Best estimator: {}".format(gsearch1.best_estimator_)) print("****************************")
def main(): train_data = utils.gongcan_to_ll() # 删除原有的ID,不作为训练特征 for i in range(1, 8): train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True) train_data.drop(['CellID_' + str(i)], axis=1, inplace=True) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) rel_lon = [] rel_lat = [] for index, row in train_data.iterrows(): rel_lon.append(row['Longitude'] - row['Longitude_1']) rel_lat.append(row['Latitude'] - row['Latitude_1']) train_data['rel_Longitude'] = np.array(rel_lon) train_data['rel_Latitude'] = np.array(rel_lat) # features和labels train_data.set_index(['Longitude_1', 'Latitude_1'], inplace=True, drop=False) train_data.sort_index(inplace=True) ids = list(set(train_data.index.tolist())) errors_all = [] amount = [] for id in ids: MS_datas = train_data.loc[id] X = MS_datas.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y = MS_datas[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的 random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] # 随机森林 print("MS {}".format(id)) errors = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) regr = RandomForestRegressor(max_depth=20, random_state=0) y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5], axis=1)).predict(X_test) error = utils.pos_error(y_test, y_pred) errors.append(error) # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred) # errors.append(utils.pos_error(y_test, y_pred)) # 将每个数据集的点做出来 plt.title("Median error: %.3f" % np.percentile(np.array(errors).mean(axis=0), 50) + " Data amount: {}".format(X.shape[0])) ax = plt.gca() ax.get_xaxis().get_major_formatter().set_useOffset(False) plt.scatter(y[:, 2], y[:, 3]) plt.xlim([lb_Longitude, rt_Longitude]) plt.ylim([lb_Latitude, rt_Latitude]) plt.show() # print("Different data amount: {}".format(len(set(X[:,0])))) print("Data amount: {}".format(X.shape[0])) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) errors_all.append([id, errors]) amount.append( [X.shape[0], np.percentile(np.array(errors).mean(axis=0), 50)]) # amount.append([len(set(X[:, 0])), np.percentile(np.array(errors).mean(axis=0), 50)]) print("****************************") utils.cdf_figure(errors_all) utils.mean_figure(errors_all) # utils.cdf_figure_overall(errors_all) # 将每个基站的中位误差和总的数据集个数输出 amount = np.array(amount) amount = amount[amount[:, 0].argsort()] for a in amount: print(a) return errors_all
def compare(): """ 将a问与c问结果比较 :return: """ ll_data_2g = utils.gongcan_to_ll() train_data = utils.ll_to_grid(ll_data_2g) # print(train_data) # 删除原有的ID,不作为训练特征 for i in range(1, 8): train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True) train_data.drop(['CellID_' + str(i)], axis=1, inplace=True) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) # features和labels X_ = train_data.drop( ['MRTime', 'Longitude', 'Latitude', 'Num_connected', 'grid_num'], axis=1, inplace=False).as_matrix() y_ = train_data[['grid_num', 'Longitude', 'Latitude']].as_matrix() # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的 random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] start = datetime.datetime.now() errors_all = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X_, y_, test_size=0.2, random_state=random_states[i]) clf = RandomForestClassifier(max_depth=20, random_state=0) y_pred = clf.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) ll_pred = [] for y in y_pred: X_box = int(y % X_box_num) y_box = int(y / X_box_num) + 1 if X_box == 0: X_box = X_box_num y_box -= 1 lon = lb_Longitude + per_lon * X_box - 0.5 * per_lon lat = lb_Latitude + per_lat * y_box - 0.5 * per_lat ll_pred.append([lon, lat]) ll_true = np.delete(y_test, 0, axis=1).tolist() errors = [] for (true, pred) in zip(ll_true, ll_pred): error = utils.haversine(true[0], true[1], pred[0], pred[1]) errors.append(error) errors.sort() errors_all.append(errors) print("RandomForest") print("Median error: {}".format( np.percentile(np.array(errors_all).mean(axis=0), 50))) print("Time: {}".format(datetime.datetime.now() - start)) print("****************************") # 获得 c 问结果 start = datetime.datetime.now() c_errors = main() print("Time: {}".format(datetime.datetime.now() - start)) plt.figure('Comparision 2G DATA') plt.xlabel('Comparision 2G DATA - CDF figure') plt.ylabel('Error(meters)') # 绘制 c 问的结果的总体CDF曲线 mean_errors = [] for i in range(len(c_errors)): errors = np.array(c_errors[i][1]) mean_error = errors.mean(axis=0) mean_errors.extend(mean_error) mean_errors.sort() plt.plot( [float(i) / float(len(mean_errors)) for i in range(len(mean_errors))], list(mean_errors), '--', linewidth=1, alpha=0.6, label="c-method median error(m): %.3f" % np.percentile(mean_errors, 50)) # 绘制 a 问的结果的总体CDF曲线 errors = np.array(errors_all) mean_errors = errors.mean(axis=0) # print(mean_errors) plt.plot( [float(i) / float(len(mean_errors)) for i in range(len(mean_errors))], list(mean_errors), '--', linewidth=1, alpha=0.6, label="a-method median error: %.3f" % np.percentile(mean_errors, 50)) plt.legend() plt.show()
def main(): train_data = utils.gongcan_to_ll() # 删除原有的ID,不作为训练特征 for i in range(1, 8): train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True) train_data.drop(['CellID_' + str(i)], axis=1, inplace=True) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) rel_lon = [] rel_lat = [] # print(train_data) for index, row in train_data.iterrows(): rel_lon.append(row['Longitude'] - row['Longitude_1']) rel_lat.append(row['Latitude'] - row['Latitude_1']) train_data['rel_Longitude'] = np.array(rel_lon) train_data['rel_Latitude'] = np.array(rel_lat) # features和labels train_data.set_index(['Longitude_1', 'Latitude_1'], inplace=True, drop=False) train_data.sort_index(inplace=True) ids = list(set(train_data.index.tolist())) # 利用 KMeans 聚类,将不同的基站通过距离进行聚类 y_pred = KMeans(n_init=1, random_state=0).fit_predict(ids) # print(y_pred) # 做出聚类后的结果 plt.title("Kmeans Result") x = [id[0] for id in ids] y = [id[1] for id in ids] plt.scatter(x, y, c=y_pred) ax = plt.gca() ax.get_xaxis().get_major_formatter().set_useOffset(False) # plt.xlim([lb_Longitude, rt_Longitude]) # plt.ylim([lb_Latitude, rt_Latitude]) plt.show() ids = [(id, cluster) for (id, cluster) in zip(ids, y_pred)] # print(ids) errors_all = [] median_errors = [] for id in ids: MS_datas = train_data.loc[id[0]] X = MS_datas.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y = MS_datas[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的 random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] # 随机森林 print("MS {}".format(id)) errors = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) regr = RandomForestRegressor(max_depth=20, random_state=0) y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5], axis=1)).predict(X_test) error = utils.pos_error(y_test, y_pred) errors.append(error) # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred) # errors.append(utils.pos_error(y_test, y_pred)) median_error = np.percentile(np.array(errors).mean(axis=0), 50) print("Median error: {}".format(median_error)) median_errors.append([id[0], median_error, id[1]]) errors_all.append([id, errors]) print("****************************") median_errors = DataFrame(median_errors, columns=['id', 'median_error', 'cluster']) median_errors.set_index(['median_error'], inplace=True, drop=False) median_errors.sort_index(inplace=True) MS_number = median_errors.shape[0] topk_worst = median_errors.iloc[int(MS_number * 0.8):][['id', 'cluster' ]].as_matrix() old_errors = [] # 用于存储没有修正前的 top k- 的所有 error for error in errors_all: if error[0][0] in topk_worst[:, 0].tolist(): old_errors.append([error[0], error[1]]) print("\n") print("Start correction") print("\n") new_errors = [] # 用于存储修正后的 top k- 的所有 error for worst in topk_worst: similars = median_errors[median_errors['cluster'] == worst[1]].as_matrix().tolist() MS_datas = worst_data = train_data.loc[worst[0]] X_worst = worst_data.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y_worst = worst_data[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() for similar in similars: MS_datas = pd.concat([MS_datas, train_data.loc[similar[0]]]) # 随机抽样 # MS_datas = MS_datas.sample(frac=0.8) X = MS_datas.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y = MS_datas[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() # X = [] # y = [] # # # 筛选,删掉距离原始数据集过远的数据 # for i, j in zip(X_, y_): # error = utils.haversine(j[4], j[5], worst[0][0], worst[0][1]) # if error > 500: # continue # X.append(i) # y.append(j) # X = np.array(X) # y = np.array(y) # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的,同时每一次的实验的数据集也是一样的,从而提升结果的可信度 random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] # 随机森林 print("MS {}".format(worst)) errors = [] for i in range(10): # 切分训练集和验证集 X_train, _, y_train, _ = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) _, X_test, _, y_test = train_test_split( X_worst, y_worst, test_size=0.2, random_state=random_states[i]) regr = RandomForestRegressor(max_depth=20, random_state=0) y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5], axis=1)).predict(X_test) error = utils.pos_error(y_test, y_pred) errors.append(error) # 做出每个基站用于加入了新的数据集后的所有训练数据点位置和原始该 MS 基站的数据点位置 plt.title("Median error: %.3f" % np.percentile(np.array(errors).mean(axis=0), 50)) ax = plt.gca() ax.get_xaxis().get_major_formatter().set_useOffset(False) plt.scatter(y[:, 2], y[:, 3], label='new data') plt.scatter(y_worst[:, 2], y_worst[:, 3], label='old data') plt.xlim([lb_Longitude, rt_Longitude]) plt.ylim([lb_Latitude, rt_Latitude]) plt.legend() plt.show() median_error = np.percentile(np.array(errors).mean(axis=0), 50) print("Median error: {}".format(median_error)) new_errors.append([worst, errors]) print("****************************") utils.cdf_figure(old_errors, new_errors) utils.mean_figure(old_errors, new_errors)
def main(): ll_data_2g = utils.gongcan_to_ll() train_data = utils.ll_to_grid(ll_data_2g) # print(train_data) # 删除原有的ID,不作为训练特征 for i in range(1, 8): train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True) train_data.drop(['CellID_' + str(i)], axis=1, inplace=True) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) # features和labels X = train_data.drop( ['MRTime', 'Longitude', 'Latitude', 'Num_connected', 'grid_num'], axis=1, inplace=False).as_matrix() y = train_data[['grid_num', 'Longitude', 'Latitude']].as_matrix() # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的 random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] errors_all = [] top10_pres_all = [] top10_recalls_all = [] top10_fs_all = [] overall_pres_all = [] # 高斯朴素贝叶斯分类器 start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) gnb = GaussianNB() y_pred = gnb.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("Gaussian") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # K近邻分类器 start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) neigh = KNeighborsClassifier() y_pred = neigh.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("KNeighbors") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # 决策树分类器 start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) clf = DecisionTreeClassifier() y_pred = clf.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("DecisionTree") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # 随机森林 start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) clf = RandomForestClassifier(max_depth=20, random_state=0) y_pred = clf.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("RandomForest") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # AdaBoost start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) clf = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=20), learning_rate=0.01, n_estimators=30, algorithm='SAMME.R') y_pred = clf.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("AdaBoost") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # Bagging start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) clf = BaggingClassifier(n_estimators=20) y_pred = clf.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("Bagging") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") # GradientBoosting start = datetime.datetime.now() errors = [] overall_pres = [] top10_pres = [] top10_recalls = [] top10_fs = [] for i in range(10): print(i) # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) clf = GradientBoostingClassifier(n_estimators=60, learning_rate=0.01) y_pred = clf.fit(np.delete(X_train, 0, axis=1), y_train[:, 0]).predict(np.delete(X_test, 0, axis=1)) overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall( y_test[:, 0], y_pred) overall_pres.append(overall_pre) top10_pres.append(top10_pre) top10_recalls.append(top10_recall) top10_fs.append(top10_f) errors.append(utils.pos_error(y_test, y_pred)) print("GradientBoosting") print("Overall precision: %.3f" % np.mean(np.array(overall_pres))) print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean()) print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean()) print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean()) print("Median error: {}".format( np.percentile(np.array(errors).mean(axis=0), 50))) print("Time spend: {}".format(datetime.datetime.now() - start)) errors_all.append(errors) top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean()) top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean()) overall_pres_all.append(np.mean(np.array(overall_pres))) top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean()) print("****************************") utils.cdf_figure(errors_all) utils.figure(overall_pres_all, top10_pres_all, top10_recalls_all, top10_fs_all)
def generator(): ll_data_2g = utils.gongcan_to_ll() train_data = utils.ll_to_grid(ll_data_2g) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) train_data.to_csv("X.csv") # 接下来是针对 cnn 进行的预处理 train_scaled = DataFrame() # 归一化 labels = [ 'RNCID_', 'CellID_', 'AsuLevel_', 'SignalLevel_', 'RSSI_', 'Latitude_', 'Longitude_' ] for label in labels: tmp = DataFrame() for i in range(1, 8): tmp = pd.concat([tmp, train_data[label + str(i)]], axis=1) tmp_index = tmp.columns.tolist() tmp = tmp.as_matrix() # tmp_scaled = scale(tmp) min_max_scaler = MinMaxScaler(copy=True, feature_range=(0, 1)) tmp_scaled = min_max_scaler.fit_transform(tmp) tmp_scaled = DataFrame(tmp_scaled, columns=tmp_index) train_scaled = pd.concat([train_scaled, tmp_scaled], axis=1) # train_scaled.to_csv('X_scaled.csv') train_scaled = pd.concat([ train_scaled, train_data[['IMSI', 'MRTime', 'Longitude', 'Latitude', 'grid_num']] ], axis=1) X_ = [] y_ = [] for index, row in train_scaled.iterrows(): y_.append(row['grid_num']) x_ = [] for i in range(1, 8): tmp = [] for label in labels: tmp.append(row[label + str(i)]) x_.append(tmp) X_.append(x_) # X 是生成好的 7*7 数组,作为 feature # y_ 是生成好的 label X = np.array(X_) y_ = np.array(y_) # 对生成好的 label 做 onehot 编码 y = np.zeros(shape=(y_.shape[0], 13 * 17)) # print(y.shape) for i in range(y_.shape[0]): y[i][int(y_[i])] = 1 return X, y
def main(): train_data = utils.gongcan_to_ll() # 删除原有的ID,不作为训练特征 for i in range(1, 8): train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True) train_data.drop(['CellID_' + str(i)], axis=1, inplace=True) # 将空余的信号强度,用0补填补 train_data = train_data.fillna(0) rel_lon = [] rel_lat = [] # print(train_data) for index, row in train_data.iterrows(): rel_lon.append(row['Longitude'] - row['Longitude_1']) rel_lat.append(row['Latitude'] - row['Latitude_1']) train_data['rel_Longitude'] = np.array(rel_lon) train_data['rel_Latitude'] = np.array(rel_lat) # features和labels train_data.set_index(['Longitude_1', 'Latitude_1'], inplace=True, drop=False) train_data.sort_index(inplace=True) ids = list(set(train_data.index.tolist())) # print(ids) # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的,同时每一次的实验的数据集也是一样的,从而提升结果的可信度 random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] errors_all = [] median_errors = [] for id in ids: MS_datas = train_data.loc[id] X = MS_datas.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y = MS_datas[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() # 随机森林 print("MS {}".format(id)) errors = [] for i in range(10): # 切分训练集和验证集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) regr = RandomForestRegressor(max_depth=20, random_state=0) y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5], axis=1)).predict(X_test) error = utils.pos_error(y_test, y_pred) errors.append(error) # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred) # errors.append(utils.pos_error(y_test, y_pred)) median_error = np.percentile(np.array(errors).mean(axis=0), 50) print("Median error: {}".format(median_error)) median_errors.append([id, median_error]) errors_all.append([id, errors]) print("****************************") median_errors = DataFrame(median_errors, columns=['id', 'median_error']) median_errors.set_index(['median_error'], inplace=True, drop=False) median_errors.sort_index(inplace=True) # print(median_errors) MS_number = median_errors.shape[0] topk_best = median_errors.iloc[:int(MS_number * 0.2)]['id'].as_matrix().tolist() topk_worst = median_errors.iloc[int(MS_number * 0.8):]['id'].as_matrix().tolist() old_errors = [] # 用于存储没有修正前的 top k- 的所有 error for error in errors_all: if error[0] in topk_worst: old_errors.append([error[0], error[1]]) # 获取top k+的数据 best_data = DataFrame() for best in topk_best: best_data = pd.concat([best_data, train_data.loc[best]], axis=0) # print(best_data) # best_data = best_data.sample(frac=0.7) # print(best_data) print("\n") print("Start correction") print("\n") new_errors = [] # 用于存储修正后的 top k- 的所有 error for worst in topk_worst: MS_datas = pd.concat([train_data.loc[worst], best_data]) # MS_datas = best_data X = MS_datas.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y = MS_datas[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() worst_data = train_data.loc[worst] X_worst = worst_data.drop( ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'], axis=1, inplace=False).as_matrix() y_worst = worst_data[[ 'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude', 'Longitude_1', 'Latitude_1' ]].as_matrix() # 随机森林 print("MS {}".format(worst)) errors = [] for i in range(10): # 切分训练集和验证集 X_train, _, y_train, _ = train_test_split( X, y, test_size=0.2, random_state=random_states[i]) _, X_test, _, y_test = train_test_split( X_worst, y_worst, test_size=0.2, random_state=random_states[i]) regr = RandomForestRegressor(max_depth=20, random_state=0) y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5], axis=1)).predict(X_test) error = utils.pos_error(y_test, y_pred) errors.append(error) # 做出每个基站用于加入了新的数据集后的所有训练数据点位置和原始该 MS 基站的数据点位置 plt.title("Median error: %.3f" % np.percentile(np.array(errors).mean(axis=0), 50)) ax = plt.gca() ax.get_xaxis().get_major_formatter().set_useOffset(False) plt.scatter(y[:, 2], y[:, 3], label='new data') plt.scatter(y_worst[:, 2], y_worst[:, 3], label='old data') plt.xlim([lb_Longitude, rt_Longitude]) plt.ylim([lb_Latitude, rt_Latitude]) plt.legend() plt.show() new_errors.append([worst, errors]) median_error = np.percentile(np.array(errors).mean(axis=0), 50) print("Median error: {}".format(median_error)) # median_errors.append([worst, median_error]) # errors_all.append([id, errors]) print("****************************") utils.cdf_figure(old_errors, new_errors) utils.mean_figure(old_errors, new_errors)