Esempio n. 1
0
def main():
    ll_data_2g = utils.gongcan_to_ll()
    train_data = utils.ll_to_grid(ll_data_2g)

    # print(train_data)
    # 删除原有的ID,不作为训练特征
    for i in range(1, 8):
        train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True)
        train_data.drop(['CellID_' + str(i)], axis=1, inplace=True)
    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)

    # features和labels
    X = train_data.drop(
        ['MRTime', 'Longitude', 'Latitude', 'Num_connected', 'grid_num'],
        axis=1,
        inplace=False).as_matrix()
    y = train_data[['grid_num', 'Longitude', 'Latitude']].as_matrix()

    # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的
    # random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

    # errors_all = []
    # GradientBoosting 调参
    # start = datetime.datetime.now()
    # errors = []
    # overall_pres = []
    # top10_pres = []
    # top10_recalls = []
    # top10_fs = []
    # print(y[:,0])
    print("GradientBoosting")

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_states[i])

    param_test1 = {
        'n_estimators': range(10, 61, 10),
        'learning_rate': np.arange(0.01, 0.1, 10)
    }
    param_test2 = {'max_depth': range(3, 14, 2)}
    param_test3 = {
        'max_features': range(7, 20, 2),
        'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]
    }
    gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(
        learning_rate=0.1, min_samples_split=300),
                            param_grid=param_test1,
                            scoring='f1_micro',
                            cv=5)
    gsearch1.fit(np.delete(X, 0, axis=1), y[:, 0])
    print("Best param: {}".format(gsearch1.best_params_))
    print("Best score: {}".format(gsearch1.best_score_))

    print("****************************")
Esempio n. 2
0
def main():
    ll_data_2g = utils.gongcan_to_ll()
    train_data = utils.ll_to_grid(ll_data_2g)

    # print(train_data)
    # 删除原有的ID,不作为训练特征
    for i in range(1, 8):
        train_data.drop(['RNCID_'+str(i)], axis=1, inplace=True)
        train_data.drop(['CellID_'+str(i)], axis=1, inplace=True)
    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)

    # features和labels
    X = train_data.drop(['MRTime', 'Longitude', 'Latitude',
                         'Num_connected', 'grid_num'], axis=1, inplace=False).as_matrix()
    y = train_data[['grid_num', 'Longitude', 'Latitude']].as_matrix()

    # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的
    # random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

    # errors_all = []
    # GradientBoosting 调参
    # start = datetime.datetime.now()
    # errors = []
    # overall_pres = []
    # top10_pres = []
    # top10_recalls = []
    # top10_fs = []
    # print(y[:,0])
    print("Adaboost")

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_states[i])

    # param_test1 = {'n_estimators': range(30, 101, 10),
    #                'learning_rate': np.arange(0.01, 0.1, 10)}

    # param_test2 = {'algorithm' : ['SAMME', 'SAMME.R']}
    param_test3 = {'base_estimator': [GaussianNB(), DecisionTreeClassifier()]}
    gsearch1 = GridSearchCV(estimator=AdaBoostClassifier(n_estimators=30, learning_rate=0.01, algorithm='SAMME.R'),
                            param_grid=param_test3, scoring='f1_micro', cv=5)
    gsearch1.fit(np.delete(X, 0, axis=1), y[:, 0])
    print("Best param: {}".format(gsearch1.best_params_))
    print("Best score: {}".format(gsearch1.best_score_))
    print("Best estimator: {}".format(gsearch1.best_estimator_))

    print("****************************")
Esempio n. 3
0
def main():
    train_data = utils.gongcan_to_ll()
    # 删除原有的ID,不作为训练特征
    for i in range(1, 8):
        train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True)
        train_data.drop(['CellID_' + str(i)], axis=1, inplace=True)

    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)
    rel_lon = []
    rel_lat = []
    for index, row in train_data.iterrows():
        rel_lon.append(row['Longitude'] - row['Longitude_1'])
        rel_lat.append(row['Latitude'] - row['Latitude_1'])

    train_data['rel_Longitude'] = np.array(rel_lon)
    train_data['rel_Latitude'] = np.array(rel_lat)

    # features和labels
    train_data.set_index(['Longitude_1', 'Latitude_1'],
                         inplace=True,
                         drop=False)
    train_data.sort_index(inplace=True)
    ids = list(set(train_data.index.tolist()))

    errors_all = []
    amount = []
    for id in ids:
        MS_datas = train_data.loc[id]
        X = MS_datas.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y = MS_datas[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的
        random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

        # 随机森林
        print("MS {}".format(id))
        errors = []
        for i in range(10):

            # 切分训练集和验证集
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=random_states[i])

            regr = RandomForestRegressor(max_depth=20, random_state=0)
            y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5],
                                                 axis=1)).predict(X_test)

            error = utils.pos_error(y_test, y_pred)
            errors.append(error)

            # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred)
            # errors.append(utils.pos_error(y_test, y_pred))

        # 将每个数据集的点做出来
        plt.title("Median error: %.3f" %
                  np.percentile(np.array(errors).mean(axis=0), 50) +
                  " Data amount: {}".format(X.shape[0]))
        ax = plt.gca()
        ax.get_xaxis().get_major_formatter().set_useOffset(False)
        plt.scatter(y[:, 2], y[:, 3])
        plt.xlim([lb_Longitude, rt_Longitude])
        plt.ylim([lb_Latitude, rt_Latitude])
        plt.show()

        # print("Different data amount: {}".format(len(set(X[:,0]))))
        print("Data amount: {}".format(X.shape[0]))
        print("Median error: {}".format(
            np.percentile(np.array(errors).mean(axis=0), 50)))
        errors_all.append([id, errors])
        amount.append(
            [X.shape[0],
             np.percentile(np.array(errors).mean(axis=0), 50)])
        # amount.append([len(set(X[:, 0])), np.percentile(np.array(errors).mean(axis=0), 50)])

        print("****************************")
    utils.cdf_figure(errors_all)
    utils.mean_figure(errors_all)
    # utils.cdf_figure_overall(errors_all)

    # 将每个基站的中位误差和总的数据集个数输出
    amount = np.array(amount)
    amount = amount[amount[:, 0].argsort()]
    for a in amount:
        print(a)

    return errors_all
Esempio n. 4
0
def compare():
    """
    将a问与c问结果比较
    :return:
    """
    ll_data_2g = utils.gongcan_to_ll()
    train_data = utils.ll_to_grid(ll_data_2g)

    # print(train_data)
    # 删除原有的ID,不作为训练特征
    for i in range(1, 8):
        train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True)
        train_data.drop(['CellID_' + str(i)], axis=1, inplace=True)
    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)

    # features和labels
    X_ = train_data.drop(
        ['MRTime', 'Longitude', 'Latitude', 'Num_connected', 'grid_num'],
        axis=1,
        inplace=False).as_matrix()
    y_ = train_data[['grid_num', 'Longitude', 'Latitude']].as_matrix()
    # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的
    random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

    start = datetime.datetime.now()
    errors_all = []

    for i in range(10):

        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X_, y_, test_size=0.2, random_state=random_states[i])

        clf = RandomForestClassifier(max_depth=20, random_state=0)
        y_pred = clf.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))

        ll_pred = []
        for y in y_pred:
            X_box = int(y % X_box_num)
            y_box = int(y / X_box_num) + 1
            if X_box == 0:
                X_box = X_box_num
                y_box -= 1
            lon = lb_Longitude + per_lon * X_box - 0.5 * per_lon
            lat = lb_Latitude + per_lat * y_box - 0.5 * per_lat

            ll_pred.append([lon, lat])
        ll_true = np.delete(y_test, 0, axis=1).tolist()
        errors = []
        for (true, pred) in zip(ll_true, ll_pred):
            error = utils.haversine(true[0], true[1], pred[0], pred[1])
            errors.append(error)
        errors.sort()
        errors_all.append(errors)

    print("RandomForest")
    print("Median error: {}".format(
        np.percentile(np.array(errors_all).mean(axis=0), 50)))
    print("Time: {}".format(datetime.datetime.now() - start))
    print("****************************")

    # 获得 c 问结果
    start = datetime.datetime.now()
    c_errors = main()
    print("Time: {}".format(datetime.datetime.now() - start))

    plt.figure('Comparision 2G DATA')
    plt.xlabel('Comparision 2G DATA - CDF figure')
    plt.ylabel('Error(meters)')

    # 绘制 c 问的结果的总体CDF曲线
    mean_errors = []
    for i in range(len(c_errors)):
        errors = np.array(c_errors[i][1])
        mean_error = errors.mean(axis=0)
        mean_errors.extend(mean_error)
    mean_errors.sort()
    plt.plot(
        [float(i) / float(len(mean_errors)) for i in range(len(mean_errors))],
        list(mean_errors),
        '--',
        linewidth=1,
        alpha=0.6,
        label="c-method median error(m): %.3f" %
        np.percentile(mean_errors, 50))

    # 绘制 a 问的结果的总体CDF曲线
    errors = np.array(errors_all)
    mean_errors = errors.mean(axis=0)
    # print(mean_errors)
    plt.plot(
        [float(i) / float(len(mean_errors)) for i in range(len(mean_errors))],
        list(mean_errors),
        '--',
        linewidth=1,
        alpha=0.6,
        label="a-method median error: %.3f" % np.percentile(mean_errors, 50))
    plt.legend()
    plt.show()
Esempio n. 5
0
def main():
    train_data = utils.gongcan_to_ll()
    # 删除原有的ID,不作为训练特征
    for i in range(1, 8):
        train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True)
        train_data.drop(['CellID_' + str(i)], axis=1, inplace=True)

    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)
    rel_lon = []
    rel_lat = []
    # print(train_data)
    for index, row in train_data.iterrows():
        rel_lon.append(row['Longitude'] - row['Longitude_1'])
        rel_lat.append(row['Latitude'] - row['Latitude_1'])

    train_data['rel_Longitude'] = np.array(rel_lon)
    train_data['rel_Latitude'] = np.array(rel_lat)

    # features和labels
    train_data.set_index(['Longitude_1', 'Latitude_1'],
                         inplace=True,
                         drop=False)
    train_data.sort_index(inplace=True)
    ids = list(set(train_data.index.tolist()))

    # 利用 KMeans 聚类,将不同的基站通过距离进行聚类
    y_pred = KMeans(n_init=1, random_state=0).fit_predict(ids)
    # print(y_pred)

    # 做出聚类后的结果
    plt.title("Kmeans Result")
    x = [id[0] for id in ids]
    y = [id[1] for id in ids]
    plt.scatter(x, y, c=y_pred)
    ax = plt.gca()
    ax.get_xaxis().get_major_formatter().set_useOffset(False)
    # plt.xlim([lb_Longitude, rt_Longitude])
    # plt.ylim([lb_Latitude, rt_Latitude])
    plt.show()

    ids = [(id, cluster) for (id, cluster) in zip(ids, y_pred)]
    # print(ids)
    errors_all = []
    median_errors = []
    for id in ids:
        MS_datas = train_data.loc[id[0]]
        X = MS_datas.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y = MS_datas[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的
        random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

        # 随机森林
        print("MS {}".format(id))
        errors = []
        for i in range(10):
            # 切分训练集和验证集
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=random_states[i])

            regr = RandomForestRegressor(max_depth=20, random_state=0)
            y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5],
                                                 axis=1)).predict(X_test)
            error = utils.pos_error(y_test, y_pred)
            errors.append(error)

            # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred)
            # errors.append(utils.pos_error(y_test, y_pred))
        median_error = np.percentile(np.array(errors).mean(axis=0), 50)
        print("Median error: {}".format(median_error))
        median_errors.append([id[0], median_error, id[1]])
        errors_all.append([id, errors])
        print("****************************")
    median_errors = DataFrame(median_errors,
                              columns=['id', 'median_error', 'cluster'])
    median_errors.set_index(['median_error'], inplace=True, drop=False)
    median_errors.sort_index(inplace=True)

    MS_number = median_errors.shape[0]
    topk_worst = median_errors.iloc[int(MS_number * 0.8):][['id', 'cluster'
                                                            ]].as_matrix()

    old_errors = []  # 用于存储没有修正前的 top k- 的所有 error
    for error in errors_all:
        if error[0][0] in topk_worst[:, 0].tolist():
            old_errors.append([error[0], error[1]])

    print("\n")
    print("Start correction")
    print("\n")

    new_errors = []  # 用于存储修正后的 top k- 的所有 error
    for worst in topk_worst:
        similars = median_errors[median_errors['cluster'] ==
                                 worst[1]].as_matrix().tolist()

        MS_datas = worst_data = train_data.loc[worst[0]]
        X_worst = worst_data.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y_worst = worst_data[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()
        for similar in similars:
            MS_datas = pd.concat([MS_datas, train_data.loc[similar[0]]])

        # 随机抽样
        # MS_datas = MS_datas.sample(frac=0.8)

        X = MS_datas.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y = MS_datas[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        # X = []
        # y = []
        #
        # # 筛选,删掉距离原始数据集过远的数据
        # for i, j in zip(X_, y_):
        #     error = utils.haversine(j[4], j[5], worst[0][0], worst[0][1])
        #     if error > 500:
        #         continue
        #     X.append(i)
        #     y.append(j)
        # X = np.array(X)
        # y = np.array(y)

        # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的,同时每一次的实验的数据集也是一样的,从而提升结果的可信度
        random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

        # 随机森林
        print("MS {}".format(worst))
        errors = []
        for i in range(10):
            # 切分训练集和验证集
            X_train, _, y_train, _ = train_test_split(
                X, y, test_size=0.2, random_state=random_states[i])
            _, X_test, _, y_test = train_test_split(
                X_worst, y_worst, test_size=0.2, random_state=random_states[i])
            regr = RandomForestRegressor(max_depth=20, random_state=0)
            y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5],
                                                 axis=1)).predict(X_test)
            error = utils.pos_error(y_test, y_pred)
            errors.append(error)

        # 做出每个基站用于加入了新的数据集后的所有训练数据点位置和原始该 MS 基站的数据点位置
        plt.title("Median error: %.3f" %
                  np.percentile(np.array(errors).mean(axis=0), 50))
        ax = plt.gca()
        ax.get_xaxis().get_major_formatter().set_useOffset(False)
        plt.scatter(y[:, 2], y[:, 3], label='new data')
        plt.scatter(y_worst[:, 2], y_worst[:, 3], label='old data')

        plt.xlim([lb_Longitude, rt_Longitude])
        plt.ylim([lb_Latitude, rt_Latitude])
        plt.legend()
        plt.show()

        median_error = np.percentile(np.array(errors).mean(axis=0), 50)
        print("Median error: {}".format(median_error))
        new_errors.append([worst, errors])
        print("****************************")

    utils.cdf_figure(old_errors, new_errors)
    utils.mean_figure(old_errors, new_errors)
Esempio n. 6
0
def main():
    ll_data_2g = utils.gongcan_to_ll()
    train_data = utils.ll_to_grid(ll_data_2g)

    # print(train_data)
    # 删除原有的ID,不作为训练特征
    for i in range(1, 8):
        train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True)
        train_data.drop(['CellID_' + str(i)], axis=1, inplace=True)
    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)

    # features和labels
    X = train_data.drop(
        ['MRTime', 'Longitude', 'Latitude', 'Num_connected', 'grid_num'],
        axis=1,
        inplace=False).as_matrix()
    y = train_data[['grid_num', 'Longitude', 'Latitude']].as_matrix()
    # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的
    random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

    errors_all = []
    top10_pres_all = []
    top10_recalls_all = []
    top10_fs_all = []
    overall_pres_all = []

    # 高斯朴素贝叶斯分类器
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        gnb = GaussianNB()
        y_pred = gnb.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("Gaussian")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # K近邻分类器
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        neigh = KNeighborsClassifier()
        y_pred = neigh.fit(np.delete(X_train, 0, axis=1),
                           y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("KNeighbors")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # 决策树分类器
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        clf = DecisionTreeClassifier()
        y_pred = clf.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("DecisionTree")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # 随机森林
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        clf = RandomForestClassifier(max_depth=20, random_state=0)
        y_pred = clf.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("RandomForest")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # AdaBoost
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        clf = AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(max_depth=20),
            learning_rate=0.01,
            n_estimators=30,
            algorithm='SAMME.R')
        y_pred = clf.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("AdaBoost")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # Bagging
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        clf = BaggingClassifier(n_estimators=20)
        y_pred = clf.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("Bagging")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    # GradientBoosting
    start = datetime.datetime.now()
    errors = []
    overall_pres = []
    top10_pres = []
    top10_recalls = []
    top10_fs = []
    for i in range(10):
        print(i)
        # 切分训练集和验证集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_states[i])

        clf = GradientBoostingClassifier(n_estimators=60, learning_rate=0.01)
        y_pred = clf.fit(np.delete(X_train, 0, axis=1),
                         y_train[:, 0]).predict(np.delete(X_test, 0, axis=1))
        overall_pre, top10_pre, top10_recall, top10_f = utils.precision_recall(
            y_test[:, 0], y_pred)
        overall_pres.append(overall_pre)
        top10_pres.append(top10_pre)
        top10_recalls.append(top10_recall)
        top10_fs.append(top10_f)
        errors.append(utils.pos_error(y_test, y_pred))

    print("GradientBoosting")
    print("Overall precision: %.3f" % np.mean(np.array(overall_pres)))
    print("Top10 precision: %.3f" % np.array(top10_pres).mean(axis=0).mean())
    print("Top10 recall: %.3f" % np.array(top10_recalls).mean(axis=0).mean())
    print("Top10 f-measurement: %.3f" % np.array(top10_fs).mean(axis=0).mean())
    print("Median error: {}".format(
        np.percentile(np.array(errors).mean(axis=0), 50)))
    print("Time spend: {}".format(datetime.datetime.now() - start))
    errors_all.append(errors)
    top10_recalls_all.append(np.array(top10_recalls).mean(axis=0).mean())
    top10_pres_all.append(np.array(top10_pres).mean(axis=0).mean())
    overall_pres_all.append(np.mean(np.array(overall_pres)))
    top10_fs_all.append(np.array(top10_fs).mean(axis=0).mean())
    print("****************************")

    utils.cdf_figure(errors_all)
    utils.figure(overall_pres_all, top10_pres_all, top10_recalls_all,
                 top10_fs_all)
Esempio n. 7
0
def generator():
    ll_data_2g = utils.gongcan_to_ll()
    train_data = utils.ll_to_grid(ll_data_2g)

    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)
    train_data.to_csv("X.csv")

    # 接下来是针对 cnn 进行的预处理
    train_scaled = DataFrame()

    # 归一化
    labels = [
        'RNCID_', 'CellID_', 'AsuLevel_', 'SignalLevel_', 'RSSI_', 'Latitude_',
        'Longitude_'
    ]
    for label in labels:
        tmp = DataFrame()

        for i in range(1, 8):
            tmp = pd.concat([tmp, train_data[label + str(i)]], axis=1)

        tmp_index = tmp.columns.tolist()

        tmp = tmp.as_matrix()
        # tmp_scaled = scale(tmp)
        min_max_scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
        tmp_scaled = min_max_scaler.fit_transform(tmp)
        tmp_scaled = DataFrame(tmp_scaled, columns=tmp_index)
        train_scaled = pd.concat([train_scaled, tmp_scaled], axis=1)

    # train_scaled.to_csv('X_scaled.csv')

    train_scaled = pd.concat([
        train_scaled,
        train_data[['IMSI', 'MRTime', 'Longitude', 'Latitude', 'grid_num']]
    ],
                             axis=1)

    X_ = []
    y_ = []
    for index, row in train_scaled.iterrows():
        y_.append(row['grid_num'])
        x_ = []
        for i in range(1, 8):
            tmp = []
            for label in labels:
                tmp.append(row[label + str(i)])
            x_.append(tmp)

        X_.append(x_)
    # X 是生成好的 7*7 数组,作为 feature
    # y_ 是生成好的 label
    X = np.array(X_)
    y_ = np.array(y_)

    # 对生成好的 label 做 onehot 编码
    y = np.zeros(shape=(y_.shape[0], 13 * 17))
    # print(y.shape)
    for i in range(y_.shape[0]):
        y[i][int(y_[i])] = 1

    return X, y
Esempio n. 8
0
def main():
    train_data = utils.gongcan_to_ll()
    # 删除原有的ID,不作为训练特征
    for i in range(1, 8):
        train_data.drop(['RNCID_' + str(i)], axis=1, inplace=True)
        train_data.drop(['CellID_' + str(i)], axis=1, inplace=True)

    # 将空余的信号强度,用0补填补
    train_data = train_data.fillna(0)
    rel_lon = []
    rel_lat = []
    # print(train_data)
    for index, row in train_data.iterrows():
        rel_lon.append(row['Longitude'] - row['Longitude_1'])
        rel_lat.append(row['Latitude'] - row['Latitude_1'])

    train_data['rel_Longitude'] = np.array(rel_lon)
    train_data['rel_Latitude'] = np.array(rel_lat)

    # features和labels
    train_data.set_index(['Longitude_1', 'Latitude_1'],
                         inplace=True,
                         drop=False)
    train_data.sort_index(inplace=True)
    ids = list(set(train_data.index.tolist()))
    # print(ids)

    # 通过设置每一次的随机数种子,保证不同分类器每一次的数据集是一样的,同时每一次的实验的数据集也是一样的,从而提升结果的可信度
    random_states = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

    errors_all = []
    median_errors = []
    for id in ids:
        MS_datas = train_data.loc[id]
        X = MS_datas.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y = MS_datas[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        # 随机森林
        print("MS {}".format(id))
        errors = []
        for i in range(10):

            # 切分训练集和验证集
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=random_states[i])

            regr = RandomForestRegressor(max_depth=20, random_state=0)
            y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5],
                                                 axis=1)).predict(X_test)
            error = utils.pos_error(y_test, y_pred)
            errors.append(error)

            # overall_pre, top10_pre, top10_recall = utils.precision_recall(y_test[:, 0], y_pred)
            # errors.append(utils.pos_error(y_test, y_pred))
        median_error = np.percentile(np.array(errors).mean(axis=0), 50)
        print("Median error: {}".format(median_error))
        median_errors.append([id, median_error])
        errors_all.append([id, errors])
        print("****************************")
    median_errors = DataFrame(median_errors, columns=['id', 'median_error'])
    median_errors.set_index(['median_error'], inplace=True, drop=False)
    median_errors.sort_index(inplace=True)
    # print(median_errors)

    MS_number = median_errors.shape[0]
    topk_best = median_errors.iloc[:int(MS_number *
                                        0.2)]['id'].as_matrix().tolist()
    topk_worst = median_errors.iloc[int(MS_number *
                                        0.8):]['id'].as_matrix().tolist()

    old_errors = []  # 用于存储没有修正前的 top k- 的所有 error
    for error in errors_all:
        if error[0] in topk_worst:
            old_errors.append([error[0], error[1]])

    # 获取top k+的数据
    best_data = DataFrame()
    for best in topk_best:
        best_data = pd.concat([best_data, train_data.loc[best]], axis=0)

    # print(best_data)
    # best_data = best_data.sample(frac=0.7)
    # print(best_data)
    print("\n")
    print("Start correction")
    print("\n")
    new_errors = []  # 用于存储修正后的 top k- 的所有 error
    for worst in topk_worst:
        MS_datas = pd.concat([train_data.loc[worst], best_data])
        # MS_datas = best_data
        X = MS_datas.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y = MS_datas[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        worst_data = train_data.loc[worst]
        X_worst = worst_data.drop(
            ['IMSI', 'MRTime', 'Longitude', 'Latitude', 'Num_connected'],
            axis=1,
            inplace=False).as_matrix()
        y_worst = worst_data[[
            'rel_Longitude', 'rel_Latitude', 'Longitude', 'Latitude',
            'Longitude_1', 'Latitude_1'
        ]].as_matrix()

        # 随机森林
        print("MS {}".format(worst))
        errors = []
        for i in range(10):
            # 切分训练集和验证集
            X_train, _, y_train, _ = train_test_split(
                X, y, test_size=0.2, random_state=random_states[i])
            _, X_test, _, y_test = train_test_split(
                X_worst, y_worst, test_size=0.2, random_state=random_states[i])
            regr = RandomForestRegressor(max_depth=20, random_state=0)
            y_pred = regr.fit(X_train, np.delete(y_train, [2, 3, 4, 5],
                                                 axis=1)).predict(X_test)
            error = utils.pos_error(y_test, y_pred)
            errors.append(error)

        # 做出每个基站用于加入了新的数据集后的所有训练数据点位置和原始该 MS 基站的数据点位置
        plt.title("Median error: %.3f" %
                  np.percentile(np.array(errors).mean(axis=0), 50))
        ax = plt.gca()
        ax.get_xaxis().get_major_formatter().set_useOffset(False)
        plt.scatter(y[:, 2], y[:, 3], label='new data')
        plt.scatter(y_worst[:, 2], y_worst[:, 3], label='old data')

        plt.xlim([lb_Longitude, rt_Longitude])
        plt.ylim([lb_Latitude, rt_Latitude])
        plt.legend()
        plt.show()

        new_errors.append([worst, errors])
        median_error = np.percentile(np.array(errors).mean(axis=0), 50)
        print("Median error: {}".format(median_error))
        # median_errors.append([worst, median_error])
        # errors_all.append([id, errors])
        print("****************************")

    utils.cdf_figure(old_errors, new_errors)
    utils.mean_figure(old_errors, new_errors)