Exemple #1
0
def grid_search(clf):
    train_data = read_train_join_mall()
    train_data = train_data.sort_values(by='time_stamp')
    train_label = preprocessing.LabelEncoder().fit_transform(
        train_data['shop_id'])

    for mall_id in ['m_7374']:  # train_data['mall_id'].unique():
        X_train, y_train = DataVector.data_to_vec(mall_id, [
            LocationToVec2(),
            WifiToVec(),
            WifiStrongToVec(),
            WifiKStrongToVec(),
            PriceToVec()
        ], train_data, train_label)
        # print('fit.....')
        clf.fit(X_train, y_train)
        # print('fit done')

        print('{} score: {}'.format(mall_id, clf.best_score_))
        for name, val in clf.best_params_.items():
            print("{}  {}".format(name, val))
        print('----------')
        with open('./console_output/grid_search_res.txt', 'a') as f:
            f.write('{} score: {}\n'.format(mall_id, clf.best_score_))
            for name, val in clf.best_params_.items():
                f.write("{}  {}\n".format(name, val))
            f.write('------\n\n\n')
            f.flush()
def recovery_probability_from_pkl():
    _train_data = read_train_join_mall()
    _train_data = _train_data.sort_values(by='time_stamp')
    _train_label = _train_data['category_id'].values
    _test_data = read_test_data()

    le = preprocessing.LabelEncoder().fit(_train_label)
    # print(le.classes_)
    _train_label = le.transform(_train_label)

    m, n = _train_data.shape[0], len(le.classes_)
    print(m, n)

    oof_train = joblib.load(
        './feature_save/predicted_category_pro.csv_oof_train2.pkl')
    oof_test = joblib.load(
        './feature_save/predicted_category_pro.csv_oof_test2.pkl')
    with open('./feature_save/predicted_category_pro.csv', 'w') as f:
        f.write('row_id,{}\n'.format(','.join(str(i) for i in range(n))))
        for i, row_id in enumerate(_train_data['row_id']):
            f.write('{},{}\n'.format(
                row_id, ','.join(list(str(x) for x in oof_train[i]))))
        for i, row_id in enumerate(_test_data['row_id']):
            f.write('{},{}\n'.format(
                row_id, ','.join(list(str(x) for x in oof_test[i]))))
def shop_mall_visualization(mall_id='m_4572'):
    """
    画出某mall_id商场的所有店铺和用户位置
    """
    train_data = read_train_join_mall()
    train_data = train_data[train_data['mall_id'] == mall_id]

    x = train_data['latitude']
    y = train_data['longitude']

    id2color = {
        mall_id: i
        for i, mall_id in enumerate(train_data['shop_id'].unique())
    }
    colors = [id2color[i] for i in train_data['shop_id']]
    plt.scatter(x, y, s=100, c=colors, alpha=0.5, marker='^')

    train_data = read_mall_data()
    train_data = train_data[train_data['mall_id'] == mall_id]
    x = train_data['latitude']
    y = train_data['longitude']

    colors = [id2color[i] for i in train_data['shop_id']]
    plt.scatter(x, y, s=600, c=colors, alpha=0.5)

    center = center_latitudes_and_longitudes(list(zip(x, y)))
    plt.scatter(center[0], center[1], s=1000, marker='s')

    show_plt()
def wifi_apperance_days(mall_id='m_1621'):
    import pandas as pd
    import numpy as np
    _train_data = read_train_join_mall()
    train_data = _train_data.loc[_train_data['mall_id'] == mall_id]
    train_data = train_data.assign(
        time_stamp=pd.to_datetime(train_data['time_stamp']))
    train_data['time_stamp'] = train_data['time_stamp'].dt.day
    total_count = [collections.defaultdict(set) for _ in range(31)]
    bssids = set()
    for shop_id, day, wifi_infos in zip(train_data['shop_id'],
                                        train_data['time_stamp'],
                                        train_data['wifi_infos']):
        for wifi_info in wifi_infos.split(';'):
            bssid, _, _ = wifi_info.split('|')
            bssids.add(bssid)
            total_count[day - 1][bssid].add(shop_id)

    cnt = 0
    for bssid in sorted(bssids):
        t = np.array([len(total_count[i][bssid]) for i in range(31)])
        if np.count_nonzero(t) > 7:
            print(t)
            cnt += 1
    print(cnt, len(bssids))
def check_low():
    """
        mall_id  wifi_loc         wifi
        m_7168 0.708214760008  0.686966420034
        m_7800 0.721053965037  0.690904484419
        m_1920 0.764782750735  0.7520418164
        m_4422 0.767413834659  0.730537478911
        m_2224 0.790900290416  0.773797999355
        m_4079 0.793646944714  0.777400581959
        m_6803 0.825242718447  0.79854368932
        'm_1950': 0.924817798236,  0.909474491753
        m_5076 0.948070175439  0.938713450292
        m_4495 0.972508591065  0.968499427262
    """
    train_data = read_train_join_mall()
    low_list = {
        'm_7168': 0.708214760008,
        'm_7800': 0.721053965037,
        'm_1920': 0.764782750735,
        'm_4422': 0.767413834659,
        'm_2224': 0.790900290416,
        'm_4079': 0.793646944714,
        'm_6803': 0.825242718447,
        'm_1950': 0.924817798236,
        'm_5076': 0.948070175439,
        'm_4495': 0.972508591065
    }
    for mall_id, score in sorted(low_list.items(), key=lambda x: x[1]):
        check_mall(train_data, mall_id)
Exemple #6
0
    def train_and_on_test_data(self, vec_func, target_column='shop_id'):
        train_data = read_train_join_mall()
        train_label = train_data[target_column]
        test_data = read_test_data()

        ans = self._trained_by_mall_and_predict_location(
            vec_func, train_data, train_label, test_data)
        self.result_to_csv(ans, test_data)
def wifi_co_occurrence_analysis():
    train_data = read_train_join_mall()
    res = []
    for mall_id in train_data['mall_id'].unique():
        res.extend(_wifi_co_occurrence(train_data, mall_id))
    with open('./feature_save/wifi_co_occurrence.csv', 'w') as f:
        f.write('mall_id,bssid\n')
        for mall_id, bssid in res:
            f.write('{},{}\n'.format(mall_id, bssid))
Exemple #8
0
def analysis():
    TRAIN_PRICE = pd.read_csv('./feature_save/predicted_price.csv')
    data = read_train_join_mall()
    data = data.loc[data['mall_id'] == 'm_1790']
    d = data.join(TRAIN_PRICE.set_index('row_id'), on='row_id', rsuffix='_train')
    print(d.shape)
    diff = []
    for row_id, shop_id, price, p_price in zip(d['row_id'], d['shop_id'], d['price'], d['p_price']):
        print(row_id, shop_id, price, p_price, p_price - price)
        diff.append(abs(p_price - price))
    print(sum(diff), d.shape[0], sum(diff) / d.shape[0])
    def train_test(self, vec_func, target_column='category_id', fold=5):
        """

        :param vec_func: list of vector function
        :param target_column: the target column you want to predict.
        :param fold: the fold of cross-validation.
        :return: None
        """
        # ------input data -----------
        _train_data = read_train_join_mall()
        _train_data = _train_data.sort_values(by='time_stamp')
        _test_data = read_test_data()

        for mall_id in _train_data['mall_id'].unique():
            train_data = _train_data.loc[_train_data['mall_id'] == mall_id]
            train_label = train_data[target_column].values
            test_data = _test_data.loc[_test_data['mall_id'] == mall_id]

            label_encoder = preprocessing.LabelEncoder()
            train_label = label_encoder.fit_transform(train_label)

            kf = KFold(n_splits=fold, random_state=self._random_state)

            oof_train = np.zeros(
                (train_data.shape[0], len(label_encoder.classes_)))
            oof_test = np.zeros(
                (test_data.shape[0], len(label_encoder.classes_)))

            for i, (train_index,
                    test_index) in enumerate(kf.split(train_data)):
                self._trained_and_predict(vec_func, train_data, train_label,
                                          test_data, train_index, test_index,
                                          oof_train, oof_test, i, mall_id)
            oof_test /= fold

            cur_save_path = '{}/{}'.format(self.SAVE_MODEL_BASE_PATH, mall_id)

            safe_dump_model(oof_train, cur_save_path + '_train.pkl')
            safe_dump_model(oof_test, cur_save_path + '_test.pkl')

            row_ids = pd.DataFrame(train_data['row_id'].values,
                                   columns=['row_id'])
            oof_train = pd.DataFrame(oof_train, columns=label_encoder.classes_)
            safe_save_csv_result(
                pd.concat([row_ids, oof_train], axis=1).set_index('row_id'),
                cur_save_path + '_train.csv')

            row_ids = pd.DataFrame(test_data['row_id'].values,
                                   columns=['row_id'])
            oof_test = pd.DataFrame(oof_test, columns=label_encoder.classes_)
            safe_save_csv_result(
                pd.concat([row_ids, oof_test], axis=1).set_index('row_id'),
                cur_save_path + '_test.csv')
Exemple #10
0
    def train_test(self, vec_func, target_column='shop_id'):
        """

        :param vec_func: list of vector function
        :param target_column: the target column you want to predict.
        :return:
        """
        # ------input data -----------
        train_data = read_train_join_mall()
        train_data = train_data.sort_values(by='time_stamp')
        train_label = train_data[target_column]
        train_data, test_data, train_label, test_label = train_test_split(
            train_data, train_label, self._test_ratio)

        ans = self._trained_by_mall_and_predict_location(
            vec_func, train_data, train_label, test_data, test_label)
Exemple #11
0
def recovery_price_from_pkl():
    _train_data = read_train_join_mall()
    _train_data = _train_data.sort_values(by='time_stamp')
    _test_data = read_test_data()

    oof_train = joblib.load('./feature_save/predicted_price.csv_oof_train.pkl')
    oof_test = joblib.load('./feature_save/predicted_price.csv_oof_test.pkl')
    print(oof_train.shape, _train_data.shape)
    print(oof_test.shape, _test_data.shape)

    with open('./feature_save/predicted_price.csv', 'w') as f:
        f.write('row_id,p_price\n')
        for row_id, p in zip(_train_data['row_id'], oof_train):
            f.write('{},{}\n'.format(row_id, p))
        for row_id, p in zip(_test_data['row_id'], oof_test):
            f.write('{},{}\n'.format(row_id, p))
Exemple #12
0
    def train_test(self, vec_func, target_column='price', fold=10):
        """

        :param vec_func: list of vector function
        :param target_column: the target column you want to predict.
        :param fold: the fold of cross-validation.
        :return: None
        """
        # ------input data -----------
        _train_data = read_train_join_mall()
        _train_data = _train_data.sort_values(by='time_stamp')
        _train_label = _train_data[target_column].values
        _test_data = read_test_data()

        kf = KFold(n_splits=fold, random_state=self._random_state)
        oof_train = np.zeros((_train_data.shape[0], ))
        oof_test = np.zeros((_test_data.shape[0], ))
        oof_test_skf = np.zeros((fold, _test_data.shape[0]))

        fold_error = 0
        for i, (train_index, test_index) in enumerate(kf.split(_train_data)):
            print(i)
            fold_error += self._trained_and_predict(vec_func, _train_data,
                                                    _train_label, _test_data,
                                                    oof_train, oof_test_skf,
                                                    train_index, test_index, i)

        oof_test[:] = oof_test_skf.mean(axis=0)

        joblib.dump(oof_train,
                    self.feature_save_path + '_oof_train.pkl',
                    compress=3)
        joblib.dump(oof_test,
                    self.feature_save_path + '_oof_test.pkl',
                    compress=3)

        print(fold_error / fold)

        with open(self.feature_save_path, 'w') as f:
            f.write('row_id,p_price\n')
            for i, row_id in enumerate(_train_data['row_id']):
                f.write('{},{}\n'.format(row_id, oof_train[i]))
            for i, row_id in enumerate(_test_data['row_id']):
                f.write('{},{}\n'.format(row_id, oof_test[i]))
        print('done')
def many_mall_has_same_bssid():
    """
    many mall has same bssid, it may be mobile hotspot
    :return:
    """
    train_data = read_train_join_mall()
    counter = collections.defaultdict(set)
    start = time.time()
    for mall_id, wifi_infos in zip(train_data['mall_id'],
                                   train_data['wifi_infos']):
        for wifi in wifi_infos.split(';'):
            _id, _strong, _connect = wifi.split('|')
            counter[_id].add(mall_id)
    print(time.time() - start)
    many_uid = {key for key, l in counter.items() if len(l) > 1}
    joblib.dump(many_uid, './feature_save/many_mall_wifi_bssid.pkl')
    print('total: {} repeat in other mall: {}'.format(len(counter),
                                                      len(many_uid)))
def mall_category_time(mall_id='m_7168', _date='2017-08-04'):
    """
    计算某商场某天类别随时间变化
    """
    data = read_train_join_mall()
    data = data[data['mall_id'] == mall_id]
    data = data.sort_values(by='time_stamp')
    first_date = datetime.strptime(_date, "%Y-%m-%d").date()

    counter = collections.defaultdict(lambda: [0] * 24)
    for _datetime, category_id in zip(data['time_stamp'], data['category_id']):
        _datetime = datetime.strptime(_datetime, "%Y-%m-%d %H:%M")
        if _datetime.date() != first_date: continue
        counter[category_id][_datetime.hour] += 1

    with open('./analysis_data/mall_counter_{}.csv'.format(_date), 'w') as f:
        f.write(',{}\n'.format(','.join([str(i) for i in range(24)])))
        for category_id, cnt in sorted(counter.items()):
            f.write('{},{}\n'.format(category_id, ','.join([str(c) for c in cnt])))
def mall_shop_day_sales_volume(mall_id='m_1621'):
    """
    画出某店铺的每日销量
    """
    _train_data = read_train_join_mall()
    train_data = _train_data.loc[_train_data['mall_id'] == mall_id]
    train_data = train_data.assign(
        time_stamp=pd.to_datetime(train_data['time_stamp']))
    train_data['time_stamp'] = train_data['time_stamp'].dt.day

    total_count = [collections.Counter() for _ in range(31)]
    for shop_id, day in zip(train_data['shop_id'], train_data['time_stamp']):
        total_count[day - 1][shop_id] += 1

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    z = 0
    shop_dis = 60

    for shop_id in [
            's_389866', 's_432426', 's_459836', 's_634174', 's_1215854',
            's_1287028', 's_2110248', 's_2670603', 's_2862961', 's_2922711',
            's_3418707', 's_3479448', 's_3558937', 's_3658245', 's_3711363',
            's_3716008', 's_3790469', 's_4001714', 's_4021610', 's_4050122'
    ]:
        if total_count[-1][shop_id] > 0: continue  # 只画最后一天没有卖东西的,减少数量
        xs = list(range(31))
        ys = [total_count[i][shop_id] for i in xs]
        ax.bar(xs, ys, z, zdir='y', alpha=0.8)
        z += shop_dis

    ax.set_xlabel('days')
    ax.set_ylabel('shops')
    ax.set_zlabel('sales volume')

    show_plt()
    train_data = _train_data.loc[_train_data['mall_id'] == mall_id]
    train_data = train_data.assign(
        time_stamp=pd.to_datetime(train_data['time_stamp']))
    train_data['time_stamp'] = train_data['time_stamp'].dt.day
    total_count = [collections.defaultdict(set) for _ in range(31)]
    bssids = set()
    for shop_id, day, wifi_infos in zip(train_data['shop_id'],
                                        train_data['time_stamp'],
                                        train_data['wifi_infos']):
        for wifi_info in wifi_infos.split(';'):
            bssid, _, _ = wifi_info.split('|')
            bssids.add(bssid)
            total_count[day - 1][bssid].add(shop_id)

    cnt = 0
    for bssid in sorted(bssids):
        t = np.array([len(total_count[i][bssid]) for i in range(31)])
        if np.count_nonzero(t) > 7:
            print(t)
            cnt += 1
    print(cnt, len(bssids))


if __name__ == '__main__':
    many_mall_has_same_bssid()
    check_low()
    wifi_co_occurrence_analysis()
    _wifi_co_occurrence(read_train_join_mall())
    wifi_empty_statics()
    wifi_apperance_days()