def grid_search(clf): train_data = read_train_join_mall() train_data = train_data.sort_values(by='time_stamp') train_label = preprocessing.LabelEncoder().fit_transform( train_data['shop_id']) for mall_id in ['m_7374']: # train_data['mall_id'].unique(): X_train, y_train = DataVector.data_to_vec(mall_id, [ LocationToVec2(), WifiToVec(), WifiStrongToVec(), WifiKStrongToVec(), PriceToVec() ], train_data, train_label) # print('fit.....') clf.fit(X_train, y_train) # print('fit done') print('{} score: {}'.format(mall_id, clf.best_score_)) for name, val in clf.best_params_.items(): print("{} {}".format(name, val)) print('----------') with open('./console_output/grid_search_res.txt', 'a') as f: f.write('{} score: {}\n'.format(mall_id, clf.best_score_)) for name, val in clf.best_params_.items(): f.write("{} {}\n".format(name, val)) f.write('------\n\n\n') f.flush()
def recovery_probability_from_pkl(): _train_data = read_train_join_mall() _train_data = _train_data.sort_values(by='time_stamp') _train_label = _train_data['category_id'].values _test_data = read_test_data() le = preprocessing.LabelEncoder().fit(_train_label) # print(le.classes_) _train_label = le.transform(_train_label) m, n = _train_data.shape[0], len(le.classes_) print(m, n) oof_train = joblib.load( './feature_save/predicted_category_pro.csv_oof_train2.pkl') oof_test = joblib.load( './feature_save/predicted_category_pro.csv_oof_test2.pkl') with open('./feature_save/predicted_category_pro.csv', 'w') as f: f.write('row_id,{}\n'.format(','.join(str(i) for i in range(n)))) for i, row_id in enumerate(_train_data['row_id']): f.write('{},{}\n'.format( row_id, ','.join(list(str(x) for x in oof_train[i])))) for i, row_id in enumerate(_test_data['row_id']): f.write('{},{}\n'.format( row_id, ','.join(list(str(x) for x in oof_test[i]))))
def shop_mall_visualization(mall_id='m_4572'): """ 画出某mall_id商场的所有店铺和用户位置 """ train_data = read_train_join_mall() train_data = train_data[train_data['mall_id'] == mall_id] x = train_data['latitude'] y = train_data['longitude'] id2color = { mall_id: i for i, mall_id in enumerate(train_data['shop_id'].unique()) } colors = [id2color[i] for i in train_data['shop_id']] plt.scatter(x, y, s=100, c=colors, alpha=0.5, marker='^') train_data = read_mall_data() train_data = train_data[train_data['mall_id'] == mall_id] x = train_data['latitude'] y = train_data['longitude'] colors = [id2color[i] for i in train_data['shop_id']] plt.scatter(x, y, s=600, c=colors, alpha=0.5) center = center_latitudes_and_longitudes(list(zip(x, y))) plt.scatter(center[0], center[1], s=1000, marker='s') show_plt()
def wifi_apperance_days(mall_id='m_1621'): import pandas as pd import numpy as np _train_data = read_train_join_mall() train_data = _train_data.loc[_train_data['mall_id'] == mall_id] train_data = train_data.assign( time_stamp=pd.to_datetime(train_data['time_stamp'])) train_data['time_stamp'] = train_data['time_stamp'].dt.day total_count = [collections.defaultdict(set) for _ in range(31)] bssids = set() for shop_id, day, wifi_infos in zip(train_data['shop_id'], train_data['time_stamp'], train_data['wifi_infos']): for wifi_info in wifi_infos.split(';'): bssid, _, _ = wifi_info.split('|') bssids.add(bssid) total_count[day - 1][bssid].add(shop_id) cnt = 0 for bssid in sorted(bssids): t = np.array([len(total_count[i][bssid]) for i in range(31)]) if np.count_nonzero(t) > 7: print(t) cnt += 1 print(cnt, len(bssids))
def check_low(): """ mall_id wifi_loc wifi m_7168 0.708214760008 0.686966420034 m_7800 0.721053965037 0.690904484419 m_1920 0.764782750735 0.7520418164 m_4422 0.767413834659 0.730537478911 m_2224 0.790900290416 0.773797999355 m_4079 0.793646944714 0.777400581959 m_6803 0.825242718447 0.79854368932 'm_1950': 0.924817798236, 0.909474491753 m_5076 0.948070175439 0.938713450292 m_4495 0.972508591065 0.968499427262 """ train_data = read_train_join_mall() low_list = { 'm_7168': 0.708214760008, 'm_7800': 0.721053965037, 'm_1920': 0.764782750735, 'm_4422': 0.767413834659, 'm_2224': 0.790900290416, 'm_4079': 0.793646944714, 'm_6803': 0.825242718447, 'm_1950': 0.924817798236, 'm_5076': 0.948070175439, 'm_4495': 0.972508591065 } for mall_id, score in sorted(low_list.items(), key=lambda x: x[1]): check_mall(train_data, mall_id)
def train_and_on_test_data(self, vec_func, target_column='shop_id'): train_data = read_train_join_mall() train_label = train_data[target_column] test_data = read_test_data() ans = self._trained_by_mall_and_predict_location( vec_func, train_data, train_label, test_data) self.result_to_csv(ans, test_data)
def wifi_co_occurrence_analysis(): train_data = read_train_join_mall() res = [] for mall_id in train_data['mall_id'].unique(): res.extend(_wifi_co_occurrence(train_data, mall_id)) with open('./feature_save/wifi_co_occurrence.csv', 'w') as f: f.write('mall_id,bssid\n') for mall_id, bssid in res: f.write('{},{}\n'.format(mall_id, bssid))
def analysis(): TRAIN_PRICE = pd.read_csv('./feature_save/predicted_price.csv') data = read_train_join_mall() data = data.loc[data['mall_id'] == 'm_1790'] d = data.join(TRAIN_PRICE.set_index('row_id'), on='row_id', rsuffix='_train') print(d.shape) diff = [] for row_id, shop_id, price, p_price in zip(d['row_id'], d['shop_id'], d['price'], d['p_price']): print(row_id, shop_id, price, p_price, p_price - price) diff.append(abs(p_price - price)) print(sum(diff), d.shape[0], sum(diff) / d.shape[0])
def train_test(self, vec_func, target_column='category_id', fold=5): """ :param vec_func: list of vector function :param target_column: the target column you want to predict. :param fold: the fold of cross-validation. :return: None """ # ------input data ----------- _train_data = read_train_join_mall() _train_data = _train_data.sort_values(by='time_stamp') _test_data = read_test_data() for mall_id in _train_data['mall_id'].unique(): train_data = _train_data.loc[_train_data['mall_id'] == mall_id] train_label = train_data[target_column].values test_data = _test_data.loc[_test_data['mall_id'] == mall_id] label_encoder = preprocessing.LabelEncoder() train_label = label_encoder.fit_transform(train_label) kf = KFold(n_splits=fold, random_state=self._random_state) oof_train = np.zeros( (train_data.shape[0], len(label_encoder.classes_))) oof_test = np.zeros( (test_data.shape[0], len(label_encoder.classes_))) for i, (train_index, test_index) in enumerate(kf.split(train_data)): self._trained_and_predict(vec_func, train_data, train_label, test_data, train_index, test_index, oof_train, oof_test, i, mall_id) oof_test /= fold cur_save_path = '{}/{}'.format(self.SAVE_MODEL_BASE_PATH, mall_id) safe_dump_model(oof_train, cur_save_path + '_train.pkl') safe_dump_model(oof_test, cur_save_path + '_test.pkl') row_ids = pd.DataFrame(train_data['row_id'].values, columns=['row_id']) oof_train = pd.DataFrame(oof_train, columns=label_encoder.classes_) safe_save_csv_result( pd.concat([row_ids, oof_train], axis=1).set_index('row_id'), cur_save_path + '_train.csv') row_ids = pd.DataFrame(test_data['row_id'].values, columns=['row_id']) oof_test = pd.DataFrame(oof_test, columns=label_encoder.classes_) safe_save_csv_result( pd.concat([row_ids, oof_test], axis=1).set_index('row_id'), cur_save_path + '_test.csv')
def train_test(self, vec_func, target_column='shop_id'): """ :param vec_func: list of vector function :param target_column: the target column you want to predict. :return: """ # ------input data ----------- train_data = read_train_join_mall() train_data = train_data.sort_values(by='time_stamp') train_label = train_data[target_column] train_data, test_data, train_label, test_label = train_test_split( train_data, train_label, self._test_ratio) ans = self._trained_by_mall_and_predict_location( vec_func, train_data, train_label, test_data, test_label)
def recovery_price_from_pkl(): _train_data = read_train_join_mall() _train_data = _train_data.sort_values(by='time_stamp') _test_data = read_test_data() oof_train = joblib.load('./feature_save/predicted_price.csv_oof_train.pkl') oof_test = joblib.load('./feature_save/predicted_price.csv_oof_test.pkl') print(oof_train.shape, _train_data.shape) print(oof_test.shape, _test_data.shape) with open('./feature_save/predicted_price.csv', 'w') as f: f.write('row_id,p_price\n') for row_id, p in zip(_train_data['row_id'], oof_train): f.write('{},{}\n'.format(row_id, p)) for row_id, p in zip(_test_data['row_id'], oof_test): f.write('{},{}\n'.format(row_id, p))
def train_test(self, vec_func, target_column='price', fold=10): """ :param vec_func: list of vector function :param target_column: the target column you want to predict. :param fold: the fold of cross-validation. :return: None """ # ------input data ----------- _train_data = read_train_join_mall() _train_data = _train_data.sort_values(by='time_stamp') _train_label = _train_data[target_column].values _test_data = read_test_data() kf = KFold(n_splits=fold, random_state=self._random_state) oof_train = np.zeros((_train_data.shape[0], )) oof_test = np.zeros((_test_data.shape[0], )) oof_test_skf = np.zeros((fold, _test_data.shape[0])) fold_error = 0 for i, (train_index, test_index) in enumerate(kf.split(_train_data)): print(i) fold_error += self._trained_and_predict(vec_func, _train_data, _train_label, _test_data, oof_train, oof_test_skf, train_index, test_index, i) oof_test[:] = oof_test_skf.mean(axis=0) joblib.dump(oof_train, self.feature_save_path + '_oof_train.pkl', compress=3) joblib.dump(oof_test, self.feature_save_path + '_oof_test.pkl', compress=3) print(fold_error / fold) with open(self.feature_save_path, 'w') as f: f.write('row_id,p_price\n') for i, row_id in enumerate(_train_data['row_id']): f.write('{},{}\n'.format(row_id, oof_train[i])) for i, row_id in enumerate(_test_data['row_id']): f.write('{},{}\n'.format(row_id, oof_test[i])) print('done')
def many_mall_has_same_bssid(): """ many mall has same bssid, it may be mobile hotspot :return: """ train_data = read_train_join_mall() counter = collections.defaultdict(set) start = time.time() for mall_id, wifi_infos in zip(train_data['mall_id'], train_data['wifi_infos']): for wifi in wifi_infos.split(';'): _id, _strong, _connect = wifi.split('|') counter[_id].add(mall_id) print(time.time() - start) many_uid = {key for key, l in counter.items() if len(l) > 1} joblib.dump(many_uid, './feature_save/many_mall_wifi_bssid.pkl') print('total: {} repeat in other mall: {}'.format(len(counter), len(many_uid)))
def mall_category_time(mall_id='m_7168', _date='2017-08-04'): """ 计算某商场某天类别随时间变化 """ data = read_train_join_mall() data = data[data['mall_id'] == mall_id] data = data.sort_values(by='time_stamp') first_date = datetime.strptime(_date, "%Y-%m-%d").date() counter = collections.defaultdict(lambda: [0] * 24) for _datetime, category_id in zip(data['time_stamp'], data['category_id']): _datetime = datetime.strptime(_datetime, "%Y-%m-%d %H:%M") if _datetime.date() != first_date: continue counter[category_id][_datetime.hour] += 1 with open('./analysis_data/mall_counter_{}.csv'.format(_date), 'w') as f: f.write(',{}\n'.format(','.join([str(i) for i in range(24)]))) for category_id, cnt in sorted(counter.items()): f.write('{},{}\n'.format(category_id, ','.join([str(c) for c in cnt])))
def mall_shop_day_sales_volume(mall_id='m_1621'): """ 画出某店铺的每日销量 """ _train_data = read_train_join_mall() train_data = _train_data.loc[_train_data['mall_id'] == mall_id] train_data = train_data.assign( time_stamp=pd.to_datetime(train_data['time_stamp'])) train_data['time_stamp'] = train_data['time_stamp'].dt.day total_count = [collections.Counter() for _ in range(31)] for shop_id, day in zip(train_data['shop_id'], train_data['time_stamp']): total_count[day - 1][shop_id] += 1 fig = plt.figure() ax = fig.add_subplot(111, projection='3d') z = 0 shop_dis = 60 for shop_id in [ 's_389866', 's_432426', 's_459836', 's_634174', 's_1215854', 's_1287028', 's_2110248', 's_2670603', 's_2862961', 's_2922711', 's_3418707', 's_3479448', 's_3558937', 's_3658245', 's_3711363', 's_3716008', 's_3790469', 's_4001714', 's_4021610', 's_4050122' ]: if total_count[-1][shop_id] > 0: continue # 只画最后一天没有卖东西的,减少数量 xs = list(range(31)) ys = [total_count[i][shop_id] for i in xs] ax.bar(xs, ys, z, zdir='y', alpha=0.8) z += shop_dis ax.set_xlabel('days') ax.set_ylabel('shops') ax.set_zlabel('sales volume') show_plt()
train_data = _train_data.loc[_train_data['mall_id'] == mall_id] train_data = train_data.assign( time_stamp=pd.to_datetime(train_data['time_stamp'])) train_data['time_stamp'] = train_data['time_stamp'].dt.day total_count = [collections.defaultdict(set) for _ in range(31)] bssids = set() for shop_id, day, wifi_infos in zip(train_data['shop_id'], train_data['time_stamp'], train_data['wifi_infos']): for wifi_info in wifi_infos.split(';'): bssid, _, _ = wifi_info.split('|') bssids.add(bssid) total_count[day - 1][bssid].add(shop_id) cnt = 0 for bssid in sorted(bssids): t = np.array([len(total_count[i][bssid]) for i in range(31)]) if np.count_nonzero(t) > 7: print(t) cnt += 1 print(cnt, len(bssids)) if __name__ == '__main__': many_mall_has_same_bssid() check_low() wifi_co_occurrence_analysis() _wifi_co_occurrence(read_train_join_mall()) wifi_empty_statics() wifi_apperance_days()