def get_genre_week_exp_feat(label, key, n_day, data_dict): data = data_dict['data'] start_date = date_add_days(key[0], -n_day) data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy() data_temp['visit_date'] = data_temp['visit_date'].apply( lambda x: diff_of_days(key[0], x)) data_temp['weight'] = data_temp['visit_date'].apply(lambda x: 0.985**x) data_temp['visitors'] = data_temp['visitors'] * data_temp['weight'] result1 = data_temp.groupby(['air_genre_name', 'dow'], as_index=False)['visitors'].agg({ 'genre_dow_exp_mean{}'.format(n_day): 'sum' }) result2 = data_temp.groupby(['air_genre_name', 'dow'], as_index=False)['weight'].agg({ 'genre_dow_exp_weight_sum{}'.format(n_day): 'sum' }) result = result1.merge(result2, on=['air_genre_name', 'dow'], how='left') result['genre_dow_exp_mean{}'.format( n_day)] = result['genre_dow_exp_mean{}'.format(n_day)] / result[ 'genre_dow_exp_weight_sum{}'.format(n_day)] result = left_merge(label, result, on=['air_genre_name', 'dow']).fillna(0) return result
def get_store_holiday_feat(label, key, n_day, data_dict): data = data_dict['data'] start_date = date_add_days(key[0], -n_day) data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy() result1 = data_temp.groupby(['store_id', 'holiday_flg'], as_index=False)['visitors'].agg({ 'store_holiday_min{}'.format(n_day): 'min', 'store_holiday_mean{}'.format(n_day): 'mean', 'store_holiday_median{}'.format(n_day): 'median', 'store_holiday_max{}'.format(n_day): 'max', 'store_holiday_count{}'.format(n_day): 'count', 'store_holiday_std{}'.format(n_day): 'std', 'store_holiday_skew{}'.format(n_day): 'skew' }) result1 = left_merge(label, result1, on=['store_id', 'holiday_flg']).fillna(0) result2 = data_temp.groupby(['store_id', 'holiday_flg2'], as_index=False)['visitors'].agg({ 'store_holiday2_min{}'.format(n_day): 'min', 'store_holiday2_mean{}'.format(n_day): 'mean', 'store_holiday2_median{}'.format(n_day): 'median', 'store_holiday2_max{}'.format(n_day): 'max', 'store_holiday2_count{}'.format(n_day): 'count', 'store_holiday2_std{}'.format(n_day): 'std', 'store_holiday2_skew{}'.format(n_day): 'skew' }) result2 = left_merge(label, result2, on=['store_id', 'holiday_flg2']).fillna(0) result = pd.concat([result1, result2], axis=1) return result
def get_first_last_time_feat(label, key, n_day, data_dict): data = data_dict['data'] start_date = date_add_days(key[0], -n_day) data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy() data_temp = data_temp.sort_values('visit_date') result = data_temp.groupby('store_id')['visit_date'].agg({ 'first_time': lambda x: diff_of_days(key[0], np.min(x)), 'last_time': lambda x: diff_of_days(key[0], np.max(x)), }) result = left_merge(label, result, on=['store_id']).fillna(0) return result
def get_store_week_exp_feat(label, key, n_day, data_dict): data = data_dict['data'] start_date = date_add_days(key[0], -n_day) data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy() data_temp['visit_date'] = data_temp['visit_date'].apply( lambda x: diff_of_days(key[0], x)) data_temp['visitors2'] = data_temp['visitors'] result = None for i in [0.9, 0.95, 0.97, 0.98, 0.985, 0.99, 0.999, 0.9999]: data_temp['weight'] = data_temp['visit_date'].apply(lambda x: i**x) data_temp['visitors1'] = data_temp['visitors'] * data_temp['weight'] data_temp['visitors2'] = data_temp['visitors2'] * data_temp['weight'] result1 = data_temp.groupby( ['store_id', 'dow'], as_index=False)['visitors1'].agg( {'store_dow_exp_mean{}_{}'.format(n_day, i): 'sum'}) result3 = data_temp.groupby( ['store_id', 'dow'], as_index=False)['visitors2'].agg( {'store_dow_exp_mean2{}_{}'.format(n_day, i): 'sum'}) result2 = data_temp.groupby( ['store_id', 'dow'], as_index=False)['weight'].agg( {'store_dow_exp_weight_sum{}_{}'.format(n_day, i): 'sum'}) result_temp = result1.merge(result2, on=['store_id', 'dow'], how='left') result_temp = result_temp.merge(result3, on=['store_id', 'dow'], how='left') result_temp['store_dow_exp_mean{}_{}'.format( n_day, i)] = result_temp['store_dow_exp_mean{}_{}'.format( n_day, i)] / result_temp['store_dow_exp_weight_sum{}_{}'.format( n_day, i)] result_temp['store_dow_exp_mean2{}_{}'.format( n_day, i)] = result_temp['store_dow_exp_mean2{}_{}'.format( n_day, i)] / result_temp['store_dow_exp_weight_sum{}_{}'.format( n_day, i)] if result is None: result = result_temp else: result = result.merge(result_temp, on=['store_id', 'dow'], how='left') result = left_merge(label, result, on=['store_id', 'dow']).fillna(0) return result
def get_store_week_diff_feat(label, key, n_day, data_dict): data = data_dict['data'] start_date = date_add_days(key[0], -n_day) data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy() result = data_temp.set_index(['store_id', 'visit_date'])['visitors'].unstack() result = result.diff(axis=1).iloc[:, 1:] c = result.columns result['store_diff_mean'] = np.abs(result[c]).mean(axis=1) result['store_diff_std'] = result[c].std(axis=1) result['store_diff_max'] = result[c].max(axis=1) result['store_diff_min'] = result[c].min(axis=1) result = left_merge(label, result[[ 'store_diff_mean', 'store_diff_std', 'store_diff_max', 'store_diff_min' ]], on=['store_id']).fillna(0) return result
def get_store_visitor_feat(label, key, n_day, data_dict): data = data_dict['data'] start_date = date_add_days(key[0], -n_day) data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy() result = data_temp.groupby(['store_id'], as_index=False)['visitors'].agg({ 'store_min{}'.format(n_day): 'min', 'store_mean{}'.format(n_day): 'mean', 'store_median{}'.format(n_day): 'median', 'store_max{}'.format(n_day): 'max', 'store_count{}'.format(n_day): 'count', 'store_std{}'.format(n_day): 'std', 'store_skew{}'.format(n_day): 'skew' }) result = left_merge(label, result, on=['store_id']).fillna(0) return result
def get_store_all_week_feat(label, key, n_day, data_dict): data = data_dict['data'] start_date = date_add_days(key[0], -n_day) data_temp = data[(data.visit_date < key[0]) & (data.visit_date > start_date)].copy() result_temp = data_temp.groupby(['store_id', 'dow'], as_index=False)['visitors'].agg({ 'store_dow_mean{}'.format(n_day): 'mean', 'store_dow_median{}'.format(n_day): 'median', 'store_dow_sum{}'.format(n_day): 'max', 'store_dow_count{}'.format(n_day): 'count' }) result = pd.DataFrame() for i in range(7): result_sub = result_temp[result_temp['dow'] == i].copy() result_sub = result_sub.set_index('store_id') result_sub = result_sub.add_prefix(str(i)) result_sub = left_merge(label, result_sub, on=['store_id']).fillna(0) result = pd.concat([result, result_sub], axis=1) return result
def get_reserve_feat(label, key, _, data_dict): data = data_dict['data'] air_reserve = data_dict['air_reserve'] hpg_reserve = data_dict['hpg_reserve'] air_store = data_dict['air_store'] label_end_date = date_add_days(key[0], key[1]) air_reserve_temp = air_reserve[(air_reserve.visit_date >= key[0]) & # key[0] 是'2017-04-23' # label_end_date 是'2017-05-31' (air_reserve.visit_date < label_end_date) & (air_reserve.reserve_date < key[0])].copy() air_reserve_temp = air_reserve_temp.merge(air_store, on='store_id', how='left') air_reserve_temp['diff_time'] = ( pd.to_datetime(air_reserve['visit_datetime']) - pd.to_datetime(air_reserve['reserve_datetime'])).dt.days air_reserve_temp = air_reserve_temp.merge(air_store, on='store_id') air_result = air_reserve_temp.groupby(['store_id', 'visit_date' ])['reserve_visitors'].agg({ 'air_reserve_visitors': 'sum', 'air_reserve_count': 'count' }) air_store_diff_time_mean = air_reserve_temp.groupby( ['store_id', 'visit_date'])['diff_time'].agg({'air_store_diff_time_mean': 'mean'}) air_diff_time_mean = air_reserve_temp.groupby( ['visit_date'])['diff_time'].agg({'air_diff_time_mean': 'mean'}) air_result = air_result.unstack().fillna(0).stack() air_date_result = air_reserve_temp.groupby(['visit_date' ])['reserve_visitors'].agg({ 'air_date_visitors': 'sum', 'air_date_count': 'count' }) hpg_reserve_temp = hpg_reserve[(hpg_reserve.visit_date >= key[0]) & (hpg_reserve.visit_date < label_end_date) & (hpg_reserve.reserve_date < key[0])].copy() hpg_reserve_temp['diff_time'] = ( pd.to_datetime(hpg_reserve['visit_datetime']) - pd.to_datetime(hpg_reserve['reserve_datetime'])).dt.days hpg_result = hpg_reserve_temp.groupby(['store_id', 'visit_date' ])['reserve_visitors'].agg({ 'hpg_reserve_visitors': 'sum', 'hpg_reserve_count': 'count' }) hpg_result = hpg_result.unstack().fillna(0).stack() hpg_date_result = hpg_reserve_temp.groupby(['visit_date' ])['reserve_visitors'].agg({ 'hpg_date_visitors': 'sum', 'hpg_date_count': 'count' }) hpg_store_diff_time_mean = hpg_reserve_temp.groupby( ['store_id', 'visit_date'])['diff_time'].agg({'hpg_store_diff_time_mean': 'mean'}) hpg_diff_time_mean = hpg_reserve_temp.groupby( ['visit_date'])['diff_time'].agg({'hpg_diff_time_mean': 'mean'}) air_result = left_merge(label, air_result, on=['store_id', 'visit_date']).fillna(0) air_store_diff_time_mean = left_merge(label, air_store_diff_time_mean, on=['store_id', 'visit_date']).fillna(0) hpg_result = left_merge(label, hpg_result, on=['store_id', 'visit_date']).fillna(0) hpg_store_diff_time_mean = left_merge(label, hpg_store_diff_time_mean, on=['store_id', 'visit_date']).fillna(0) air_date_result = left_merge(label, air_date_result, on=['visit_date']).fillna(0) air_diff_time_mean = left_merge(label, air_diff_time_mean, on=['visit_date']).fillna(0) hpg_date_result = left_merge(label, hpg_date_result, on=['visit_date']).fillna(0) hpg_diff_time_mean = left_merge(label, hpg_diff_time_mean, on=['visit_date']).fillna(0) result = pd.concat([ air_result, hpg_result, air_date_result, hpg_date_result, air_store_diff_time_mean, hpg_store_diff_time_mean, air_diff_time_mean, hpg_diff_time_mean ], axis=1) return result