def make_feats(end_date): t0 = time.time() print('数据key为:{}'.format(end_date)) result_path = cache_path + 'train_set_{0}.hdf'.format(end_date) if os.path.exists(result_path) & 0: result = pd.to_hdf(result_path, 'w', complib='blosc', complevel=5) else: print('添加label') label = get_label(end_date) print('开始构造特征...') result = [] result.append(get_lastdays_of_st(label, end_date, 30)) # 前一周每天的值 # result.append(get_sum_of_store_item(label, end_date, 1)) # 前1天的和 # result.append(get_sum_of_store_item(label, end_date, 3)) # 前3天的和 result.append(get_sum_of_store_item(label, end_date, 7)) # 前7天的和 result.append(get_sum_of_store_item(label, end_date, 14)) # 前14天的和 result.append(get_sum_of_store_item(label, end_date, 21)) # 前21天的和 result.append(get_sum_of_store_item(label, end_date, 28)) # 前28天的和 result.append(get_sum_of_store_item(label, end_date, 42)) # 前42天的和 result.append(get_sum_of_store_item(label, end_date, 70)) # 前70天的和 # result.append(get_sum_of_store_item(label, end_date, 98)) # 前98天的和 result.append(get_sum_of_store_item(label, end_date, 140)) # 前140天的和 result.append(get_lastdays_of_prom(label, end_date, 7)) # 前7天是否促销 result.append(get_sum_of_prom(label, end_date, 14)) # 前14天促销次数 result.append(get_sum_of_prom(label, end_date, 28)) # 前28天促销次数 result.append(get_sum_of_prom(label, end_date, 140)) # 前140天促销次数 # result.append(get_sum_of_week(label, end_date, 140)) #获取前一个月的week和 #上次购买时间 result.append(get_promo_of_store_item(label, end_date)) # 上次促销开始的时间和结束时间 result.append(label) print('开始合并特征...') result = concat(result).reindex() result = second_feat(result) # print('存储数据...') # result.to_hdf(result_path, 'w', complib='blosc', complevel=5) print('特征矩阵大小:{}'.format(result.shape)) print('生成特征一共用时{}秒'.format(time.time() - t0)) return result
def make_feats(end_date): t0 = time.time() print('数据key为:{}'.format(end_date)) result_path = cache_path + 'train_set_{0}.hdf'.format(end_date) if os.path.exists(result_path) & 0: result = pd.to_hdf(result_path, 'w', complib='blosc', complevel=5) else: print('添加label') label = get_label(end_date) base_feat = get_base_feat(label,end_date) # 基础特征 date_feat = get_date_feat(label,end_date) # 时间特征 print('开始合并特征...') result = concat([label,base_feat,date_feat]) result = second_feat(result) # print('存储数据...') # result.to_hdf(result_path, 'w', complib='blosc', complevel=5) print('特征矩阵大小:{}'.format(result.shape)) print('生成特征一共用时{}秒'.format(time.time() - t0)) return result
def make_feats(end_date): t0 = time.time() print('数据key为:{}'.format(end_date)) result_path = cache_path + 'train_set_{0}.hdf'.format(end_date) if os.path.exists(result_path) & 0: result = pd.to_hdf(result_path, 'w', complib='blosc', complevel=5) else: print('添加label') label = get_label(end_date) base_feat = get_base_feat(label, end_date) # 基础特征 print('开始构造特征...') result = [base_feat] result.append(get_lastdays_of_store_item(base_feat, end_date, 30)) # 前一周每天的值 result.append(get_sum_of_store_item(base_feat, end_date, 7)) # 前7天的和 result.append(get_sum_of_store_item(base_feat, end_date, 14)) # 前14天的和 result.append(get_sum_of_store_item(base_feat, end_date, 21)) # 前21天的和 result.append(get_sum_of_store_item(base_feat, end_date, 28)) # 前28天的和 result.append(get_sum_of_store_item(base_feat, end_date, 42)) # 前42天的和 result.append(get_sum_of_store_item(base_feat, end_date, 70)) # 前70天的和 result.append(get_sum_of_store_item(base_feat, end_date, 140)) # 前140天的和 result.append(get_lastdays_of_prom(base_feat, end_date, 7)) # 前7天是否促销 result.append(get_sum_of_prom(base_feat, end_date, 14)) # 前14天促销次数 result.append(get_sum_of_prom(base_feat, end_date, 28)) # 前28天促销次数 result.append(get_sum_of_prom(base_feat, end_date, 140)) # 前140天促销次数 result.append(get_sum_of_week(base_feat, end_date, 28)) # 获取前一个月的week和 result.append(get_sum_of_week(base_feat, end_date, 140)) # 获取前一个月的week和 result.append(get_sum_of_week(base_feat, end_date, 490)) # 获取前一个月的week和 result.append(get_lastyear_of_store_item(base_feat, end_date)) # 获取去年同期的数据销量数据 result.append(get_lastday_of_store_item(base_feat, end_date)) #上次购买时间 result.append(get_promo_of_store_item(base_feat, end_date)) # 是否促销 result.append(get_lastdays_of_item(base_feat, end_date, 30)) # item 前7天的值 result.append(get_sum_of_item(base_feat, end_date, 7)) # item 前7天的和 result.append(get_sum_of_item(base_feat, end_date, 28)) # item 前7天的和 result.append(get_sum_of_item(base_feat, end_date, 70)) # item 前7天的和 result.append(get_sum_of_item(base_feat, end_date, 140)) # item 前7天的和 # result.append(get_sum_of_week_item(base_feat, end_date, 140)) # 获取前一个月的week和 result.append(get_lastyear_of_item(base_feat, end_date)) # 获取去年同期的数据销量数据 # result.append(get_lastdays_of_store(base_feat, end_date, 7)) # store 前7天的值 # result.append(get_sum_of_week_store(base_feat, end_date, 140)) # 获取前一个月的week和 # result.append(get_lastyear_of_store(base_feat, end_date)) # 获取去年同期的数据销量数据 result.append(get_target_store(base_feat, end_date)) # 商场的销量 # 商场 品类特征 # result.append(get_sum_of_store_class(base_feat, end_date, 7)) # store×class 前7天的 # 商店 大类特征 # result.append(get_sum_of_store_family(base_feat, end_date, 7)) # store×class 前7天的 ########################日期趋势信息########################### result.append(label) print('开始合并特征...') result = concat(result) result = second_feat(result) # print('存储数据...') # result.to_hdf(result_path, 'w', complib='blosc', complevel=5) print('特征矩阵大小:{}'.format(result.shape)) print('生成特征一共用时{}秒'.format(time.time() - t0)) return result
def write_hdf(file_name, key, **kwargs): pd.to_hdf(file_name, key, **kwargs)
def make_feats(end_date,n_day): t0 = time.time() key = end_date,n_day print('数据key为:{}'.format(key)) result_path = cache_path + 'train_set_{0}.hdf'.format(end_date) if os.path.exists(result_path) & 0: result = pd.to_hdf(result_path, 'w', complib='blosc', complevel=5) else: print('添加label') label = get_label(end_date,n_day) print('开始构造特征...') result = [label] result.append(get_store_visitor_feat(label, key, 1000)) # 商店特征 result.append(get_store_visitor_feat(label, key, 60)) # 商店特征 result.append(get_store_visitor_feat(label, key, 30)) # 商店特征 result.append(get_store_exp_visitor_feat(label, key, 1000)) # 商店特征 result.append(get_store_week_feat(label, key, 1000)) # 商店×星期特征 result.append(get_store_week_feat(label, key, 60)) # 商店×星期特征 result.append(get_store_week_feat(label, key, 30)) # 商店×星期特征 result.append(get_store_week_exp_feat(label, key, 1000)) # 商店×星期指数特征 # result.append(get_area_visitor_feat(label, key, 1000)) # 地域特征 # result.append(get_area_visitor_feat(label, key, 60)) # 地域特征 # result.append(get_area_visitor_feat(label, key, 30)) # 地域特征 # result.append(get_area_exp_visitor_feat(label, key, 1000)) # 地域特征 # result.append(get_area_week_feat(label, key, 1000)) # 地域×星期特征 # result.append(get_area_week_feat(label, key, 60)) # 地域×星期特征 # result.append(get_area_week_feat(label, key, 30)) # 地域×星期特征 # result.append(get_area_week_exp_feat(label, key, 1000)) # 地域×星期指数特征 result.append(get_genre_visitor_feat(label, key, 1000)) # 品类特征 result.append(get_genre_visitor_feat(label, key, 60)) # 品类特征 result.append(get_genre_visitor_feat(label, key, 30)) # 品类特征 result.append(get_genre_exp_visitor_feat(label, key, 1000)) # 品类特征 result.append(get_genre_week_feat(label, key, 1000)) # 品类×星期特征 result.append(get_genre_week_feat(label, key, 60)) # 品类×星期特征 result.append(get_genre_week_feat(label, key, 30)) # 品类×星期特征 result.append(get_genre_week_exp_feat(label, key, 1000)) # 品类×星期指数特征 # result.append(get_area_genre_visitor_feat(label, key, 1000)) # 地域×品类特征 # result.append(get_area_genre_visitor_feat(label, key, 60)) # 地域×品类特征 # result.append(get_area_genre_visitor_feat(label, key, 30)) # 地域×品类特征 # result.append(get_area_genre_exp_visitor_feat(label, key, 1000))# 品类特征 # result.append(get_area_genre_week_feat(label, key, 1000)) # 地域×品类×星期特征 # result.append(get_area_genre_week_feat(label, key, 60)) # 地域×品类×星期特征 # result.append(get_area_genre_week_feat(label, key, 30)) # 地域×品类×星期特征 # result.append(get_area_genre_week_exp_feat(label, key, 1000)) # 地域×品类×星期指数特征 result.append(get_reserve_feat(label,key)) # air_reserve当天特征 # result.append(get_reserve_history(label,key)) # air_reserve统计特征 result.append(get_first_last_time(label,key,1000)) # 第一次和最后一次有销量的时间 result.append(label) print('开始合并特征...') result = concat(result) result = second_feat(result) # print('存储数据...') # result.to_hdf(result_path, 'w', complib='blosc', complevel=5) print('特征矩阵大小:{}'.format(result.shape)) print('生成特征一共用时{}秒'.format(time.time() - t0)) return result
def make_feats(end_date): t0 = time.time() print('time key:{}'.format(end_date)) result_path = cache_path + 'train_set_{0}.hdf'.format(end_date) if os.path.exists(result_path) & load: result = pd.to_hdf(result_path, 'w', complib='blosc', complevel=5) else: print('add label') label = get_label(end_date) print('make feature...') result = [] result.append(get_sum_of_store_item(label, end_date, 140, None, holiday_no, 'holiday_no')) result.append( get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], holiday_no, 'holiday_no')) result.append(get_sum_of_store_item(label, end_date, 140, None, holiday_yes, 'holiday_yes')) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], holiday_yes, 'holiday_yes')) result.append(get_sum_of_store_item(label, end_date, 140, None, week0, 'week0')) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week0, 'week0')) result.append(get_sum_of_store_item(label, end_date, 140, None, week1, 'week1')) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week1, 'week1')) result.append(get_sum_of_store_item(label, end_date, 140, None, week2, 'week2')) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week2, 'week2')) result.append(get_sum_of_store_item(label, end_date, 140, None, week3, 'week3')) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week3, 'week3')) result.append(get_sum_of_store_item(label, end_date, 140, None, week4, 'week4')) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week4, 'week4')) result.append(get_sum_of_store_item(label, end_date, 140, None, week5, 'week5')) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week5, 'week5')) result.append(get_sum_of_store_item(label, end_date, 140, None, weekend_no, 'weekend_no')) result.append( get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], weekend_no, 'weekend_no')) result.append(get_sum_of_store_item(label, end_date, 140, None, weekend_yes, 'weekend_yes')) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], weekend_yes, 'weekend_yes')) result.append(get_sum_of_store_item(label, end_date, 140, None, onpromotion_yes, 'onpromotion_yes')) result.append( get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], onpromotion_yes, 'onpromotion_yes')) result.append(get_sum_of_store_item(label, end_date, 140, None, onpromotion_no, 'onpromotion_no')) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], onpromotion_no, 'onpromotion_no')) result.append(get_sum_of_store_item(label, end_date, 140, [i for i in range(140)])) result.append(get_sum_of_store_item(label, end_date, 140, [i + 140 for i in range(140)])) result.append(get_sum_of_store_item(label, end_date, 140, [i + 280 for i in range(140)])) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 2 for i in range(140)])) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 3 for i in range(140)])) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 4 for i in range(140)])) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 5 for i in range(140)])) result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)])) result.append(get_sum_of_item_state(label, end_date, 7)) # 前7天同state同商品销量 result.append(get_sum_of_item_state(label, end_date, 14)) # 前14天同state同商品销量 result.append(get_sum_of_item_state(label, end_date, 28)) # 前28天同state同商品销量 result.append(get_sum_of_item_state(label, end_date, 42)) # 前42天同state同商品销量 result.append(get_sum_of_item_state(label, end_date, 70)) # 前70天同state同商品销量 result.append(get_sum_of_item_state(label, end_date, 140)) # 前140天同state同商品销量 result.append(get_sum_of_item_type(label, end_date, 7)) # 前7天同type同商品销量 result.append(get_sum_of_item_type(label, end_date, 14)) # 前14天同type同商品销量 result.append(get_sum_of_item_type(label, end_date, 28)) # 前28天同type同商品销量 result.append(get_sum_of_item_type(label, end_date, 42)) # 前42天同type同商品销量 result.append(get_sum_of_item_type(label, end_date, 70)) # 前70天同type同商品销量 result.append(get_sum_of_item_type(label, end_date, 140)) # 前140天同type同商品销量 result.append(get_sum_of_item_cluster(label, end_date, 7)) # 前7天同cluster同商品销量 result.append(get_sum_of_item_cluster(label, end_date, 14)) # 前14天同cluster同商品销量 result.append(get_sum_of_item_cluster(label, end_date, 28)) # 前28天同cluster同商品销量 result.append(get_sum_of_item_cluster(label, end_date, 42)) # 前42天同cluster同商品销量 result.append(get_sum_of_item_cluster(label, end_date, 70)) # 前70天同cluster同商品销量 result.append(get_sum_of_item_cluster(label, end_date, 140)) # 前140天同cluster同商品销量 result.append(get_sum_of_store_class(label, end_date, 7)) # 前7天同店铺同class销量 result.append(get_sum_of_store_class(label, end_date, 14)) # 前14天同店铺同class销量 result.append(get_sum_of_store_class(label, end_date, 28)) # 前28天同店铺同class销量 result.append(get_sum_of_store_class(label, end_date, 42)) # 前42天同店铺同class销量 result.append(get_sum_of_store_class(label, end_date, 70)) # 前70天同店铺同class销量 result.append(get_sum_of_store_class(label, end_date, 140)) # 前140天同店铺同class销量 result.append(get_sum_of_trans(label, end_date, 7)) # 前7天trans result.append(get_sum_of_trans(label, end_date, 28)) # 前28天trans result.append(get_sum_of_trans(label, end_date, 42)) # 前42天trans result.append(get_sum_of_trans(label, end_date, 70)) # 前70天trans result.append(get_sum_of_trans(label, end_date, 140)) # 前140天trans result.append(get_sum_of_oil(label, end_date, 7)) # 前7天油价 result.append(get_sum_of_oil(label, end_date, 28)) # 前28天油价 result.append(get_sum_of_oil(label, end_date, 42)) # 前42天油价 result.append(get_sum_of_oil(label, end_date, 70)) # 前70天油价 result.append(get_sum_of_oil(label, end_date, 140)) # 前140天油价 result.append(get_item_info(label)) # 添加商品信息 result.append(get_store_info(label)) # 添加店铺信息 result.append(get_sum_of_store_family(label, end_date, 7)) # 前7天同店铺同family销量 result.append(get_sum_of_store_family(label, end_date, 14)) # 前14天同店铺同family销量 result.append(get_sum_of_store_family(label, end_date, 28)) # 前28天同店铺同family销量 result.append(get_sum_of_store_family(label, end_date, 42)) # 前42天同店铺同family销量 result.append(get_sum_of_store_family(label, end_date, 70)) # 前70天同店铺同family销量 result.append(get_sum_of_store_family(label, end_date, 140)) # 前140天同店铺同family销量 result.append(get_sum_of_item_city(label, end_date, 7)) # 前7天同城市同商品销量 result.append(get_sum_of_item_city(label, end_date, 14)) # 前14天同城市同商品销量 result.append(get_sum_of_item_city(label, end_date, 21)) # 前21天同城市同商品销量 result.append(get_sum_of_item_city(label, end_date, 28)) # 前28天同城市同商品销量 result.append(get_sum_of_item_city(label, end_date, 42)) # 前42天同城市同商品销量 result.append(get_sum_of_item_city(label, end_date, 70)) # 前70天同城市同商品销量 result.append(get_sum_of_item_city(label, end_date, 140)) # 前140天同城市同商品销量 result.append(get_sum_of_item(label, end_date, 7)) # 前7天商品销量 result.append(get_sum_of_item(label, end_date, 14)) # 前14天商品销量 result.append(get_sum_of_item(label, end_date, 21)) # 前21天商品销量 result.append(get_sum_of_item(label, end_date, 28)) # 前28天商品销量 result.append(get_sum_of_item(label, end_date, 42)) # 前42天商品销量 result.append(get_sum_of_item(label, end_date, 70)) # 前70天商品销量 result.append(get_sum_of_item(label, end_date, 140)) # 前140天商品销量 result.append(get_lastdays_of_st(label, end_date, 30)) # 前一月每天的值 # result.append(get_sum_of_store_item(label, end_date, 1)) # 前1天的和 result.append(get_sum_of_store_item(label, end_date, 3)) # 前3天的和 result.append(get_sum_of_store_item(label, end_date, 7)) # 前7天的和 result.append(get_sum_of_store_item(label, end_date, 14)) # 前14天的和 result.append(get_sum_of_store_item(label, end_date, 21)) # 前21天的和 result.append(get_sum_of_store_item(label, end_date, 28)) # 前28天的和 result.append(get_sum_of_store_item(label, end_date, 42)) # 前42天的和 result.append(get_sum_of_store_item(label, end_date, 70)) # 前70天的和 result.append(get_sum_of_store_item(label, end_date, 98)) # 前98天的和 result.append(get_sum_of_store_item(label, end_date, 140)) # 前140天的和 result.append(get_lastdays_of_prom(label, end_date, 16)) # 该时间段内是否促销 result.append(get_sum_of_prom(label, end_date, 14)) # 前14天促销次数 result.append(get_sum_of_prom(label, end_date, 28)) # 前28天促销次数 result.append(get_sum_of_prom(label, end_date, 35)) # 前140天促销次数 result.append(get_sum_of_prom(label, end_date, 70)) # 前140天促销次数 result.append(get_sum_of_prom(label, end_date, 280)) # 前140天促销次数 result.append(get_sum_of_prom(label, end_date, 140)) # 前140天促销次数 # result.append(get_sum_of_week(label, end_date, 140)) #获取前一个月的week和 # 上次购买时间 result.append(get_promo_of_store_item(label, end_date)) # 上次促销开始的时间和结束时间 result.append(label) print('concat feature...') result = concat(result).reindex() result = second_feat(result) # print('存储数据...') # result.to_hdf(result_path, 'w', complib='blosc', complevel=5) print('shape:{}'.format(result.shape)) print('used {} second'.format(time.time() - t0)) return result