Exemple #1
0
def make_feats(end_date):
    t0 = time.time()
    print('数据key为:{}'.format(end_date))
    result_path = cache_path + 'train_set_{0}.hdf'.format(end_date)
    if os.path.exists(result_path) & 0:
        result = pd.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    else:
        print('添加label')
        label = get_label(end_date)

        print('开始构造特征...')
        result = []
        result.append(get_lastdays_of_st(label, end_date, 30))  # 前一周每天的值
        # result.append(get_sum_of_store_item(label, end_date, 1))        # 前1天的和
        # result.append(get_sum_of_store_item(label, end_date, 3))        # 前3天的和
        result.append(get_sum_of_store_item(label, end_date, 7))  # 前7天的和
        result.append(get_sum_of_store_item(label, end_date, 14))  # 前14天的和
        result.append(get_sum_of_store_item(label, end_date, 21))  # 前21天的和
        result.append(get_sum_of_store_item(label, end_date, 28))  # 前28天的和
        result.append(get_sum_of_store_item(label, end_date, 42))  # 前42天的和
        result.append(get_sum_of_store_item(label, end_date, 70))  # 前70天的和
        # result.append(get_sum_of_store_item(label, end_date, 98))       # 前98天的和
        result.append(get_sum_of_store_item(label, end_date, 140))  # 前140天的和
        result.append(get_lastdays_of_prom(label, end_date, 7))  # 前7天是否促销
        result.append(get_sum_of_prom(label, end_date, 14))  # 前14天促销次数
        result.append(get_sum_of_prom(label, end_date, 28))  # 前28天促销次数
        result.append(get_sum_of_prom(label, end_date, 140))  # 前140天促销次数
        # result.append(get_sum_of_week(label, end_date, 140))    #获取前一个月的week和

        #上次购买时间
        result.append(get_promo_of_store_item(label,
                                              end_date))  # 上次促销开始的时间和结束时间

        result.append(label)

        print('开始合并特征...')
        result = concat(result).reindex()

        result = second_feat(result)

        # print('存储数据...')
        # result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    print('特征矩阵大小:{}'.format(result.shape))
    print('生成特征一共用时{}秒'.format(time.time() - t0))
    return result
Exemple #2
0
def make_feats(end_date):
    t0 = time.time()
    print('数据key为:{}'.format(end_date))
    result_path = cache_path + 'train_set_{0}.hdf'.format(end_date)
    if os.path.exists(result_path) & 0:
        result = pd.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    else:
        print('添加label')
        label = get_label(end_date)
        base_feat = get_base_feat(label,end_date)                        # 基础特征
        date_feat = get_date_feat(label,end_date)                        # 时间特征

        print('开始合并特征...')
        result = concat([label,base_feat,date_feat])

        result = second_feat(result)

        # print('存储数据...')
        # result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    print('特征矩阵大小:{}'.format(result.shape))
    print('生成特征一共用时{}秒'.format(time.time() - t0))
    return result
Exemple #3
0
def make_feats(end_date):
    t0 = time.time()
    print('数据key为:{}'.format(end_date))
    result_path = cache_path + 'train_set_{0}.hdf'.format(end_date)
    if os.path.exists(result_path) & 0:
        result = pd.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    else:
        print('添加label')
        label = get_label(end_date)
        base_feat = get_base_feat(label, end_date)  # 基础特征

        print('开始构造特征...')
        result = [base_feat]
        result.append(get_lastdays_of_store_item(base_feat, end_date,
                                                 30))  # 前一周每天的值
        result.append(get_sum_of_store_item(base_feat, end_date, 7))  # 前7天的和
        result.append(get_sum_of_store_item(base_feat, end_date, 14))  # 前14天的和
        result.append(get_sum_of_store_item(base_feat, end_date, 21))  # 前21天的和
        result.append(get_sum_of_store_item(base_feat, end_date, 28))  # 前28天的和
        result.append(get_sum_of_store_item(base_feat, end_date, 42))  # 前42天的和
        result.append(get_sum_of_store_item(base_feat, end_date, 70))  # 前70天的和
        result.append(get_sum_of_store_item(base_feat, end_date,
                                            140))  # 前140天的和
        result.append(get_lastdays_of_prom(base_feat, end_date, 7))  # 前7天是否促销
        result.append(get_sum_of_prom(base_feat, end_date, 14))  # 前14天促销次数
        result.append(get_sum_of_prom(base_feat, end_date, 28))  # 前28天促销次数
        result.append(get_sum_of_prom(base_feat, end_date, 140))  # 前140天促销次数
        result.append(get_sum_of_week(base_feat, end_date, 28))  # 获取前一个月的week和
        result.append(get_sum_of_week(base_feat, end_date,
                                      140))  # 获取前一个月的week和
        result.append(get_sum_of_week(base_feat, end_date,
                                      490))  # 获取前一个月的week和
        result.append(get_lastyear_of_store_item(base_feat,
                                                 end_date))  # 获取去年同期的数据销量数据
        result.append(get_lastday_of_store_item(base_feat, end_date))  #上次购买时间
        result.append(get_promo_of_store_item(base_feat, end_date))  # 是否促销

        result.append(get_lastdays_of_item(base_feat, end_date,
                                           30))  # item 前7天的值
        result.append(get_sum_of_item(base_feat, end_date, 7))  # item 前7天的和
        result.append(get_sum_of_item(base_feat, end_date, 28))  # item 前7天的和
        result.append(get_sum_of_item(base_feat, end_date, 70))  # item 前7天的和
        result.append(get_sum_of_item(base_feat, end_date, 140))  # item 前7天的和
        # result.append(get_sum_of_week_item(base_feat, end_date, 140))   # 获取前一个月的week和
        result.append(get_lastyear_of_item(base_feat,
                                           end_date))  # 获取去年同期的数据销量数据

        # result.append(get_lastdays_of_store(base_feat, end_date, 7))    # store 前7天的值
        # result.append(get_sum_of_week_store(base_feat, end_date, 140))  # 获取前一个月的week和
        # result.append(get_lastyear_of_store(base_feat, end_date))       # 获取去年同期的数据销量数据
        result.append(get_target_store(base_feat, end_date))  # 商场的销量

        # 商场 品类特征
        # result.append(get_sum_of_store_class(base_feat, end_date, 7))  # store×class 前7天的

        # 商店 大类特征
        # result.append(get_sum_of_store_family(base_feat, end_date, 7))  # store×class 前7天的

        ########################日期趋势信息###########################

        result.append(label)

        print('开始合并特征...')
        result = concat(result)

        result = second_feat(result)

        # print('存储数据...')
        # result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    print('特征矩阵大小:{}'.format(result.shape))
    print('生成特征一共用时{}秒'.format(time.time() - t0))
    return result
Exemple #4
0
def write_hdf(file_name, key, **kwargs):
    pd.to_hdf(file_name, key, **kwargs)
Exemple #5
0
def write_hdf(file_name, key, **kwargs):
    pd.to_hdf(file_name, key, **kwargs)
Exemple #6
0
def make_feats(end_date,n_day):
    t0 = time.time()
    key = end_date,n_day
    print('数据key为:{}'.format(key))
    result_path = cache_path + 'train_set_{0}.hdf'.format(end_date)
    if os.path.exists(result_path) & 0:
        result = pd.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    else:
        print('添加label')
        label = get_label(end_date,n_day)

        print('开始构造特征...')
        result = [label]
        result.append(get_store_visitor_feat(label, key, 1000))        # 商店特征
        result.append(get_store_visitor_feat(label, key, 60))          # 商店特征
        result.append(get_store_visitor_feat(label, key, 30))          # 商店特征
        result.append(get_store_exp_visitor_feat(label, key, 1000))    # 商店特征
        result.append(get_store_week_feat(label, key, 1000))           # 商店×星期特征
        result.append(get_store_week_feat(label, key, 60))             # 商店×星期特征
        result.append(get_store_week_feat(label, key, 30))             # 商店×星期特征
        result.append(get_store_week_exp_feat(label, key, 1000))       # 商店×星期指数特征

        # result.append(get_area_visitor_feat(label, key, 1000))        # 地域特征
        # result.append(get_area_visitor_feat(label, key, 60))          # 地域特征
        # result.append(get_area_visitor_feat(label, key, 30))          # 地域特征
        # result.append(get_area_exp_visitor_feat(label, key, 1000))    # 地域特征
        # result.append(get_area_week_feat(label, key, 1000))           # 地域×星期特征
        # result.append(get_area_week_feat(label, key, 60))             # 地域×星期特征
        # result.append(get_area_week_feat(label, key, 30))             # 地域×星期特征
        # result.append(get_area_week_exp_feat(label, key, 1000))       # 地域×星期指数特征

        result.append(get_genre_visitor_feat(label, key, 1000))         # 品类特征
        result.append(get_genre_visitor_feat(label, key, 60))           # 品类特征
        result.append(get_genre_visitor_feat(label, key, 30))           # 品类特征
        result.append(get_genre_exp_visitor_feat(label, key, 1000))     # 品类特征
        result.append(get_genre_week_feat(label, key, 1000))            # 品类×星期特征
        result.append(get_genre_week_feat(label, key, 60))              # 品类×星期特征
        result.append(get_genre_week_feat(label, key, 30))              # 品类×星期特征
        result.append(get_genre_week_exp_feat(label, key, 1000))         # 品类×星期指数特征

        # result.append(get_area_genre_visitor_feat(label, key, 1000))   # 地域×品类特征
        # result.append(get_area_genre_visitor_feat(label, key, 60))     # 地域×品类特征
        # result.append(get_area_genre_visitor_feat(label, key, 30))     # 地域×品类特征
        # result.append(get_area_genre_exp_visitor_feat(label, key, 1000))# 品类特征
        # result.append(get_area_genre_week_feat(label, key, 1000))      # 地域×品类×星期特征
        # result.append(get_area_genre_week_feat(label, key, 60))        # 地域×品类×星期特征
        # result.append(get_area_genre_week_feat(label, key, 30))        # 地域×品类×星期特征
        # result.append(get_area_genre_week_exp_feat(label, key, 1000))  # 地域×品类×星期指数特征

        result.append(get_reserve_feat(label,key))                      # air_reserve当天特征
        # result.append(get_reserve_history(label,key))                   # air_reserve统计特征
        result.append(get_first_last_time(label,key,1000))             # 第一次和最后一次有销量的时间


        result.append(label)

        print('开始合并特征...')
        result = concat(result)

        result = second_feat(result)

        # print('存储数据...')
        # result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    print('特征矩阵大小:{}'.format(result.shape))
    print('生成特征一共用时{}秒'.format(time.time() - t0))
    return result
Exemple #7
0
def make_feats(end_date):
    t0 = time.time()
    print('time key:{}'.format(end_date))
    result_path = cache_path + 'train_set_{0}.hdf'.format(end_date)
    if os.path.exists(result_path) & load:
        result = pd.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    else:
        print('add label')
        label = get_label(end_date)
        print('make feature...')
        result = []

        result.append(get_sum_of_store_item(label, end_date, 140, None, holiday_no, 'holiday_no'))
        result.append(
            get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], holiday_no, 'holiday_no'))
        result.append(get_sum_of_store_item(label, end_date, 140, None, holiday_yes, 'holiday_yes'))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], holiday_yes,
                                            'holiday_yes'))

        result.append(get_sum_of_store_item(label, end_date, 140, None, week0, 'week0'))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week0, 'week0'))
        result.append(get_sum_of_store_item(label, end_date, 140, None, week1, 'week1'))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week1, 'week1'))
        result.append(get_sum_of_store_item(label, end_date, 140, None, week2, 'week2'))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week2, 'week2'))
        result.append(get_sum_of_store_item(label, end_date, 140, None, week3, 'week3'))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week3, 'week3'))
        result.append(get_sum_of_store_item(label, end_date, 140, None, week4, 'week4'))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week4, 'week4'))
        result.append(get_sum_of_store_item(label, end_date, 140, None, week5, 'week5'))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], week5, 'week5'))
        result.append(get_sum_of_store_item(label, end_date, 140, None, weekend_no, 'weekend_no'))
        result.append(
            get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], weekend_no, 'weekend_no'))
        result.append(get_sum_of_store_item(label, end_date, 140, None, weekend_yes, 'weekend_yes'))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], weekend_yes,
                                            'weekend_yes'))

        result.append(get_sum_of_store_item(label, end_date, 140, None, onpromotion_yes, 'onpromotion_yes'))
        result.append(
            get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], onpromotion_yes,
                                  'onpromotion_yes'))
        result.append(get_sum_of_store_item(label, end_date, 140, None, onpromotion_no, 'onpromotion_no'))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)], onpromotion_no,
                                            'onpromotion_no'))

        result.append(get_sum_of_store_item(label, end_date, 140, [i for i in range(140)]))
        result.append(get_sum_of_store_item(label, end_date, 140, [i + 140 for i in range(140)]))
        result.append(get_sum_of_store_item(label, end_date, 140, [i + 280 for i in range(140)]))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 2 for i in range(140)]))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 3 for i in range(140)]))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 4 for i in range(140)]))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 5 for i in range(140)]))
        result.append(get_sum_of_store_item(label, end_date, 140, [i ** 6 / 1000 for i in range(140)]))

        result.append(get_sum_of_item_state(label, end_date, 7))  # 前7天同state同商品销量
        result.append(get_sum_of_item_state(label, end_date, 14))  # 前14天同state同商品销量
        result.append(get_sum_of_item_state(label, end_date, 28))  # 前28天同state同商品销量
        result.append(get_sum_of_item_state(label, end_date, 42))  # 前42天同state同商品销量
        result.append(get_sum_of_item_state(label, end_date, 70))  # 前70天同state同商品销量
        result.append(get_sum_of_item_state(label, end_date, 140))  # 前140天同state同商品销量

        result.append(get_sum_of_item_type(label, end_date, 7))  # 前7天同type同商品销量
        result.append(get_sum_of_item_type(label, end_date, 14))  # 前14天同type同商品销量
        result.append(get_sum_of_item_type(label, end_date, 28))  # 前28天同type同商品销量
        result.append(get_sum_of_item_type(label, end_date, 42))  # 前42天同type同商品销量
        result.append(get_sum_of_item_type(label, end_date, 70))  # 前70天同type同商品销量
        result.append(get_sum_of_item_type(label, end_date, 140))  # 前140天同type同商品销量

        result.append(get_sum_of_item_cluster(label, end_date, 7))  # 前7天同cluster同商品销量
        result.append(get_sum_of_item_cluster(label, end_date, 14))  # 前14天同cluster同商品销量
        result.append(get_sum_of_item_cluster(label, end_date, 28))  # 前28天同cluster同商品销量
        result.append(get_sum_of_item_cluster(label, end_date, 42))  # 前42天同cluster同商品销量
        result.append(get_sum_of_item_cluster(label, end_date, 70))  # 前70天同cluster同商品销量
        result.append(get_sum_of_item_cluster(label, end_date, 140))  # 前140天同cluster同商品销量

        result.append(get_sum_of_store_class(label, end_date, 7))  # 前7天同店铺同class销量
        result.append(get_sum_of_store_class(label, end_date, 14))  # 前14天同店铺同class销量
        result.append(get_sum_of_store_class(label, end_date, 28))  # 前28天同店铺同class销量
        result.append(get_sum_of_store_class(label, end_date, 42))  # 前42天同店铺同class销量
        result.append(get_sum_of_store_class(label, end_date, 70))  # 前70天同店铺同class销量
        result.append(get_sum_of_store_class(label, end_date, 140))  # 前140天同店铺同class销量

        result.append(get_sum_of_trans(label, end_date, 7))  # 前7天trans
        result.append(get_sum_of_trans(label, end_date, 28))  # 前28天trans
        result.append(get_sum_of_trans(label, end_date, 42))  # 前42天trans
        result.append(get_sum_of_trans(label, end_date, 70))  # 前70天trans
        result.append(get_sum_of_trans(label, end_date, 140))  # 前140天trans

        result.append(get_sum_of_oil(label, end_date, 7))  # 前7天油价
        result.append(get_sum_of_oil(label, end_date, 28))  # 前28天油价
        result.append(get_sum_of_oil(label, end_date, 42))  # 前42天油价
        result.append(get_sum_of_oil(label, end_date, 70))  # 前70天油价
        result.append(get_sum_of_oil(label, end_date, 140))  # 前140天油价

        result.append(get_item_info(label))  # 添加商品信息
        result.append(get_store_info(label))  # 添加店铺信息

        result.append(get_sum_of_store_family(label, end_date, 7))  # 前7天同店铺同family销量
        result.append(get_sum_of_store_family(label, end_date, 14))  # 前14天同店铺同family销量
        result.append(get_sum_of_store_family(label, end_date, 28))  # 前28天同店铺同family销量
        result.append(get_sum_of_store_family(label, end_date, 42))  # 前42天同店铺同family销量
        result.append(get_sum_of_store_family(label, end_date, 70))  # 前70天同店铺同family销量
        result.append(get_sum_of_store_family(label, end_date, 140))  # 前140天同店铺同family销量

        result.append(get_sum_of_item_city(label, end_date, 7))  # 前7天同城市同商品销量
        result.append(get_sum_of_item_city(label, end_date, 14))  # 前14天同城市同商品销量
        result.append(get_sum_of_item_city(label, end_date, 21))  # 前21天同城市同商品销量
        result.append(get_sum_of_item_city(label, end_date, 28))  # 前28天同城市同商品销量
        result.append(get_sum_of_item_city(label, end_date, 42))  # 前42天同城市同商品销量
        result.append(get_sum_of_item_city(label, end_date, 70))  # 前70天同城市同商品销量
        result.append(get_sum_of_item_city(label, end_date, 140))  # 前140天同城市同商品销量

        result.append(get_sum_of_item(label, end_date, 7))  # 前7天商品销量
        result.append(get_sum_of_item(label, end_date, 14))  # 前14天商品销量
        result.append(get_sum_of_item(label, end_date, 21))  # 前21天商品销量
        result.append(get_sum_of_item(label, end_date, 28))  # 前28天商品销量
        result.append(get_sum_of_item(label, end_date, 42))  # 前42天商品销量
        result.append(get_sum_of_item(label, end_date, 70))  # 前70天商品销量
        result.append(get_sum_of_item(label, end_date, 140))  # 前140天商品销量

        result.append(get_lastdays_of_st(label, end_date, 30))  # 前一月每天的值
        # result.append(get_sum_of_store_item(label, end_date, 1))        # 前1天的和
        result.append(get_sum_of_store_item(label, end_date, 3))  # 前3天的和
        result.append(get_sum_of_store_item(label, end_date, 7))  # 前7天的和
        result.append(get_sum_of_store_item(label, end_date, 14))  # 前14天的和
        result.append(get_sum_of_store_item(label, end_date, 21))  # 前21天的和
        result.append(get_sum_of_store_item(label, end_date, 28))  # 前28天的和
        result.append(get_sum_of_store_item(label, end_date, 42))  # 前42天的和
        result.append(get_sum_of_store_item(label, end_date, 70))  # 前70天的和
        result.append(get_sum_of_store_item(label, end_date, 98))  # 前98天的和
        result.append(get_sum_of_store_item(label, end_date, 140))  # 前140天的和

        result.append(get_lastdays_of_prom(label, end_date, 16))  # 该时间段内是否促销

        result.append(get_sum_of_prom(label, end_date, 14))  # 前14天促销次数
        result.append(get_sum_of_prom(label, end_date, 28))  # 前28天促销次数
        result.append(get_sum_of_prom(label, end_date, 35))  # 前140天促销次数
        result.append(get_sum_of_prom(label, end_date, 70))  # 前140天促销次数
        result.append(get_sum_of_prom(label, end_date, 280))  # 前140天促销次数
        result.append(get_sum_of_prom(label, end_date, 140))  # 前140天促销次数
        # result.append(get_sum_of_week(label, end_date, 140))    #获取前一个月的week和


        # 上次购买时间
        result.append(get_promo_of_store_item(label, end_date))  # 上次促销开始的时间和结束时间

        result.append(label)

        print('concat feature...')
        result = concat(result).reindex()

        result = second_feat(result)

        # print('存储数据...')
        # result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    print('shape:{}'.format(result.shape))
    print('used {} second'.format(time.time() - t0))
    return result