def gen_level_aggs(col, updata=False): feat_path = os.path.join(feats_root,'level_aggs_{}.pkl'.format(col)) if os.path.exists(feat_path) and updata == False: print('Found ' + feat_path) else: print('Generating ' + feat_path) dfal = get_nominal_dfal()[[col, 'da'] + level_cols] dmax = dfal.da.max() dmin = dfal.da.min() level_agg = None for da in sorted(dfal.da.unique())[1:]: da_agg = None for win_das in [1, 2, 3]: if da - win_das < dmin: continue agg = gen_level_agg_features(dfal, da, win_das, col) print('Generated {} {} {}'.format(col, da, win_das)) if da_agg is None: da_agg = agg else: da_agg = da_agg.merge(agg, how='outer') if level_agg is None: level_agg = da_agg else: level_agg = pd.concat([level_agg, da_agg], axis=0) level_agg.fillna(0, inplace=True) level_agg, _ = reduce_mem_usage(level_agg) print(level_agg.shape) level_agg, _ = reduce_mem_usage(level_agg) dump_pickle(level_agg, feat_path)
def gen_buy_count(file_name='train'): data = load_pickle(path=raw_data_path + file_name + '.pkl') cols = ['user_id', 'item_id', 'item_brand_id', 'second_cate', 'shop_id'] data_select = pd.DataFrame() if file_name == 'train': for col in cols: feature_str = col + '_buy_count' buy_all = None for day in data.day.unique(): buy_filter = data.loc[data.day < day, [col, 'is_trade']] col_buy_count = buy_filter.groupby([col]).sum().iloc[:, 0] today_data = data.loc[data.day == day, [col]] today_data[feature_str] = today_data.apply(lambda x: \ col_buy_count[x[col]] if x[col] in col_buy_count.index else -1, axis=1) buy_all = pd.concat([buy_all, today_data], axis=0) data_select[feature_str] = buy_all[feature_str] else: train_data = load_pickle(path=raw_data_path + 'train' + '.pkl') for col in cols: feature_str = col + '_buy_count' buy_filter = train_data.loc[train_data.day <= 24, [col, 'is_trade']] col_buy_count = buy_filter.groupby([col]).sum().iloc[:, 0] data_select[feature_str] = data.apply(lambda x: \ col_buy_count[x[col]] if x[col] in col_buy_count.index else -1, axis=1) dump_pickle(data_select, feature_data_path + file_name + '_buy_count')
def gen_user_feature_click_hour(update=True): """生成用户对所有分类属性的当前小时点击量 file_name: user_(feature_id)_click_hour.pkl features: 'user_item_id_click_hour', 'user_item_brand_id_click_hour', 'user_context_page_id_click_hour', 'user_shop_id_click_hour', """ data = load_pickle(raw_data_path + 'all_data.pkl') feature_list=['item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level', 'item_collected_level', 'item_pv_level', 'context_page_id', 'shop_id', 'shop_review_num_level', 'shop_star_level',] for feature in tqdm(feature_list): feature_path = feature_data_path + 'user_'+feature+'_click_hour.pkl' if os.path.exists(feature_path) and update == False: print('found '+feature_path) else: print('generating '+feature_path) user_feature_click_day = data.groupby(['user_id', 'day', 'hour', feature]).size( ).reset_index().rename(columns={0: 'user_'+feature+'_click_hour'}) dump_pickle(user_feature_click_day, feature_path)
def gen_feature_click_stats(update=True): """生成各个分类属性日点击量的统计特征 file_name: (feature)_click_day_stats.pkl example: user_id_click_day_mean 该用户平均每天点击多少次 item_id_click_day_max 该物品单日最高销量 features: 'user_id_click_day_mean', 'user_id_click_day_max', 'user_id_click_day_min', 'item_id_click_day_mean', 'item_id_click_day_max', 'item_id_click_day_min', 'item_brand_id_click_day_mean', 'item_brand_id_click_day_max', 'item_brand_id_click_day_min', 'shop_id_click_day_mean', 'shop_id_click_day_max', 'shop_id_click_day_min', 'context_page_id_click_day_mean', 'context_page_id_click_day_max', 'context_page_id_click_day_min', 'category2_label_click_day_mean', 'category2_label_click_day_max', 'category2_label_click_day_min' """ data = load_pickle(raw_data_path + 'all_data.pkl') stats_feature = ['user_id', 'item_id', 'item_brand_id', 'shop_id'] for feature in tqdm(stats_feature): feature_path = feature_data_path + feature + '_click_day_stats.pkl' if os.path.exists(feature_path) and update == False: print('found ' + feature_path) else: print('generating ' + feature_path) feature_stats = gen_feature_click_day_stats(data, feature) print(feature_stats.columns) dump_pickle(feature_stats, feature_path)
def gen_item_stats_feature(updata=False): feat_path = os.path.join(feats_root, 'item_click_stats.pkl') if os.path.exists(feat_path) and updata == False: print('Found ' + feat_path) else: dfal = get_nominal_dfal() dfal = add_item_total_da_click(dfal) dfal = add_item_da_feature_click(dfal) print('generating ' + feat_path) columns_da = list( filter(lambda x: x.endswith('_click_da'), dfal.columns.values)) columns_ho = list( filter(lambda x: x.endswith('_click_ho'), dfal.columns.values)) tbar = tqdm(columns_da) for col in tbar: tbar.set_description('add_item_click_stats ' + col) dfal = gen_item_click_stats(dfal, col) print('add_item_click_stats completed.') feat_names = list( filter(lambda x: '_click_da_' in x, dfal.columns.values)) dfal = dfal[feat_names + ['item_id']].drop_duplicates(['item_id']) print('gen_item_stats_feature shape:', dfal.shape) dump_pickle(dfal, feat_path) print('gen_item_stats_feature completed.')
def get_vae_vs_beta(epochs, model, model_name, data_dir, betas): for beta in betas: if beta > 1: beta_str = f"{int(beta):04d}" else: beta_str = f"{beta:.1f}" vae = model(dim_z=cf_latent_dim, dim_x=(cf_img_size,cf_img_size,3), learning_rate=0.0001, kl_weight=beta) #vae.compile(optimizer=vae.optimizer, loss = vae.partial_vae_loss) # always start from the "warmed up beta=1, 20 epochs weights" sv_path = os.path.join(data_dir,"0001") vae.load_model(sv_path, 20) # vae = kcv.K_PCVAE(dim_z=cf_latent_dim, dim_x=(cf_img_size,cf_img_size,3), # learning_rate=0.0001, kl_weight=beta) vae.compile(optimizer=vae.optimizer, loss = vae.partial_vae_loss) train_history = vae.fit(train_dataset,epochs=epochs, verbose=0, validation_data=test_dataset) #, initial_epoch = 11 ) history = train_history.history ut.dump_pickle(os.path.join(data_dir,f"history{model_name}_{beta_str}.pkl"), (history,betas,epochs)) sv_path = os.path.join(data_dir,f"{beta_str}") make_dir(sv_path) print('save model') vae.save_model(sv_path, epochs)
def overtrain_vae(model, model_name, data_dir, params, epochs): make_dir(data_dir) vae = model(dim_z=params['z_dim'], dim_x=params['x_dim'], learning_rate=0.0001, kl_weight=params['kl_weight']) loss = vae.partial_vae_loss vae.compile(optimizer=vae.optimizer, loss=loss) train_history = vae.fit(train_dataset, epochs=epochs, verbose=1, validation_data=test_dataset) #, initial_epoch = 11 ) history = train_history.history filename = f"overtrain-{model_name}-kl_weight{params['kl_weight']:03d}.pkl" ut.dump_pickle(os.path.join(data_dir, filename), (history, params)) sv_path = os.path.join(data_dir, f"kl_weight{params['kl_weight']:03d}") make_dir(sv_path) print('save model') vae.save_model(sv_path, epochs)
def denote_direction(input_path, output_dir): data_dsc = utils.load_pickle(input_path) data_dsc.loc[:, "fut_ret_direction"] = np.nan is_pos = data_dsc.loc[:, "fut_ret"] > 0 data_dsc.loc[is_pos, "fut_ret_direction"] = 1 data_dsc.loc[~is_pos, "fut_ret_direction"] = -1 utils.dump_pickle(data_dsc, os.path.join(output_dir, "data_dsc.pkl"))
def load_data(start_day=23, end_day=26, load_test=False): """ 读取基本表拼接后的数据 test表load_test = True """ if load_test == True: trans_train_path = feature_data_path + 'trans_test_' + str( start_day) + '_' + str(end_day) + '.pkl' raw_train_path = raw_data_path + 'test.pkl' else: trans_train_path = feature_data_path + 'trans_train_' + str( start_day) + '_' + str(end_day) + '.pkl' raw_train_path = raw_data_path + 'train.pkl' if os.path.exists(trans_train_path): print('found ' + trans_train_path) train = pickle.load(open(trans_train_path, 'rb')) else: print('generating ' + trans_train_path) train = load_pickle(raw_train_path) train = addTime(train) train = train[(train.clickDay >= start_day) & (train.clickDay <= end_day)] train = addAd(train) train = addPosition(train) train = addAppCategories(train) train = addUserInfo(train) dump_pickle(train, trans_train_path) return train
def gen_user_search_count(file_name): data = load_pickle(path=raw_data_path + file_name + '.pkl') data = data.loc[:, [ 'user_id', 'item_id', 'shop_id', 'day', 'hour', 'second_cate' ]] data_select = pd.DataFrame() #聚类一下 user_day_search = data.groupby(['user_id', 'day']).count().iloc[:, 0] #获取每个样本的,user_id,day组成的索引,以索引聚类后的数据 x = data.loc[:, ('user_id', 'day')].values k = user_day_search.loc[[tuple(i) for i in x]] data_select['user_day_search'] = k.values user_hour_search = data.groupby(['user_id', 'day', 'hour']).count().iloc[:, 0] x = data.loc[:, ('user_id', 'day', 'hour')].values k = user_hour_search.loc[[tuple(i) for i in x]] data_select['user_hour_search'] = k.values user_day_item_search = data.groupby(['user_id', 'day', 'item_id']).count().iloc[:, 0] x = data.loc[:, ('user_id', 'day', 'item_id')].values k = user_day_item_search.loc[[tuple(i) for i in x]] data_select['user_day_item_search'] = k.values user_hour_item_search = data.groupby(['user_id', 'day', 'hour', 'item_id']).count().iloc[:, 0] x = data.loc[:, ('user_id', 'day', 'hour', 'item_id')].values k = user_hour_item_search.loc[[tuple(i) for i in x]] data_select['user_hour_item_search'] = k.values user_day_shop_search = data.groupby(['user_id', 'day', 'shop_id']).count().iloc[:, 0] x = data.loc[:, ('user_id', 'day', 'shop_id')].values k = user_day_shop_search.loc[[tuple(i) for i in x]] data_select['user_day_shop_search'] = k.values user_hour_shop_search = data.groupby(['user_id', 'day', 'hour', 'shop_id']).count().iloc[:, 0] x = data.loc[:, ('user_id', 'day', 'hour', 'shop_id')].values k = user_hour_shop_search.loc[[tuple(i) for i in x]] data_select['user_hour_shop_search'] = k.values user_day_catesearch = data.groupby(['user_id', 'day', 'second_cate']).count().iloc[:, 0] x = data.loc[:, ('user_id', 'day', 'second_cate')].values k = user_day_catesearch.loc[[tuple(i) for i in x]] data_select['user_day_cate_search'] = k.values user_hour_cate_search = data.groupby( ['user_id', 'day', 'hour', 'second_cate']).count().iloc[:, 0] x = data.loc[:, ('user_id', 'day', 'hour', 'second_cate')].values k = user_hour_cate_search.loc[[tuple(i) for i in x]] data_select['user_hour_cate_search'] = k.values dump_pickle(data_select, feature_data_path + file_name + '_user_search_count')
def main(): parse = argparse.ArgumentParser() parse.add_argument('-A', '--A', type=float, default=0.55, help='A') parse.add_argument('-B', '--B', type=float, default=1.5, help='B') args = parse.parse_args() data_path = 'data/deeplearning_data/xml_data/' train_pid_label = load_pickle(data_path + 'train_label.pkl') index_label = load_pickle('data/baseline_data/xml_data/all_labels.pkl') baseline_inv_prop_file = 'data/baseline_data/xml_data/inv_prop.txt' train_label = train_pid_label.values() train_label = np.concatenate(train_label).tolist() label_frequency = dict(Counter(train_label)) labels, fre = zip(*label_frequency.iteritems()) fre = np.array(fre) N = len(train_pid_label) C = (math.log(N) - 1) * (args.B + 1)**args.A inv_prop = 1 + C * (fre + args.B)**(-args.A) inv_prop_dict = dict(zip(labels, inv_prop.tolist())) dump_pickle(inv_prop_dict, data_path + 'inv_prop_dict.pkl') # # for baseline inv propensity with open(baseline_inv_prop_file, 'w') as df: for l_ in index_label[:-1]: df.write(str(inv_prop_dict[l_])) df.write('\n') df.write(str(inv_prop_dict[index_label[-1]]))
def gen_category_predict_rank(update=True): '''生成实际类别在预测类别里的排序 file_name: category_predict_rank.pkl features: category_predict_rank ''' all_data = load_pickle(raw_data_path + 'all_data.pkl') feature_path = feature_data_path + 'category_predict_rank.pkl' if os.path.exists(feature_path) and update == False: print('found ' + feature_path) else: print('generating ' + feature_path) all_data['category_predict_rank'] = all_data.apply( lambda row: get_category_predict_rank( row['item_category_list'], row['predict_category_property']), axis=1) all_data['category_3'] = all_data.apply( lambda row: get_category_3_predict_rank( row['item_category_list'], row['predict_category_property']), axis=1) all_data = all_data[['category_predict_rank', 'category_3']] dump_pickle(all_data, feature_path)
def gen_property_sim(update=True): '''生成实际属性与预测属性的相似度 file_name: property_sim.pkl features: property_sim ''' all_data = load_pickle(raw_data_path + 'all_data.pkl') feature_path = feature_data_path + 'property_sim.pkl' if os.path.exists(feature_path) and update == False: print('found ' + feature_path) else: print('generating ' + feature_path) all_data['property_sim'] = all_data.apply( lambda row: get_property_sim(row['item_category_list'], row[ 'item_property_list'], row['predict_category_property']), axis=1) all_data = all_data[[ 'property_sim', ]] dump_pickle(all_data, feature_path)
def gen_hist_cvr_smooth(start_day,end_day, key, alpha=0.25): train_data = load_pickle(raw_data_path+'train.pkl') test_date = load_pickle(raw_data_path+'test.pkl') data = train_data.append(test_date) del train_data,test_date gc.collect() data = addTime(data) data = addAd(data) data = addPosition(data) ID_hist_cvr = None for day in tqdm(np.arange(start_day,end_day+1)): feature_path = feature_data_path + key +'_histcvr_smooth_day_'+str(day)+'.pkl' if os.path.exists(feature_path): print('found '+feature_path) else: print('generating '+feature_path) dfCvr = data[data.clickDay < day] dfCvr = pd.get_dummies(dfCvr, columns=['label'], prefix='label') dfCvr = dfCvr.groupby([key], as_index=False).sum() dfCvr[key+'_cvr'] = (dfCvr['label_1'] + alpha) / (dfCvr['label_0'] + dfCvr['label_1'] + alpha*2) #dfCvr['clickDay'] = day sub_data = pd.merge(data.loc[data.clickDay==day,['clickDay',key]],dfCvr[[key,key+'_cvr']],'left',on=[key,]) sub_data.drop_duplicates(['clickDay',key],inplace=True) sub_data.sort_values(['clickDay',key],inplace=True) dump_pickle(sub_data[['clickDay',key,key+'_cvr']],feature_path)
def gen_user_feature_click_hour(): """生成用户对所有分类属性的当前小时点击量 """ data = load_pickle(raw_data_path + 'all_data_4567.pkl') feature_list = [ 'category2_label', 'category3_label', 'shop_id', 'item_id', 'item_brand_id', 'context_page_id', 'item_price_level_bin', 'item_sales_level_bin', 'item_property_topic_k_15', ] for feature in tqdm(feature_list): feature_path = feature_data_path + '_2_1_' + 'user_' + feature + '_click_hour.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) user_feature_click_hour = data.groupby( ['user_id', 'day', 'hour', feature]).size().reset_index().rename( columns={0: 'user_' + feature + '_click_hour'}) dump_pickle(user_feature_click_hour, feature_path)
def comparing_with_ground_truth(tops, txt_infos, k): utils.dump_pickle("result.pkl", tops) gt = utils.get_pickle("datasets/qst1_w4/gt_corresps.pkl") hypo = utils.get_pickle("result.pkl") mapAtK = metrics.mapk(gt, hypo, k) print("\nMap@ " + str(k) + " is " + str(mapAtK)) bbs_gt = np.asarray( utils.get_groundtruth("datasets/qst1_w4/text_boxes.pkl")).squeeze() bbs_predicted = [[painting.boundingxy for painting in txt_info] for txt_info in txt_infos] mean_iou = utils.get_mean_IoU(bbs_gt, bbs_predicted) print("Mean Intersection over Union: ", mean_iou) texts_gt = utils.get_gt_text("datasets/qst1_w4") texts_predicted = [[painting.text for painting in txt_info] for txt_info in txt_infos] with open('results.txt', 'w') as f: for item in texts_predicted: f.write("%s\n" % item) mean_lev = utils.compute_lev(texts_gt, texts_predicted) print(texts_predicted) print("\n") print(texts_gt) print("Mean Levenshtein distance: ", mean_lev)
def gen_bow_features_for_pool(pools, pool_idx, image_dir, feature_file_dir): train_files = [] val_files = [] test_files = [] pool = pools['data'][str(pool_idx)] for path in pool['train_files']: full_path = os.path.join(image_dir, path) train_files.append(full_path) for path in pool['val_files']: full_path = os.path.join(image_dir, path) val_files.append(full_path) for path in pool['test_files']: full_path = os.path.join(image_dir, path) test_files.append(full_path) train_bow_features, val_bow_features, test_bow_features = extract_BOW_features( train_files, val_files, test_files) features = { 'train_features': train_bow_features, 'val_features': val_bow_features, 'test_features': test_bow_features, 'pool_idx': pool_idx } filepath = get_feature_file_path(pools, pool_idx, feature_file_dir) dump_pickle(features, filepath) return
def gen_tricks(start_day, end_day): """ 生成trick,first_diff,last_diff,install2click,根据gloabl_index拼接 """ train_data = load_pickle(raw_data_path + 'train.pkl') test_data = load_pickle(raw_data_path + 'test.pkl') actions = load_pickle(raw_data_path + 'user_app_actions.pkl') data = train_data.append(test_data) del train_data, test_data data = addTime(data) data = addAd(data) for day in tqdm(np.arange(start_day, end_day + 1)): feature_path = feature_data_path + 'tricks_day_' + str(day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) df = data.loc[data.clickDay == day] df = add_trick(df) df = add_diff(df) df = add_install2click(df, day, actions) dump_pickle( df[[ 'global_index', 'trick', 'first_diff', 'last_diff', 'install2click' ]], feature_path)
def gen_feature_click_day_hour(update=True): ''' 计算feature=['user_id', 'item_id', 'item_brand_id', 'shop_id', 'user_gender_id', 'context_page_id', 'user_occupation_id', 'user_age_level']的点击量 计算的是每天每小时 文件名:[feature]_click_hour.pkl ''' all_data = load_pickle(raw_data_path + 'all_data_4567.pkl') for feature in tqdm([ 'user_id', 'item_id', 'item_brand_id', 'category2_label', 'category3_label', 'context_page_id', 'shop_id', 'item_property_topic_k_15' ]): feature_path = feature_data_path + '_2_7_' + feature + '_click_day_hour.pkl' # 要存放的目录 if os.path.exists(feature_path) and update == False: print('found ' + feature_path) else: print('generating ' + feature_path) feature_click_day_hour = all_data.groupby( [feature, 'day', 'hour']).size().reset_index().rename( columns={0: feature + '_click_hour'}) dump_pickle(feature_click_day_hour, feature_path) # 存储
def linear_regression(train, test, cols_x, col_y, output_dir): reg = LinearRegression() X_train = train.loc[:, cols_x].copy() y_train = train.loc[:, col_y].copy() X_test = test.loc[:, cols_x].copy() y_test = test.loc[:, col_y].copy() X_train.replace(np.nan, 0.0, inplace=True) X_test.replace(np.nan, 0.0, inplace=True) X_train = X_train.abs() y_train = y_train.abs() X_test = X_test.abs() y_test = y_test.abs() # res = np.correlate(X_train.loc[:, cols_x[0]], X_train.loc[:, cols_x[1]]) # print(res) reg.fit(X_train, y_train) print(reg.coef_) yhat_test = reg.predict(X_test) utils.dump_pickle(yhat_test, os.path.join(output_dir, "scale_yhat_test.pkl")) print("insample R2: ", reg.score(X_train, y_train)) print("outofsample R2: ", reg.score(X_test, y_test))
def gen_user_start_installed_cateA(): """ 计算用户初始安装的各大类app的的数量 拼接键['userID',] """ user_install = load_pickle(raw_data_path + 'user_installedapps.pkl') app_cate = pd.read_csv(raw_data_path + 'app_categories.csv') app_cate['cate_a'] = app_cate.appCategory.apply(lambda x: x // 100 if x > 100 else x) user_install = user_install.merge(app_cate, 'left', 'appID') for cate_a in tqdm(app_cate.cate_a.unique()): feature_path = feature_data_path + 'user_start_installed_cate_' + str( cate_a) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) else: print('generating ' + feature_path) user_install_cate = user_install[user_install.cate_a == cate_a][[ 'userID', 'cate_a' ]] user_install_cate.rename( columns={'cate_a': 'user_start_install_cate_' + str(cate_a)}, inplace=True) user_install_cate = user_install_cate.groupby( 'userID', as_index=False).sum() dump_pickle(user_install_cate, feature_path)
def gen_ID_global_sum_count( last_day=27, stats_features=['positionID', 'creativeID', 'appID', 'adID', 'userID']): train = load_pickle(raw_data_path + 'train.pkl') test = load_pickle(raw_data_path + 'test.pkl') data = train.append(test) data = addTime(data) data = data[data.clickDay <= last_day] del train, test gc.collect() data = addAd(data) data = addPosition(data) data = addAppCategories(data) for feature in tqdm(stats_features): feature_path = feature_data_path + 'global_count_' + feature + '_lastday' + str( last_day) + '.pkl' if os.path.exists(feature_path): print('found ' + feature_path) #continue print('generating ' + feature_path) feature_count_sum = pd.DataFrame( data.groupby(feature).size()).reset_index().rename( columns={0: feature + '_sum_count'}) dump_pickle(feature_count_sum, feature_path)
def run_graph_generate(args): print("graph_generate") generate_args_base = { "sizes": args["cluster_sizes"], "p_in": args["p"], "p_out": args["q"], "seed": args["seed"], "cull_disconnected": args["cull_disconnected"], "connect_disconnected": args["connect_disconnected"], "generator_type": args["generator_type"], "out_path": None, "visualize": False, } graphs = {} num_graphs = args["num_graphs"] for i in range(num_graphs): if i == 0 or (i+1) % 100 == 0 or i == (num_graphs - 1): print(f"graph {i+1}/{num_graphs}") generate_args = generate_args_base.copy() if args.get('verbose', False): print(f"{i}: {generate_args}") graph = generate_appm.main(generate_args) graph_id = str(uuid.uuid4()) graphs[graph_id] = {'args': generate_args, 'graph': graph } out_path = (f"{args['results_base']}" f"/{datetime.now().strftime(TIMESTAMP_FORMAT)}.pk") dump_pickle(graphs, out_path)
def gen_hist_cvr_smooth(start_da, end_da, key, alpha=0.25): dfal = get_nominal_dfal() dfal = dfal.loc[dfal.da <= end_da, [key, 'da', 'is_trade']] gc.collect() for da in tqdm(np.arange(start_da, end_da + 1)): feat_path = os.path.join( feats_root, key + '_hist_cvr_smooth_da_' + str(da) + '.pkl') if os.path.exists(feat_path): print('found ' + feat_path) else: print('generating ' + feat_path) dfcv = dfal.copy().loc[dfal.da < da] dfcv.is_trade = dfcv.is_trade.apply(int) dfcv = pd.get_dummies(dfcv, columns=['is_trade'], prefix='label') dfcv = dfcv.groupby([key], as_index=False).sum() dfcv[key + '_cvr'] = (dfcv['label_1'] + alpha) / ( dfcv['label_0'] + dfcv['label_1'] + alpha * 2) result = pd.merge(dfal.loc[dfal.da == da, ['da', key]], dfcv.loc[:, [key, key + '_cvr']], 'left', on=[ key, ]) result.drop_duplicates(['da', key], inplace=True) result.sort_values(['da', key], inplace=True) dump_pickle(result.loc[:, ['da', key, key + '_cvr']], feat_path)
def gen_dfal(): dump_nominal_file = os.path.join(utils.cache_root, 'dfda_nominal.pkl') dump_textual_file = os.path.join(utils.cache_root, 'dfda_textual.pkl') if not os.path.exists(dump_nominal_file): tr = pd.read_csv('./input/round1_ijcai_18_train_20180301.txt', sep=' ', dtype={'is_trade': np.uint8}) tr.is_trade = tr.is_trade.astype(np.int8) te = pd.read_csv('./input/round1_ijcai_18_test_b_20180418.txt', sep=' ') da = pd.concat([tr, te], axis=0) da = utils.add_time_fields(da) for col in utils.nominal_cate_cols + utils.identity_cols: da[col] = LabelEncoder().fit_transform(da[col]) for col in utils.ordinal_cate_cols: levels = sorted(da[col].unique()) da[col] = da[col].apply(lambda x: levels.index(x)).astype(np.uint8) del da['context_id'] del da['context_timestamp'] del da['ts'] da, _ = utils.reduce_mem_usage(da) utils.dump_pickle(da[utils.textual_cols], dump_textual_file) utils.dump_pickle(da.drop(utils.textual_cols, axis=1), dump_nominal_file) print('gen dfal ok.')
def gen_user_search_time(file_name): ''' #用当次搜索距离当天第一次搜索该商品时间差 #用当次搜索距离当天第最后一次搜索该商品时间差 #用当次搜索距离当天第一次搜索该店铺时间差 #用当次搜索距离当天第最后一次搜索该店铺时间差 #用当次搜索距离当天第一次搜索该品牌时间差 #用当次搜索距离当天第最后一次搜索该品牌时间差 #用当次搜索距离当天第一次搜索该类目时间差 #用当次搜索距离当天第最后一次搜索该类目时间差 ''' data_select = pd.DataFrame() data = load_pickle(path=raw_data_path + file_name + '.pkl') cols = ['item_id','shop_id', 'item_brand_id','second_cate'] for col in cols: data_filter = data[['user_id', col,'day','context_timestamp']].groupby(['user_id', col,'day']) max_time = data_filter.agg(max) min_time = data_filter.agg(min) x = data.loc[:, ('user_id', col, 'day')].values m = max_time.loc[[tuple(i) for i in x]] n = min_time.loc[[tuple(i) for i in x]] data_select['sub_maxtime_'+col] = data['context_timestamp'].values - np.squeeze(m.values) data_select['sub_mintime_'+col] = data['context_timestamp'].values - np.squeeze(n.values) data_select['sub_maxtime_'+col] = data_select['sub_maxtime_'+col].apply(lambda x: x.total_seconds()) data_select['sub_mintime_'+col] = data_select['sub_mintime_'+col].apply(lambda x: x.total_seconds()) dump_pickle(data_select, feature_data_path +file_name + '_user_search_time')
def gen_user_basic_info(file_name='train', test_day=24): data_select = pd.DataFrame() data = load_pickle(path=raw_data_path + file_name + '.pkl') data_select['user_id'] = data['user_id'] data_select['user_gender_id'] = data['user_gender_id'] data_select['user_age_level'] = data['user_age_level'] data_select['user_occupation_id'] = data['user_occupation_id'] data_select['user_star_level'] = data['user_star_level'] #用户搜索时间划分,上午/下午/晚上/凌晨 data_select['is_morning'] = (data['hour'].values >= 8) & (data['hour'].values <= 12) data_select['is_afternoon'] = (data['hour'].values > 12) & (data['hour'].values <= 17) data_select['is_evening'] = (data['hour'].values > 17) & (data['hour'].values <= 23) data_select['is_before_dawn'] = (data['hour'].values < 8) if file_name == 'train': ''' 为了后面的抽样,这里先加上is_trade,训练时记得要删去 ''' data_select['is_trade'] = data['is_trade'] dump_pickle(data_select, feature_data_path + file_name + '_user_basic_info')
def add_context_cate(data): #得到类别和属性组合后的个数 context_cate_cols_path = raw_data_path + 'context_cate_cols.pkl' if os.path.exists(context_cate_cols_path): print("found " + context_cate_cols_path) cols = load_pickle(context_cate_cols_path) cols = list(map(lambda x: x[0], cols)) else: #cate_dict, cate_cnt, _, _ = search_category_explore(data) cols = gen_sorted_search_cate_property(data) cols = list(map(lambda x: x[0], cols)) dump_pickle(cols, context_cate_cols_path) feature_path = feature_data_path + 'context_cate_property_feat.pkl' data.cate_cols = data.predict_category_property.apply( lambda x: str_to_cate_cols(x)) col_index = 0 #当前商品的类别和属性拼接后是否在前00名 for col in tqdm(cols[:300]): data[col] = data.cate_cols.apply(lambda x: 1 if col in x else 0) #if col_index % 200 == 0 and col_index > 100: # dump_pickle(data[['instance_id']+cols[:col_index+1]], feature_path) col_index += 1 dump_pickle(data[['instance_id'] + cols[:300]], feature_path) return data
def gen_shop_da_feature_click(updata=False): """生成用户相关所有数据的每天点击统计量""" dfal = get_nominal_dfal() stats_feat = [ 'item_category_list', 'item_brand_id', 'item_city_id', 'user_gender_id', 'user_occupation_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_age_level', 'user_star_level', 'context_page_id', 'item_id', 'user_id' ] tbar = tqdm(stats_feat) for feat in tbar: feat_path = os.path.join(feats_root, 'shop_' + feat + '_click_da.pkl') if os.path.exists(feat_path) and updata == False: tbar.set_description('Found {:>60}'.format( os.path.basename(feat_path))) else: tbar.set_description('Generating {:>60}'.format( os.path.basename(feat_path))) shop_feat_click_da = dfal.groupby( ['shop_id', 'da', feat]).size().reset_index().rename( columns={0: 'agg_shop_%s_click_da' % feat}) dump_pickle(shop_feat_click_da, feat_path) print('gen_shop_da_feature_click completed.')
def gen_category(file_name='train'): data = load_pickle(path=raw_data_path + file_name + '.pkl') item_cate_col = list(data.item_category_list) item_cate = list(map(lambda x: x.split(';'), item_cate_col)) data['second_cate'] = list(map(lambda x: x[1], item_cate)) dump_pickle(data, path=raw_data_path + file_name + '.pkl')