set(train_df[train_df['user_id'].isin( qtime_user_df['user_id'])]['user_id'])))) train_x = train_df[train_df.columns.difference( ['user_id', 'item_id', 'label', 'truth_item_id'])].values train_y = train_df['label'].values valid_df = valid_df.sort_values('sim').reset_index(drop=True) valid_x = valid_df[valid_df.columns.difference( ['user_id', 'item_id', 'label', 'truth_item_id'])].values valid_y = valid_df['label'].values ''' 模型训练 ''' time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('------------------------ 模型训练 start time:{}'.format(time_str)) # model = rank_rf(train_x, train_y) model = rank_xgb(train_x, train_y) one_train_auc = roc_auc_score(train_y, model.predict_proba(train_x)[:, 1]) train_auc += one_train_auc print('train set: auc:{}'.format(one_train_auc)) with open('./cache/model.pickle', 'wb') as f: pickle.dump(model, f) ''' 模型验证 ''' time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('------------------------ 模型验证 start time:{}'.format(time_str)) pre_y = model.predict_proba(valid_x)[:, 1] one_valid_auc = roc_auc_score(valid_y, pre_y) valid_auc += one_valid_auc print('valid set: auc:{}'.format(one_valid_auc)) answer = make_answer(valid_df[valid_df['label'] == 1], hot_df, phase=1)
def _get_model(params): feature_df = params.get("feature") hot_df = params.get("hot_df") eta = None if 'eta' in params: eta = params['eta'] min_child_weight = None if 'min_child_weight' in params: min_child_weight = params['min_child_weight'] max_depth = None if 'max_depth' in params: max_depth = int(params['max_depth']) gamma = None if 'gamma' in params: gamma = params['gamma'] subsample = None if 'subsample' in params: subsample = params['subsample'] colsample_bytree = None if 'colsample_bytree' in params: colsample_bytree = params['colsample_bytree'] reg_lambda = None if 'reg_lambda' in params: reg_lambda = params['reg_lambda'] scale_pos_weight = None if 'scale_pos_weight' in params: scale_pos_weight = params['scale_pos_weight'] tree_method = None if 'tree_method' in params: tree_method = params['tree_method'] n_estimators = None if 'n_estimators' in params: n_estimators = int(params['n_estimators']) train_auc = valid_auc = 0 pre_score_arr = np.zeros(5).reshape(-1, ) rank_score_arr = np.zeros(5).reshape(-1, ) for i in range(conf.k): ''' 训练集/验证集划分 ''' train_df, valid_df = featuring.train_test_split(feature_df, seed=1) train_x = train_df[train_df.columns.difference( ['user_id', 'item_id', 'label'])].values train_y = train_df['label'].values valid_df = valid_df.sort_values('sim').reset_index(drop=True) valid_x = valid_df[valid_df.columns.difference( ['user_id', 'item_id', 'label'])].values valid_y = valid_df['label'].values ''' 模型训练 ''' model = rank.rank_xgb(train_x, train_y, eta=eta, min_child_weight=min_child_weight, max_depth=max_depth, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, tree_method=tree_method, n_estimators=n_estimators) one_train_auc = roc_auc_score(train_y, model.predict_proba(train_x)[:, 1]) train_auc += one_train_auc ''' 模型验证 ''' pre_y = model.predict_proba(valid_x)[:, 1] one_valid_auc = roc_auc_score(valid_y, pre_y) valid_auc += one_valid_auc answer = eval.make_answer(valid_df[valid_df['label'] == 1], hot_df, phase=1) pre_score_arr += eval.my_eval(list(valid_df['sim']), valid_df, answer, print_mark=False) rank_score_arr += eval.my_eval(pre_y, valid_df, answer, print_mark=False) avg_valid_auc = valid_auc / conf.k avg_pre_ndcg = pre_score_arr / conf.k avg_rank_ndcg = rank_score_arr / conf.k diff = avg_rank_ndcg - avg_pre_ndcg print('avg valid auc:{}, ndcg full gain:{}, ndcg half gain:{}'.format( avg_valid_auc, diff[0], diff[2])) return -diff[2]