Beispiel #1
0
    print('------------------------ 模型训练 start time:{}'.format(time_str))
    # submit = train_model_lgb(feature_all, recall_rate=hit_rate, hot_list=hot_list, valid=0.2, topk=50, num_boost_round=1, early_stopping_rounds=1)
    # submit = train_model_rf(train_test, recall_rate=1, hot_list=hot_list, valid=0.2, topk=50)
    model = rank_rf(train_x, train_y)
    # model = rank_xgb(train_x, train_y)
    print('train set: auc:{}'.format(
        roc_auc_score(train_y,
                      model.predict_proba(train_x)[:, 1])))
    with open('./cache/rf.pickle', 'wb') as f:
        pickle.dump(model, f)
    ''' 模型验证 '''
    print('------------------------ 模型验证 start time:{}'.format(time_str))
    time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    pre_y = model.predict_proba(valid_x)[:, 1]
    print('valid set: auc:{}'.format(roc_auc_score(valid_y, pre_y)))
    answer = make_answer(valid_df[valid_df['label'] == 1], hot_df, phase=1)
    my_eval(pre_y, valid_df, answer)

    # ''' qtime 提交集 特征 '''
    # for phase in range(0, conf.now_phase + 1):
    #     pass

    print('------------------------ underexpose_test_qtime 预测 start time:{}'.
          format(time_str))
    submit_all = pd.DataFrame()
    for phase in range(0, conf.now_phase + 1):
        print('----------------------- phase:{} -------------------------'.
              format(phase))
        if conf.is_recall_cached:
            one_phase_recall_item_df = \
                pd.read_csv(conf.recall_cache_path.format(phase), dtype={'user_id': np.int, 'item_id': np.int})
Beispiel #2
0
def _get_model(params):
    feature_df = params.get("feature")
    hot_df = params.get("hot_df")

    eta = None
    if 'eta' in params:
        eta = params['eta']
    min_child_weight = None
    if 'min_child_weight' in params:
        min_child_weight = params['min_child_weight']
    max_depth = None
    if 'max_depth' in params:
        max_depth = int(params['max_depth'])
    gamma = None
    if 'gamma' in params:
        gamma = params['gamma']
    subsample = None
    if 'subsample' in params:
        subsample = params['subsample']
    colsample_bytree = None
    if 'colsample_bytree' in params:
        colsample_bytree = params['colsample_bytree']
    reg_lambda = None
    if 'reg_lambda' in params:
        reg_lambda = params['reg_lambda']
    scale_pos_weight = None
    if 'scale_pos_weight' in params:
        scale_pos_weight = params['scale_pos_weight']
    tree_method = None
    if 'tree_method' in params:
        tree_method = params['tree_method']
    n_estimators = None
    if 'n_estimators' in params:
        n_estimators = int(params['n_estimators'])

    train_auc = valid_auc = 0
    pre_score_arr = np.zeros(5).reshape(-1, )
    rank_score_arr = np.zeros(5).reshape(-1, )
    for i in range(conf.k):
        ''' 训练集/验证集划分 '''
        train_df, valid_df = featuring.train_test_split(feature_df, seed=1)

        train_x = train_df[train_df.columns.difference(
            ['user_id', 'item_id', 'label'])].values
        train_y = train_df['label'].values

        valid_df = valid_df.sort_values('sim').reset_index(drop=True)
        valid_x = valid_df[valid_df.columns.difference(
            ['user_id', 'item_id', 'label'])].values
        valid_y = valid_df['label'].values
        ''' 模型训练 '''
        model = rank.rank_xgb(train_x,
                              train_y,
                              eta=eta,
                              min_child_weight=min_child_weight,
                              max_depth=max_depth,
                              gamma=gamma,
                              subsample=subsample,
                              colsample_bytree=colsample_bytree,
                              reg_lambda=reg_lambda,
                              scale_pos_weight=scale_pos_weight,
                              tree_method=tree_method,
                              n_estimators=n_estimators)
        one_train_auc = roc_auc_score(train_y,
                                      model.predict_proba(train_x)[:, 1])
        train_auc += one_train_auc
        ''' 模型验证 '''
        pre_y = model.predict_proba(valid_x)[:, 1]
        one_valid_auc = roc_auc_score(valid_y, pre_y)
        valid_auc += one_valid_auc
        answer = eval.make_answer(valid_df[valid_df['label'] == 1],
                                  hot_df,
                                  phase=1)

        pre_score_arr += eval.my_eval(list(valid_df['sim']),
                                      valid_df,
                                      answer,
                                      print_mark=False)
        rank_score_arr += eval.my_eval(pre_y,
                                       valid_df,
                                       answer,
                                       print_mark=False)

    avg_valid_auc = valid_auc / conf.k
    avg_pre_ndcg = pre_score_arr / conf.k
    avg_rank_ndcg = rank_score_arr / conf.k
    diff = avg_rank_ndcg - avg_pre_ndcg
    print('avg valid auc:{}, ndcg full gain:{}, ndcg half gain:{}'.format(
        avg_valid_auc, diff[0], diff[2]))

    return -diff[2]