Exemple #1
0
def LR_offline(train_data, cv_data):
    train_Y = train_data['is_trade']
    cv_Y = cv_data['is_trade']
    
    drop_cols = ['is_trade']
    train_data.drop(drop_cols,axis=1,inplace=True)
    cv_data.drop(drop_cols,axis=1,inplace=True)
    
    fold = 5
    kf = KFold(len(train_data), n_folds = fold, shuffle=True, random_state=520)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((cv_data.shape[0], fold))
    for i, (train_index, cv_index) in enumerate(kf):
        
        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]
        
        clf = LogisticRegression(C=1.2, fit_intercept=True, max_iter=3000,class_weight={0:0.5, 1:0.5})
        clf.fit(X=train_feat.values, y=train_Y[train_index])
        
        predict_train = clf.predict_proba(train_feat.values)[:,1]
        predict_cv = clf.predict_proba(cv_feat.values)[:,1]
        predict_test = clf.predict_proba(cv_data.values)[:,1]
        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv
        test_preds[:,i] = predict_test
        
        print('  训练损失:',cal_log_loss(predict_train, train_Y[train_index]))
        print('  测试损失:',cal_log_loss(predict_cv, train_Y[cv_index]))
    predict_test = np.median(test_preds,axis=1)
    print('验证损失:',cal_log_loss(predict_test, cv_Y))
Exemple #2
0
def FFM_model_first(day7_data, cv_data, test_data):

    day7_name = cache_pkl_path + 'FFM_day/train_7'
    cv_name = cache_pkl_path + 'FFM_day/cv'
    test_name = cache_pkl_path + 'FFM_day/test'

    name_list = [cache_pkl_path + 'FFM_day/train_' + str(i) for i in range(7)]
    data_in_process = [(train_name, day7_name, cv_name, test_name)
                       for train_name in name_list]
    len_name = len(name_list)

    predict_day7 = np.zeros((day7_data.shape[0], len_name))
    predict_cv = np.zeros((cv_data.shape[0], len_name))
    predict_test = np.zeros((test_data.shape[0], len_name))

    with multiprocessing.Pool(len_name) as p:
        k_val_list = p.map(_FFM_train, data_in_process)

    for i, val in zip(range(len_name), k_val_list):
        print('no %d train' % (i))
        day7_i, cv_i, test_i = val
        print('  测试损失:', cal_log_loss(day7_i, day7_data['is_trade'].values))
        print('  验证损失:', cal_log_loss(cv_i, cv_data['is_trade'].values))
        predict_day7[:, i] = day7_i
        predict_cv[:, i] = cv_i
        predict_test[:, i] = test_i
    return (predict_day7, predict_cv, predict_test)
Exemple #3
0
def xgb_online(train, cv, test):

    train_data = train.copy()
    cv_data = cv.copy()
    test_data = test.copy()

    train_data = pd.concat([train_data, cv_data], axis=0)
    train_data.reset_index(inplace=True, drop=True)
    train_Y = train_data['is_trade']

    drop_cols = ['is_trade']
    train_data.drop(drop_cols, axis=1, inplace=True)
    test_data.drop(drop_cols, axis=1, inplace=True)

    folds = 5
    kf = KFold(len(train_data), n_folds=folds, shuffle=True, random_state=7)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((test_data.shape[0], 5))
    for i, (train_index, cv_index) in enumerate(kf):
        print('第{}次训练...'.format(i))
        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]

        train_feat = xgb.DMatrix(train_feat.values, label=train_Y[train_index])
        cv_feat = xgb.DMatrix(cv_feat.values, label=train_Y[cv_index])
        test_feat = xgb.DMatrix(test_data.values)
        watchlist = [(train_feat, 'train'), (cv_feat, 'val')]


        clf = xgb.train(params=params, dtrain=train_feat,num_boost_round=n_round,\
            evals=watchlist,early_stopping_rounds=7,verbose_eval=False)

        predict_train = clf.predict(train_feat)
        predict_cv = clf.predict(cv_feat)
        predict_test = clf.predict(test_feat)

        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv
        test_preds[:, i] = predict_test
        #特征重要度
        features = train_data.columns
        ceate_feature_map(features)
        importance = clf.get_fscore(fmap='xgb.fmap')
        importance = sorted(importance.items(), key=operator.itemgetter(1))
        feat_imp = pd.DataFrame(importance, columns=['feature', 'fscore'])
        feat_imp['fscore'] = feat_imp['fscore'] / feat_imp['fscore'].sum()
        print(clf.best_iteration)
        print(clf.best_score)
        print('   训练损失:', cal_log_loss(predict_train,
                                       train_Y.loc[train_index]))
        print('   测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index]))
    predict_test = np.median(test_preds, axis=1)
    predict_test = predict_test / (predict_test + (1 - predict_test) / rate)
    print(params)
    print('训练损失:', cal_log_loss(train_preds / (folds - 1), train_Y))
    print('测试损失:', cal_log_loss(cv_preds, train_Y))
    print('test mean:', np.mean(predict_test))
    submmit_result(predict_test, 'XGB')
    return feat_imp, predict_test
Exemple #4
0
def xgb_offline(train_data, cv_data):

    train_data = build_train_dataset(train_data, rate)
    train_data.reset_index(inplace=True, drop=True)
    train_Y = train_data['is_trade'].values
    cv_Y = cv_data['is_trade'].values

    drop_cols = ['is_trade']
    train_data.drop(drop_cols, axis=1, inplace=True)
    cv_data.drop(drop_cols, axis=1, inplace=True)
    print('train shap:', train_data.shape)
    print('cv shape', cv_data.shape)

    kf = KFold(len(train_data), n_folds=5, shuffle=True, random_state=520)

    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((cv_data.shape[0], 5))

    for i, (train_index, cv_index) in enumerate(kf):
        print('第{}次训练...'.format(i))
        train_feat = train_data.iloc[train_index]
        cv_feat = train_data.iloc[cv_index]

        train_feat = xgb.DMatrix(train_feat.values, label=train_Y[train_index])
        cv_feat = xgb.DMatrix(cv_feat.values, label=train_Y[cv_index])
        test_feat = xgb.DMatrix(cv_data.values)
        watchlist = [(train_feat, 'train'), (cv_feat, 'val')]


        clf = xgb.train(params=params, dtrain=train_feat,num_boost_round=n_round,\
            evals=watchlist,early_stopping_rounds=7,verbose_eval=False)

        predict_train = clf.predict(train_feat)
        predict_cv = clf.predict(cv_feat)
        predict_test = clf.predict(test_feat)

        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv
        test_preds[:, i] = predict_test

        print(clf.best_iteration)
        print(clf.best_score)
        print('  训练损失:', cal_log_loss(predict_train, train_Y[train_index]))
        print('  测试损失:', cal_log_loss(predict_cv, train_Y[cv_index]))
    #特征重要度
    features = train_data.columns
    ceate_feature_map(features)
    importance = clf.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=operator.itemgetter(1))
    df = pd.DataFrame(importance, columns=['feature', 'fscore'])
    df['fscore'] = df['fscore'] / df['fscore'].sum()

    predict_test = np.median(test_preds, axis=1)
    predict_test = predict_test / (predict_test + (1 - predict_test) / rate)
    print('训练损失:', cal_log_loss(train_preds / 4, train_Y))
    print('测试损失:', cal_log_loss(cv_preds, train_Y))
    print('验证损失:', cal_log_loss(predict_test, cv_Y))
    return df, clf
Exemple #5
0
def lgb_online(train, cv, test):

    train_data = train.copy()
    cv_data = cv.copy()
    test_data = test.copy()

    train_data = pd.concat([train_data, cv_data], axis=0)
    train_data.reset_index(inplace=True, drop=True)
    train_Y = train_data['is_trade']

    drop_cols = ['is_trade']
    train_data.drop(drop_cols, axis=1, inplace=True)
    test_data.drop(drop_cols, axis=1, inplace=True)

    folds = 5
    kf = KFold(len(train_data), n_folds=folds, shuffle=True, random_state=7)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((test_data.shape[0], 5))
    for i, (train_index, cv_index) in enumerate(kf):
        print('第{}次训练...'.format(i))
        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]
        lgb_train = lgb.Dataset(train_feat.values, train_Y.loc[train_index])
        lgb_cv = lgb.Dataset(cv_feat.values, train_Y.loc[cv_index])
        gbm = lgb.train(params=params,
                        train_set=lgb_train,
                        num_boost_round=6000,
                        valid_sets=lgb_cv,
                        verbose_eval=False,
                        early_stopping_rounds=50)
        #评价特征的重要性
        feat_imp = pd.Series(
            gbm.feature_importance(),
            index=train_data.columns).sort_values(ascending=False)

        predict_train = gbm.predict(train_feat.values)
        predict_cv = gbm.predict(cv_feat.values)
        test_preds[:, i] = gbm.predict(test_data.values)

        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv

        feat_imp = pd.Series(
            gbm.feature_importance(),
            index=train_data.columns).sort_values(ascending=False)
        print(gbm.best_iteration)
        print(gbm.best_score)
        print('   训练损失:', cal_log_loss(predict_train,
                                       train_Y.loc[train_index]))
        print('   测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index]))
    predict_test = np.median(test_preds, axis=1)
    predict_test = predict_test / (predict_test + (1 - predict_test) / rate)
    print(params)
    print('训练损失:', cal_log_loss(train_preds / (folds - 1), train_Y))
    print('测试损失:', cal_log_loss(cv_preds, train_Y))
    print('test mean:', np.mean(predict_test))
    submmit_result(predict_test, 'LGB')
    return feat_imp, predict_test
Exemple #6
0
def LR_offline(train, cv):
    print('off line')
    train_data = train.copy()
    cv_data = cv.copy()
    train_Y = train_data['is_trade']
    cv_Y = cv_data['is_trade']

    fold = 5
    kf = KFold(len(train_data), n_folds=fold, shuffle=True, random_state=520)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((cv_data.shape[0], fold))

    data_in_process = [(train_data.loc[train_index], train_data.loc[cv_index],
                        cv_data)
                       for i, (train_index, cv_index) in enumerate(kf)]
    index_all = [(train_index, cv_index)
                 for i, (train_index, cv_index) in enumerate(kf)]
    with multiprocessing.Pool(fold) as p:
        k_val_list = p.map(_LR_train, data_in_process)
    for i, index, val in zip(range(fold), index_all, k_val_list):
        print('no %d train' % (i))
        train_index, cv_index = index
        predict_train, predict_cv, predict_test = val
        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv
        test_preds[:, i] = predict_test
        print('  训练损失:', cal_log_loss(predict_train, train_Y[train_index]))
        print('  测试损失:', cal_log_loss(predict_cv, train_Y[cv_index]))
    '''
    
    for i, (train_index, cv_index) in enumerate(kf):
        print('no %d train:'%(i))
        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]
        
        predict_train,predict_cv,predict_test = _LR_train((train_feat,cv_feat,cv_data))
        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv
        test_preds[:,i] = predict_test
        
        print('  训练损失:',cal_log_loss(predict_train, train_Y[train_index]))
        print('  测试损失:',cal_log_loss(predict_cv, train_Y[cv_index]))
    '''
    predict_test = np.median(test_preds, axis=1)
    print('mean:', np.mean(predict_test))
    print('训练损失:', cal_log_loss(train_preds / (fold - 1), train_Y))
    print('测试损失:', cal_log_loss(cv_preds, train_Y))
    print('验证损失:', cal_log_loss(predict_test, cv_Y))
Exemple #7
0
def _LGB_train(df_all):
    
    train_name,day7_data,cv_data = df_all
    train_data = _load_splited_df(path=train_name)
    train_Y = train_data['is_trade'].values
    day7_Y = day7_data['is_trade'].values
    
    drop_cols = ['is_trade']
    train_now = train_data.drop(drop_cols,axis=1)
    day7_now = day7_data.drop(drop_cols, axis=1)
    cv_now = cv_data.drop(drop_cols,axis=1)
    
    
    lgb_train = lgb.Dataset(train_now.values, train_Y)
    lgb_day7 = lgb.Dataset(day7_now.values, day7_Y)
    
    gbm = lgb.train(params=params,
                    train_set=lgb_train,
                    num_boost_round=300,
                    valid_sets=lgb_day7,
                    verbose_eval=False,
                    early_stopping_rounds=50)
    
    predict_train = gbm.predict(train_now.values)
    predict_day7 = gbm.predict(day7_now.values)
    predict_cv = gbm.predict(cv_now.values)
    feat_imp = pd.Series(gbm.feature_importance(), index=train_now.columns).sort_values(ascending=False)
    
    print(feat_imp)
    print('train:',cal_log_loss(predict_train, train_Y))
    predict_day7 = np.float16(predict_day7)
    predict_cv = np.float16(predict_cv)
    
    
    return (predict_day7,predict_cv)
Exemple #8
0
def _LR_train(df_all):

    train_name, day7_data, cv_data, test_data = df_all
    train_data = _load_splited_df(path=train_name)
    train_Y = train_data['is_trade'].values

    drop_cols = ['is_trade']
    train_now = train_data.drop(drop_cols, axis=1)
    day7_now = day7_data.drop(drop_cols, axis=1)
    cv_now = cv_data.drop(drop_cols, axis=1)
    test_now = test_data.drop(drop_cols, axis=1)

    clf = LogisticRegression(C=1.2,
                             fit_intercept=True,
                             max_iter=300,
                             solver='sag',
                             verbose=1,
                             random_state=7)
    clf.fit(X=train_now.values, y=train_Y)

    predict_train = clf.predict_proba(train_now.values)[:, 1]
    predict_day7 = clf.predict_proba(day7_now.values)[:, 1]
    predict_cv = clf.predict_proba(cv_now.values)[:, 1]
    predict_test = clf.predict_proba(test_now.values)[:, 1]

    print('train:', cal_log_loss(predict_train, train_Y))
    #    predict_train = np.float16(predict_train)
    predict_day7 = np.float16(predict_day7)
    predict_cv = np.float16(predict_cv)
    predict_test = np.float16(predict_test)
    return (predict_day7, predict_cv, predict_test)
Exemple #9
0
def XGB_model_second(day7_data, cv_data):

    predict_day7, predict_cv = XGB_model_first(day7_data, cv_data)

    #model 2
    train_Y = day7_data['is_trade'].values
    cv_Y = cv_data['is_trade'].values

    clf = LinearRegression(fit_intercept=True, normalize=True, n_jobs=-1)
    clf.fit(X=predict_day7, y=train_Y)

    predict_train_2 = clf.predict(predict_day7)
    predict_cv_2 = clf.predict(predict_cv)

    print('train:', cal_log_loss(predict_train_2, train_Y))
    print('test:', cal_log_loss(predict_cv_2, cv_Y))
    print('train mean:', np.mean(predict_train_2))
    print('cv mean:', np.mean(predict_cv_2))

    return predict_cv_2
Exemple #10
0
def XGB_model_first(day7, cv):

    day7_data = day7.copy()
    cv_data = cv.copy()

    name_list = [cache_pkl_path + 'Tree_day/train_' + str(i) for i in range(7)]
    data_in_process = [(train_name, day7_data, cv_data)
                       for train_name in name_list]
    len_name = len(name_list)

    predict_day7 = np.zeros((day7_data.shape[0], len_name))
    predict_cv = np.zeros((cv_data.shape[0], len_name))

    with multiprocessing.Pool(len_name) as p:
        k_val_list = p.map(_XGB_train, data_in_process)

    for i, val in zip(range(len_name), k_val_list):
        print('no %d train' % (i))
        day7_i, cv_i = val
        print('  测试损失:', cal_log_loss(day7_i, day7_data['is_trade'].values))
        print('  验证损失:', cal_log_loss(cv_i, cv_data['is_trade'].values))
        predict_day7[:, i] = day7_i
        predict_cv[:, i] = cv_i
    return (predict_day7, predict_cv)
Exemple #11
0
def _XGB_train(df_all):

    train_name, day7_data, cv_data = df_all
    train_data = _load_splited_df(path=train_name)
    train_Y = train_data['is_trade'].values
    day7_Y = day7_data['is_trade'].values

    drop_cols = ['is_trade']
    train_now = train_data.drop(drop_cols, axis=1)
    day7_now = day7_data.drop(drop_cols, axis=1)
    cv_now = cv_data.drop(drop_cols, axis=1)

    train_feat = xgb.DMatrix(train_now.values, label=train_Y)
    day7_feat = xgb.DMatrix(day7_now.values, label=day7_Y)
    test_feat = xgb.DMatrix(cv_now.values)
    watchlist = [(train_feat, 'train'), (day7_feat, 'val')]


    clf = xgb.train(params=params, dtrain=train_feat,num_boost_round=n_round,\
        evals=watchlist,early_stopping_rounds=20,verbose_eval=False)

    predict_train = clf.predict(train_feat)
    predict_day7 = clf.predict(day7_feat)
    predict_cv = clf.predict(test_feat)
    print('train:', cal_log_loss(predict_train, train_Y))

    #特征重要度
    features = train_data.columns
    ceate_feature_map(features)
    importance = clf.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=operator.itemgetter(1))
    feat_imp = pd.DataFrame(importance, columns=['feature', 'fscore'])
    feat_imp['fscore'] = feat_imp['fscore'] / feat_imp['fscore'].sum()

    predict_day7 = np.float16(predict_day7)
    predict_cv = np.float16(predict_cv)

    return (predict_day7, predict_cv)
Exemple #12
0
def offline(train_data, cv_data):

    #剔除历史数据,保留老用户的历史数据
    history_cols = ['user_id_cvr_smooth', 'user_id_buy_count']
    old_user_data_train = train_data[history_cols]
    old_user_data_test = cv_data[history_cols]
    #    train_data.drop(history_cols, axis=1, inplace=True)
    #    cv_data.drop(history_cols, axis=1, inplace=True)

    #对数据集进行训练
    train_Y = train_data['is_trade']
    cv_Y = cv_data['is_trade']

    drop_cols = ['is_trade']
    train_data.drop(drop_cols, axis=1, inplace=True)
    cv_data.drop(drop_cols, axis=1, inplace=True)

    kf = KFold(len(train_data), n_folds=5, shuffle=True, random_state=520)
    train_preds = np.zeros(train_data.shape[0])
    cv_preds = np.zeros(train_data.shape[0])
    test_preds = np.zeros((cv_data.shape[0], 5))
    for i, (train_index, cv_index) in enumerate(kf):

        train_feat = train_data.loc[train_index]
        cv_feat = train_data.loc[cv_index]

        print('第{}次训练...'.format(i))
        lgb_train = lgb.Dataset(train_feat.values, train_Y.loc[train_index])
        lgb_cv = lgb.Dataset(cv_feat.values, train_Y.loc[cv_index])
        gbm = lgb.train(params=params,
                        train_set=lgb_train,
                        num_boost_round=6000,
                        valid_sets=lgb_cv,
                        verbose_eval=False,
                        early_stopping_rounds=200)
        #评价特征的重要性
        feat_imp = pd.Series(
            gbm.feature_importance(),
            index=train_data.columns).sort_values(ascending=False)

        predict_train = gbm.predict(train_feat.values)
        predict_cv = gbm.predict(cv_feat.values)

        test_preds[:, i] = gbm.predict(cv_data.values)
        train_preds[train_index] += predict_train
        cv_preds[cv_index] += predict_cv

        feat_imp = pd.Series(
            gbm.feature_importance(),
            index=train_data.columns).sort_values(ascending=False)
        print(gbm.best_iteration)
        print(gbm.best_score)
        print('   训练损失:', cal_log_loss(predict_train,
                                       train_Y.loc[train_index]))
        print('   测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index]))
    test_preds = np.median(test_preds, axis=1)
    print(params)
    print('训练损失:', cal_log_loss(train_preds / 4, train_Y))
    print('测试损失:', cal_log_loss(cv_preds, train_Y))
    print('验证损失:', cal_log_loss(test_preds, cv_Y))
    #划分出新老用户的分数并计算损失情况
    train_old_user_index = old_user_data_train.loc[
        old_user_data_train.user_id_cvr_smooth != -1, :].index
    test_old_user_index = old_user_data_test.loc[
        old_user_data_test.user_id_cvr_smooth != -1, :].index
    train_new_user_index = old_user_data_train.loc[
        old_user_data_train.user_id_cvr_smooth == -1, :].index
    test_new_user_index = old_user_data_test.loc[
        old_user_data_test.user_id_cvr_smooth == -1, :].index

    train_old_score = cv_preds[train_old_user_index]
    test_old_score = test_preds[test_old_user_index]
    train_new_score = cv_preds[train_new_user_index]
    test_new_score = test_preds[test_new_user_index]

    new_train_data = old_user_data_train.loc[
        old_user_data_train.user_id_cvr_smooth != -1, :]
    new_test_data = old_user_data_test.loc[
        old_user_data_test.user_id_cvr_smooth != -1, :]
    new_train_data['y'] = train_old_score
    new_test_data['y'] = test_old_score
    new_train_Y = train_Y[train_old_user_index]
    new_test_Y = cv_Y[test_old_user_index]
    #对老用户单独训练
    clf = LogisticRegression(C=12,
                             fit_intercept=True,
                             max_iter=3000,
                             class_weight={
                                 0: 0.5,
                                 1: 0.5
                             })
    clf.fit(X=new_train_data.values, y=new_train_Y)

    train_LR_score = clf.predict_proba(new_train_data.values)[:, 1]
    test_LR_score = clf.predict_proba(new_test_data.values)[:, 1]

    cv_preds[train_old_user_index] = train_LR_score
    test_preds[test_old_user_index] = test_LR_score
    #记录老用户的损失情况
    print('LR train:', cal_log_loss(train_LR_score, new_train_Y))
    print('LR test:', cal_log_loss(test_LR_score, new_test_Y))
    #拼接结果看看总体的损失情况
    print('All train:', cal_log_loss(cv_preds, train_Y))
    print('ALL test:', cal_log_loss(test_preds, cv_Y))
    return test_preds, feat_imp
    print('test shape', test_data.shape)

    clf = LogisticRegression(C=0.5,
                             fit_intercept=True,
                             max_iter=1000,
                             class_weight={
                                 0: 0.5,
                                 1: 0.5
                             })
    clf.fit(X=train_data.values, y=np.squeeze(train_Y))

    predict_train_fir = clf.predict_proba(train_data.values)[:, 1]
    predict_cv_fir = clf.predict_proba(cv_data.values)[:, 1]
    predict_test_fir = clf.predict_proba(test_data.values)[:, 1]

    print('训练损失:', cal_log_loss(predict_train_fir, train_Y))
    print('测试损失:', cal_log_loss(predict_cv_fir, cv_Y))

    #把全量数据拿过来训练
    train_data_all = pd.concat([train_data, cv_data], axis=0)
    train_Y_all = np.append(train_Y, cv_Y)
    clf.fit(X=train_data_all.values, y=np.squeeze(train_Y_all))
    print(
        '训练损失:',
        cal_log_loss(
            clf.predict_proba(train_data_all.values)[:, 1], train_Y_all))
    predict_test_fir = clf.predict_proba(test_data.values)[:, 1]

    #保存结果
    submission = pd.DataFrame({
        'instance_id': test_id,
Exemple #14
0
    cv_data.drop(drop_cols,axis=1,inplace=True)
    test_data.drop(drop_cols,axis=1,inplace=True)
    
#    train_data, _, train_Y, _ = train_test_split(train_data,
#                                                 train_Y,
#                                                 test_size=0.5)
    
#    gbc = GradientBoostingClassifier(n_estimators=27,learning_rate=0.1,max_depth=6,max_leaf_nodes=35)
    gbc = GradientBoostingClassifier(n_estimators=27,learning_rate=0.1,max_depth=6, max_leaf_nodes=35)
    gbc.fit(train_data.values, train_Y)
    predict_train = gbc.predict_proba(train_data.values)[:,1]
    predict_cv = gbc.predict_proba(cv_data.values)[:,1]
    predict_test = gbc.predict_proba(test_data.values)[:,1]
    
#    print(gbc.get_params)
    print('训练损失:',cal_log_loss(predict_train, train_Y))
    print('测试损失:',cal_log_loss(predict_cv, cv_Y))
    t1 = time.time()
    print('训练用时:',t1-t0)
    
    new_train, new_cv, new_test = gen_gbdt_feature(gbc, train_data, cv_data, test_data)
    print('train shap:',new_train.shape)
    print('cv shape', new_cv.shape)
    print('test shape', new_test.shape)
    
#    LR预测
    clf = LogisticRegression(C=0.8, fit_intercept=True, max_iter=3000,class_weight={0:0.5, 1:0.5})
    clf.fit(X=new_train, y=np.squeeze(train_Y))
    
    predict_train = clf.predict_proba(new_train)[:,1]
    predict_cv = clf.predict_proba(new_cv)[:,1]
Exemple #15
0
    lgb_train = lgb.Dataset(train_data.values, train_Y)
    lgb_cv = lgb.Dataset(cv_data.values, cv_Y)
    gbm = lgb.train(
        params=params,  #参数
        train_set=lgb_train,  #要训练的数据
        num_boost_round=6000,  #迭代次数
        valid_sets=lgb_cv,  #训练时需要评估的列表
        verbose_eval=False,  #
        early_stopping_rounds=500)

    predict_train_fir = gbm.predict(train_data.values)
    predict_cv_fir = gbm.predict(cv_data.values)
    predict_test_fir = gbm.predict(test_data.values)

    print('训练损失:', cal_log_loss(predict_train_fir, train_Y))
    print('测试损失:', cal_log_loss(predict_cv_fir, cv_Y))

    #把全量数据拿过来训练
    train_data_all = pd.concat([train_data, cv_data], axis=0)
    train_Y_all = np.append(train_Y, cv_Y)

    lgb_train = lgb.Dataset(train_data_all.values, train_Y_all)
    gbm = lgb.train(
        params=params,  #参数
        train_set=lgb_train,  #要训练的数据
        num_boost_round=500,  #迭代次数
        verbose_eval=False)
    print('训练损失:', cal_log_loss(gbm.predict(train_data_all.values),
                                train_Y_all))
    predict_test_fir = gbm.predict(test_data.values)
Exemple #16
0
    return (predict_day7, predict_cv, predict_test)


if __name__ == '__main__':

    t0 = time.time()

    day7_data = _load_splited_df(path=cache_pkl_path + 'LR_day/train_7')
    cv_data = _load_splited_df(path=cache_pkl_path + 'LR_day/cv')
    test_data = _load_splited_df(path=cache_pkl_path + 'LR_day/test')

    predict_day7, predict_cv, predict_test = LR_model_first(
        day7_data, cv_data, test_data)

    #model 2
    train_Y = day7_data['is_trade'].values
    cv_Y = cv_data['is_trade'].values

    clf = LinearRegression(fit_intercept=True, normalize=True, n_jobs=-1)
    clf.fit(X=predict_day7, y=train_Y)

    predict_train_2 = clf.predict(predict_day7)
    predict_cv_2 = clf.predict(predict_cv)
    predict_test_2 = clf.predict(predict_test)

    print('train:', cal_log_loss(predict_train_2, train_Y))
    print('test:', cal_log_loss(predict_cv_2, cv_Y))
    print('train mean:', np.mean(predict_train_2))
    print('cv mean:', np.mean(predict_cv_2))
    print('test mean:', np.mean(predict_test_2))
    submmit_result(predict_test_2, 'LR_LR')