def LR_offline(train_data, cv_data): train_Y = train_data['is_trade'] cv_Y = cv_data['is_trade'] drop_cols = ['is_trade'] train_data.drop(drop_cols,axis=1,inplace=True) cv_data.drop(drop_cols,axis=1,inplace=True) fold = 5 kf = KFold(len(train_data), n_folds = fold, shuffle=True, random_state=520) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((cv_data.shape[0], fold)) for i, (train_index, cv_index) in enumerate(kf): train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] clf = LogisticRegression(C=1.2, fit_intercept=True, max_iter=3000,class_weight={0:0.5, 1:0.5}) clf.fit(X=train_feat.values, y=train_Y[train_index]) predict_train = clf.predict_proba(train_feat.values)[:,1] predict_cv = clf.predict_proba(cv_feat.values)[:,1] predict_test = clf.predict_proba(cv_data.values)[:,1] train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv test_preds[:,i] = predict_test print(' 训练损失:',cal_log_loss(predict_train, train_Y[train_index])) print(' 测试损失:',cal_log_loss(predict_cv, train_Y[cv_index])) predict_test = np.median(test_preds,axis=1) print('验证损失:',cal_log_loss(predict_test, cv_Y))
def FFM_model_first(day7_data, cv_data, test_data): day7_name = cache_pkl_path + 'FFM_day/train_7' cv_name = cache_pkl_path + 'FFM_day/cv' test_name = cache_pkl_path + 'FFM_day/test' name_list = [cache_pkl_path + 'FFM_day/train_' + str(i) for i in range(7)] data_in_process = [(train_name, day7_name, cv_name, test_name) for train_name in name_list] len_name = len(name_list) predict_day7 = np.zeros((day7_data.shape[0], len_name)) predict_cv = np.zeros((cv_data.shape[0], len_name)) predict_test = np.zeros((test_data.shape[0], len_name)) with multiprocessing.Pool(len_name) as p: k_val_list = p.map(_FFM_train, data_in_process) for i, val in zip(range(len_name), k_val_list): print('no %d train' % (i)) day7_i, cv_i, test_i = val print(' 测试损失:', cal_log_loss(day7_i, day7_data['is_trade'].values)) print(' 验证损失:', cal_log_loss(cv_i, cv_data['is_trade'].values)) predict_day7[:, i] = day7_i predict_cv[:, i] = cv_i predict_test[:, i] = test_i return (predict_day7, predict_cv, predict_test)
def xgb_online(train, cv, test): train_data = train.copy() cv_data = cv.copy() test_data = test.copy() train_data = pd.concat([train_data, cv_data], axis=0) train_data.reset_index(inplace=True, drop=True) train_Y = train_data['is_trade'] drop_cols = ['is_trade'] train_data.drop(drop_cols, axis=1, inplace=True) test_data.drop(drop_cols, axis=1, inplace=True) folds = 5 kf = KFold(len(train_data), n_folds=folds, shuffle=True, random_state=7) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((test_data.shape[0], 5)) for i, (train_index, cv_index) in enumerate(kf): print('第{}次训练...'.format(i)) train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] train_feat = xgb.DMatrix(train_feat.values, label=train_Y[train_index]) cv_feat = xgb.DMatrix(cv_feat.values, label=train_Y[cv_index]) test_feat = xgb.DMatrix(test_data.values) watchlist = [(train_feat, 'train'), (cv_feat, 'val')] clf = xgb.train(params=params, dtrain=train_feat,num_boost_round=n_round,\ evals=watchlist,early_stopping_rounds=7,verbose_eval=False) predict_train = clf.predict(train_feat) predict_cv = clf.predict(cv_feat) predict_test = clf.predict(test_feat) train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv test_preds[:, i] = predict_test #特征重要度 features = train_data.columns ceate_feature_map(features) importance = clf.get_fscore(fmap='xgb.fmap') importance = sorted(importance.items(), key=operator.itemgetter(1)) feat_imp = pd.DataFrame(importance, columns=['feature', 'fscore']) feat_imp['fscore'] = feat_imp['fscore'] / feat_imp['fscore'].sum() print(clf.best_iteration) print(clf.best_score) print(' 训练损失:', cal_log_loss(predict_train, train_Y.loc[train_index])) print(' 测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index])) predict_test = np.median(test_preds, axis=1) predict_test = predict_test / (predict_test + (1 - predict_test) / rate) print(params) print('训练损失:', cal_log_loss(train_preds / (folds - 1), train_Y)) print('测试损失:', cal_log_loss(cv_preds, train_Y)) print('test mean:', np.mean(predict_test)) submmit_result(predict_test, 'XGB') return feat_imp, predict_test
def xgb_offline(train_data, cv_data): train_data = build_train_dataset(train_data, rate) train_data.reset_index(inplace=True, drop=True) train_Y = train_data['is_trade'].values cv_Y = cv_data['is_trade'].values drop_cols = ['is_trade'] train_data.drop(drop_cols, axis=1, inplace=True) cv_data.drop(drop_cols, axis=1, inplace=True) print('train shap:', train_data.shape) print('cv shape', cv_data.shape) kf = KFold(len(train_data), n_folds=5, shuffle=True, random_state=520) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((cv_data.shape[0], 5)) for i, (train_index, cv_index) in enumerate(kf): print('第{}次训练...'.format(i)) train_feat = train_data.iloc[train_index] cv_feat = train_data.iloc[cv_index] train_feat = xgb.DMatrix(train_feat.values, label=train_Y[train_index]) cv_feat = xgb.DMatrix(cv_feat.values, label=train_Y[cv_index]) test_feat = xgb.DMatrix(cv_data.values) watchlist = [(train_feat, 'train'), (cv_feat, 'val')] clf = xgb.train(params=params, dtrain=train_feat,num_boost_round=n_round,\ evals=watchlist,early_stopping_rounds=7,verbose_eval=False) predict_train = clf.predict(train_feat) predict_cv = clf.predict(cv_feat) predict_test = clf.predict(test_feat) train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv test_preds[:, i] = predict_test print(clf.best_iteration) print(clf.best_score) print(' 训练损失:', cal_log_loss(predict_train, train_Y[train_index])) print(' 测试损失:', cal_log_loss(predict_cv, train_Y[cv_index])) #特征重要度 features = train_data.columns ceate_feature_map(features) importance = clf.get_fscore(fmap='xgb.fmap') importance = sorted(importance.items(), key=operator.itemgetter(1)) df = pd.DataFrame(importance, columns=['feature', 'fscore']) df['fscore'] = df['fscore'] / df['fscore'].sum() predict_test = np.median(test_preds, axis=1) predict_test = predict_test / (predict_test + (1 - predict_test) / rate) print('训练损失:', cal_log_loss(train_preds / 4, train_Y)) print('测试损失:', cal_log_loss(cv_preds, train_Y)) print('验证损失:', cal_log_loss(predict_test, cv_Y)) return df, clf
def lgb_online(train, cv, test): train_data = train.copy() cv_data = cv.copy() test_data = test.copy() train_data = pd.concat([train_data, cv_data], axis=0) train_data.reset_index(inplace=True, drop=True) train_Y = train_data['is_trade'] drop_cols = ['is_trade'] train_data.drop(drop_cols, axis=1, inplace=True) test_data.drop(drop_cols, axis=1, inplace=True) folds = 5 kf = KFold(len(train_data), n_folds=folds, shuffle=True, random_state=7) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((test_data.shape[0], 5)) for i, (train_index, cv_index) in enumerate(kf): print('第{}次训练...'.format(i)) train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] lgb_train = lgb.Dataset(train_feat.values, train_Y.loc[train_index]) lgb_cv = lgb.Dataset(cv_feat.values, train_Y.loc[cv_index]) gbm = lgb.train(params=params, train_set=lgb_train, num_boost_round=6000, valid_sets=lgb_cv, verbose_eval=False, early_stopping_rounds=50) #评价特征的重要性 feat_imp = pd.Series( gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False) predict_train = gbm.predict(train_feat.values) predict_cv = gbm.predict(cv_feat.values) test_preds[:, i] = gbm.predict(test_data.values) train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv feat_imp = pd.Series( gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False) print(gbm.best_iteration) print(gbm.best_score) print(' 训练损失:', cal_log_loss(predict_train, train_Y.loc[train_index])) print(' 测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index])) predict_test = np.median(test_preds, axis=1) predict_test = predict_test / (predict_test + (1 - predict_test) / rate) print(params) print('训练损失:', cal_log_loss(train_preds / (folds - 1), train_Y)) print('测试损失:', cal_log_loss(cv_preds, train_Y)) print('test mean:', np.mean(predict_test)) submmit_result(predict_test, 'LGB') return feat_imp, predict_test
def LR_offline(train, cv): print('off line') train_data = train.copy() cv_data = cv.copy() train_Y = train_data['is_trade'] cv_Y = cv_data['is_trade'] fold = 5 kf = KFold(len(train_data), n_folds=fold, shuffle=True, random_state=520) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((cv_data.shape[0], fold)) data_in_process = [(train_data.loc[train_index], train_data.loc[cv_index], cv_data) for i, (train_index, cv_index) in enumerate(kf)] index_all = [(train_index, cv_index) for i, (train_index, cv_index) in enumerate(kf)] with multiprocessing.Pool(fold) as p: k_val_list = p.map(_LR_train, data_in_process) for i, index, val in zip(range(fold), index_all, k_val_list): print('no %d train' % (i)) train_index, cv_index = index predict_train, predict_cv, predict_test = val train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv test_preds[:, i] = predict_test print(' 训练损失:', cal_log_loss(predict_train, train_Y[train_index])) print(' 测试损失:', cal_log_loss(predict_cv, train_Y[cv_index])) ''' for i, (train_index, cv_index) in enumerate(kf): print('no %d train:'%(i)) train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] predict_train,predict_cv,predict_test = _LR_train((train_feat,cv_feat,cv_data)) train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv test_preds[:,i] = predict_test print(' 训练损失:',cal_log_loss(predict_train, train_Y[train_index])) print(' 测试损失:',cal_log_loss(predict_cv, train_Y[cv_index])) ''' predict_test = np.median(test_preds, axis=1) print('mean:', np.mean(predict_test)) print('训练损失:', cal_log_loss(train_preds / (fold - 1), train_Y)) print('测试损失:', cal_log_loss(cv_preds, train_Y)) print('验证损失:', cal_log_loss(predict_test, cv_Y))
def _LGB_train(df_all): train_name,day7_data,cv_data = df_all train_data = _load_splited_df(path=train_name) train_Y = train_data['is_trade'].values day7_Y = day7_data['is_trade'].values drop_cols = ['is_trade'] train_now = train_data.drop(drop_cols,axis=1) day7_now = day7_data.drop(drop_cols, axis=1) cv_now = cv_data.drop(drop_cols,axis=1) lgb_train = lgb.Dataset(train_now.values, train_Y) lgb_day7 = lgb.Dataset(day7_now.values, day7_Y) gbm = lgb.train(params=params, train_set=lgb_train, num_boost_round=300, valid_sets=lgb_day7, verbose_eval=False, early_stopping_rounds=50) predict_train = gbm.predict(train_now.values) predict_day7 = gbm.predict(day7_now.values) predict_cv = gbm.predict(cv_now.values) feat_imp = pd.Series(gbm.feature_importance(), index=train_now.columns).sort_values(ascending=False) print(feat_imp) print('train:',cal_log_loss(predict_train, train_Y)) predict_day7 = np.float16(predict_day7) predict_cv = np.float16(predict_cv) return (predict_day7,predict_cv)
def _LR_train(df_all): train_name, day7_data, cv_data, test_data = df_all train_data = _load_splited_df(path=train_name) train_Y = train_data['is_trade'].values drop_cols = ['is_trade'] train_now = train_data.drop(drop_cols, axis=1) day7_now = day7_data.drop(drop_cols, axis=1) cv_now = cv_data.drop(drop_cols, axis=1) test_now = test_data.drop(drop_cols, axis=1) clf = LogisticRegression(C=1.2, fit_intercept=True, max_iter=300, solver='sag', verbose=1, random_state=7) clf.fit(X=train_now.values, y=train_Y) predict_train = clf.predict_proba(train_now.values)[:, 1] predict_day7 = clf.predict_proba(day7_now.values)[:, 1] predict_cv = clf.predict_proba(cv_now.values)[:, 1] predict_test = clf.predict_proba(test_now.values)[:, 1] print('train:', cal_log_loss(predict_train, train_Y)) # predict_train = np.float16(predict_train) predict_day7 = np.float16(predict_day7) predict_cv = np.float16(predict_cv) predict_test = np.float16(predict_test) return (predict_day7, predict_cv, predict_test)
def XGB_model_second(day7_data, cv_data): predict_day7, predict_cv = XGB_model_first(day7_data, cv_data) #model 2 train_Y = day7_data['is_trade'].values cv_Y = cv_data['is_trade'].values clf = LinearRegression(fit_intercept=True, normalize=True, n_jobs=-1) clf.fit(X=predict_day7, y=train_Y) predict_train_2 = clf.predict(predict_day7) predict_cv_2 = clf.predict(predict_cv) print('train:', cal_log_loss(predict_train_2, train_Y)) print('test:', cal_log_loss(predict_cv_2, cv_Y)) print('train mean:', np.mean(predict_train_2)) print('cv mean:', np.mean(predict_cv_2)) return predict_cv_2
def XGB_model_first(day7, cv): day7_data = day7.copy() cv_data = cv.copy() name_list = [cache_pkl_path + 'Tree_day/train_' + str(i) for i in range(7)] data_in_process = [(train_name, day7_data, cv_data) for train_name in name_list] len_name = len(name_list) predict_day7 = np.zeros((day7_data.shape[0], len_name)) predict_cv = np.zeros((cv_data.shape[0], len_name)) with multiprocessing.Pool(len_name) as p: k_val_list = p.map(_XGB_train, data_in_process) for i, val in zip(range(len_name), k_val_list): print('no %d train' % (i)) day7_i, cv_i = val print(' 测试损失:', cal_log_loss(day7_i, day7_data['is_trade'].values)) print(' 验证损失:', cal_log_loss(cv_i, cv_data['is_trade'].values)) predict_day7[:, i] = day7_i predict_cv[:, i] = cv_i return (predict_day7, predict_cv)
def _XGB_train(df_all): train_name, day7_data, cv_data = df_all train_data = _load_splited_df(path=train_name) train_Y = train_data['is_trade'].values day7_Y = day7_data['is_trade'].values drop_cols = ['is_trade'] train_now = train_data.drop(drop_cols, axis=1) day7_now = day7_data.drop(drop_cols, axis=1) cv_now = cv_data.drop(drop_cols, axis=1) train_feat = xgb.DMatrix(train_now.values, label=train_Y) day7_feat = xgb.DMatrix(day7_now.values, label=day7_Y) test_feat = xgb.DMatrix(cv_now.values) watchlist = [(train_feat, 'train'), (day7_feat, 'val')] clf = xgb.train(params=params, dtrain=train_feat,num_boost_round=n_round,\ evals=watchlist,early_stopping_rounds=20,verbose_eval=False) predict_train = clf.predict(train_feat) predict_day7 = clf.predict(day7_feat) predict_cv = clf.predict(test_feat) print('train:', cal_log_loss(predict_train, train_Y)) #特征重要度 features = train_data.columns ceate_feature_map(features) importance = clf.get_fscore(fmap='xgb.fmap') importance = sorted(importance.items(), key=operator.itemgetter(1)) feat_imp = pd.DataFrame(importance, columns=['feature', 'fscore']) feat_imp['fscore'] = feat_imp['fscore'] / feat_imp['fscore'].sum() predict_day7 = np.float16(predict_day7) predict_cv = np.float16(predict_cv) return (predict_day7, predict_cv)
def offline(train_data, cv_data): #剔除历史数据,保留老用户的历史数据 history_cols = ['user_id_cvr_smooth', 'user_id_buy_count'] old_user_data_train = train_data[history_cols] old_user_data_test = cv_data[history_cols] # train_data.drop(history_cols, axis=1, inplace=True) # cv_data.drop(history_cols, axis=1, inplace=True) #对数据集进行训练 train_Y = train_data['is_trade'] cv_Y = cv_data['is_trade'] drop_cols = ['is_trade'] train_data.drop(drop_cols, axis=1, inplace=True) cv_data.drop(drop_cols, axis=1, inplace=True) kf = KFold(len(train_data), n_folds=5, shuffle=True, random_state=520) train_preds = np.zeros(train_data.shape[0]) cv_preds = np.zeros(train_data.shape[0]) test_preds = np.zeros((cv_data.shape[0], 5)) for i, (train_index, cv_index) in enumerate(kf): train_feat = train_data.loc[train_index] cv_feat = train_data.loc[cv_index] print('第{}次训练...'.format(i)) lgb_train = lgb.Dataset(train_feat.values, train_Y.loc[train_index]) lgb_cv = lgb.Dataset(cv_feat.values, train_Y.loc[cv_index]) gbm = lgb.train(params=params, train_set=lgb_train, num_boost_round=6000, valid_sets=lgb_cv, verbose_eval=False, early_stopping_rounds=200) #评价特征的重要性 feat_imp = pd.Series( gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False) predict_train = gbm.predict(train_feat.values) predict_cv = gbm.predict(cv_feat.values) test_preds[:, i] = gbm.predict(cv_data.values) train_preds[train_index] += predict_train cv_preds[cv_index] += predict_cv feat_imp = pd.Series( gbm.feature_importance(), index=train_data.columns).sort_values(ascending=False) print(gbm.best_iteration) print(gbm.best_score) print(' 训练损失:', cal_log_loss(predict_train, train_Y.loc[train_index])) print(' 测试损失:', cal_log_loss(predict_cv, train_Y.loc[cv_index])) test_preds = np.median(test_preds, axis=1) print(params) print('训练损失:', cal_log_loss(train_preds / 4, train_Y)) print('测试损失:', cal_log_loss(cv_preds, train_Y)) print('验证损失:', cal_log_loss(test_preds, cv_Y)) #划分出新老用户的分数并计算损失情况 train_old_user_index = old_user_data_train.loc[ old_user_data_train.user_id_cvr_smooth != -1, :].index test_old_user_index = old_user_data_test.loc[ old_user_data_test.user_id_cvr_smooth != -1, :].index train_new_user_index = old_user_data_train.loc[ old_user_data_train.user_id_cvr_smooth == -1, :].index test_new_user_index = old_user_data_test.loc[ old_user_data_test.user_id_cvr_smooth == -1, :].index train_old_score = cv_preds[train_old_user_index] test_old_score = test_preds[test_old_user_index] train_new_score = cv_preds[train_new_user_index] test_new_score = test_preds[test_new_user_index] new_train_data = old_user_data_train.loc[ old_user_data_train.user_id_cvr_smooth != -1, :] new_test_data = old_user_data_test.loc[ old_user_data_test.user_id_cvr_smooth != -1, :] new_train_data['y'] = train_old_score new_test_data['y'] = test_old_score new_train_Y = train_Y[train_old_user_index] new_test_Y = cv_Y[test_old_user_index] #对老用户单独训练 clf = LogisticRegression(C=12, fit_intercept=True, max_iter=3000, class_weight={ 0: 0.5, 1: 0.5 }) clf.fit(X=new_train_data.values, y=new_train_Y) train_LR_score = clf.predict_proba(new_train_data.values)[:, 1] test_LR_score = clf.predict_proba(new_test_data.values)[:, 1] cv_preds[train_old_user_index] = train_LR_score test_preds[test_old_user_index] = test_LR_score #记录老用户的损失情况 print('LR train:', cal_log_loss(train_LR_score, new_train_Y)) print('LR test:', cal_log_loss(test_LR_score, new_test_Y)) #拼接结果看看总体的损失情况 print('All train:', cal_log_loss(cv_preds, train_Y)) print('ALL test:', cal_log_loss(test_preds, cv_Y)) return test_preds, feat_imp
print('test shape', test_data.shape) clf = LogisticRegression(C=0.5, fit_intercept=True, max_iter=1000, class_weight={ 0: 0.5, 1: 0.5 }) clf.fit(X=train_data.values, y=np.squeeze(train_Y)) predict_train_fir = clf.predict_proba(train_data.values)[:, 1] predict_cv_fir = clf.predict_proba(cv_data.values)[:, 1] predict_test_fir = clf.predict_proba(test_data.values)[:, 1] print('训练损失:', cal_log_loss(predict_train_fir, train_Y)) print('测试损失:', cal_log_loss(predict_cv_fir, cv_Y)) #把全量数据拿过来训练 train_data_all = pd.concat([train_data, cv_data], axis=0) train_Y_all = np.append(train_Y, cv_Y) clf.fit(X=train_data_all.values, y=np.squeeze(train_Y_all)) print( '训练损失:', cal_log_loss( clf.predict_proba(train_data_all.values)[:, 1], train_Y_all)) predict_test_fir = clf.predict_proba(test_data.values)[:, 1] #保存结果 submission = pd.DataFrame({ 'instance_id': test_id,
cv_data.drop(drop_cols,axis=1,inplace=True) test_data.drop(drop_cols,axis=1,inplace=True) # train_data, _, train_Y, _ = train_test_split(train_data, # train_Y, # test_size=0.5) # gbc = GradientBoostingClassifier(n_estimators=27,learning_rate=0.1,max_depth=6,max_leaf_nodes=35) gbc = GradientBoostingClassifier(n_estimators=27,learning_rate=0.1,max_depth=6, max_leaf_nodes=35) gbc.fit(train_data.values, train_Y) predict_train = gbc.predict_proba(train_data.values)[:,1] predict_cv = gbc.predict_proba(cv_data.values)[:,1] predict_test = gbc.predict_proba(test_data.values)[:,1] # print(gbc.get_params) print('训练损失:',cal_log_loss(predict_train, train_Y)) print('测试损失:',cal_log_loss(predict_cv, cv_Y)) t1 = time.time() print('训练用时:',t1-t0) new_train, new_cv, new_test = gen_gbdt_feature(gbc, train_data, cv_data, test_data) print('train shap:',new_train.shape) print('cv shape', new_cv.shape) print('test shape', new_test.shape) # LR预测 clf = LogisticRegression(C=0.8, fit_intercept=True, max_iter=3000,class_weight={0:0.5, 1:0.5}) clf.fit(X=new_train, y=np.squeeze(train_Y)) predict_train = clf.predict_proba(new_train)[:,1] predict_cv = clf.predict_proba(new_cv)[:,1]
lgb_train = lgb.Dataset(train_data.values, train_Y) lgb_cv = lgb.Dataset(cv_data.values, cv_Y) gbm = lgb.train( params=params, #参数 train_set=lgb_train, #要训练的数据 num_boost_round=6000, #迭代次数 valid_sets=lgb_cv, #训练时需要评估的列表 verbose_eval=False, # early_stopping_rounds=500) predict_train_fir = gbm.predict(train_data.values) predict_cv_fir = gbm.predict(cv_data.values) predict_test_fir = gbm.predict(test_data.values) print('训练损失:', cal_log_loss(predict_train_fir, train_Y)) print('测试损失:', cal_log_loss(predict_cv_fir, cv_Y)) #把全量数据拿过来训练 train_data_all = pd.concat([train_data, cv_data], axis=0) train_Y_all = np.append(train_Y, cv_Y) lgb_train = lgb.Dataset(train_data_all.values, train_Y_all) gbm = lgb.train( params=params, #参数 train_set=lgb_train, #要训练的数据 num_boost_round=500, #迭代次数 verbose_eval=False) print('训练损失:', cal_log_loss(gbm.predict(train_data_all.values), train_Y_all)) predict_test_fir = gbm.predict(test_data.values)
return (predict_day7, predict_cv, predict_test) if __name__ == '__main__': t0 = time.time() day7_data = _load_splited_df(path=cache_pkl_path + 'LR_day/train_7') cv_data = _load_splited_df(path=cache_pkl_path + 'LR_day/cv') test_data = _load_splited_df(path=cache_pkl_path + 'LR_day/test') predict_day7, predict_cv, predict_test = LR_model_first( day7_data, cv_data, test_data) #model 2 train_Y = day7_data['is_trade'].values cv_Y = cv_data['is_trade'].values clf = LinearRegression(fit_intercept=True, normalize=True, n_jobs=-1) clf.fit(X=predict_day7, y=train_Y) predict_train_2 = clf.predict(predict_day7) predict_cv_2 = clf.predict(predict_cv) predict_test_2 = clf.predict(predict_test) print('train:', cal_log_loss(predict_train_2, train_Y)) print('test:', cal_log_loss(predict_cv_2, cv_Y)) print('train mean:', np.mean(predict_train_2)) print('cv mean:', np.mean(predict_cv_2)) print('test mean:', np.mean(predict_test_2)) submmit_result(predict_test_2, 'LR_LR')