X_train, X_test, y_train, y_test = train_test_split(trn_term_doc, y, test_size=0.01, random_state=111) print('tttt') # X_train=X_train.toarray() # X_test=X_test.toarray() print('to array') #创建数据集11 dataset = Dataset(X_train, y_train, test_term_doc, use_cache=False) #创建RF模型和LR模型1 class_use_cache = False model_nb = Classifier(dataset=dataset, estimator=MultinomialNB, name='nb', use_cache=class_use_cache) model_lr = Classifier(dataset=dataset, estimator=LogisticRegression, parameters={ 'C': 4, 'dual': True, 'n_jobs': -1 }, name='lr', use_cache=class_use_cache) model_lr2 = Classifier(dataset=dataset, estimator=LogisticRegression, parameters={ 'C': 4, 'multi_class': 'multinomial',
def func_estimator(X_train, y_train, X_test, y_test): return np.zeros(X_test.shape[0]) def random_param(): return random.randint(1, 100) model_func = Regressor(estimator=func_estimator, dataset=TestDataset) model_cls = TestEstimator(dataset=TestDataset()) model_param = Regressor(estimator=LinearRegression, parameters={'random_param': random_param}, dataset=TestDataset) model_param2 = Classifier(estimator=LogisticRegression, parameters={'colsample_bylevel': 0.9}, dataset=TestDataset) def test_hashing(): assert str( model_func) == 'func_estimator(54743c7a5484d1bf2a64ac1d7b68f8cc)' assert str(model_cls) == 'TestEstimator(da29cb8766f96e6561a51e8e3c13f661)' assert str( model_param) == 'LinearRegression(2e789a766f6dc2457fb6a63452ad2859)' assert str( model_param2) == 'LogisticRegression(74efb248db47d168aed2fc37c0016e6f)' assert model_param2.hash == '74efb248db47d168aed2fc37c0016e6f' e_hash = TestEstimator(dataset=TestDataset()).hash
lr_param = { 'multi_class': 'multinomial', 'solver': 'newton-cg', 'random_state': 1 } mlp_param = { 'hidden_layer_sizes': (132, ), 'activation': 'logistic', 'max_iter': 500 } #------------------------------------------------------------------------------ knn = Classifier(dataset=dataset, estimator=KNeighborsClassifier, use_cache=CACHE, parameters=knn_param, name='knn') rf = Classifier(dataset=dataset, estimator=RandomForestClassifier, use_cache=CACHE, parameters=rf_param, name='rf') et = Classifier(dataset=dataset, use_cache=CACHE, estimator=ExtraTreesClassifier, parameters=et_param, name='et') lgb = Classifier(dataset=dataset, estimator=LGBMClassifier, use_cache=CACHE,
train_xf = train_x[filtered_cols_t] # val_xr = val_x[filtered_cols] test_xf = test_x[filtered_cols_t] dataset_full = Dataset(train_x.astype(np.float64), train_yt, test_x.astype(np.float64)) dataset_f = Dataset(train_xf.astype(np.float64), train_yt, test_xf.astype(np.float64)) xgb_params = { 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 500, # 'num_class': 5, 'objective': 'multi:softprob', 'subsample': 0.8} model_xgb = Classifier(dataset=dataset_full, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb') model_xgb_f = Classifier(dataset=dataset_f, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb_f') model_rf = Classifier(dataset=dataset_f, estimator=RandomForestClassifier, parameters={'n_estimators': 700}, name='rf') pipeline = ModelsPipeline( # model_xgb, model_rf, model_xgb_f) stack_ds = pipeline.stack(k=5, full_test=True, seed=111) stacker = Classifier(stack_ds, LogisticRegression) stacker.validate(k=5, scorer=log_loss) # logging.info(val_results) #
xg_params = { 'seed': 0, 'colsample_bytree': 0.7, 'silent': 1, 'subsample': 0.7, 'learning_rate': 0.1, 'objective': 'multi:softprob', 'num_class': 7, 'max_depth': 4, 'min_child_weight': 1, 'eval_metric': 'mlogloss', 'nrounds': 200 } #------------------------------------------------------------------------------ knn = Classifier(dataset=dataset, estimator = KNeighborsClassifier, use_cache=CACHE, parameters=knn_param,name='knn') rf = Classifier(dataset=dataset, estimator = RandomForestClassifier, use_cache=CACHE, parameters=rf_param,name='rf') et = Classifier(dataset=dataset, estimator=ExtraTreesClassifier, use_cache=CACHE, parameters=et_param,name='et') lgb = Classifier(dataset=dataset, estimator=LGBMClassifier, use_cache=CACHE, parameters=lgb_param,name='lgb') lr = Classifier(dataset=dataset, estimator=LogisticRegression, use_cache=CACHE, parameters=lr_param,name='lr') xgf = Classifier(dataset=dataset, estimator=XGBClassifier, use_cache=CACHE, parameters=xg_params,name='xgf') #------------------------------------------------------------------------------ #Stack the models and returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(knn, rf, et, lgb, lr) stack_ds = pipeline.stack(k=NFOLDS,seed=1) print(stack_ds.X_train.shape,stack_ds.X_test.shape) #------------------------------------------------------------------------------ dtrain = xgb.DMatrix(stack_ds.X_train, label=stack_ds.y_train) dtest = xgb.DMatrix(stack_ds.X_test) xgb_params = {
from heamy.estimator import Classifier from heamy.pipeline import ModelsPipeline from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score import lightgbm as lgb from xgboost import XGBClassifier #创建数据集 dataset = Dataset(X_train, y_train, X_n) #对无标签训练集进行预测时将X_test替换为X_n model_xgb = Classifier(dataset=dataset, estimator=XGBClassifier, parameters={ 'reg_alpha': 0.01, 'n_estimators': 100, 'objective': 'binary:logistic', 'seed': 32, 'gamma': 0.4, 'colsample_bytree': 0.75, 'subsample': 0.8, }, name='xgb') model_xgb2 = Classifier(dataset=dataset, estimator=XGBClassifier, parameters={ 'seed': 128, 'gamma': 0.4, 'reg_alpha': 0.01, 'n_estimators': 100, 'objective': 'binary:logistic', 'colsample_bytree': 0.75,
data_train = pd.read_csv('data_analysis/data_train.csv',encoding='gb2312') targets = data_train['TARGET'] train_data = data_train.drop(labels=['TARGET'],axis=1) data_test = pd.read_csv('data_analysis/data_test.csv',encoding='gb2312') test_data = data_test.drop(labels=['FORTARGET','PROB'],axis=1) # ------------------------------------------------------- 划分样本集-----------------------------------# # train_x,test_x,train_y,test_y = train_test_split(train_data,targets,test_size=0.5,random_state=66) # create dataset # dataset = Dataset(train_data,targets,test_data) dataset = Dataset(train_data,targets,test_data) #xgb = XGBClassifier(n_estimators = 1350,scale_pos_weight=4,nthread=-1,seed=6,max_depth=3,min_child_weight=6,learning_rate=0.05, # gamma=0,subsample=0.9,colsample_bytree=0.9,reg_alpha=8) #--------------------------------------------------------stacking model----------------------# model_rf1 = Classifier(dataset=dataset, estimator=RandomForestClassifier, parameters={'n_estimators': 1000,'max_depth':19, 'criterion':'entropy','min_samples_split':15,'n_jobs':-1},name='rf1') model_rf2 = Classifier(dataset=dataset, estimator=RandomForestClassifier, parameters={'n_estimators': 1000,'max_depth':19, 'criterion':'gini','min_samples_split':15,'n_jobs':-1},name='rf2') model_gdbt1 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'exponential', 'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.06,'random_state':1}, name='gdbt1') model_gdbt2 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'exponential', 'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.07,'random_state':2}, name='gdbt2') model_gdbt3 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'deviance', 'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.07,'random_state':3}, name='gdbt3') model_xgbt = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators' :1350, 'nthread':-1,'max_depth':3,'min_child_weight':6,'learning_rate':0.05, 'gamma':0,'subsample':0.9,'colsample_bytree':0.9,'reg_alpha':8,},name='xgbt')
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}, name='rf') model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr') model_knn = Regressor(dataset=dataset, estimator=KNeighborsRegressor, parameters={'n_neighbors': 15}, name='knn') model_lgt = Regressor(dataset=dataset, estimator=LogisticRegression, parameters={'penalty': 'l2'}, name='lgt') xgbclf = Classifier(dataset=dataset, estimator=XGBClassifier) # Stack two models # Returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(model_rf, model_lr, model_knn, xgbclf) weights = pipeline.find_weights(mean_absolute_error) result = pipeline.weight(weights) stack_ds = pipeline.stack(k=10, seed=111) # Then, train LinearRegression on stacked data stacker = Regressor(dataset=dataset, estimator=LinearRegression) results = stacker.predict() results = stacker.validate(k=10, scorer=mean_absolute_error)
smooth_idf=1, sublinear_tf=1) trn_term_doc = vec.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(trn_term_doc, y, test_size=0.1, random_state=111) # X_train=X_train.toarray() # X_test=X_test.toarray() print(type(X_train)) #创建数据集1 dataset = Dataset(X_train, y_train, X_test, use_cache=False) class_use_cache = False #创建RF模型和LR模型 model_nb = Classifier(dataset=dataset, estimator=MultinomialNB, name='nb', use_cache=class_use_cache) model_lr = Classifier(dataset=dataset, estimator=LogisticRegression, parameters={ 'C': 4, 'dual': True, 'n_jobs': -1 }, name='lr', use_cache=class_use_cache) model_svm = Classifier(dataset=dataset, estimator=svm.SVC, parameters={'probability': True}, name='svm', use_cache=class_use_cache)
from sklearn.model_selection import train_test_split """数据集设置""" X_train = df_data.loc[df_data['sample']=='train', :].drop(['id','issueDate','isDefault', 'sample'], axis=1) X_test = df_data.loc[df_data['sample']=='test', :].drop(['id','issueDate','isDefault', 'sample'], axis=1) y_train = df_data.loc[df_data['sample']=='train', 'isDefault'] # 数据集划分 # X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2) from heamy.dataset import Dataset from heamy.estimator import Classifier model_dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test) model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False) model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False) from heamy.pipeline import ModelsPipeline pipeline = ModelsPipeline(model_xgb, model_lgb) pipeline # 构建第一层新特征,其中k默认是5,表示5折交叉验证,full_test=True,对全部训练集进行训练得到基学习器,然后用基学习器对测试集预测得到新特征 stack_ds = pipeline.stack(k=5, seed=111, full_test=True) from sklearn.linear_model import LogisticRegression # 第二层使用逻辑回归进行stack LogisticRegression(solver='lbfgs') stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'}) # 测试集的预测结果