from heamy.estimator import Regressor from heamy.pipeline import ModelsPipeline from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2) # create dataset Data = Dataset(X_train,y_train,X_test) # initialize RandomForest & LinearRegression RfModel = Regressor(dataset=Data, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf') LRModel = Regressor(dataset=Data, estimator=LinearRegression, parameters={'normalize': True},name='lr') # Stack two models # Returns new dataset with out-of-fold predictions Pipeline = ModelsPipeline(RfModel,LRModel) StackModel = Pipeline.stack(k=10,seed=2) # Train LinearRegression on stacked data (second stage) Stacker = Regressor(dataset=StackModel, estimator=LinearRegression) Results = Stacker.predict() # Validate results using 10 fold cross-validation Results = Stacker.validate(k=10,scorer=mean_absolute_error)
model_xgb3 = Regressor(dataset=xgb_dataset, estimator=xgb_feature3, name='xgb3', use_cache=False) model_lgb = Regressor(dataset=lgb_dataset, estimator=lgb_feature, name='lgb', use_cache=False) model_gbdt = Regressor(dataset=xgb_dataset, estimator=gbdt_model, name='gbdt', use_cache=False) pipeline = ModelsPipeline(model_xgb, model_xgb2, model_xgb3, model_lgb, model_gbdt) stack_ds = pipeline.stack(k=5, seed=111, add_diff=False, full_test=True) stacker = Regressor(dataset=stack_ds, estimator=LinearRegression, parameters={'fit_intercept': False}) predict_result = stacker.predict() ans = pd.read_csv('../AI_risk_test_V3.0/test_list.csv', parse_dates=['appl_sbm_tm']) ans['PROB'] = predict_result ans = ans.drop(['appl_sbm_tm'], axis=1) minmin, maxmax = min(ans['PROB']), max(ans['PROB']) ans['PROB'] = ans['PROB'].map(lambda x: (x - minmin) / (maxmax - minmin)) ans['PROB'] = ans['PROB'].map(lambda x: '%.4f' % x) ans.to_csv('./ans_stacking.csv', index=None)
estimator=MLPClassifier, name="mlp", use_cache=class_use_cache) model_sgt = Classifier(dataset=dataset, estimator=SGDClassifier, parameters={'penalty': 'l1'}, name="sgd", use_cache=class_use_cache) # Stack两个模型mhg # Returns new dataset with out-of-fold prediction,model_svm,model_per logging.info('stack_ds....') # pipeline = ModelsPipeline(model_nb,model_lr,model_svc) pipeline = ModelsPipeline(model_sgt) # pipeline = ModelsPipeline(model_nb),model_nb,model_lr,model_lr2 stack_ds = pipeline.stack(k=8, seed=111) #第二层使用lr模型stack2 logging.info('second layer....') stacker = Classifier(dataset=stack_ds, estimator=svm.LinearSVC, use_cache=False, probability=False) results = stacker.predict() # 使用10折交叉验证结果 results10 = stacker.validate(k=3, scorer=accuracy_score) logging.info(results10) result_list = list(results + 1) test_id = list(test[["id"]].copy())
parameters=et_param, name='et') lgb = Classifier(dataset=dataset, estimator=LGBMClassifier, use_cache=CACHE, parameters=lgb_param, name='lgb') lr = Classifier(dataset=dataset, estimator=LogisticRegression, use_cache=CACHE, parameters=lr_param, name='lr') #------------------------------------------------------------------------------ #Stack the models and returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(knn, rf, et, lgb, lr) stack_ds = pipeline.stack(k=NFOLDS, seed=1) # Train LogisticRegression on stacked data (second stage) lr1 = LogisticRegression lr1_params = { 'C': 5, 'random_state': 1, 'solver': 'liblinear', 'multi_class': 'ovr', } stacker = Classifier(dataset=stack_ds, estimator=lr1, use_cache=False, parameters=lr1_params) # Validate results using k-fold cross-validation
'n_estimators': 500, # 'num_class': 5, 'objective': 'multi:softprob', 'subsample': 0.8} model_xgb = Classifier(dataset=dataset_full, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb') model_xgb_f = Classifier(dataset=dataset_f, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb_f') model_rf = Classifier(dataset=dataset_f, estimator=RandomForestClassifier, parameters={'n_estimators': 700}, name='rf') pipeline = ModelsPipeline( # model_xgb, model_rf, model_xgb_f) stack_ds = pipeline.stack(k=5, full_test=True, seed=111) stacker = Classifier(stack_ds, LogisticRegression) stacker.validate(k=5, scorer=log_loss) # logging.info(val_results) # # # logging.info(train_x.head(10)) # # print(test_x.columns.difference(train_x.columns)) # # boosters = np.array([]) # predictions = [] # # print(xgb.cv(xgb_params, xgb.DMatrix(train_x, train_yt), nfold=5, num_boost_round=100, early_stopping_rounds=10, metrics=["mlogloss"], verbose_eval=False)) #
from sklearn.datasets import load_boston data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111) # create dataset dataset = Dataset(X_train,y_train,X_test) # initialize RandomForest & LinearRegression model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf') model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr') # Stack two models # Returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(model_rf,model_lr) stack_ds = pipeline.stack(k=10,seed=111) # Train LinearRegression on stacked data (second stage) stacker = Regressor(dataset=stack_ds, estimator=LinearRegression) results = stacker.predict() # Validate results using 10 fold cross-validation results = stacker.validate(k=10,scorer=mean_absolute_error) #blend # load boston dataset from sklearn from sklearn.datasets import load_boston data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)
fs = ['xgb1','xgb2','xgb3','et','svm','lr','lgb','gbdt'] import matplotlib.pyplot as plt def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues): plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(8) plt.xticks(tick_marks, fs, rotation=45) plt.yticks(tick_marks, fs) plt.tight_layout() plot_confusion_matrix(cm, title='mic') plt.show() model_xgb2 = Regressor(dataset= dataset, estimator=xgb_feature2,name='xgb2',use_cache=False) model_lr = Regressor(dataset= dataset, estimator=logistic_model,name='lr',use_cache=False) model_lgb = Regressor(dataset= dataset, estimator=lgb_model,name='lgb',use_cache=False) model_ gbdt = Regressor(dataset= dataset, estimator=gbdt_model,name='gbdt',use_cache=False) pipeline = ModelsPipeline(model_xgb2, model_lr, model_lgb, model_svm) stack_data = pipeline.stack(k=5, seed=0, add_diff=False, full_test=True) stacker = Regressor(dataset=stack_data,estimator=LinearRegression, parameters={'fit_intercept': False}) predict_result = stacker.predict() val = pd.read_csv('val_list.csv') val['PROB'] = predict_result minmin, maxmax = min(val ['PROB']),max(val ['PROB']) val['PROB'] = val['PROB'].map(lambda x:(x-minmin)/(maxmax-minmin)) val['PROB'] = val['PROB'].map(lambda x:'%.4f' % x)
parameters={'normalize': True}, name='lr'), Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'), Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={ 'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent' }, name='cr') ] # pipelineを定義、2nd levelデータセットの作成 pipeline = ModelsPipeline(*models) stack_ds = pipeline.stack(k=10, seed=seed) # modelを作ってvalidation stacker = Regressor(dataset=stack_ds, estimator=LinearRegression) y_trues, y_preds = stacker.validate(k=10) # 精度出力 # X_testを使ってpredict y_pred = stacker.predict() print(y_pred) sum = 0 buy = 0 for i, yosoku in enumerate(x_test): if stacker.predict(x_test[i]) < 0.3: if t_test[i] == 0: