def test_prediction(): model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset) output = model.predict() assert len(output.shape) == 1 assert model.dataset.X_test.shape[0] == output.shape[0] # Retrieve cached object output = model.predict() assert len(output.shape) == 1 assert model.dataset.X_test.shape[0] == output.shape[0]
def test_stacking(): model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset) ds = model.stack(10) assert ds.X_train.shape[0] == model.dataset.X_train.shape[0] assert ds.X_test.shape[0] == model.dataset.X_test.shape[0] assert ds.y_train.shape[0] == model.dataset.y_train.shape[0] model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset) ds = model.stack(10, full_test=False) assert np.isnan(ds.X_train).sum() == 0 assert ds.X_train.shape[0] == model.dataset.X_train.shape[0] assert ds.X_test.shape[0] == model.dataset.X_test.shape[0] assert ds.y_train.shape[0] == model.dataset.y_train.shape[0] model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset) model.dataset.load() ds = model.stack(10, full_test=False) # Check cache assert np.isnan(ds.X_train).sum() == 0 assert ds.X_train.shape[0] == model.dataset.X_train.shape[0] assert ds.X_test.shape[0] == model.dataset.X_test.shape[0] assert ds.y_train.shape[0] == model.dataset.y_train.shape[0]
def test_blending(): model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset) _, _, X_t, y_t = model.dataset.split(test_size=0.2) ds = model.blend(proportion=0.2) assert ds.X_test.shape[0] == model.dataset.X_test.shape[0] assert ds.X_train.shape[0] == X_t.shape[0] # Check cache ds = model.blend(proportion=0.2) assert ds.X_test.shape[0] == model.dataset.X_test.shape[0] assert ds.X_train.shape[0] == X_t.shape[0]
def test_validation(): model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset) model.validate(k=10) # Retrieve cached object y_true, y_pred = model.validate(k=10) assert len(y_true) == len(y_pred) model.validate(k=1) # Retrieve cached object y_true, y_pred = model.validate(k=1) assert len(y_true) == len(y_pred) assert len(y_true) == 1
def test_custom_estimators(): def test_estimator(): return class TestEstimator2(Classifier): def estimator(self): return with pytest.raises(ValueError): TestEstimator2(dataset=TestDataset) with pytest.raises(ValueError): Regressor(estimator=test_estimator, dataset=TestDataset)
rf = RandomForestRegressor rf2 = RandomForestRegressor rfs = RandomForestRegressor ### #params_adb = {'n_estimators':300,'loss':'square','learning_rate':0.8} ##adb = AdaBoostRegressor params_ext = {'max_features':'log2','n_estimators':500,'max_depth':12,'oob_score': True, 'n_jobs':4,'bootstrap':True} ext = ExtraTreesRegressor params_gbrt = {'loss':'huber','n_estimators': 400,'max_depth':12,'learning_rate': 0.01, 'random_state': 3} gbrt = GradientBoostingRegressor ###stacking model_rf1 = Regressor(dataset=dataset, estimator=rf, parameters=params_rf,name='rf1') #model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext,name='ext') #model_rf2 = Regressor(dataset=dataset, estimator=rf2, parameters=params_rf2,name='rf2') model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv,name='rcv') #model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt') #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv') model_br = Regressor(dataset=dataset, estimator=br, parameters=params_br,name='br') model_knn = Regressor(dataset=dataset, estimator=knn, parameters=params_knn,name='knn') #blending = pipeline.blend(proportion=0.3,seed=111) params_las = {'alpha':1.7} params_rcv2 = {'cv':5,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_squared_error'} params_lascv = {'max_iter':500,'cv':8} pipeline = ModelsPipeline(model_rf1,model_knn) stack_ds = pipeline.stack(k=5,seed=111)
train_test_data = train_test_data.fillna(0) dummy_df = pd.get_dummies(train_test_data.loc[:, dummy_fea]) train_test_data = pd.concat([train_test_data, dummy_df], axis=1) train_test_data = train_test_data.drop(dummy_fea, axis=1) train_train = train_test_data.iloc[:train_data.shape[0], :] test_test = train_test_data.iloc[train_data.shape[0]:, :] train_train_x = train_train.drop(['target'], axis=1) test_test_x = test_test.drop(['target'], axis=1) xgb_dataset = Dataset(X_train=train_train_x, y_train=train_train['target'], X_test=test_test_x, y_test=None, use_cache=False) #heamy model_xgb = Regressor(dataset=xgb_dataset, estimator=xgb_feature, name='xgb', use_cache=False) model_xgb2 = Regressor(dataset=xgb_dataset, estimator=xgb_feature2, name='xgb2', use_cache=False) model_xgb3 = Regressor(dataset=xgb_dataset, estimator=xgb_feature3, name='xgb3', use_cache=False) model_lgb = Regressor(dataset=lgb_dataset, estimator=lgb_feature, name='lgb', use_cache=False) model_gbdt = Regressor(dataset=xgb_dataset, estimator=gbdt_model,
ext = ExtraTreesRegressor ### params_gbrt = { 'loss': 'huber', 'n_estimators': 300, 'max_depth': 12, 'learning_rate': 0.01, 'random_state': 3 } gbrt = GradientBoostingRegressor ### params_las = {'alpha': 1} ###stacking model_rf = Regressor(dataset=dataset, estimator=rf, parameters=params_rf, name='rf') #model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt') model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext, name='ext') model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv, name='rcv') #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv') pipeline = ModelsPipeline(model_rf, model_rcv, model_ext) stack_ds = pipeline.stack(k=5, seed=111) stacker = Regressor(dataset=stack_ds,
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111) # create dataset dataset = Dataset(X_train,y_train,X_test) # initialize RandomForest & LinearRegression model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf') model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr') # Stack two models # Returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(model_rf,model_lr) stack_ds = pipeline.stack(k=10,seed=111) # Train LinearRegression on stacked data (second stage) stacker = Regressor(dataset=stack_ds, estimator=LinearRegression) results = stacker.predict() # Validate results using 10 fold cross-validation results = stacker.validate(k=10,scorer=mean_absolute_error) #blend # load boston dataset from sklearn from sklearn.datasets import load_boston data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111) # create dataset dataset = Dataset(X_train,y_train,X_test)
} from heamy.dataset import Dataset from heamy.estimator import Regressor from heamy.pipeline import ModelsPipeline from catboost import CatBoostRegressor #import adaboost # datasetを準備 dataset = Dataset(x_test, t_test, x_train2) # X_testは今回使わないが入れないとエラーになる # アンサンブルに使うモデルを定義 models = [ Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={ 'n_estimators': 50, 'random_state': seed }, name='rf'), Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'), Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'), Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={ 'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent' },
rf = RandomForestRegressor rf2 = RandomForestRegressor rfs = RandomForestRegressor ### params_adb = {'n_estimators':500,'loss':'square','learning_rate':0.02} adb = AdaBoostRegressor params_ext = {'max_features':'log2','n_estimators':500,'max_depth':12,'oob_score': True, 'n_jobs':4,'bootstrap':True} ext = ExtraTreesRegressor params_gbrt = {'loss':'huber','n_estimators': 400,'max_depth':12,'learning_rate': 0.01, 'random_state': 3} gbrt = GradientBoostingRegressor ###stacking model_rf1 = Regressor(dataset=dataset, estimator=rf, parameters=params_rf,name='rf1') model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext,name='ext') #model_rf2 = Regressor(dataset=dataset, estimator=rf2, parameters=params_rf2,name='rf2') model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv,name='rcv') model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt') #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv') model_br = Regressor(dataset=dataset, estimator=br, parameters=params_br,name='br') model_knn = Regressor(dataset=dataset, estimator=knn, parameters=params_knn,name='knn') model_adb = Regressor(dataset=dataset, estimator=adb, parameters=params_adb,name='adb') pipeline = ModelsPipeline(model_rf1,model_knn,model_rcv) #stack_ds = pipeline.stack(k=5,seed=111) #blending = pipeline.blend(proportion=0.3,seed=111) params_las = {'alpha':1.7} params_rcv2 = {'cv':5,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_absolute_error'} #stacker = Regressor(dataset=stack_ds,estimator=rcv, parameters=params_rcv2) #y_pre = stacker.predict()
#stack # load boston dataset from sklearn from sklearn.datasets import load_boston data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111) # create dataset dataset = Dataset(X_train, y_train, X_test) # initialize RandomForest & LinearRegression model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}, name='rf') model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr') # Stack two models # Returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(model_rf, model_lr) stack_ds = pipeline.stack(k=10, seed=111) # Train LinearRegression on stacked data (second stage) stacker = Regressor(dataset=stack_ds, estimator=LinearRegression) results = stacker.predict() # Validate results using 10 fold cross-validation
from heamy.cache import np_hash from heamy.dataset import Dataset from heamy.estimator import Regressor from heamy.utils.main import generate_columns, group_models, report_score from heamy.utils.optimizer import Optimizer def boston_dataset(): data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111) return X_train, y_train, X_test, y_test dataset = Dataset(preprocessor=boston_dataset, use_cache=True) model = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}) model_2 = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}) def test_generate_columns(): x = np.random.rand(100, 10) output = generate_columns(x, 'test') assert len(output) == 10 assert all([True if col.find('test') == 0 else False for col in output]) output = generate_columns(np.random.rand(100), 'test') assert output[0] == 'test' def test_optimizer(): opt = Optimizer([model, model_2], scorer=mean_absolute_error)
class TestEstimator(Classifier): @staticmethod def estimator(X_train, y_train, X_test, y_test=None): return np.zeros((2, X_test.shape[0])) # return np.zeros(X_test.shape[0]) def func_estimator(X_train, y_train, X_test, y_test): return np.zeros(X_test.shape[0]) def random_param(): return random.randint(1, 100) model_func = Regressor(estimator=func_estimator, dataset=TestDataset) model_cls = TestEstimator(dataset=TestDataset()) model_param = Regressor(estimator=LinearRegression, parameters={'random_param': random_param}, dataset=TestDataset) model_param2 = Classifier(estimator=LogisticRegression, parameters={'colsample_bylevel': 0.9}, dataset=TestDataset) def test_hashing(): assert str( model_func) == 'func_estimator(54743c7a5484d1bf2a64ac1d7b68f8cc)' assert str(model_cls) == 'TestEstimator(da29cb8766f96e6561a51e8e3c13f661)' assert str( model_param) == 'LinearRegression(2e789a766f6dc2457fb6a63452ad2859)'
from heamy.estimator import Regressor from heamy.pipeline import ModelsPipeline from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2) # create dataset Data = Dataset(X_train,y_train,X_test) # initialize RandomForest & LinearRegression RfModel = Regressor(dataset=Data, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf') LRModel = Regressor(dataset=Data, estimator=LinearRegression, parameters={'normalize': True},name='lr') # Stack two models # Returns new dataset with out-of-fold predictions Pipeline = ModelsPipeline(RfModel,LRModel) StackModel = Pipeline.stack(k=10,seed=2) # Train LinearRegression on stacked data (second stage) Stacker = Regressor(dataset=StackModel, estimator=LinearRegression) Results = Stacker.predict() # Validate results using 10 fold cross-validation Results = Stacker.validate(k=10,scorer=mean_absolute_error)
data_X = pd.DataFrame(data_X) data_y = pd.DataFrame(data_y) # 训练集,测试集划分 # train_X, test_X, train_y, test_y = train_test_split(data_X, data_y1, test_size=0.8, random_state=7) # print(np.shape(train_X)) # print("################模型融合##############") x_train, x_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.33, random_state=2018) y_train = np.asarray(y_train).reshape(-1, 1) # 创建数据集 dataset = Dataset(x_train, y_train.ravel(), x_test) # 创建RF模型和LR模型 model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr') model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}, name='rf') model_gbdt = Regressor(dataset=dataset, estimator=GradientBoostingRegressor, parameters={ 'n_estimators': 50, 'learning_rate': 0.05, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 15, 'min_samples_split': 10 },