def test_properties(): d = Dataset(X_train, y_train, X_test, y_test) assert d.X_train is X_train assert d.y_train is y_train assert d.X_test is X_test assert d.y_test is y_test d = Dataset(X_train, y_train) assert d.X_test is None assert d.y_test is None
def test_merge(): d1 = Dataset(X_train, y_train, X_test, y_test) d2 = Dataset(X_train, y_train, X_test, y_test) ds = d1.merge(d2) assert ds.X_train.shape[1] == 26 assert ds.X_test.shape[1] == 26 d1 = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train), pd.DataFrame(X_test), pd.DataFrame(y_test)) d2 = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train), pd.DataFrame(X_test), pd.DataFrame(y_test)) d1.merge(d2, inplace=True) assert d1.X_train.shape[1] == 26 assert d1.X_test.shape[1] == 26
def test_initialization(): Dataset(X_train, y_train, X_test, y_test) Dataset(X_train, y_train, X_test) Dataset(X_train, y_train) Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train), pd.DataFrame(X_test)) # with pytest.raises(ValueError): # Dataset() Dataset(preprocessor=preprocess) CustomDataset()
def test_hashing(): assert Dataset(X_train, y_train, X_test, y_test).hash == '0cb7e710b7319bb71e7328e4b422b374' assert Dataset(X_train, y_train, X_test).hash == 'c9b316f827981b3d0b53f8ab139234ea' assert Dataset( pd.DataFrame(X_train), pd.DataFrame(y_train), pd.DataFrame(X_test)).hash == 'c9b316f827981b3d0b53f8ab139234ea' assert Dataset( np.asfortranarray(X_train), np.asfortranarray(y_train), np.asfortranarray(X_test)).hash == '8087697aa8460a25314edc85cc915ec8' d_hash = TestDataset().hash assert d_hash == TestDataset().hash
def test_hashing(): assert Dataset(X_train, y_train, X_test, y_test).hash == '13fceb92d1485772af58252810646711' assert Dataset(X_train, y_train, X_test).hash == '116d39a012c2b54df573a8b8d0eae85c' assert Dataset( pd.DataFrame(X_train), pd.DataFrame(y_train), pd.DataFrame(X_test)).hash == '116d39a012c2b54df573a8b8d0eae85c' assert Dataset( np.asfortranarray(X_train), np.asfortranarray(y_train), np.asfortranarray(X_test)).hash == 'e2ebd8834b5b3bfb6b9ff3e4697ceb24' d_hash = CustomDataset().hash assert d_hash == CustomDataset().hash
def test_shapes(): x_t = np.random.rand(100, 5) y_t = np.random.rand(5, 1) with pytest.raises(ValueError): assert Dataset(X_train, y_train, x_t, y_test) with pytest.raises(ValueError): assert Dataset(X_train, y_t, x_t, y_test) with pytest.raises(ValueError): assert Dataset(X_train, y_train, X_test, y_t) with pytest.raises(ValueError): assert Dataset(X_train, y_train, x_t)
def test_slicing(): train_index = np.array(range(100)) test_index = np.array(range(100, 250)) d = Dataset(X_train, y_train, X_test, y_test) Xtrain, ytrain, Xtest, ytest = d.split(indices=(train_index, test_index)) assert Xtrain.shape[0] == 100 assert ytrain.shape[0] == Xtrain.shape[0] assert Xtest.shape[0] == 150 assert Xtest.shape[0] == ytest.shape[0] d = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train), pd.DataFrame(X_test), pd.DataFrame(y_test)) Xtrain, ytrain, Xtest, ytest = d.split(indices=(train_index, test_index)) assert Xtrain.shape[0] == 100 assert ytrain.shape[0] == Xtrain.shape[0] assert Xtest.shape[0] == 150 assert Xtest.shape[0] == ytest.shape[0]
def test_split(): d = Dataset(X_train, y_train) d.split(inplace=True) assert d.X_test is not None assert d.y_test is not None d = Dataset(X_train, y_train) data = d.split(inplace=False) assert all(x is not None for x in data) d = Dataset(X_train, y_train) data = d.split(inplace=False) assert all([x is not None for x in data]) d = Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train)) train_ind = np.array(range(0, 70)) test_ind = np.array(range(30, 100)) data = d.split(indices=[train_ind, test_ind]) assert isinstance(data[0], (pd.DataFrame, pd.Series)) assert isinstance(data[2], (pd.DataFrame, pd.Series))
def stacking(self): from heamy.dataset import Dataset from heamy.estimator import Regressor from heamy.pipeline import ModelsPipeline %%time dataset = Dataset(X_train, y_train, X_test) models_dic = { 'random forest': RandomForestRegressor(n_estimators=50, random_state=seed), 'linear regression': LinearRegression(normalize=True), 'knn': KNeighborsRegressor(), 'catboost': CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent') } for name, model in models_dic.items(): kfold = KFold(n_splits=10, random_state=seed) cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="neg_mean_absolute_error") print(f'{name} : {-np.mean(cv_results):.2f}')
""" Created on Mon Oct 29 15:25:05 2018 @author: Administrator """ from heamy.dataset import Dataset from heamy.estimator import Classifier from heamy.pipeline import ModelsPipeline from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score import lightgbm as lgb from xgboost import XGBClassifier #创建数据集 dataset = Dataset(X_train, y_train, X_n) #对无标签训练集进行预测时将X_test替换为X_n model_xgb = Classifier(dataset=dataset, estimator=XGBClassifier, parameters={ 'reg_alpha': 0.01, 'n_estimators': 100, 'objective': 'binary:logistic', 'seed': 32, 'gamma': 0.4, 'colsample_bytree': 0.75, 'subsample': 0.8, }, name='xgb') model_xgb2 = Classifier(dataset=dataset,
"""对训练集数据进行划分,分成训练集和验证集,并进行相应的操作""" from sklearn.model_selection import train_test_split """数据集设置""" X_train = df_data.loc[df_data['sample']=='train', :].drop(['id','issueDate','isDefault', 'sample'], axis=1) X_test = df_data.loc[df_data['sample']=='test', :].drop(['id','issueDate','isDefault', 'sample'], axis=1) y_train = df_data.loc[df_data['sample']=='train', 'isDefault'] # 数据集划分 # X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2) from heamy.dataset import Dataset from heamy.estimator import Classifier model_dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test) model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False) model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False) from heamy.pipeline import ModelsPipeline pipeline = ModelsPipeline(model_xgb, model_lgb) pipeline # 构建第一层新特征,其中k默认是5,表示5折交叉验证,full_test=True,对全部训练集进行训练得到基学习器,然后用基学习器对测试集预测得到新特征 stack_ds = pipeline.stack(k=5, seed=111, full_test=True) from sklearn.linear_model import LogisticRegression # 第二层使用逻辑回归进行stack LogisticRegression(solver='lbfgs') stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
vec = TfidfVectorizer(ngram_range=(1, 1), min_df=3, max_df=0.8, use_idf=1, smooth_idf=1, sublinear_tf=1) trn_term_doc = vec.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(trn_term_doc, y, test_size=0.1, random_state=111) # X_train=X_train.toarray() # X_test=X_test.toarray() print(type(X_train)) #创建数据集1 dataset = Dataset(X_train, y_train, X_test, use_cache=False) class_use_cache = False #创建RF模型和LR模型 model_nb = Classifier(dataset=dataset, estimator=MultinomialNB, name='nb', use_cache=class_use_cache) model_lr = Classifier(dataset=dataset, estimator=LogisticRegression, parameters={ 'C': 4, 'dual': True, 'n_jobs': -1 }, name='lr', use_cache=class_use_cache)
LinearRegression(normalize=True), 'knn': KNeighborsRegressor(), 'catboost': CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent') } from heamy.dataset import Dataset from heamy.estimator import Regressor from heamy.pipeline import ModelsPipeline from catboost import CatBoostRegressor #import adaboost # datasetを準備 dataset = Dataset(x_test, t_test, x_train2) # X_testは今回使わないが入れないとエラーになる # アンサンブルに使うモデルを定義 models = [ Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={ 'n_estimators': 50, 'random_state': seed }, name='rf'), Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'), Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
from heamy.dataset import Dataset from sklearn.svm import SVC #---------------------------------------------------------读取数据集-------------------------------# data_train = pd.read_csv('data_analysis/data_train.csv',encoding='gb2312') targets = data_train['TARGET'] train_data = data_train.drop(labels=['TARGET'],axis=1) data_test = pd.read_csv('data_analysis/data_test.csv',encoding='gb2312') test_data = data_test.drop(labels=['FORTARGET','PROB'],axis=1) # ------------------------------------------------------- 划分样本集-----------------------------------# # train_x,test_x,train_y,test_y = train_test_split(train_data,targets,test_size=0.5,random_state=66) # create dataset # dataset = Dataset(train_data,targets,test_data) dataset = Dataset(train_data,targets,test_data) #xgb = XGBClassifier(n_estimators = 1350,scale_pos_weight=4,nthread=-1,seed=6,max_depth=3,min_child_weight=6,learning_rate=0.05, # gamma=0,subsample=0.9,colsample_bytree=0.9,reg_alpha=8) #--------------------------------------------------------stacking model----------------------# model_rf1 = Classifier(dataset=dataset, estimator=RandomForestClassifier, parameters={'n_estimators': 1000,'max_depth':19, 'criterion':'entropy','min_samples_split':15,'n_jobs':-1},name='rf1') model_rf2 = Classifier(dataset=dataset, estimator=RandomForestClassifier, parameters={'n_estimators': 1000,'max_depth':19, 'criterion':'gini','min_samples_split':15,'n_jobs':-1},name='rf2') model_gdbt1 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'exponential', 'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.06,'random_state':1}, name='gdbt1') model_gdbt2 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'exponential', 'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.07,'random_state':2}, name='gdbt2') model_gdbt3 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'deviance',
from heamy.cache import np_hash from heamy.dataset import Dataset from heamy.estimator import Regressor from heamy.utils.main import generate_columns, group_models, report_score from heamy.utils.optimizer import Optimizer def boston_dataset(): data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111) return X_train, y_train, X_test, y_test dataset = Dataset(preprocessor=boston_dataset, use_cache=True) model = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}) model_2 = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}) def test_generate_columns(): x = np.random.rand(100, 10) output = generate_columns(x, 'test') assert len(output) == 10 assert all([True if col.find('test') == 0 else False for col in output]) output = generate_columns(np.random.rand(100), 'test') assert output[0] == 'test' def test_optimizer():
stacking_pre = [] Y_test = [] err_shop_rf = [] err_shop_gbrt = [] while i<500: # readfile X = [] Y = readfile_oneshop_Y(fr2) start = len(Y) #print(start) X = readfile_oneshop_X(fr1,xday,xweekend,xweekday,xholiday)[-(start+14):-14] x_train = X[:-7] y_train = Y[:-7] x_test = X[-7:] y_test = Y[-7:] dataset = Dataset(x_train,y_train,x_test) ### params_rcv = {'cv':8,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_squared_error'} rcv = RidgeCV params_lascv = {'n_jobs':4,'cv':8} lascv = LassoCV params_rf = {'n_estimators':500,'max_depth':10,'min_samples_split':2,'warm_start':True,'n_jobs':4,'oob_score':True,'max_features':'log2'} params_rf2 = {'n_estimators':400,'max_depth':10,'min_samples_split':2,'warm_start':True,'n_jobs':4,'oob_score':True,'max_features':'log2'} params_br = {'n_iter':300} br = BayesianRidge params_knn = {'n_neighbors':14,'algorithm':'auto'} knn = KNeighborsRegressor rf = RandomForestRegressor rf2 = RandomForestRegressor
def test_cache(): d = Dataset(X_train, y_train, X_test, y_test) assert not d._cache() cache_dir = '.cache/heamy/' if os.path.exists(cache_dir): shutil.rmtree(cache_dir) d = CustomDataset() d._cache() d = CustomDataset(use_cache=True) d.load() assert d.loaded d = CustomDataset(use_cache=False) d.load() assert d.loaded d = CustomDataset3(use_cache=True) d.load() d._cache() d = CustomDataset3(use_cache=True) d.load() assert isinstance(d.X_train, (pd.DataFrame, pd.Series))
scaler = StandardScaler() data_X = scaler.fit_transform(data_X) data_y = scaler.fit_transform(data_y) data_X = pd.DataFrame(data_X) data_y = pd.DataFrame(data_y) # 训练集,测试集划分 # train_X, test_X, train_y, test_y = train_test_split(data_X, data_y1, test_size=0.8, random_state=7) # print(np.shape(train_X)) # print("################模型融合##############") x_train, x_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.33, random_state=2018) y_train = np.asarray(y_train).reshape(-1, 1) # 创建数据集 dataset = Dataset(x_train, y_train.ravel(), x_test) # 创建RF模型和LR模型 model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr') model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}, name='rf') model_gbdt = Regressor(dataset=dataset, estimator=GradientBoostingRegressor, parameters={ 'n_estimators': 50, 'learning_rate': 0.05, 'max_depth': 4,
logging.info('read from .....') with open('../input_data/trn_term_doc_13.pil', 'rb') as f: trn_term_doc = pickle.load(f) with open('../input_data/test_term_doc_13.pil', 'rb') as f: test_term_doc = pickle.load(f) X_train, X_test, y_train, y_test = train_test_split(trn_term_doc, y, test_size=0.01, random_state=111) print('tttt') # X_train=X_train.toarray() # X_test=X_test.toarray() print('to array') #创建数据集11 dataset = Dataset(X_train, y_train, test_term_doc, use_cache=False) #创建RF模型和LR模型1 class_use_cache = False model_nb = Classifier(dataset=dataset, estimator=MultinomialNB, name='nb', use_cache=class_use_cache) model_lr = Classifier(dataset=dataset, estimator=LogisticRegression, parameters={ 'C': 4, 'dual': True, 'n_jobs': -1 }, name='lr',
def test_repr(): assert str(Dataset(X_train, y_train, X_test, y_test)) == 'Dataset(13fceb92d1485772af58252810646711)'
def test_cache(): d = Dataset(X_train, y_train, X_test, y_test) assert not d._cache() cache_dir = '.cache/heamy/' if os.path.exists(cache_dir): shutil.rmtree(cache_dir) d = TestDataset() d._cache() d = TestDataset(use_cache=True) d.load() assert d.loaded d = TestDataset(use_cache=False) d.load() assert d.loaded d = TestDataset3(use_cache=True) d.load() d._cache() d = TestDataset3(use_cache=True) d.load() assert isinstance(d.X_train, (pd.DataFrame, pd.Series))
def test_repr(): assert str(Dataset(X_train, y_train, X_test, y_test)) == 'Dataset(0cb7e710b7319bb71e7328e4b422b374)'
'n_estimators': 500, 'num_class': 5, 'objective': 'multi:softprob', 'subsample': 0.8} bst = xgb.train(xgb_params, xgb.DMatrix(train_x, train_yt)) fscores = bst.get_fscore() filtered_fscores_t = {k: v for k, v in fscores.items() if v > 0} filtered_cols_t = list(filtered_fscores_t.keys()) train_xf = train_x[filtered_cols_t] # val_xr = val_x[filtered_cols] test_xf = test_x[filtered_cols_t] dataset_full = Dataset(train_x.astype(np.float64), train_yt, test_x.astype(np.float64)) dataset_f = Dataset(train_xf.astype(np.float64), train_yt, test_xf.astype(np.float64)) xgb_params = { 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 500, # 'num_class': 5, 'objective': 'multi:softprob', 'subsample': 0.8} model_xgb = Classifier(dataset=dataset_full, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb') model_xgb_f = Classifier(dataset=dataset_f, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb_f') model_rf = Classifier(dataset=dataset_f, estimator=RandomForestClassifier, parameters={'n_estimators': 700}, name='rf')
from heamy.dataset import Dataset from heamy.estimator import Regressor, Classifier from heamy.pipeline import ModelsPipeline #stack # load boston dataset from sklearn from sklearn.datasets import load_boston data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111) # create dataset dataset = Dataset(X_train, y_train, X_test) # initialize RandomForest & LinearRegression model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}, name='rf') model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr') # Stack two models # Returns new dataset with out-of-fold predictions pipeline = ModelsPipeline(model_rf, model_lr) stack_ds = pipeline.stack(k=10, seed=111)
test_term_doc=pickle.load( f) with open('../input_data/trn_term_doc_wc_13.pil', 'rb') as f: trn_term_doc_wc=pickle.load( f) with open('../input_data/test_term_doc_wc_13.pil', 'rb') as f: test_term_doc_wc=pickle.load( f) X_train, X_test, y_train, y_test =train_test_split(trn_term_doc, y, test_size=0.01, random_state=111) X_train_wc, X_test_wc, y_train_wc, y_test_wc =train_test_split(trn_term_doc_wc, y, test_size=0.01, random_state=111) print('tttt') # X_train=X_train.toarray() # X_test=X_test.toarray() print('to array') #创建数据集11 dataset = Dataset(X_train,y_train,test_term_doc,use_cache=False) #创建RF模型和LR模型1 dataset_wc = Dataset(X_train_wc,y_train_wc,test_term_doc_wc,use_cache=False) class_use_cache=False model_nb = Classifier(dataset=dataset_wc, estimator=MultinomialNB,name='nb',use_cache=class_use_cache) model_lr = Classifier(dataset=dataset, estimator=LogisticRegression, parameters={'C':4, 'dual':True,'n_jobs':-1},name='lr',use_cache=class_use_cache) model_lr2 = Classifier(dataset=dataset, estimator=LogisticRegression, parameters={'C':4, 'multi_class':'multinomial','solver':'sag','dual':False,'n_jobs':-1},name='lr2',use_cache=class_use_cache) model_svm = Classifier(dataset=dataset, estimator=svm.SVC, parameters={ 'probability':True},name='svm',use_cache=class_use_cache) model_svc= Classifier(dataset=dataset, estimator=svm.LinearSVC,name='LinearSVC',use_cache=class_use_cache) model_knn=Classifier(dataset=dataset, estimator=KNeighborsClassifier,name="knn",use_cache=class_use_cache) # Stack两个模型mhg # Returns new dataset with out-of-fold prediction,model_svm,model_per logging.info('stack_ds....') pipeline = ModelsPipeline(model_knn) # pipeline = ModelsPipeline(model_nb),model_nb,model_lr,model_lr2
test[c] *= weights[i] train = dtrn_.values test = dtst_.values y = y_train.ravel() - 1 return { 'X_train': train, 'X_test': test, 'y_train': y } #dtrn_, dtst_, y_train #------------------------------------------------------------------------------ #create the dataset dataset = Dataset(preprocessor=preprocessData, use_cache=True) #------------------------------------------------------------------------------ from sklearn import model_selection from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.cluster import KMeans from lightgbm import LGBMClassifier from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, \ AdaBoostClassifier from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.multiclass import OneVsRestClassifier from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC
def test_convertion(): d1 = Dataset(X_train, y_train, X_test, y_test) d1.to_csc() d1.to_dense() d1 = Dataset(X_train, y_train, X_test, y_test) d1.to_csr() d1.to_dense() d1 = Dataset(X_train, y_train, X_test, y_test) d1.to_dense()
dummy_fea = [ 'sex', 'merriage', 'income', 'qq_bound', 'degree', 'wechat_bound', 'account_grade', 'industry' ] for _fea in dummy_fea: print(_fea) le = LabelEncoder() le.fit(train_data[_fea].tolist() + test_data[_fea].tolist()) tmp = le.transform(train_data[_fea].tolist() + test_data[_fea].tolist()) train_data[_fea] = tmp[:train_data.shape[0]] test_data[_fea] = tmp[train_data.shape[0]:] train_x = train_data.drop(['target'], axis=1) test_x = test_data.drop(['target'], axis=1) lgb_dataset = Dataset(train_x, train_data['target'], test_x, use_cache=False) ############################## train_data = pd.read_csv('8288train.csv', engine='python') train_data = train_data.fillna(0) test_data = pd.read_csv('8288test.csv', engine='python') test_data = test_data.fillna(0) dummy_fea = [ 'sex', 'merriage', 'income', 'qq_bound', 'degree', 'wechat_bound', 'account_grade', 'industry' ] train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True) train_test_data = train_test_data.fillna(0) dummy_df = pd.get_dummies(train_test_data.loc[:, dummy_fea])
# load boston dataset from sklearn from heamy.dataset import Dataset from heamy.estimator import Regressor from heamy.pipeline import ModelsPipeline from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2) # create dataset Data = Dataset(X_train,y_train,X_test) # initialize RandomForest & LinearRegression RfModel = Regressor(dataset=Data, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf') LRModel = Regressor(dataset=Data, estimator=LinearRegression, parameters={'normalize': True},name='lr') # Stack two models # Returns new dataset with out-of-fold predictions Pipeline = ModelsPipeline(RfModel,LRModel) StackModel = Pipeline.stack(k=10,seed=2) # Train LinearRegression on stacked data (second stage) Stacker = Regressor(dataset=StackModel, estimator=LinearRegression) Results = Stacker.predict() # Validate results using 10 fold cross-validation