Ejemplo n.º 1
0
def test_prediction():
    model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset)
    output = model.predict()
    assert len(output.shape) == 1
    assert model.dataset.X_test.shape[0] == output.shape[0]

    # Retrieve cached object
    output = model.predict()
    assert len(output.shape) == 1
    assert model.dataset.X_test.shape[0] == output.shape[0]
Ejemplo n.º 2
0
def test_stacking():
    model = Regressor(estimator=LinearRegression,
                      parameters={},
                      dataset=RealDataset)
    ds = model.stack(10)

    assert ds.X_train.shape[0] == model.dataset.X_train.shape[0]
    assert ds.X_test.shape[0] == model.dataset.X_test.shape[0]
    assert ds.y_train.shape[0] == model.dataset.y_train.shape[0]

    model = Regressor(estimator=LinearRegression,
                      parameters={},
                      dataset=RealDataset)
    ds = model.stack(10, full_test=False)
    assert np.isnan(ds.X_train).sum() == 0
    assert ds.X_train.shape[0] == model.dataset.X_train.shape[0]
    assert ds.X_test.shape[0] == model.dataset.X_test.shape[0]
    assert ds.y_train.shape[0] == model.dataset.y_train.shape[0]

    model = Regressor(estimator=LinearRegression,
                      parameters={},
                      dataset=RealDataset)
    model.dataset.load()
    ds = model.stack(10, full_test=False)
    # Check cache
    assert np.isnan(ds.X_train).sum() == 0
    assert ds.X_train.shape[0] == model.dataset.X_train.shape[0]
    assert ds.X_test.shape[0] == model.dataset.X_test.shape[0]
    assert ds.y_train.shape[0] == model.dataset.y_train.shape[0]
Ejemplo n.º 3
0
def test_blending():
    model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset)
    _, _, X_t, y_t = model.dataset.split(test_size=0.2)
    ds = model.blend(proportion=0.2)
    assert ds.X_test.shape[0] == model.dataset.X_test.shape[0]
    assert ds.X_train.shape[0] == X_t.shape[0]

    # Check cache
    ds = model.blend(proportion=0.2)
    assert ds.X_test.shape[0] == model.dataset.X_test.shape[0]
    assert ds.X_train.shape[0] == X_t.shape[0]
Ejemplo n.º 4
0
def test_prediction():
    model = Regressor(estimator=LinearRegression,
                      parameters={},
                      dataset=RealDataset)
    output = model.predict()
    assert len(output.shape) == 1
    assert model.dataset.X_test.shape[0] == output.shape[0]

    # Retrieve cached object
    output = model.predict()
    assert len(output.shape) == 1
    assert model.dataset.X_test.shape[0] == output.shape[0]
Ejemplo n.º 5
0
def test_blending():
    model = Regressor(estimator=LinearRegression,
                      parameters={},
                      dataset=RealDataset)
    _, _, X_t, y_t = model.dataset.split(test_size=0.2)
    ds = model.blend(proportion=0.2)
    assert ds.X_test.shape[0] == model.dataset.X_test.shape[0]
    assert ds.X_train.shape[0] == X_t.shape[0]

    # Check cache
    ds = model.blend(proportion=0.2)
    assert ds.X_test.shape[0] == model.dataset.X_test.shape[0]
    assert ds.X_train.shape[0] == X_t.shape[0]
Ejemplo n.º 6
0
def test_validation():
    model = Regressor(estimator=LinearRegression,
                      parameters={},
                      dataset=RealDataset)
    model.validate(k=10)
    # Retrieve cached object
    y_true, y_pred = model.validate(k=10)
    assert len(y_true) == len(y_pred)

    model.validate(k=1)
    # Retrieve cached object
    y_true, y_pred = model.validate(k=1)
    assert len(y_true) == len(y_pred)
    assert len(y_true) == 1
Ejemplo n.º 7
0
def test_custom_estimators():
    def test_estimator():
        return

    class TestEstimator2(Classifier):
        def estimator(self):
            return

    with pytest.raises(ValueError):
        TestEstimator2(dataset=TestDataset)

    with pytest.raises(ValueError):
        Regressor(estimator=test_estimator, dataset=TestDataset)
Ejemplo n.º 8
0
def test_validation():
    model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset)
    model.validate(k=10)
    # Retrieve cached object
    y_true, y_pred = model.validate(k=10)
    assert len(y_true) == len(y_pred)

    model.validate(k=1)
    # Retrieve cached object
    y_true, y_pred = model.validate(k=1)
    assert len(y_true) == len(y_pred)
    assert len(y_true) == 1
Ejemplo n.º 9
0
def test_stacking():
    model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset)
    ds = model.stack(10)

    assert ds.X_train.shape[0] == model.dataset.X_train.shape[0]
    assert ds.X_test.shape[0] == model.dataset.X_test.shape[0]
    assert ds.y_train.shape[0] == model.dataset.y_train.shape[0]

    model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset)
    ds = model.stack(10, full_test=False)
    assert np.isnan(ds.X_train).sum() == 0
    assert ds.X_train.shape[0] == model.dataset.X_train.shape[0]
    assert ds.X_test.shape[0] == model.dataset.X_test.shape[0]
    assert ds.y_train.shape[0] == model.dataset.y_train.shape[0]

    model = Regressor(estimator=LinearRegression, parameters={}, dataset=RealDataset)
    model.dataset.load()
    ds = model.stack(10, full_test=False)
    # Check cache
    assert np.isnan(ds.X_train).sum() == 0
    assert ds.X_train.shape[0] == model.dataset.X_train.shape[0]
    assert ds.X_test.shape[0] == model.dataset.X_test.shape[0]
    assert ds.y_train.shape[0] == model.dataset.y_train.shape[0]
Ejemplo n.º 10
0
    rf = RandomForestRegressor
    rf2 = RandomForestRegressor
    rfs = RandomForestRegressor
    ###
    #params_adb = {'n_estimators':300,'loss':'square','learning_rate':0.8}
    ##adb = AdaBoostRegressor


    params_ext = {'max_features':'log2','n_estimators':500,'max_depth':12,'oob_score': True, 'n_jobs':4,'bootstrap':True}
    ext = ExtraTreesRegressor
   
    params_gbrt = {'loss':'huber','n_estimators': 400,'max_depth':12,'learning_rate': 0.01, 'random_state': 3}
    gbrt = GradientBoostingRegressor
    ###stacking
    model_rf1 = Regressor(dataset=dataset, estimator=rf, parameters=params_rf,name='rf1')
    #model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext,name='ext')
    #model_rf2 = Regressor(dataset=dataset, estimator=rf2, parameters=params_rf2,name='rf2')
    model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv,name='rcv')
    #model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt')
    #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv')
    model_br = Regressor(dataset=dataset, estimator=br, parameters=params_br,name='br')
    model_knn = Regressor(dataset=dataset, estimator=knn, parameters=params_knn,name='knn')
    
    #blending = pipeline.blend(proportion=0.3,seed=111)
    params_las = {'alpha':1.7}
    params_rcv2 = {'cv':5,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_squared_error'}
    params_lascv = {'max_iter':500,'cv':8}

    pipeline = ModelsPipeline(model_rf1,model_knn)
    stack_ds = pipeline.stack(k=5,seed=111)
 train_test_data = train_test_data.fillna(0)
 dummy_df = pd.get_dummies(train_test_data.loc[:, dummy_fea])
 train_test_data = pd.concat([train_test_data, dummy_df], axis=1)
 train_test_data = train_test_data.drop(dummy_fea, axis=1)
 train_train = train_test_data.iloc[:train_data.shape[0], :]
 test_test = train_test_data.iloc[train_data.shape[0]:, :]
 train_train_x = train_train.drop(['target'], axis=1)
 test_test_x = test_test.drop(['target'], axis=1)
 xgb_dataset = Dataset(X_train=train_train_x,
                       y_train=train_train['target'],
                       X_test=test_test_x,
                       y_test=None,
                       use_cache=False)
 #heamy
 model_xgb = Regressor(dataset=xgb_dataset,
                       estimator=xgb_feature,
                       name='xgb',
                       use_cache=False)
 model_xgb2 = Regressor(dataset=xgb_dataset,
                        estimator=xgb_feature2,
                        name='xgb2',
                        use_cache=False)
 model_xgb3 = Regressor(dataset=xgb_dataset,
                        estimator=xgb_feature3,
                        name='xgb3',
                        use_cache=False)
 model_lgb = Regressor(dataset=lgb_dataset,
                       estimator=lgb_feature,
                       name='lgb',
                       use_cache=False)
 model_gbdt = Regressor(dataset=xgb_dataset,
                        estimator=gbdt_model,
Ejemplo n.º 12
0
    ext = ExtraTreesRegressor
    ###

    params_gbrt = {
        'loss': 'huber',
        'n_estimators': 300,
        'max_depth': 12,
        'learning_rate': 0.01,
        'random_state': 3
    }
    gbrt = GradientBoostingRegressor
    ###
    params_las = {'alpha': 1}
    ###stacking
    model_rf = Regressor(dataset=dataset,
                         estimator=rf,
                         parameters=params_rf,
                         name='rf')
    #model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt')
    model_ext = Regressor(dataset=dataset,
                          estimator=ext,
                          parameters=params_ext,
                          name='ext')
    model_rcv = Regressor(dataset=dataset,
                          estimator=rcv,
                          parameters=params_rcv,
                          name='rcv')
    #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv')
    pipeline = ModelsPipeline(model_rf, model_rcv, model_ext)
    stack_ds = pipeline.stack(k=5, seed=111)

    stacker = Regressor(dataset=stack_ds,
Ejemplo n.º 13
0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)

# create dataset
dataset = Dataset(X_train,y_train,X_test)

# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.stack(k=10,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
results = stacker.validate(k=10,scorer=mean_absolute_error)



#blend
# load boston dataset from sklearn
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)

# create dataset
dataset = Dataset(X_train,y_train,X_test)
Ejemplo n.º 14
0
}

from heamy.dataset import Dataset
from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline
from catboost import CatBoostRegressor
#import adaboost
# datasetを準備
dataset = Dataset(x_test, t_test, x_train2)  # X_testは今回使わないが入れないとエラーになる

# アンサンブルに使うモデルを定義
models = [
    Regressor(dataset=dataset,
              estimator=RandomForestRegressor,
              parameters={
                  'n_estimators': 50,
                  'random_state': seed
              },
              name='rf'),
    Regressor(dataset=dataset,
              estimator=LinearRegression,
              parameters={'normalize': True},
              name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
    Regressor(dataset=dataset,
              estimator=CatBoostRegressor,
              parameters={
                  'custom_metric': ['MAE'],
                  'random_seed': seed,
                  'logging_level': 'Silent'
              },
Ejemplo n.º 15
0
    rf = RandomForestRegressor
    rf2 = RandomForestRegressor
    rfs = RandomForestRegressor
    ###
    params_adb = {'n_estimators':500,'loss':'square','learning_rate':0.02}
    adb = AdaBoostRegressor


    params_ext = {'max_features':'log2','n_estimators':500,'max_depth':12,'oob_score': True, 'n_jobs':4,'bootstrap':True}
    ext = ExtraTreesRegressor
   
    params_gbrt = {'loss':'huber','n_estimators': 400,'max_depth':12,'learning_rate': 0.01, 'random_state': 3}
    gbrt = GradientBoostingRegressor
    ###stacking
    model_rf1 = Regressor(dataset=dataset, estimator=rf, parameters=params_rf,name='rf1')
    model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext,name='ext')
    #model_rf2 = Regressor(dataset=dataset, estimator=rf2, parameters=params_rf2,name='rf2')
    model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv,name='rcv')
    model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt')
    #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv')
    model_br = Regressor(dataset=dataset, estimator=br, parameters=params_br,name='br')
    model_knn = Regressor(dataset=dataset, estimator=knn, parameters=params_knn,name='knn')
    model_adb = Regressor(dataset=dataset, estimator=adb, parameters=params_adb,name='adb')
    pipeline = ModelsPipeline(model_rf1,model_knn,model_rcv)
    #stack_ds = pipeline.stack(k=5,seed=111)
    #blending = pipeline.blend(proportion=0.3,seed=111)
    params_las = {'alpha':1.7}
    params_rcv2 = {'cv':5,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_absolute_error'}
    #stacker = Regressor(dataset=stack_ds,estimator=rcv, parameters=params_rcv2)
    #y_pre = stacker.predict()
Ejemplo n.º 16
0
#stack
# load boston dataset from sklearn
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=111)

# create dataset
dataset = Dataset(X_train, y_train, X_test)

# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset,
                     estimator=RandomForestRegressor,
                     parameters={'n_estimators': 50},
                     name='rf')
model_lr = Regressor(dataset=dataset,
                     estimator=LinearRegression,
                     parameters={'normalize': True},
                     name='lr')

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf, model_lr)
stack_ds = pipeline.stack(k=10, seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
Ejemplo n.º 17
0
from heamy.cache import np_hash
from heamy.dataset import Dataset
from heamy.estimator import Regressor
from heamy.utils.main import generate_columns, group_models, report_score
from heamy.utils.optimizer import Optimizer


def boston_dataset():
    data = load_boston()
    X, y = data['data'], data['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)
    return X_train, y_train, X_test, y_test


dataset = Dataset(preprocessor=boston_dataset, use_cache=True)
model = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True})
model_2 = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50})


def test_generate_columns():
    x = np.random.rand(100, 10)
    output = generate_columns(x, 'test')
    assert len(output) == 10
    assert all([True if col.find('test') == 0 else False for col in output])

    output = generate_columns(np.random.rand(100), 'test')
    assert output[0] == 'test'


def test_optimizer():
    opt = Optimizer([model, model_2], scorer=mean_absolute_error)
Ejemplo n.º 18
0
class TestEstimator(Classifier):
    @staticmethod
    def estimator(X_train, y_train, X_test, y_test=None):
        return np.zeros((2, X_test.shape[0]))
        # return np.zeros(X_test.shape[0])


def func_estimator(X_train, y_train, X_test, y_test):
    return np.zeros(X_test.shape[0])


def random_param():
    return random.randint(1, 100)


model_func = Regressor(estimator=func_estimator, dataset=TestDataset)
model_cls = TestEstimator(dataset=TestDataset())
model_param = Regressor(estimator=LinearRegression,
                        parameters={'random_param': random_param},
                        dataset=TestDataset)
model_param2 = Classifier(estimator=LogisticRegression,
                          parameters={'colsample_bylevel': 0.9},
                          dataset=TestDataset)


def test_hashing():
    assert str(
        model_func) == 'func_estimator(54743c7a5484d1bf2a64ac1d7b68f8cc)'
    assert str(model_cls) == 'TestEstimator(da29cb8766f96e6561a51e8e3c13f661)'
    assert str(
        model_param) == 'LinearRegression(2e789a766f6dc2457fb6a63452ad2859)'
from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)

# create dataset
Data = Dataset(X_train,y_train,X_test)

# initialize RandomForest & LinearRegression
RfModel = Regressor(dataset=Data, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
LRModel = Regressor(dataset=Data, estimator=LinearRegression, parameters={'normalize': True},name='lr')


# Stack two models
# Returns new dataset with out-of-fold predictions
Pipeline = ModelsPipeline(RfModel,LRModel)
StackModel = Pipeline.stack(k=10,seed=2)

# Train LinearRegression on stacked data (second stage)
Stacker = Regressor(dataset=StackModel, estimator=LinearRegression)
Results = Stacker.predict()
# Validate results using 10 fold cross-validation
Results = Stacker.validate(k=10,scorer=mean_absolute_error)
Ejemplo n.º 20
0
data_X = pd.DataFrame(data_X)
data_y = pd.DataFrame(data_y)
# 训练集,测试集划分
# train_X, test_X, train_y, test_y = train_test_split(data_X, data_y1, test_size=0.8, random_state=7)
# print(np.shape(train_X))
# print("################模型融合##############")
x_train, x_test, y_train, y_test = train_test_split(data_X,
                                                    data_y,
                                                    test_size=0.33,
                                                    random_state=2018)
y_train = np.asarray(y_train).reshape(-1, 1)
# 创建数据集
dataset = Dataset(x_train, y_train.ravel(), x_test)
# 创建RF模型和LR模型
model_lr = Regressor(dataset=dataset,
                     estimator=LinearRegression,
                     parameters={'normalize': True},
                     name='lr')
model_rf = Regressor(dataset=dataset,
                     estimator=RandomForestRegressor,
                     parameters={'n_estimators': 50},
                     name='rf')
model_gbdt = Regressor(dataset=dataset,
                       estimator=GradientBoostingRegressor,
                       parameters={
                           'n_estimators': 50,
                           'learning_rate': 0.05,
                           'max_depth': 4,
                           'max_features': 'sqrt',
                           'min_samples_leaf': 15,
                           'min_samples_split': 10
                       },