from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)

# create dataset
Data = Dataset(X_train,y_train,X_test)

# initialize RandomForest & LinearRegression
RfModel = Regressor(dataset=Data, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
LRModel = Regressor(dataset=Data, estimator=LinearRegression, parameters={'normalize': True},name='lr')


# Stack two models
# Returns new dataset with out-of-fold predictions
Pipeline = ModelsPipeline(RfModel,LRModel)
StackModel = Pipeline.stack(k=10,seed=2)

# Train LinearRegression on stacked data (second stage)
Stacker = Regressor(dataset=StackModel, estimator=LinearRegression)
Results = Stacker.predict()
# Validate results using 10 fold cross-validation
Results = Stacker.validate(k=10,scorer=mean_absolute_error)
        model_xgb3 = Regressor(dataset=xgb_dataset,
                               estimator=xgb_feature3,
                               name='xgb3',
                               use_cache=False)
        model_lgb = Regressor(dataset=lgb_dataset,
                              estimator=lgb_feature,
                              name='lgb',
                              use_cache=False)
        model_gbdt = Regressor(dataset=xgb_dataset,
                               estimator=gbdt_model,
                               name='gbdt',
                               use_cache=False)
        pipeline = ModelsPipeline(model_xgb, model_xgb2, model_xgb3, model_lgb,
                                  model_gbdt)
        stack_ds = pipeline.stack(k=5,
                                  seed=111,
                                  add_diff=False,
                                  full_test=True)
        stacker = Regressor(dataset=stack_ds,
                            estimator=LinearRegression,
                            parameters={'fit_intercept': False})
        predict_result = stacker.predict()
        ans = pd.read_csv('../AI_risk_test_V3.0/test_list.csv',
                          parse_dates=['appl_sbm_tm'])
        ans['PROB'] = predict_result
        ans = ans.drop(['appl_sbm_tm'], axis=1)
        minmin, maxmax = min(ans['PROB']), max(ans['PROB'])
        ans['PROB'] = ans['PROB'].map(lambda x: (x - minmin) /
                                      (maxmax - minmin))
        ans['PROB'] = ans['PROB'].map(lambda x: '%.4f' % x)
        ans.to_csv('./ans_stacking.csv', index=None)
Ejemplo n.º 3
0
                       estimator=MLPClassifier,
                       name="mlp",
                       use_cache=class_use_cache)
model_sgt = Classifier(dataset=dataset,
                       estimator=SGDClassifier,
                       parameters={'penalty': 'l1'},
                       name="sgd",
                       use_cache=class_use_cache)

# Stack两个模型mhg
# Returns new dataset with out-of-fold prediction,model_svm,model_per
logging.info('stack_ds....')
# pipeline = ModelsPipeline(model_nb,model_lr,model_svc)
pipeline = ModelsPipeline(model_sgt)
# pipeline = ModelsPipeline(model_nb),model_nb,model_lr,model_lr2
stack_ds = pipeline.stack(k=8, seed=111)
#第二层使用lr模型stack2
logging.info('second layer....')
stacker = Classifier(dataset=stack_ds,
                     estimator=svm.LinearSVC,
                     use_cache=False,
                     probability=False)
results = stacker.predict()

# 使用10折交叉验证结果
results10 = stacker.validate(k=3, scorer=accuracy_score)
logging.info(results10)

result_list = list(results + 1)

test_id = list(test[["id"]].copy())
Ejemplo n.º 4
0
                parameters=et_param,
                name='et')
lgb = Classifier(dataset=dataset,
                 estimator=LGBMClassifier,
                 use_cache=CACHE,
                 parameters=lgb_param,
                 name='lgb')
lr = Classifier(dataset=dataset,
                estimator=LogisticRegression,
                use_cache=CACHE,
                parameters=lr_param,
                name='lr')
#------------------------------------------------------------------------------
#Stack the models and returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(knn, rf, et, lgb, lr)
stack_ds = pipeline.stack(k=NFOLDS, seed=1)

# Train LogisticRegression on stacked data (second stage)
lr1 = LogisticRegression
lr1_params = {
    'C': 5,
    'random_state': 1,
    'solver': 'liblinear',
    'multi_class': 'ovr',
}
stacker = Classifier(dataset=stack_ds,
                     estimator=lr1,
                     use_cache=False,
                     parameters=lr1_params)

# Validate results using k-fold cross-validation
Ejemplo n.º 5
0
        'n_estimators': 500,
        # 'num_class': 5,
        'objective': 'multi:softprob',
        'subsample': 0.8}

    model_xgb = Classifier(dataset=dataset_full, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb')
    model_xgb_f = Classifier(dataset=dataset_f, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb_f')
    model_rf = Classifier(dataset=dataset_f, estimator=RandomForestClassifier, parameters={'n_estimators': 700},
                          name='rf')

    pipeline = ModelsPipeline(
        # model_xgb,
        model_rf,
        model_xgb_f)

    stack_ds = pipeline.stack(k=5, full_test=True, seed=111)
    stacker = Classifier(stack_ds, LogisticRegression)
    stacker.validate(k=5, scorer=log_loss)

    # logging.info(val_results)
    #
    # # logging.info(train_x.head(10))
    #
    # print(test_x.columns.difference(train_x.columns))

    #
    # boosters = np.array([])
    # predictions = []
    #
    # print(xgb.cv(xgb_params, xgb.DMatrix(train_x, train_yt), nfold=5, num_boost_round=100, early_stopping_rounds=10, metrics=["mlogloss"], verbose_eval=False))
    #
Ejemplo n.º 6
0
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)

# create dataset
dataset = Dataset(X_train,y_train,X_test)

# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.stack(k=10,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
results = stacker.validate(k=10,scorer=mean_absolute_error)



#blend
# load boston dataset from sklearn
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)
Ejemplo n.º 7
0
fs = ['xgb1','xgb2','xgb3','et','svm','lr','lgb','gbdt']  
  
import matplotlib.pyplot as plt  
  
def plot_confusion_matrix(cm, title, cmap=plt.cm.Blues):  
    plt.imshow(cm, interpolation='nearest', cmap=cmap)  
    plt.title(title)  
    plt.colorbar()  
    tick_marks = np.arange(8)  
    plt.xticks(tick_marks, fs, rotation=45)  
    plt.yticks(tick_marks, fs)  
    plt.tight_layout()  
  
plot_confusion_matrix(cm, title='mic')  
plt.show() 
model_xgb2 = Regressor(dataset= dataset, estimator=xgb_feature2,name='xgb2',use_cache=False) 
model_lr = Regressor(dataset= dataset, estimator=logistic_model,name='lr',use_cache=False) 
model_lgb = Regressor(dataset= dataset, estimator=lgb_model,name='lgb',use_cache=False)  
model_ gbdt = Regressor(dataset= dataset, estimator=gbdt_model,name='gbdt',use_cache=False)
pipeline = ModelsPipeline(model_xgb2, model_lr, model_lgb, model_svm)  
stack_data = pipeline.stack(k=5, seed=0, add_diff=False, full_test=True)  
stacker = Regressor(dataset=stack_data,estimator=LinearRegression,
                      parameters={'fit_intercept': False})  
predict_result = stacker.predict()
val = pd.read_csv('val_list.csv')
val['PROB'] = predict_result
minmin, maxmax = min(val ['PROB']),max(val ['PROB'])
val['PROB'] = val['PROB'].map(lambda x:(x-minmin)/(maxmax-minmin))
val['PROB'] = val['PROB'].map(lambda x:'%.4f' % x)

Ejemplo n.º 8
0
              parameters={'normalize': True},
              name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
    Regressor(dataset=dataset,
              estimator=CatBoostRegressor,
              parameters={
                  'custom_metric': ['MAE'],
                  'random_seed': seed,
                  'logging_level': 'Silent'
              },
              name='cr')
]

# pipelineを定義、2nd levelデータセットの作成
pipeline = ModelsPipeline(*models)
stack_ds = pipeline.stack(k=10, seed=seed)

# modelを作ってvalidation
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
y_trues, y_preds = stacker.validate(k=10)

# 精度出力
# X_testを使ってpredict
y_pred = stacker.predict()
print(y_pred)

sum = 0
buy = 0
for i, yosoku in enumerate(x_test):
    if stacker.predict(x_test[i]) < 0.3:
        if t_test[i] == 0: