Esempio n. 1
0
# create dataset
dataset = Dataset(X_train, y_train, X_test)

# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset,
                     estimator=RandomForestRegressor,
                     parameters={'n_estimators': 50},
                     name='rf')
model_lr = Regressor(dataset=dataset,
                     estimator=LinearRegression,
                     parameters={'normalize': True},
                     name='lr')

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf, model_lr)
stack_ds = pipeline.stack(k=10, seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
results = stacker.validate(k=10, scorer=mean_absolute_error)

#blend
# load boston dataset from sklearn
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
  params_ext = {'max_features':'log2','n_estimators':500,'max_depth':12,'oob_score': True, 'n_jobs':4,'bootstrap':True}
  ext = ExtraTreesRegressor
 
  params_gbrt = {'loss':'huber','n_estimators': 400,'max_depth':12,'learning_rate': 0.01, 'random_state': 3}
  gbrt = GradientBoostingRegressor
  ###stacking
  model_rf1 = Regressor(dataset=dataset, estimator=rf, parameters=params_rf,name='rf1')
  model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext,name='ext')
  #model_rf2 = Regressor(dataset=dataset, estimator=rf2, parameters=params_rf2,name='rf2')
  model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv,name='rcv')
  model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt')
  #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv')
  model_br = Regressor(dataset=dataset, estimator=br, parameters=params_br,name='br')
  model_knn = Regressor(dataset=dataset, estimator=knn, parameters=params_knn,name='knn')
  model_adb = Regressor(dataset=dataset, estimator=adb, parameters=params_adb,name='adb')
  pipeline = ModelsPipeline(model_rf1,model_knn,model_rcv)
  #stack_ds = pipeline.stack(k=5,seed=111)
  #blending = pipeline.blend(proportion=0.3,seed=111)
  params_las = {'alpha':1.7}
  params_rcv2 = {'cv':5,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_absolute_error'}
  #stacker = Regressor(dataset=stack_ds,estimator=rcv, parameters=params_rcv2)
  #y_pre = stacker.predict()
  #print(y_pre)
  #y_pre = pipeline.blend()
  #print(y_pre)
  ###
  #loss_stack = Evaluation([y_pre],[y_test])
  #stacking_pre.append(y_pre)
  weights = pipeline.find_weights(mean_squared_error)
  #print(weights)
  result = pipeline.weight(weights).execute()
    ###stacking
    model_rf = Regressor(dataset=dataset,
                         estimator=rf,
                         parameters=params_rf,
                         name='rf')
    #model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt')
    model_ext = Regressor(dataset=dataset,
                          estimator=ext,
                          parameters=params_ext,
                          name='ext')
    model_rcv = Regressor(dataset=dataset,
                          estimator=rcv,
                          parameters=params_rcv,
                          name='rcv')
    #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv')
    pipeline = ModelsPipeline(model_rf, model_rcv, model_ext)
    stack_ds = pipeline.stack(k=5, seed=111)

    stacker = Regressor(dataset=stack_ds,
                        estimator=Lasso,
                        parameters=params_las)
    y_pre = stacker.predict()
    y_pre_last = np.append(y_pre, y_pre)
    y_pre_last[10] * 1.08
    ###
    #loss_gbrt = Evaluation([y_pre_gbrt],[y_test])
    output(fw, i + 1, y_pre_last)
    '''
    if loss_gbrt>0.015:
        output(fw_gbrt,i+1,y_pre_rf)
        fw_gbrt.write(str(i+1)+',gbrt,'+str(loss_gbrt)+'\n')
Esempio n. 4
0
    xgb_params = {
        'learning_rate': 0.05,
        'max_depth': 6,
        'n_estimators': 500,
        # 'num_class': 5,
        'objective': 'multi:softprob',
        'subsample': 0.8}

    model_xgb = Classifier(dataset=dataset_full, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb')
    model_xgb_f = Classifier(dataset=dataset_f, estimator=xgb.sklearn.XGBClassifier, parameters=xgb_params, name='xgb_f')
    model_rf = Classifier(dataset=dataset_f, estimator=RandomForestClassifier, parameters={'n_estimators': 700},
                          name='rf')

    pipeline = ModelsPipeline(
        # model_xgb,
        model_rf,
        model_xgb_f)

    stack_ds = pipeline.stack(k=5, full_test=True, seed=111)
    stacker = Classifier(stack_ds, LogisticRegression)
    stacker.validate(k=5, scorer=log_loss)

    # logging.info(val_results)
    #
    # # logging.info(train_x.head(10))
    #
    # print(test_x.columns.difference(train_x.columns))

    #
    # boosters = np.array([])
    # predictions = []
Esempio n. 5
0
    ###stacking
    model_rf1 = Regressor(dataset=dataset, estimator=rf, parameters=params_rf,name='rf1')
    #model_ext = Regressor(dataset=dataset, estimator=ext, parameters=params_ext,name='ext')
    #model_rf2 = Regressor(dataset=dataset, estimator=rf2, parameters=params_rf2,name='rf2')
    model_rcv = Regressor(dataset=dataset, estimator=rcv, parameters=params_rcv,name='rcv')
    #model_gbrt = Regressor(dataset=dataset, estimator=gbrt, parameters=params_gbrt,name='gbrt')
    #model_lascv = Regressor(dataset=dataset, estimator=lascv, parameters=params_lascv,name='lascv')
    model_br = Regressor(dataset=dataset, estimator=br, parameters=params_br,name='br')
    model_knn = Regressor(dataset=dataset, estimator=knn, parameters=params_knn,name='knn')
    
    #blending = pipeline.blend(proportion=0.3,seed=111)
    params_las = {'alpha':1.7}
    params_rcv2 = {'cv':5,'normalize':True,'gcv_mode':'auto','scoring':'neg_mean_squared_error'}
    params_lascv = {'max_iter':500,'cv':8}

    pipeline = ModelsPipeline(model_rf1,model_knn)
    stack_ds = pipeline.stack(k=5,seed=111)
    stacker = Regressor(dataset=stack_ds,estimator=LassoCV, parameters=params_lascv)
    y_pre = stacker.predict()

    pipeline2 = ModelsPipeline(model_rf1,model_knn)
    stack_ds2 = pipeline2.blend(seed=111)
    blending =  Regressor(dataset=stack_ds2,estimator=LassoCV, parameters=params_lascv)
    y_pre2 = blending.predict()
    blending_pre.append(y_pre2)

   

    #print(y_pre)
    #y_pre = pipeline.blend()
    #print(y_pre)
                           'reg_alpha': 0.001,
                           'colsample_bytree': 0.5,
                           'min_child_samples': 24,
                       },
                       name='lgb')

model_lgb2 = Classifier(dataset=dataset,
                        estimator=lgb.LGBMClassifier,
                        parameters={
                            'n_estimators': 70,
                            'boosting_type': 'gbdt',
                            'max_depth': 6,
                            'min_child_weight': 0.001,
                            'num_leaves': 30,
                            'seed': 128,
                            'reg_alpha': 0.001,
                            'reg_lambda': 0.002,
                            'colsample_bytree': 0.5,
                            'min_child_samples': 24
                        },
                        name='lgb')

model_lg = Classifier(dataset=dataset, estimator=LogisticRegression, name='lg')

pipeline = ModelsPipeline(model_lgb, model_lgb2, model_xgb, model_xgb2,
                          model_lg)
stack_ds = pipeline.stack(k=10, seed=111)
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression)
stacker.validate(k=10, scorer=roc_auc_score)
results = stacker.predict()
Esempio n. 7
0
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=111)

# create dataset
dataset = Dataset(X_train, y_train, X_test)

# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset,
                     estimator=RandomForestRegressor,
                     parameters={'n_estimators': 50},
                     name='rf')
model_lr = Regressor(dataset=dataset,
                     estimator=LinearRegression,
                     parameters={'normalize': True},
                     name='lr')

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf, model_lr)
stack_ds = pipeline.stack(k=10, seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
results = stacker.validate(k=10, scorer=mean_absolute_error)

#print(results)
Esempio n. 8
0
                       use_cache=class_use_cache)
model_mlp = Classifier(dataset=dataset,
                       estimator=MLPClassifier,
                       name="mlp",
                       use_cache=class_use_cache)
model_sgt = Classifier(dataset=dataset,
                       estimator=SGDClassifier,
                       parameters={'penalty': 'l1'},
                       name="sgd",
                       use_cache=class_use_cache)

# Stack两个模型mhg
# Returns new dataset with out-of-fold prediction,model_svm,model_per
logging.info('stack_ds....')
# pipeline = ModelsPipeline(model_nb,model_lr,model_svc)
pipeline = ModelsPipeline(model_sgt)
# pipeline = ModelsPipeline(model_nb),model_nb,model_lr,model_lr2
stack_ds = pipeline.stack(k=8, seed=111)
#第二层使用lr模型stack2
logging.info('second layer....')
stacker = Classifier(dataset=stack_ds,
                     estimator=svm.LinearSVC,
                     use_cache=False,
                     probability=False)
results = stacker.predict()

# 使用10折交叉验证结果
results10 = stacker.validate(k=3, scorer=accuracy_score)
logging.info(results10)

result_list = list(results + 1)
Esempio n. 9
0
              estimator=LinearRegression,
              parameters={'normalize': True},
              name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
    Regressor(dataset=dataset,
              estimator=CatBoostRegressor,
              parameters={
                  'custom_metric': ['MAE'],
                  'random_seed': seed,
                  'logging_level': 'Silent'
              },
              name='cr')
]

# pipelineを定義、2nd levelデータセットの作成
pipeline = ModelsPipeline(*models)
stack_ds = pipeline.stack(k=10, seed=seed)

# modelを作ってvalidation
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
y_trues, y_preds = stacker.validate(k=10)

# 精度出力
# X_testを使ってpredict
y_pred = stacker.predict()
print(y_pred)

sum = 0
buy = 0
for i, yosoku in enumerate(x_test):
    if stacker.predict(x_test[i]) < 0.3:
Esempio n. 10
0
#创建数据集11
dataset = Dataset(X_train,y_train,test_term_doc,use_cache=False)
#创建RF模型和LR模型1
dataset_wc = Dataset(X_train_wc,y_train_wc,test_term_doc_wc,use_cache=False)

class_use_cache=False
model_nb = Classifier(dataset=dataset_wc, estimator=MultinomialNB,name='nb',use_cache=class_use_cache)
model_lr = Classifier(dataset=dataset, estimator=LogisticRegression, parameters={'C':4, 'dual':True,'n_jobs':-1},name='lr',use_cache=class_use_cache)
model_lr2 = Classifier(dataset=dataset, estimator=LogisticRegression, parameters={'C':4, 'multi_class':'multinomial','solver':'sag','dual':False,'n_jobs':-1},name='lr2',use_cache=class_use_cache)
model_svm = Classifier(dataset=dataset, estimator=svm.SVC, parameters={ 'probability':True},name='svm',use_cache=class_use_cache)
model_svc= Classifier(dataset=dataset, estimator=svm.LinearSVC,name='LinearSVC',use_cache=class_use_cache)
model_knn=Classifier(dataset=dataset, estimator=KNeighborsClassifier,name="knn",use_cache=class_use_cache)
# Stack两个模型mhg
# Returns new dataset with out-of-fold prediction,model_svm,model_per
logging.info('stack_ds....')
pipeline = ModelsPipeline(model_knn)
# pipeline = ModelsPipeline(model_nb),model_nb,model_lr,model_lr2
stack_ds = pipeline.stack(k=8,seed=111)
#第二层使用lr模型stack2
logging.info('second layer....')
stacker = Classifier(dataset=stack_ds, estimator=svm.LinearSVC,use_cache=False,probability=False)
results = stacker.predict()
# 使用10折交叉验证结果
results10 = stacker.validate(k=10,scorer=accuracy_score)
logging.info(results10)
result_list=list(results+1)
test_id=list(test[["id"]].copy())
test_id=[i  for i in  range(len(result_list))]
logging.info('len of ....')
logging.info(len(result_list))
logging.info(len(test_id))
Esempio n. 11
0
model_rf = Regressor(dataset=dataset,
                     estimator=RandomForestRegressor,
                     parameters={'n_estimators': 50},
                     name='rf')
model_lr = Regressor(dataset=dataset,
                     estimator=LinearRegression,
                     parameters={'normalize': True},
                     name='lr')
model_knn = Regressor(dataset=dataset,
                      estimator=KNeighborsRegressor,
                      parameters={'n_neighbors': 15},
                      name='knn')
model_lgt = Regressor(dataset=dataset,
                      estimator=LogisticRegression,
                      parameters={'penalty': 'l2'},
                      name='lgt')
xgbclf = Classifier(dataset=dataset, estimator=XGBClassifier)

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf, model_lr, model_knn, xgbclf)
weights = pipeline.find_weights(mean_absolute_error)
result = pipeline.weight(weights)
stack_ds = pipeline.stack(k=10, seed=111)

# Then, train LinearRegression on stacked data
stacker = Regressor(dataset=dataset, estimator=LinearRegression)
results = stacker.predict()

results = stacker.validate(k=10, scorer=mean_absolute_error)
Esempio n. 12
0
                           'n_jobs': -1
                       },
                       name='Perceptron',
                       use_cache=class_use_cache)
model_svc = Classifier(dataset=dataset,
                       estimator=svm.LinearSVC,
                       name="LinearSVC",
                       use_cache=class_use_cache)
model_svc = Classifier(dataset=dataset,
                       estimator=svm.LinearSVC,
                       name="LinearSVC",
                       use_cache=class_use_cache)

# Stack两个模型mhg
# Returns new dataset with out-of-fold predictionmodel_svc,
pipeline = ModelsPipeline(model_nb)
stack_ds = pipeline.stack(k=3, seed=111)
print(stack_ds.X_train.shape)
#第二层使用lr模型stack
stacker = Classifier(dataset=stack_ds,
                     estimator=LogisticRegression,
                     parameters={
                         'C': 4,
                         'dual': True,
                         'n_jobs': -1
                     },
                     use_cache=False,
                     probability=False)
results = stacker.predict()
result_list = list(results)
Esempio n. 13
0
model_xgb = Classifier(dataset=dataset,
                       estimator=XGBClassifier,
                       parameters={
                           'subsample': 0.6,
                           'colsample_btree': 0.6,
                           'random_state': 27,
                           'n_jobs': 1
                       },
                       name="xgb",
                       use_cache=class_use_cache)

# Stack两个模型mhg1
# Returns new dataset with out-of-fold prediction,model_svm,model_per
logging.info('stack_ds....')
# pipeline = ModelsPipeline(model_nb,model_lr,model_svc)
pipeline = ModelsPipeline(model_svc, model_lr)
# pipeline = ModelsPipeline(model_nb),model_nb,model_lr,model_lr2
stack_ds = pipeline.stack(k=8, seed=111)
#第二层使用lr模型stack2
logging.info('second layer....')
stacker = Classifier(dataset=stack_ds,
                     estimator=svm.LinearSVC,
                     use_cache=False,
                     probability=False)
results = stacker.predict()

# 使用10折交叉验证结果
results10 = stacker.validate(k=8, scorer=accuracy_score)
logging.info(results10)

# print(accuracy_score(y_test, results))
Esempio n. 14
0
                estimator=ExtraTreesClassifier,
                parameters=et_param,
                name='et')
lgb = Classifier(dataset=dataset,
                 estimator=LGBMClassifier,
                 use_cache=CACHE,
                 parameters=lgb_param,
                 name='lgb')
lr = Classifier(dataset=dataset,
                estimator=LogisticRegression,
                use_cache=CACHE,
                parameters=lr_param,
                name='lr')
#------------------------------------------------------------------------------
#Stack the models and returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(knn, rf, et, lgb, lr)
stack_ds = pipeline.stack(k=NFOLDS, seed=1)

# Train LogisticRegression on stacked data (second stage)
lr1 = LogisticRegression
lr1_params = {
    'C': 5,
    'random_state': 1,
    'solver': 'liblinear',
    'multi_class': 'ovr',
}
stacker = Classifier(dataset=stack_ds,
                     estimator=lr1,
                     use_cache=False,
                     parameters=lr1_params)
							name='gdbt2')
model_gdbt3 = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators':600,'loss' : 'deviance',
							'max_depth':4,'min_samples_split':10,'min_weight_fraction_leaf':0.01,'learning_rate':0.07,'random_state':3},
							name='gdbt3')							
model_xgbt = Classifier(dataset=dataset, estimator=GradientBoostingClassifier, parameters={'n_estimators' :1350,
							'nthread':-1,'max_depth':3,'min_child_weight':6,'learning_rate':0.05,
							'gamma':0,'subsample':0.9,'colsample_bytree':0.9,'reg_alpha':8,},name='xgbt')
model_ext1 = Classifier(dataset=dataset,estimator=ExtraTreesClassifier,parameters={'n_estimators':700,'max_depth':39,'n_jobs':-1,
							'criterion':'gini','min_samples_split':18},name='ext1')
model_ext2 = Classifier(dataset=dataset,estimator=ExtraTreesClassifier,parameters={'n_estimators':700,'max_depth':39,'n_jobs':-1,
							'criterion':'entropy','min_samples_split':18},name='ext2')


# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf1,model_rf2,model_gdbt1,model_gdbt2,model_gdbt3,model_ext1,model_ext2)
stack_ds = pipeline.stack(k=5,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker1 = Classifier(dataset=stack_ds, estimator=LogisticRegression,parameters={'C': 10})
# stacker2 = Classifier(dataset=stack_ds, estimator=LogisticRegression,parameters={'C': 1,'penalty':'l1'})
# stacker3 = Classifier(dataset=stack_ds, estimator=SVC,parameters={'probability':True,'C':100})
# stacker4 = Classifier(dataset=stack_ds, estimator=SVC,parameters={'probability':True,'C':10})
pre_y1 = stacker1.predict()
# pre_y2 = stacker2.predict()
# pre_y3 = stacker3.predict()
# pre_y4 = stacker4.predict()

#print(pre_y)
# 计算auc
# fpr, tpr, thresholds = metrics.roc_curve(test_y, pre_y1)
                        estimator=xgb_feature2,
                        name='xgb2',
                        use_cache=False)
 model_xgb3 = Regressor(dataset=xgb_dataset,
                        estimator=xgb_feature3,
                        name='xgb3',
                        use_cache=False)
 model_lgb = Regressor(dataset=lgb_dataset,
                       estimator=lgb_feature,
                       name='lgb',
                       use_cache=False)
 model_gbdt = Regressor(dataset=xgb_dataset,
                        estimator=gbdt_model,
                        name='gbdt',
                        use_cache=False)
 pipeline = ModelsPipeline(model_xgb, model_xgb2, model_xgb3, model_lgb,
                           model_gbdt)
 stack_ds = pipeline.stack(k=5,
                           seed=111,
                           add_diff=False,
                           full_test=True)
 stacker = Regressor(dataset=stack_ds,
                     estimator=LinearRegression,
                     parameters={'fit_intercept': False})
 predict_result = stacker.predict()
 ans = pd.read_csv('../AI_risk_test_V3.0/test_list.csv',
                   parse_dates=['appl_sbm_tm'])
 ans['PROB'] = predict_result
 ans = ans.drop(['appl_sbm_tm'], axis=1)
 minmin, maxmax = min(ans['PROB']), max(ans['PROB'])
 ans['PROB'] = ans['PROB'].map(lambda x: (x - minmin) /
                               (maxmax - minmin))
Esempio n. 17
0
                                                        test_size=0.1,
                                                        random_state=111)
    return X_train, y_train, X_test, y_test


dataset = Dataset(preprocessor=boston_dataset, use_cache=True)
model = Regressor(dataset=dataset,
                  estimator=LinearRegression,
                  parameters={'normalize': True},
                  name='lr')
model_2 = Regressor(dataset=dataset,
                    estimator=RandomForestRegressor,
                    parameters={'n_estimators': 50},
                    name='rf')

pipeline = ModelsPipeline(model, model_2)


def test_apply():
    output = pipeline.apply(lambda x: np.mean(x, axis=0)).execute()
    assert output.shape[0] == dataset.X_test.shape[0]

    output = pipeline.apply(lambda x: np.mean(x, axis=0)).validate(
        scorer=mean_absolute_error, k=10)
    assert len(output) == 10


def test_simple_functions():
    assert dataset.X_test.shape[0] == pipeline.max().execute().shape[0]
    assert dataset.X_test.shape[0] == pipeline.mean().execute().shape[0]
    assert dataset.X_test.shape[0] == pipeline.gmean().execute().shape[0]
from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)

# create dataset
Data = Dataset(X_train,y_train,X_test)

# initialize RandomForest & LinearRegression
RfModel = Regressor(dataset=Data, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
LRModel = Regressor(dataset=Data, estimator=LinearRegression, parameters={'normalize': True},name='lr')


# Stack two models
# Returns new dataset with out-of-fold predictions
Pipeline = ModelsPipeline(RfModel,LRModel)
StackModel = Pipeline.stack(k=10,seed=2)

# Train LinearRegression on stacked data (second stage)
Stacker = Regressor(dataset=StackModel, estimator=LinearRegression)
Results = Stacker.predict()
# Validate results using 10 fold cross-validation
Results = Stacker.validate(k=10,scorer=mean_absolute_error)
Esempio n. 19
0
                           'max_features': 'sqrt',
                           'min_samples_leaf': 15,
                           'min_samples_split': 10
                       },
                       name='gbdt')
model_xgb = Regressor(dataset=dataset,
                      estimator=xgb.XGBRegressor,
                      parameters={
                          'n_estimators': 50,
                          'learning_rate': 0.05,
                          'max_depth': 3
                      },
                      name='xgb')

# stack两个模型
pipeline = ModelsPipeline(model_lr, model_rf, model_gbdt, model_xgb)
stack_ds = pipeline.stack(k=10, seed=111)
# 第二层使用xgboost模型stack
stacker = Regressor(dataset=stack_ds, estimator=xgb.XGBRegressor)
results = stacker.predict()
# 使用10折交叉验证结果
results10 = stacker.validate(k=10, scorer=mean_squared_error)
print("r2_score: %f" % r2_score(y_test, results))

test_y = pd.DataFrame(y_test)
predictions = pd.DataFrame(results)
data = pd.concat([data_XX, data_drop], axis=1)
data = pd.concat([data, data_y], axis=1)
data = pd.concat([data, predictions], axis=1)
data = np.array(data)
with open('C:/20180402_pre_test.csv', 'w') as f: