Esempio n. 1
0
def main():
    """
    load data
    """
    train_set = pd.read_csv('../data/train.csv')
    test_set = pd.read_csv('../data/test.csv')

    #Without outlier remover, with basic nanRemover 0.12416413124809748
    """
    Remove Outliers
    """
    outliers = train_set[train_set['GrLivArea'] > 4500].index
    print(outliers)

    outliers = [197, 523, 691, 854, 1182, 1298]

    train_set.drop(outliers, inplace=True)

    #With outlier remover 0.10970218665126451
    """
    fix salePrice skewness
    """
    train_set["SalePrice"] = np.log1p(train_set["SalePrice"])
    y_train_values = train_set["SalePrice"].values
    """
    prepare combined data.
    """
    train_set_id = train_set['Id']
    test_set_id = test_set['Id']

    train_set_rows = train_set.shape[0]
    test_set_rows = test_set.shape[0]

    train_set.drop('Id', axis=1, inplace=True)
    test_set.drop('Id', axis=1, inplace=True)
    train_set.drop('SalePrice', axis=1, inplace=True)

    combined_data = pd.concat((train_set, test_set))
    """
    create data transform pipeline
    """
    transform_pipeline = Pipeline(steps=[
        ('OutlierRemover', OutlierRemover()),
        ('NaNImputer', NaNImputer()),
        ('NaNRemover', NaNRemover()),
        ('AdditionalFeatureGenerator', AdditionalFeatureGenerator()),
        ('TypeTransformer', TypeTransformer()),
        ('ErrorImputer', ErrorImputer()),
        ('SkewFixer', SkewFixer()),
        ('Scaler', Scaler()),
        ('FeatureDropper', FeatureDropper()),
        ('Dummyfier', Dummyfier()),
    ])

    transformed_data = transform_pipeline.transform(combined_data)
    train_data = transformed_data[:train_set_rows]
    predict_data = transformed_data[train_set_rows:]
    """
    try various regressors
    """

    rf_param = {
        # 'bootstrap': [True],
        'max_depth': [3, 4, 5],
        'min_samples_leaf': [3, 4, 5],
        'n_estimators': [5, 7, 10]
    }
    ls_param = {
        'alpha': [0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008],
        'max_iter': [10000],
        "normalize": [False]
    }

    elnet_param = {
        'alpha': [0.0003, 0.0004, 0.0005],
        'l1_ratio': [0.9, 0.95, 0.99, 1],
        'max_iter': [10000]
    }

    ridge_param = {'alpha': [10, 10.1, 10.2, 10.3, 10.4, 10.5]}

    svr_param = {
        'gamma': [1e-08, 1e-09],
        'C': [100000, 110000],
        'epsilon': [1, 0.1, 0.01]
    }
    gbm_param = {
        "n_estimators": [1000],
        'min_child_weight': [1, 5],
        'gamma': [0.1, 0.2],
        'subsample': [0.6],
        'colsample_bytree': [0.6],
        'max_depth': [3, 4],
        'eta': [0.01],
        'eval_metric': ['mae']
    }

    lgb_params = {
        'objective': ['regression'],
        'num_leaves': [255],
        'max_depth': [8],
        'bagging_seed': [3],
        'boosting_type': ['gbdt'],
        'min_sum_hessian_in_leaf': [100],
        'learning_rate': np.linspace(0.05, 0.1, 2),
        'bagging_fraction': np.linspace(0.7, 0.9, 2),
        'bagging_freq': np.linspace(30, 50, 3, dtype='int'),
        'max_bin': [15, 63],
    }

    rf = get_best_estimator(train_data,
                            y_train_values,
                            estimator=RandomForestRegressor(),
                            params=rf_param,
                            n_jobs=4)
    elnet = get_best_estimator(train_data,
                               y_train_values,
                               estimator=ElasticNet(),
                               params=elnet_param,
                               n_jobs=4)
    lso = get_best_estimator(train_data,
                             y_train_values,
                             estimator=Lasso(),
                             params=ls_param,
                             n_jobs=4)
    rdg = get_best_estimator(train_data,
                             y_train_values,
                             estimator=Ridge(),
                             params=ridge_param,
                             n_jobs=4)
    svr = get_best_estimator(train_data,
                             y_train_values,
                             estimator=SVR(),
                             params=svr_param,
                             n_jobs=4)

    gbm = get_best_estimator(train_data,
                             y_train_values,
                             estimator=xgb.XGBRegressor(),
                             params=gbm_param,
                             n_jobs=4)
    lbm = get_best_estimator(train_data,
                             y_train_values,
                             estimator=lgb.LGBMRegressor(),
                             params=lgb_params,
                             n_jobs=4)

    def cv_rmse(model):
        kfolds = KFold(n_splits=5, shuffle=True, random_state=42)
        rmse = np.sqrt(-cross_val_score(model,
                                        train_data,
                                        y_train_values,
                                        scoring="neg_mean_squared_error",
                                        cv=kfolds))
        return (rmse)

    # print("Randomforest  model rmse : ", cv_rmse(rf).mean())
    # print("elastic model rmse : ", cv_rmse(elnet).mean())
    # print("lasso model rmse : ", cv_rmse(lso).mean())
    # print("ridge model rmse : ", cv_rmse(rdg).mean())
    # print("svr model rmse : ", cv_rmse(svr).mean())
    # print("xgboost model rmse : ", cv_rmse(gbm).mean())
    # print("lightgbm model rmse : ", cv_rmse(lbm).mean())

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(rf.predict(predict_data))
    })
    submission.to_csv('submission_rf.csv', index=False)

    submission = pd.DataFrame({
        "Id":
        test_set_id,
        "SalePrice":
        np.expm1(elnet.predict(predict_data))
    })
    submission.to_csv('submission_elnet.csv', index=False)

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(lso.predict(predict_data))
    })
    submission.to_csv('submission_lso.csv', index=False)

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(rdg.predict(predict_data))
    })
    submission.to_csv('submission_rdg.csv', index=False)

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(svr.predict(predict_data))
    })
    submission.to_csv('submission_svr.csv', index=False)

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(gbm.predict(predict_data))
    })
    submission.to_csv('submission_gbm.csv', index=False)

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": np.expm1(lbm.predict(predict_data))
    })
    submission.to_csv('submission_lbm.csv', index=False)

    model = StackingRegressor(regressors=[rf, elnet, lso, rdg, svr],
                              meta_regressor=Lasso(alpha=0.0005))

    # Fit the model on our data
    model.fit(train_data, y_train_values)
    print("StackingRegressor model rmse : ", cv_rmse(model).mean())

    # y_pred = model.predict(train_data)
    # print(sqrt(mean_squared_error(y_train_values, y_pred)))

    # Predict test set
    ensembled = np.expm1(model.predict(predict_data))
    """
    export submission data
    """
    submission = pd.DataFrame({"Id": test_set_id, "SalePrice": ensembled})
    submission.to_csv('submission_stacking.csv', index=False)
    """" Ensemble Weights """
    from scipy.optimize import minimize
    regressors = [rf, elnet, lso, rdg, svr, gbm, lbm]

    predictions = []
    for clf in regressors:
        predictions.append(
            clf.predict(train_data))  # listing all our predictions

    def mse_func(weights):
        # scipy minimize will pass the weights as a numpy array
        final_prediction = 0
        for weight, prediction in zip(weights, predictions):
            final_prediction += weight * prediction
        return mean_squared_error(y_train_values, final_prediction)

    starting_values = [0.5] * len(
        predictions)  # minimize need a starting value
    bounds = [(0, 1)] * len(predictions)  # weights are bound between 0 and 1
    res = minimize(mse_func, starting_values, bounds=bounds, method='SLSQP')
    print('Result Assessment: {message_algo}'.format(
        message_algo=res['message']))
    print('Ensemble Score: {best_score}'.format(best_score=res['fun']))
    print('Best Weights: {weights}'.format(weights=res['x']))

    ##  All
    sale_price_ensemble = (
        np.expm1(rf.predict(predict_data)) * res['x'][0] +
        np.expm1(elnet.predict(predict_data)) * res['x'][1] +
        np.expm1(lso.predict(predict_data)) * res['x'][2] +
        np.expm1(rdg.predict(predict_data)) * res['x'][3] +
        np.expm1(svr.predict(predict_data)) * res['x'][4] +
        np.expm1(gbm.predict(predict_data)) * res['x'][5] +
        np.expm1(lgb.predict(predict_data)) * res['x'][6])

    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": sale_price_ensemble
    })
    submission.to_csv('submission_average.csv', index=False)
Esempio n. 2
0
nn_gs.best_estimator_

# In[48]:

pd.DataFrame(nn_gs.cv_results_).sort_values('rank_test_score').head()[[
    'param_activation', 'param_hidden_layer_sizes', 'param_solver',
    'rank_test_score'
]]

# In[47]:

nn_gs.cv_results_

# ### Stacking

# In[41]:

from mlxtend.regressor import StackingRegressor

stregr = StackingRegressor(regressors=[
    tree_gs.best_estimator_, knn_gs.best_estimator_, lr_gs.best_estimator_,
    svr_gs.best_estimator_
],
                           meta_regressor=svr_gs.best_estimator_)

stregr.fit(X_train, y_train)

y_pred = stregr.predict(X_test)
print(rmsle_metric(y_pred, y_test))
Esempio n. 3
0
from sklearn.model_selection import train_test_split

train_X1, test_X, train_y1, test_y = train_test_split(train_X, train_y)

# Initialize models
lr = LinearRegression(n_jobs=-1)

rd = Ridge(alpha=4.84)

rf = RandomForestRegressor(n_estimators=12, max_depth=3, n_jobs=-1)

gb = GradientBoostingRegressor(n_estimators=40, max_depth=2)

nn = MLPRegressor(hidden_layer_sizes=(90, 90), alpha=2.75)

model = StackingRegressor(regressors=[rf, gb, nn, rd], meta_regressor=lr)
# Fit the model on our data
model.fit(train_X, train_y)

y_pred = model.predict(train_X)
print(sqrt(mean_squared_error(train_y, y_pred)))
Y_pred = model.predict(test_X)

#process data and imputes values into missing data
#from sklearn.preprocessing import Imputer
#le_imputer=Imputer()
#train_X=le_imputer.fit_transform(train_X)
#val_X=le_imputer.transform(train_X)
#print(train_X)
#print(val_X)
#forest_model=RandomForestRegressor()
Esempio n. 4
0
        def post(self):
            receive_json = request.get_json()
            error = False
            try:
                for nameClassifier, dictParams in receive_json.items():
                    if nameClassifier == 'ensemble Learning':
                        estimators = []
                        typeOfClassifier = ""
                        resultatFinal = ""
                        for index, classifier, in dictParams.items():
                            for nameSubClass, dicoValueParams, in classifier.items(
                            ):
                                typeOfClassifier = dicoValueParams.pop(
                                    'typeOf', None)
                                resultValidation = validationClassifier(
                                    dicoValueParams, nameSubClass,
                                    typeOfClassifier)
                                newValue = resultValidation[0]
                                if type(resultValidation[1]) == str and not (
                                        resultValidation[1]
                                        and resultValidation[1].strip()):
                                    if typeOfClassifier == "classifier":
                                        clfChild = dictEstimator[nameSubClass](
                                        )
                                    elif typeOfClassifier == "regressor":
                                        clfChild = dictEstimatorRegr[
                                            nameSubClass]()
                                    # send params issue by the request
                                    clfChild.set_params(**newValue)

                                    estimators.append((nameSubClass, clfChild))
                                else:
                                    resultatFinal += "{0} in {1}.\n".format(
                                        resultValidation[1], nameSubClass)
                        if type(resultatFinal) == str and not (
                                resultatFinal and resultatFinal.strip()):
                            try:
                                if (typeOfClassifier == "classifier"):
                                    clfEnsemble = VotingClassifier(estimators)
                                    clfEnsemble.fit(x_data_filtered,
                                                    y_data_filtered)
                                elif (typeOfClassifier == "regressor"):

                                    est = [b for a, b in estimators]
                                    svr_rbf = SVR(kernel='rbf')
                                    clfEnsemble = StackingRegressor(
                                        regressors=est, meta_regressor=svr_rbf)
                                # évaluate the scoring
                                joblib.dump(
                                    clfEnsemble,
                                    'interfaceMl/DataSet/newModel.pkl')

                            except Exception:
                                resultatFinal += traceback.format_exc()
                                # return all result processed

                    else:
                        typeOfClassifier = dictParams.pop('typeOf', None)
                        resultValidation = validationClassifier(
                            dictParams, nameClassifier, typeOfClassifier)
                        newValue = resultValidation[0]
                        resultatFinal = resultValidation[1]
                        if type(resultatFinal) == str and not (
                                resultatFinal and resultatFinal.strip()):
                            try:
                                if typeOfClassifier == "classifier":
                                    # create the classificator
                                    clf = dictEstimator[nameClassifier]()
                                    # send params issue by the request
                                    clf.set_params(**newValue)

                                elif typeOfClassifier == "regressor":
                                    # create the classificator
                                    clf = dictEstimatorRegr[nameClassifier]()
                                    # send params issue by the request
                                    clf.set_params(**newValue)
                                    # évaluate the scoring

                                clf.fit(x_data_filtered, y_data_filtered)
                                joblib.dump(
                                    clf, 'interfaceMl/DataSet/newModel.pkl')

                            except Exception:
                                resultatFinal += traceback.format_exc()
                                # return all result processed
            except Exception as e:
                error = True
            return jsonify(error)
Esempio n. 5
0
gbrt = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=10, max_features=1.0,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=10,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)
gbrt.fit(X_train, y_train)

print("3")
# Predicting the Test set results
y_pred = gbrt.predict(X_valid)
rms = sqrt(mean_squared_error(y_valid, y_pred))

print("gbrt rms= %f" % rms)
evaluate_strategy(y_pred, False)
print("4")
regressors = [gbrt, ridgeReg]

lr = LinearRegression()

print("5")
stregr = StackingRegressor(regressors=regressors, meta_regressor=lr, refit=True)
stregr.fit(X_train, y_train)
print("6")
outpred = stregr.predict(X_valid)

print(outpred)
print("7")
evaluate_strategy(outpred)
Esempio n. 6
0
        def post(self):
            receive_json = request.get_json()
            resultatFinal = ""

            for nameClassifier, dictParams in receive_json.items():
                if nameClassifier == 'ensemble Learning':
                    estimators = []
                    for index, classifier, in dictParams.items():
                        for nameSubClass, dicoValueParams, in classifier.items(
                        ):
                            #remove the type of
                            typeOfClassifier = dicoValueParams.pop(
                                'typeOf', None)
                            # validate all sub estimator
                            resultValidation = validationClassifier(
                                dicoValueParams, nameSubClass,
                                typeOfClassifier)
                            # Stores the validated dictionary
                            newValue = resultValidation[0]

                            # Test if no error occurred during validation
                            if type(resultValidation[1]) == str and not (
                                    resultValidation[1]
                                    and resultValidation[1].strip()):
                                # Create estimator by the type
                                if typeOfClassifier == "classifier":
                                    clfChild = dictEstimator[nameSubClass]()
                                elif typeOfClassifier == "regressor":
                                    clfChild = dictEstimatorRegr[nameSubClass](
                                    )

                                # send params issue by the request
                                clfChild.set_params(**newValue)

                                # add the estimator to the list
                                estimators.append((nameSubClass, clfChild))
                            # if error occurred durring validation return them on result var
                            else:
                                resultatFinal += "{0} in {1}.\n".format(
                                    resultValidation[1], nameSubClass)

                    # Test if no error for all sub estimator occurred during validation
                    if type(resultatFinal) == str and not (
                            resultatFinal and resultatFinal.strip()):
                        try:
                            if typeOfClassifier == "classifier":
                                # Create an ensemble estimator type Voting
                                clfEnsemble = VotingClassifier(estimators)
                                # Stock the result into variable resultat
                                # Eval the scoring
                                scores = cross_val_score(clfEnsemble,
                                                         x_data_filtered,
                                                         y_data_filtered,
                                                         cv=3)
                                resultatFinal = (
                                    "Accuracy: %0.2f (+/- %0.2f)" %
                                    (scores.mean(), scores.std() * 2))

                            elif typeOfClassifier == "regressor":

                                # Create an ensemble estimator type svr
                                svr_rbf = SVR(kernel='rbf')
                                clfEnsemble = StackingRegressor(
                                    regressors=[b for a, b in estimators],
                                    meta_regressor=svr_rbf)
                                # évaluate the scoring
                                scores = cross_val_score(
                                    clfEnsemble,
                                    x_data_filtered,
                                    y_data_filtered,
                                    cv=3,
                                    scoring='neg_mean_squared_error')
                                # Stock the result into variable resultat
                                resultatFinal = (
                                    "negatif mean squared error: %0.2f" %
                                    (scores.mean()))

                        except Exception:
                            # return all error scikit
                            resultatFinal += traceback.format_exc()

                else:
                    # extract the type of estimator
                    typeOfClassifier = dictParams.pop('typeOf', None)

                    #validation for all params
                    resultValidation = validationClassifier(
                        dictParams, nameClassifier, typeOfClassifier)

                    # Stores the validated dictionary
                    newValue = resultValidation[0]

                    resultatFinal = resultValidation[1]

                    # Test if no error for all sub estimator occurred during validation
                    if type(resultatFinal) == str and not (
                            resultatFinal and resultatFinal.strip()):
                        try:
                            if typeOfClassifier == "classifier":
                                # create the classificator
                                clf = dictEstimator[nameClassifier]()
                                # send params issue by the request
                                clf.set_params(**newValue)
                                # évaluate the scoring
                                scores = cross_val_score(clf,
                                                         x_data_filtered,
                                                         y_data_filtered,
                                                         cv=3)

                                # Stock the result into variable resultat
                                resultatFinal = (
                                    "Accuracy: %0.2f (+/- %0.2f)" %
                                    (scores.mean(), scores.std() * 2))

                            elif typeOfClassifier == "regressor":
                                # create the Regressor
                                clf = dictEstimatorRegr[nameClassifier]()
                                # send params issue by the request
                                clf.set_params(**newValue)
                                # évaluate the scoring
                                scores = cross_val_score(
                                    clf,
                                    x_data_filtered,
                                    y_data_filtered,
                                    scoring='neg_mean_squared_error',
                                    cv=3)

                                # Stock the result into variable resultat
                                resultatFinal = (
                                    "negatif mean squared error: %0.2f" %
                                    (scores.mean()))

                        except Exception:
                            # return all error scikit
                            resultatFinal += traceback.format_exc()

            # add result to object base
            receive_json[nameClassifier]['resultat'] = resultatFinal

            # Return the object receive with the result
            return jsonify(receive_json)
Esempio n. 7
0
# set model parameters
# a little bit complicated because it is modified from my another asgn
# parameters in model params
#i f not exist, then grid search
model = ['mlp' for i in range(10)]  # ten mlps
model_stack = models.model_factory()
for k in model:
    model_stack.add_model(k)
model_stack.set_parameters(x, y)

# model fusion part, use stacking

mods = model_stack.get_models()
sclf = StackingRegressor(regressors=mods,
                         use_features_in_secondary=True,
                         meta_regressor=mods[0],
                         verbose=0)
sclf.fit(x, y)
result = sclf.predict(test)

# map back the prediction
if not log:
    result = [-x if x < 0 else x for x in result]
    result = list(result)
else:
    result = [np.exp(result[j]) - 1 for j in range(len(result))]

# cal the mse with the temp best result
# decide which to submit

output = result
xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=2200,reg_alpha=0.4640, reg_lambda=0.8571,subsample=0.5213, silent=1,nthread = -1)
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
print('RMSE for XGBoost is {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred))))
xgb.score(X_test,y_test)

#######support vector########
from sklearn.svm import SVR
regressor_s  = SVR(kernel = 'linear') 
regressor_s.fit(X_train,y_train)

#####stacking

from mlxtend.regressor import StackingRegressor
from mlxtend.data import boston_housing_data
stregr = StackingRegressor(regressors=[regressor,GBoost ,regressor_r], 
                           meta_regressor=xgb)
stregr.fit(X_train,y_train)
y_pred = stregr.predict(X_test)
print('RMSE for Stacked Regression is {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred))))
stregr.score(X_test,y_test)

model = Sequential()
model.add(Dense(200, input_dim=220, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mean_squared_error', optimizer=keras.optimizers.Adadelta()) # Mean squared error
model.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=66, batch_size=32, verbose=2)
y_pred = model.predict(X_test)
print('RMSE for Neural Network is {:.4f}'.format(sqrt(mean_squared_error(y_test, y_pred))))
model.score(X_test,y_test)

plt.style.use('ggplot')
Esempio n. 9
0
y_train = np.log1p(all_y)
rf = Ridge(alpha=7.5, tol=0.00001)
lass = Lasso(alpha=0.0005, random_state=5, max_iter=9999)
#999999999
Elas = ElasticNet(alpha=0.0008, random_state=5, max_iter=99999)
gb = GradientBoostingRegressor(n_estimators=800,
                               learning_rate=0.05,
                               max_depth=4,
                               max_features='sqrt',
                               min_samples_leaf=15,
                               min_samples_split=10,
                               loss='huber',
                               random_state=5)
kn = KNeighborsRegressor(n_jobs=3, n_neighbors=5)
str = StackingRegressor(regressors=[lass, kn, Elas, gb],
                        verbose=1,
                        meta_regressor=rf)
print('Overall RMPSE')
cv = cross_validate(str,
                    train_x,
                    y_train,
                    scoring=('neg_mean_squared_error'),
                    return_train_score=False,
                    cv=10)
print(np.sqrt(np.abs(np.mean(cv['test_score']))))

#Use when Submitting Below#
'''
test_x=temp.tail(1459)
str.fit(train_x,y_train)
preds=np.expm1(str.predict(test_x))
Esempio n. 10
0
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(features)
features = imputer.transform(features)

# ...........................Modal Training data.............................................

kfold = KFold(n_splits=3)

models = []

for idx, (train_idx, val_idx) in enumerate(kfold.split(features)):
    
    train_features, train_target = features[train_idx], target[train_idx]
    val_features, val_target = features[val_idx], target[val_idx]
    
    model = StackingRegressor(regressors=(LinearRegression(), LGBMRegressor()),
        meta_regressor=LGBMRegressor(), use_features_in_secondary=True)

    model.fit(np.array(train_features), np.array(train_target))
    models.append(model)

    print('RMSE: {:.4f} of fold: {}'.format(
        np.sqrt(mean_squared_error(val_target, model.predict(np.array(val_features)))), idx))

    del train_features, train_target, val_features, val_target; gc.collect()

del features, target; gc.collect()



#.......................Test data import and processing...........................