Exemple #1
0
def stack_model(X_train, y_train, X_val, y_val):
    from sklearn.linear_model import LinearRegression
    clf = LinearRegression()
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    loss = rmsle(y_val, y_val_pred)
    print('Stacking loss is:%f' % loss)
    print('Best parameter:%s' % clf.coef_)
    joblib.dump(clf, '../result/model/stack_model_lin.pkl')
    return clf
Exemple #2
0
def stack_knn(X_train, y_train, X_val, y_val):
    from sklearn.neighbors import KNeighborsRegressor
    param_grid = {'n_neighbors': [20, 25, 30, 35, 50]}
    clf = GridSearchCV(KNeighborsRegressor(),
                       param_grid=param_grid,
                       n_jobs=-1,
                       verbose=1,
                       cv=4,
                       scoring=scoring_rmsle)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    loss = rmsle(y_val, y_val_pred)
    print('Stacking loss is:%f' % loss)
    print('Best parameter:%s' % clf.best_estimator_)
    joblib.dump(clf, '../result/model/stack_model_knn.pkl')
    return clf
Exemple #3
0
def stack_lasso(X_train, y_train, X_val, y_val):
    from sklearn import linear_model
    param_grid = {'alpha': [0.005, 0.1, 0.3, 0.5, 0.8, 1]}
    clf = GridSearchCV(linear_model.Lasso(),
                       param_grid=param_grid,
                       n_jobs=-1,
                       verbose=1,
                       cv=4,
                       scoring=scoring_rmsle)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    loss = rmsle(y_val, y_val_pred)
    print('Stacking loss is:%f' % loss)
    print('Best parameter:%s' % clf.best_estimator_)
    joblib.dump(clf, '../result/model/stack_model_lasso.pkl')
    return clf
Exemple #4
0
def stack_svr(X_train, y_train, X_val, y_val):
    from sklearn.svm import SVR
    param_grid = {
        'C': [0.005, 0.1, 0.3, 0.5, 0.8, 1, 5, 10, 50, 100],
        'gamma': [0.005, 0.1, 0.3, 0.5, 0.8, 1, 5, 10, 50, 100]
    }
    clf = GridSearchCV(SVR(),
                       param_grid=param_grid,
                       n_jobs=-1,
                       verbose=1,
                       cv=4,
                       scoring=scoring_rmsle)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    loss = rmsle(y_val, y_val_pred)
    print('Stacking loss is:%f' % loss)
    print('Best parameter:%s' % clf.best_estimator_)
    joblib.dump(clf, '../result/model/stack_model_svr.pkl')
    return clf
def apply_rf(X_train, y_train, X_valid, y_valid):
    from sklearn.ensemble import RandomForestRegressor
    param_grid = {'max_depth': [2, 12], 'n_estimators': [40, 45, 50, 55, 60]}
    rf = GridSearchCV(RandomForestRegressor(),
                      param_grid,
                      cv=4,
                      n_jobs=-1,
                      scoring=scoring_rmsle,
                      verbose=1)
    rf.fit(X_train, y_train)
    y_val_pred = rf.best_estimator_.predict(X_valid)
    val_loss = rmsle(y_valid, y_val_pred)
    print('RMSLE for validation: %f' % val_loss)
    print('Best parameter:%s' % rf.best_params_)
    joblib.dump(rf.best_estimator_, root_model + 'model_rf.pkl')
    np.savetxt(root_val_pred + 'y_val_pred_rf.csv',
               y_val_pred,
               fmt='%f',
               delimiter=',')
    return rf.best_estimator_, val_loss, y_val_pred
def apply_knn(X_train, y_train, X_valid, y_valid):
    from sklearn.neighbors import KNeighborsRegressor
    param_grid = {'n_neighbors': [1, 3, 5, 8, 10, 15, 18, 20, 22]}
    model_knn = GridSearchCV(KNeighborsRegressor(),
                             cv=4,
                             n_jobs=-1,
                             param_grid=param_grid,
                             scoring=scoring_rmsle,
                             verbose=1)
    model_knn.fit(X_train, y_train)
    y_val_pred = model_knn.best_estimator_.predict(X_valid)
    val_loss = rmsle(y_valid, y_val_pred)
    print('RMSLE for validation: %f' % val_loss)
    print('Best parameter:%s' % model_knn.best_params_)
    joblib.dump(model_knn.best_estimator_, root_model + 'model_knn.pkl')
    np.savetxt(root_val_pred + 'y_val_pred_knn.csv',
               y_val_pred,
               fmt='%f',
               delimiter=',')
    return model_knn.best_estimator_, val_loss, y_val_pred
def apply_svr(X_train, y_train, X_valid, y_valid):
    from sklearn.svm import SVR
    param_grid = {
        'C': [13, 14, 15, 16, 18, 20, 25, 30, 50, 100],
        'gamma': [0.02, 0.03, 0.04, 0.05, 0.1, 0.5, 1, 10]
    }
    model_svr = GridSearchCV(SVR(),
                             cv=4,
                             n_jobs=-1,
                             param_grid=param_grid,
                             scoring=scoring_rmsle,
                             verbose=1)
    model_svr.fit(X_train, y_train)
    y_val_pred = model_svr.best_estimator_.predict(X_valid)
    val_loss = rmsle(y_valid, y_val_pred)
    print('RMSLE for validation: %f' % val_loss)
    print('Best parameter:%s' % model_svr.best_params_)
    joblib.dump(model_svr.best_estimator_, root_model + 'model_svr.pkl')
    np.savetxt(root_val_pred + 'y_val_pred_svr.csv',
               y_val_pred,
               fmt='%f',
               delimiter=',')
    return model_svr.best_estimator_, val_loss, y_val_pred
def apply_gbrt(X_train, y_train, X_valid, y_valid):
    from sklearn.ensemble import GradientBoostingRegressor
    param_grid = {
        'max_depth': [2, 3, 4, 5, 6, 7, 8, 10],
        'n_estimators': [40, 45, 50, 55]
    }
    gbrt = GridSearchCV(GradientBoostingRegressor(),
                        param_grid=param_grid,
                        verbose=1,
                        scoring=scoring_rmsle,
                        n_jobs=-1,
                        cv=4)
    gbrt.fit(X_train, y_train)
    y_val_pred = gbrt.best_estimator_.predict(X_valid)
    val_loss = rmsle(y_valid, y_val_pred)
    print('RMSLE for validation: %f' % val_loss)
    print('Best parameter:%s' % gbrt.best_params_)
    joblib.dump(gbrt.best_estimator_, root_model + 'model_gbrt.pkl')
    np.savetxt(root_val_pred + 'y_val_pred_gbrt.csv',
               y_val_pred,
               fmt='%f',
               delimiter=',')
    return gbrt.best_estimator_, val_loss, y_val_pred
def apply_xgb(X_train, y_train, X_valid, y_valid):
    import xgboost as xgb
    param_grid = {
        'n_estimators': [16, 17, 18, 20, 25, 30, 35, 50, 100],
        'max_depth': [2, 3, 4, 5, 6, 7, 8, 10]
    }
    model_xgb = GridSearchCV(xgb.XGBRegressor(),
                             param_grid=param_grid,
                             n_jobs=-1,
                             verbose=1,
                             scoring=scoring_rmsle,
                             cv=4)
    model_xgb.fit(X_train, y_train)
    y_val_pred = model_xgb.best_estimator_.predict(X_valid)
    val_loss = rmsle(y_valid, y_val_pred)
    print('RMSLE for validation: %f' % val_loss)
    print('Best parameter:%s' % model_xgb.best_params_)
    joblib.dump(model_xgb.best_estimator_, root_model + 'model_xgb.pkl')
    np.savetxt(root_val_pred + 'y_val_pred_xgb.csv',
               y_val_pred,
               fmt='%f',
               delimiter=',')
    return model_xgb.best_estimator_, val_loss, y_val_pred