コード例 #1
0
def test_gradient_boosting_estimator_with_smooth_quantile_loss():
    np.random.seed(0)
    m = 15000
    n = 10
    p = .8
    X = np.random.normal(size=(m,n))
    beta = np.random.normal(size=n)
    mu = np.dot(X, beta)
    y = np.random.lognormal(mu)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33333333333333)
    loss_function = SmoothQuantileLossFunction(1, p, .0001)
    q_loss = QuantileLossFunction(1, p)
    model = Booster(BaggingRegressor(Earth(max_degree=2, verbose=False, use_fast=True, max_terms=10)), 
                                      loss_function, n_estimators=150, 
                                      stopper=stop_after_n_iterations_without_percent_improvement_over_threshold(3, .01), verbose=True)
    assert_raises(NotFittedError, lambda : model.predict(X_train))
    
    model.fit(X_train, y_train)
    
    prediction = model.predict(X_test)
    model2 = GradientBoostingRegressor(loss='quantile', alpha=p)
    model2.fit(X_train, y_train)
    prediction2 = model2.predict(X_test)
    assert_less(q_loss(y_test, prediction), q_loss(y_test, prediction2))
    assert_greater(r2_score(y_test,prediction), r2_score(y_test,prediction2))
    q = np.mean(y_test <= prediction)
    assert_less(np.abs(q-p), .05)
    assert_greater(model.score_, 0.)
    assert_approx_equal(model.score(X_train, y_train), model.score_)
コード例 #2
0
ファイル: agents.py プロジェクト: JosephMontoya-TRI/CAMD-bkp
    def get_hypotheses(self, candidate_data, seed_data=None):
        X_cand, X_seed, y_seed = self.update_data(candidate_data, seed_data)

        steps = [('scaler', StandardScaler()), ('GP', self.GP)]
        pipeline = Pipeline(steps)

        bag_reg = BaggingRegressor(base_estimator=pipeline,
                                   n_estimators=self.n_estimators,
                                   max_samples=self.max_samples,
                                   bootstrap=self.bootstrap,
                                   verbose=True,
                                   n_jobs=self.n_jobs)
        self.cv_score = np.mean(
            -1.0 * cross_val_score(pipeline,
                                   X_seed,
                                   y_seed,
                                   cv=KFold(3, shuffle=True),
                                   scoring='neg_mean_absolute_error'))
        bag_reg.fit(X_seed, y_seed)

        # TODO: make this a static method
        def _get_unc(bagging_regressor, X_test):
            stds = []
            pres = []
            for est in bagging_regressor.estimators_:
                _p, _s = est.predict(X_test, return_std=True)
                stds.append(_s)
                pres.append(_p)
            return np.mean(np.array(pres), axis=0), np.min(np.array(stds),
                                                           axis=0)

        # GP makes predictions for Hf and uncertainty*alpha on candidate data
        preds, stds = _get_unc(bag_reg, X_cand)
        expected = preds - stds * self.alpha

        # Update candidate data dataframe with predictions
        self.update_candidate_stabilities(expected, sort=True, floor=-6.0)

        # Find the most stable ones up to n_query within hull_distance
        stability_filter = self.candidate_data[
            'pred_stability'] < self.hull_distance
        within_hull = self.candidate_data[stability_filter]

        self.indices_to_compute = within_hull.head(self.n_query).index.tolist()
        return self.indices_to_compute
コード例 #3
0
def evalOne(parameters):
    all_obs = []
    all_pred = []
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target")
        normalizer_X = StandardScaler()
        trainX = normalizer_X.fit_transform(trainX)
        testX = normalizer_X.transform(testX)
        normalizer_Y = StandardScaler()
        trainY = normalizer_Y.fit_transform(trainY)
        testY = normalizer_Y.transform(testY)
        model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=parameters["C"], cache_size=5000), max_samples=parameters["max_samples"],n_estimators=parameters["n_estimators"], verbose=0, n_jobs=-1)
        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        prediction = normalizer_Y.inverse_transform(prediction)
        testY = normalizer_Y.inverse_transform(testY)
        all_obs.extend(testY)
        all_pred.extend(prediction)
        
    return rmseEval(all_obs, all_pred)[1]
コード例 #4
0
def trainSVM(data, columns, targetColumn, parameters):
    
    modelColumns = []
    for column in columns:
        if column != targetColumn:
            modelColumns.append(column)
            
    modelData = []
    
    for i in range(0, len(data[targetColumn])):
        record = []
        for column in modelColumns:
            record.append(data[column][i])

        modelData.append(record)
    
    #model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=1e4,cache_size=5000), max_samples=4000,n_estimators=10, verbose=0, n_jobs=-1)
    model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=parameters["C"], cache_size=5000), max_samples=parameters["max_samples"],n_estimators=parameters["n_estimators"], verbose=0, n_jobs=-1)
    
    model.fit (modelData, data[targetColumn])
    
    return SVMModel(model, modelColumns)
コード例 #5
0
output.write("location,observation,prediction\n")

for location in locations:
    print(str(location))
    trainX, testX, trainY, testY = splitDataForXValidation(
        location, "location", data, all_features, "target")
    normalizer_X = StandardScaler()
    trainX = normalizer_X.fit_transform(trainX)
    testX = normalizer_X.transform(testX)
    normalizer_Y = StandardScaler()
    trainY = normalizer_Y.fit_transform(trainY)
    testY = normalizer_Y.transform(testY)
    model = BaggingRegressor(base_estimator=SVR(kernel='rbf',
                                                C=40,
                                                cache_size=5000),
                             max_samples=4200,
                             n_estimators=10,
                             verbose=0,
                             n_jobs=-1)
    model.fit(trainX, trainY)
    prediction = model.predict(testX)
    prediction = normalizer_Y.inverse_transform(prediction)
    testY = normalizer_Y.inverse_transform(testY)

    for i in range(0, len(testY)):
        output.write(str(location))
        output.write(",")
        output.write(str(testY[i]))
        output.write(",")
        output.write(str(prediction[i]))
        output.write("\n")
コード例 #6
0
ファイル: main.py プロジェクト: sethrem/jpmml-sklearn
print(auto_X.dtype, auto_y.dtype)


def build_auto(regressor, name):
    regressor = regressor.fit(auto_X, auto_y)
    store_pkl(regressor, name + ".pkl")
    mpg = DataFrame(regressor.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")


build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=5),
           "DecisionTreeAuto")
build_auto(
    BaggingRegressor(DecisionTreeRegressor(random_state=13,
                                           min_samples_leaf=5),
                     random_state=13,
                     n_estimators=3,
                     max_features=0.5), "DecisionTreeEnsembleAuto")
build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5),
           "ExtraTreesAuto")
build_auto(GradientBoostingRegressor(random_state=13, init=None),
           "GradientBoostingAuto")
build_auto(LassoCV(random_state=13), "LassoAuto")
build_auto(LinearRegression(), "LinearRegressionAuto")
build_auto(
    BaggingRegressor(LinearRegression(), random_state=13, max_features=0.5),
    "LinearRegressionEnsembleAuto")
build_auto(RandomForestRegressor(random_state=13, min_samples_leaf=5),
           "RandomForestAuto")
build_auto(RidgeCV(), "RidgeAuto")
コード例 #7
0
    (RidgeCV(), ['predict'], create_regression_problem_1()),
    (SGDRegressor(), ['predict'], create_regression_problem_1()),
    (Lasso(), ['predict'], create_regression_problem_1()),
    (Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]),
     ['predict', 'predict_proba'], create_weird_classification_problem_1()),
    (FeatureUnion([('earth', Earth()), ('earth2', Earth(max_degree=2))],
                  transformer_weights={
                      'earth': 1,
                      'earth2': 2
                  }), ['transform'], create_weird_classification_problem_1()),
    (RandomForestRegressor(), ['predict'], create_regression_problem_1()),
    (CalibratedClassifierCV(LogisticRegression(),
                            'isotonic'), ['predict_proba'],
     create_weird_classification_problem_1()),
    (AdaBoostRegressor(), ['predict'], create_regression_problem_1()),
    (BaggingRegressor(), ['predict'], create_regression_problem_1()),
    (BaggingClassifier(), ['predict_proba'],
     create_weird_classification_problem_1()),
    (GradientBoostingRegressor(verbose=True), ['predict'],
     create_regression_problem_1(m=100000, n=200)),
    (XGBRegressor(), ['predict'], create_regression_problem_for_xgb_1())
]


# Create tests for numpy_flat language
def create_case_numpy_flat(estimator, methods, fit_data, predict_data,
                           export_predict_data):
    def test_case(self):
        model = clone(estimator)
        model.fit(**fit_data)
        for method in methods:
コード例 #8
0
from sklearn.linear_model.theil_sen import TheilSenRegressor
from sklearn.mixture.dpgmm import VBGMM
from sklearn.feature_selection.variance_threshold import VarianceThreshold

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


clf_dict = {'ARDRegression':ARDRegression(),
			'AdaBoostClassifier':AdaBoostClassifier(),
			'AdaBoostRegressor':AdaBoostRegressor(),
			'AdditiveChi2Sampler':AdditiveChi2Sampler(),
			'AffinityPropagation':AffinityPropagation(),
			'AgglomerativeClustering':AgglomerativeClustering(),
			'BaggingClassifier':BaggingClassifier(),
			'BaggingRegressor':BaggingRegressor(),
			'BayesianGaussianMixture':BayesianGaussianMixture(),
			'BayesianRidge':BayesianRidge(),
			'BernoulliNB':BernoulliNB(),
			'BernoulliRBM':BernoulliRBM(),
			'Binarizer':Binarizer(),
			'Birch':Birch(),
			'CCA':CCA(),
			'CalibratedClassifierCV':CalibratedClassifierCV(),
			'DBSCAN':DBSCAN(),
			'DPGMM':DPGMM(),
			'DecisionTreeClassifier':DecisionTreeClassifier(),
			'DecisionTreeRegressor':DecisionTreeRegressor(),
			'DictionaryLearning':DictionaryLearning(),
			'ElasticNet':ElasticNet(),
			'ElasticNetCV':ElasticNetCV(),
コード例 #9
0
model_catb = CatBoostRegressor(random_state=2020,
                               loss_function='MAPE',
                               task_type='GPU')

# Bagging
base_estimator = [model_lgbm, model_catb, model_rf]

bagging_params = {
    'base_estimator': base_estimator,
    'n_estimators': n_estimators,
    'max_samples': max_samples,
    'max_features': max_features
}

model_bagging = BaggingRegressor()


def random_search(model, params, X_train, y_train, X_val, y_val, i, name=''):
    print('-' * 100)
    start_time = datetime.datetime.now()
    print('Start Time : {}'.format(start_time))

    rnd_search = RandomizedSearchCV(model,
                                    param_distributions=params,
                                    n_iter=100,
                                    cv=2,
                                    scoring='neg_mean_absolute_error',
                                    verbose=2,
                                    n_jobs=2,
                                    random_state=2020)
コード例 #10
0
def validate(params):
    transf_type = params['transf_type']

    if transf_type == 'drop':
        transf = FunctionTransformer(drop_transform, validate=False)
    elif transf_type == 'dr+inp+sc+pca':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
            StandardScaler(),
            PCA(n_components=params['n_pca_components']),
        )
    elif transf_type == 'dr+inp':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
        )
    elif transf_type == 'dr+inp+sc':
        transf = make_pipeline(drop_transform, SimpleImputer(),
                               StandardScaler())
    elif transf_type == 'union':
        transf = create_union_transf(params)
    elif transf_type == 'poly_kbest':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
            StandardScaler(),
            PolynomialFeatures(degree=2, interaction_only=True),
            SelectKBest(f_regression, params['best_features']),
        )
    else:
        raise AttributeError(f'unknown transformer type: {transf_type}')

    est_type = params['est_type']

    if est_type == 'xgboost':
        est = create_xgb_est(params)
    elif est_type == 'gblinear':
        est = create_gblinear_est(params)
    elif est_type == 'exttree':
        est = ExtraTreesRegressor(n_estimators=params['n_estimators'],
                                  n_jobs=-1)
    elif est_type == 'gp':
        est = GaussianProcessRegressor()
    elif est_type == 'ridge':
        est = Ridge(alpha=params['alpha'])
    else:
        raise AttributeError(f'unknown estimator type: {est_type}')

    if params['bagging']:
        BaggingRegressor(est,
                         n_estimators=params['n_bag_estimators'],
                         max_features=1.,
                         max_samples=1.)

    pl = make_pipeline(transf, est)

    if params['per_group_regr']:
        pl = PerGroupRegressor(estimator=pl,
                               split_condition=['os', 'cpuFreq', 'memSize_MB'],
                               n_jobs=1,
                               verbose=1)

    return cv_test(pl, n_folds=params['n_folds'])
コード例 #11
0
def moudle_select(X, test_A, y, moudelselect, threshold=False, Rate=False):
    '''
    Function :model
    X : train data 
    test_A : predict data
    y : result label
    predict_A : predict data
    moudelselect : waht' model do you select?
    threshold:False
    Rate:False
    
    
    modelselect :
    1,XGBRegressor
    2,ensemble.RandomForestRegressor
    3,linear_model.Lasso
    4,LinearRegression
    5,linear_model.BayesianRidge
    6,DecisionTreeRegressor
    7,ensemble.RandomForestRegressor
    8,ensemble.GradientBoostingRegressor
    9,ensemble.AdaBoostRegressor
    10,BaggingRegressor
    11,ExtraTreeRegressor
    12,SVR
    13,MLPRegressor
    other:MLPRegressor
    '''

    mse = []
    sum_mse = 0.0
    predict_A = pd.DataFrame(np.zeros((100, 10)))

    for index in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        if (moudelselect == 1):
            model = xgb.XGBRegressor(
                model=xgb.XGBRegressor(max_depth=17,
                                       min_child_weigh=5,
                                       eta=0.025,
                                       gamma=0.06,
                                       subsample=1,
                                       learning_rate=0.1,
                                       n_estimators=100,
                                       silent=0,
                                       n_jobs=-1,
                                       objective='reg:linear'))

        elif (moudelselect == 2):
            model = ensemble.RandomForestRegressor(
                n_estimators=25,
                criterion='mse',
                max_depth=14,
                min_samples_split=0.1,
                min_samples_leaf=2,
                min_weight_fraction_leaf=0.0,
                max_features=0.95,
                max_leaf_nodes=None,
                min_impurity_split=1e-07,
                bootstrap=True,
                oob_score=False,
                n_jobs=-1,
                random_state=None,
                verbose=0,
                warm_start=False)
        elif (moudelselect == 3):
            model = linear_model.Lasso(alpha=0.1,
                                       max_iter=1000,
                                       normalize=False)

        elif (moudelselect == 4):
            model = LinearRegression(fit_intercept=False,
                                     n_jobs=1,
                                     normalize=False)

        elif (moudelselect == 5):
            model = linear_model.BayesianRidge(alpha_1=1e-06,
                                               alpha_2=1e-06,
                                               compute_score=False,
                                               copy_X=True,
                                               fit_intercept=True,
                                               lambda_1=1e-06,
                                               lambda_2=1e-06,
                                               n_iter=500,
                                               normalize=False,
                                               tol=10,
                                               verbose=False)

        elif (moudelselect == 6):
            model = DecisionTreeRegressor(criterion='mse',
                                          splitter='best',
                                          max_depth=3,
                                          min_samples_split=0.1,
                                          min_samples_leaf=0.1,
                                          min_weight_fraction_leaf=0.1,
                                          max_features=None,
                                          random_state=None,
                                          max_leaf_nodes=None,
                                          presort=False)

        elif (moudelselect == 7):
            model = ensemble.RandomForestRegressor(
                n_estimators=1000,
                criterion='mse',
                max_depth=14,
                min_samples_split=0.1,
                min_samples_leaf=2,
                min_weight_fraction_leaf=0.0,
                max_features='auto',
                max_leaf_nodes=None,
                min_impurity_split=1e-07,
                bootstrap=True,
                oob_score=False,
                n_jobs=-1,
                random_state=None,
                verbose=0,
                warm_start=False)
        elif (moudelselect == 8):
            model = ensemble.GradientBoostingRegressor(n_estimators=800,
                                                       learning_rate=0.1,
                                                       max_depth=4,
                                                       random_state=0,
                                                       loss='ls')

        elif (moudelselect == 9):
            model = ensemble.AdaBoostRegressor(base_estimator=None,
                                               n_estimators=120,
                                               learning_rate=1,
                                               loss='linear',
                                               random_state=None)

        elif (moudelselect == 10):
            model = BaggingRegressor(base_estimator=None,
                                     n_estimators=500,
                                     max_samples=1.0,
                                     max_features=1.0,
                                     bootstrap=True)
        elif (moudelselect == 11):
            model = ExtraTreeRegressor(criterion='mse',
                                       splitter='random',
                                       max_depth=3,
                                       min_samples_split=0.1,
                                       min_samples_leaf=1,
                                       min_weight_fraction_leaf=0.01,
                                       max_features='auto',
                                       random_state=None,
                                       max_leaf_nodes=None,
                                       min_impurity_split=1e-07)

        elif (moudelselect == 12):
            model = SVR(kernel='rbf',
                        degree=3,
                        gamma='auto',
                        coef0=0.1,
                        tol=0.001,
                        C=1,
                        epsilon=0.1,
                        shrinking=True,
                        cache_size=200,
                        verbose=False,
                        max_iter=-1)

        elif (moudelselect == 13):
            model = MLPRegressor(hidden_layer_sizes=(100, ),
                                 activation='relu',
                                 solver='adam',
                                 alpha=0.0001,
                                 batch_size='auto',
                                 learning_rate='constant',
                                 learning_rate_init=0.001,
                                 power_t=0.5,
                                 max_iter=200,
                                 shuffle=True,
                                 random_state=None,
                                 tol=0.0001,
                                 verbose=False,
                                 warm_start=False,
                                 momentum=0.9,
                                 nesterovs_momentum=True,
                                 early_stopping=False,
                                 validation_fraction=0.1,
                                 beta_1=0.9,
                                 beta_2=0.999,
                                 epsilon=1e-08)
        else:
            model = MLPRegressor(activation='relu',
                                 alpha=0.001,
                                 solver='lbfgs',
                                 max_iter=90,
                                 hidden_layer_sizes=(11, 11, 11),
                                 random_state=1)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print("index: ", index, mean_squared_error(y_test, y_pred))
        sum_mse += mean_squared_error(y_test, y_pred)
        #
        #
        if (threshold == False):
            y_predict = model.predict(test_A)
            predict_A.ix[:, index] = y_predict
            mse.append(mean_squared_error(y_test, y_pred))
        else:
            if (mean_squared_error(y_test, y_pred) <= 0.03000):
                y_predict = model.predict(test_A)
                predict_A.ix[:, index] = y_predict
                mse.append(mean_squared_error(y_test, y_pred))


#        if(Rate==False):
#            mse_rate = mse / np.sum(mse)
#            #predict_A = predict_A.ix[:,~(data==0).all()]
#            for index in range(len(mse_rate)):
#                y+=predict_A.ix[:,index]*mse_rate[index]
#
    y = 0.0
    mse = mse / np.sum(mse)
    mse = pd.Series(mse)
    mse_rate_asc = mse.sort_values(ascending=False)
    mse_rate_asc = mse_rate_asc.reset_index(drop=True)
    mse_rate_desc = mse.sort_values(ascending=True)
    indexs = list(mse_rate_desc.index)
    for index in range(len(mse)):
        y += mse_rate_asc.ix[index] * predict_A.ix[:, indexs[index]]

    print("y_predict_mean: ", y.mean())
    print("y_predict_var: ", y.var())
    y = pd.DataFrame(y)
    y.to_excel("H:/java/python/src/machinelearning/test/predict.xlsx",
               index=False)
    predict_A.to_excel(
        "H:/java/python/src/machinelearning/test/predict_testA.xlsx",
        index=False)
    print("Averge mse:", sum_mse / len(mse))