Example #1
0
def test_bootstrap_samples():
    # Test that bootstrapping samples generate non-perfect base estimators.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    base_estimator = DecisionTreeRegressor().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=False,
                                random_state=rng).fit(X_train, y_train)

    assert_equal(base_estimator.score(X_train, y_train),
                 ensemble.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=True,
                                random_state=rng).fit(X_train, y_train)

    assert_greater(base_estimator.score(X_train, y_train),
                   ensemble.score(X_train, y_train))
Example #2
0
def test_bootstrap_samples():
    """Test that bootstraping samples generate non-perfect base estimators."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    base_estimator = DecisionTreeRegressor().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=False,
                                random_state=rng).fit(X_train, y_train)

    assert_equal(base_estimator.score(X_train, y_train),
                 ensemble.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=True,
                                random_state=rng).fit(X_train, y_train)

    assert_greater(base_estimator.score(X_train, y_train),
                   ensemble.score(X_train, y_train))
Example #3
0
def bagging(x_train, y_train):
    model = BaggingRegressor(base_estimator=SVR(),
                             n_estimators=10,
                             random_state=0)
    model.fit(x_train, y_train)
    score = model.score(x_train, y_train)
    return score
Example #4
0
def test_oob_score_regression():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
                                                        diabetes.target,
                                                        random_state=rng)

    clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                           n_estimators=50,
                           bootstrap=True,
                           oob_score=True,
                           random_state=rng).fit(X_train, y_train)

    test_score = clf.score(X_test, y_test)

    assert abs(test_score - clf.oob_score_) < 0.1

    # Test with few estimators
    warn_msg = (
        "Some inputs do not have OOB scores. This probably means too few "
        "estimators were used to compute any reliable oob estimates.")
    with pytest.warns(UserWarning, match=warn_msg):
        regr = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                n_estimators=1,
                                bootstrap=True,
                                oob_score=True,
                                random_state=rng)
        regr.fit(X_train, y_train)
Example #5
0
class _BaggingRegressorImpl:
    def __init__(
        self,
        base_estimator=None,
        n_estimators=10,
        *,
        max_samples=1.0,
        max_features=1.0,
        bootstrap=True,
        bootstrap_features=False,
        oob_score=False,
        warm_start=False,
        n_jobs=None,
        random_state=None,
        verbose=0,
    ):
        estimator_impl = base_estimator

        self._hyperparams = {
            "base_estimator": estimator_impl,
            "n_estimators": n_estimators,
            "max_samples": max_samples,
            "max_features": max_features,
            "bootstrap": bootstrap,
            "bootstrap_features": bootstrap_features,
            "oob_score": oob_score,
            "warm_start": warm_start,
            "n_jobs": n_jobs,
            "random_state": random_state,
            "verbose": verbose,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)
        self._hyperparams["base_estimator"] = base_estimator

    def get_params(self, deep=True):
        out = self._wrapped_model.get_params(deep=deep)
        # we want to return the lale operator, not the underlying impl
        out["base_estimator"] = self._hyperparams["base_estimator"]
        return out

    def fit(self, X, y, sample_weight=None):
        if isinstance(X, pd.DataFrame):
            feature_transformer = FunctionTransformer(
                func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns),
                inverse_func=None,
                check_inverse=False,
            )
            self._hyperparams["base_estimator"] = (
                feature_transformer >> self._hyperparams["base_estimator"]
            )
            self._wrapped_model = SKLModel(**self._hyperparams)
        self._wrapped_model.fit(X, y, sample_weight)

        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
def test_oob_score_regression():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                           n_estimators=50,
                           bootstrap=True,
                           oob_score=True,
                           random_state=rng).fit(X_train, y_train)

    test_score = clf.score(X_test, y_test)

    assert_less(abs(test_score - clf.oob_score_), 0.1)

    # Test with few estimators
    assert_warns(UserWarning,
                 BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                  n_estimators=1,
                                  bootstrap=True,
                                  oob_score=True,
                                  random_state=rng).fit,
                 X_train,
                 y_train)
Example #7
0
    def bagging_regressor(self, data):
        train, validacion = data
        x_tr, y_tr = train
        x_val, y_val = validacion
        #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
        #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

        print('Start training BaggingRegressor...')
        start_time = self.timer()

        bg = BaggingRegressor(oob_score=True, verbose=1)
        bg.fit(x_tr, y_tr)
        print("The R2 is: {}".format(bg.score(x_tr, y_tr)))
        #		print("The alpha choose by CV is:{}".format(krrl.alpha_))
        self.timer(start_time)

        print("Making prediction on validation data")
        y_val = np.expm1(y_val)
        y_val_pred = np.expm1(bg.predict(x_val))
        mae = mean_absolute_error(y_val, y_val_pred)
        print("El mean absolute error de es {}".format(mae))

        print('Saving model into a pickle')
        try:
            os.mkdir('pickles')
        except:
            pass

        with open('pickles/bg.pkl', 'wb') as f:
            pickle.dump(bg, f)

        print('Making prediction and saving into a csv')
        y_test = bg.predict(self.x_test)

        return y_test
Example #8
0
def models(xtrain, ytrain):

    #LogesticRegression; used when Y value only has 2 values
    from sklearn.linear_model import LogisticRegression
    lin = LogisticRegression(random_state=0)
    print(sum(cross_val_score(lin, xtrain, ytrain, cv=5)) / 5)
    lin.fit(xtrain, ytrain)

    #Decision Tree;
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(criterion="entropy", random_state=0)
    print(sum(cross_val_score(tree, xtrain, ytrain, cv=5)) / 5)
    tree.fit(xtrain, ytrain)

    #random forest classifer
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators=10,
                                    criterion='entropy',
                                    random_state=0)
    print(sum(cross_val_score(forest, xtrain, ytrain, cv=5)) / 5)
    forest.fit(xtrain, ytrain)

    from sklearn.ensemble import BaggingRegressor
    bag = BaggingRegressor()
    print(sum(cross_val_score(forest, xtrain, ytrain, cv=5)) / 5)
    bag.fit(xtrain, ytrain)

    #Accuracy
    print('Logestic Regression accuracy:', lin.score(xtrain, ytrain))
    print('Decision Tree Classifer accuracy:', tree.score(xtrain, ytrain))
    print('Random Forest Classifier  accuracy:', forest.score(xtrain, ytrain))
    print('Bagging Regressor accuracy:', bag.score(xtrain, ytrain))
    return lin, tree, forest, bag
Example #9
0
def KNeighborsBagging(neigh):
    kn1 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='uniform')
    kn2 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='distance')
    bgg = BaggingRegressor(kn1,
                           n_estimators=10,
                           max_samples=0.7,
                           max_features=0.9,
                           verbose=0)  #, max_features=0.5

    bgg.fit(X_train, y_train)

    print(bgg.score(X_train, y_train))

    y_pred = bgg.predict(X_test)

    # Generate ROC curve values: fpr, tpr, thresholds
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)

    # Plot ROC curve
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()
Example #10
0
def test_oob_score_regression():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                           n_estimators=50,
                           bootstrap=True,
                           oob_score=True,
                           random_state=rng).fit(X_train, y_train)

    test_score = clf.score(X_test, y_test)

    assert_less(abs(test_score - clf.oob_score_), 0.1)

    # Test with few estimators
    assert_warns(
        UserWarning,
        BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                         n_estimators=1,
                         bootstrap=True,
                         oob_score=True,
                         random_state=rng).fit, X_train, y_train)
def run():
    print "Bagged Decision Tree Regression started..."

    #Preparing Training data
    dir_path = ""
    train_file_path = dir_path + "train.csv"
    train_file = read_csv(train_file_path, skiprows=1, header=None)

    train_file = train_file.drop(train_file.columns[0], axis=1)
    train_file = train_file.values

    #Combining previous 5 time step data into one row
    train_X_temp = train_file[5:50000, :-1]
    train_Y = train_file[6:50001, -1]
    train_X = np.zeros((train_X_temp.shape[0], 8 * 5))
    for i in range(train_X_temp.shape[0]):
        for j in range(5):
            for k in range(8):
                train_X[i][j * 8 + k] = train_X_temp[i - j][k]

    #Preparing testing data
    test_file_name = dir_path + "test2.csv"
    test_file = read_csv(test_file_name, skiprows=1, header=None)
    test_file = test_file.values
    test_X = np.array(test_file[:, :-1])
    test_y = test_file[:, -1]

    # print "\nSimple Decison Tree:"
    # dec_tree = DecisionTreeRegressor(max_depth = 5)
    # dec_tree.fit(train_X, train_Y)
    # prediction = dec_tree.predict(test_X)
    # print "Predictions: \n",prediction
    # print "Score: ",dec_tree.score(test_X,test_y)

    # print "\nADABoost Decision Tree:"
    # ada_boost = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 5),n_estimators = 10)
    # ada_boost.fit(train_X, train_Y)
    # prediction = ada_boost.predict(test_X)
    # print "Predictions: \n",prediction
    # print "Score: ",ada_boost.score(test_X,test_y)

    #Model training and prediction
    print "\nBagged Decision Tree:"
    start = time.time()
    bag_reg = BaggingRegressor(DecisionTreeRegressor(),
                               n_jobs=2,
                               random_state=0).fit(train_X, train_Y)

    #bag_reg.set_params(n_jobs=1)
    #Calculating and printing Results
    prediction = bag_reg.predict(test_X)
    mse = np.mean((prediction - test_y)**2)
    print "MSE: ", mse
    # print "Predictions: \n",prediction
    print "Score: ", bag_reg.score(test_X, test_y)

    print "Time: ", (time.time() - start)
    print "Decision Tree Regressor done...\n"
def Bagging(x_train, y_train, x_test, y_test):
    estimator = BaggingRegressor(n_estimators=1000, random_state=0, n_jobs=-1)
    estimator.fit(x_train, y_train)
    t = estimator.score(x_train, y_train)
    y_pred = estimator.predict(x_test)
    mse_score = mse(y_test, y_pred)
    print("mse_score: " + str(mse_score))
    r2_score = r2(y_test, y_pred)
    print("r2_score: " + str(r2_score))
    print(t)
def makeBaggingBoostDefaultDecisionTreePrediction(n_est):
    global y_t_pred, result
    print "Prediction #estimators = %s and Decision Trees" % (n_est)
    prefix = "%s_BaggingBoost_n_est%s_DefaultTree" % (name, n_est)
    model = BaggingRegressor(n_estimators=n_est)
    x1 = x[:, :]  # use all data
    x_t1 = x_t[:, :]  # use all data
    y_t_pred = model.fit(x1, y).predict(x_t1)
    r = model.score(x1, y)
    print("score r = %s" % r)
    return prefix, model
Example #14
0
def test_bootstrap_samples():
    # Test that bootstrapping samples generate non-perfect base estimators.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
                                                        diabetes.target,
                                                        random_state=rng)

    base_estimator = DecisionTreeRegressor().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    ensemble = BaggingRegressor(
        base_estimator=DecisionTreeRegressor(),
        max_samples=1.0,
        bootstrap=False,
        random_state=rng,
    ).fit(X_train, y_train)

    assert base_estimator.score(X_train,
                                y_train) == ensemble.score(X_train, y_train)

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BaggingRegressor(
        base_estimator=DecisionTreeRegressor(),
        max_samples=1.0,
        bootstrap=True,
        random_state=rng,
    ).fit(X_train, y_train)

    assert base_estimator.score(X_train, y_train) > ensemble.score(
        X_train, y_train)

    # check that each sampling correspond to a complete bootstrap resample.
    # the size of each bootstrap should be the same as the input data but
    # the data should be different (checked using the hash of the data).
    ensemble = BaggingRegressor(base_estimator=DummySizeEstimator(),
                                bootstrap=True).fit(X_train, y_train)
    training_hash = []
    for estimator in ensemble.estimators_:
        assert estimator.training_size_ == X_train.shape[0]
        training_hash.append(estimator.training_hash_)
    assert len(set(training_hash)) == len(training_hash)
Example #15
0
def KNeighborsBaggingResul(neigh):
    kn1 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='uniform')
    kn2 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='distance')
    bgg = BaggingRegressor(kn1,
                           n_estimators=10,
                           max_samples=0.7,
                           max_features=0.9,
                           verbose=0)  #, max_features=0.5

    bgg.fit(X_train, y_train)

    resul.append(bgg.score(X_train, y_train))
def makeBaggingBoostGaussianProcessPrediction(n_est, maxfs):
    global y_t_pred, result
    print "Prediction #estimators = %s and Gaussian Process" % (n_est)
    prefix = "%s_BaggingBoost_max_features%s_GP" % (name, maxfs)
    #GaussianProcessRegressor(kernel=RationalQuadratic(),n_restarts_optimizer=9)
    model = BaggingRegressor(n_estimators=n_est)
    #,max_features=maxfs,n_estimators=n_est)
    x1 = x[:, :]  # use all data
    x_t1 = x_t[:, :]  # use all data
    y_t_pred = model.fit(x1, y).predict(x_t1)
    r = model.score(x1, y)
    print("score r = %s" % r)
    return prefix, model
Example #17
0
def test_bootstrap_samples():
    # Test that bootstrapping samples generate non-perfect base estimators.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    base_estimator = DecisionTreeRegressor().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=False,
                                random_state=rng).fit(X_train, y_train)

    assert_equal(base_estimator.score(X_train, y_train),
                 ensemble.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=True,
                                random_state=rng).fit(X_train, y_train)

    assert_greater(base_estimator.score(X_train, y_train),
                   ensemble.score(X_train, y_train))

    # check that each sampling correspond to a complete bootstrap resample.
    # the size of each bootstrap should be the same as the input data but
    # the data should be different (checked using the hash of the data).
    ensemble = BaggingRegressor(base_estimator=DummySizeEstimator(),
                                bootstrap=True).fit(X_train, y_train)
    training_hash = []
    for estimator in ensemble.estimators_:
        assert estimator.training_size_ == X_train.shape[0]
        training_hash.append(estimator.training_hash_)
    assert len(set(training_hash)) == len(training_hash)
def run():
	print "Decision Tree Regression started..."

	#Preparing Training data
	dir_path = ""
	train_file_path = dir_path + "train.csv"
	train_file = read_csv(train_file_path,skiprows=1,header=None)

	train_file = train_file.drop(train_file.columns[0],axis=1)
	train_file = train_file.values

	train_X_temp = train_file[5:50000,:-1]
	train_Y = train_file[6:50001,-1]

	#Combining previous 5 time step data into one row
	train_X = np.zeros((train_X_temp.shape[0],8*5))
	for i in range(train_X_temp.shape[0]):
		for j in range(5):
			for k in range(8):
				train_X[i][j*8+k] = train_X_temp[i-j][k]

	#Preparing testing data
	test_file_name = dir_path + "test2.csv"
	test_file = read_csv(test_file_name,skiprows=1,header=None)
	test_file = test_file.values
	test_X = np.array(test_file[:,:-1])
	test_y = test_file[:,-1]

	#Model training and prediction for different no of trees
	estimators = np.arange(10, 100, 10)
	print "\nBagged Decision Tree:"
	bag_reg = BaggingRegressor(DecisionTreeRegressor(),n_jobs=2,random_state=0).fit(train_X, train_Y)
	scores = []
	prediction = []
	for n in estimators:
	    bag_reg.set_params(n_estimators=n)
	    bag_reg.fit(train_X, train_Y)
	    score = bag_reg.score(test_X, test_y)
	    print score
	    scores.append(score)
	    #prediction.append(bag_reg.predict(test_X))
	
	#plotting the effect of increasing no of trees on accuracy score
	plt.title("Effect of n_estimators")
	plt.xlabel("n_estimator")
	plt.ylabel("score")
	plt.plot(estimators, scores)
	plt.show()
Example #19
0
def Bagging(Xtrain, Ytrain, Xtest, Ytest):
    """
	Apply the extra trees regressor
	"""
    from sklearn.ensemble import BaggingRegressor
    print('\nBagging regressor:')

    clf = BaggingRegressor(n_estimators=100, n_jobs=-1).fit(Xtrain, Ytrain)
    print('Accuracy: {0}'.format(clf.score(Xtrain, Ytrain)))

    #find the training error
    prediction = clf.predict(Xtrain)
    Etrain = error(prediction, Ytrain)
    print('Training error: {0}'.format(Etrain))

    #find the test error
    prediction = clf.predict(Xtest)
    Etrain = error(prediction, Ytest)
    print('Test error: {0}'.format(Etrain))
def run_tree_models(x,y):
    '''
    Get an overview of performances of different tree models.
    Tree models: Decision tree, AdaBoost, Bagged tree
    INPUT: Dataframe with features (X) and target variable dataframe (y)
    OUTPUT: Scores of each tree model
    '''
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    dt = DecisionTreeRegressor()
    dt.fit(X_train, y_train)
    print('Decision Tree Score: ' + str(dt.score(X_test, y_test)))

    ada = AdaBoostRegressor(LinearRegression())
    ada.fit(X_train, y_train)
    print('AdaBoost Regressor Score: ' + str(ada.score(X_test, y_test)))

    # Train and Score Bagged Tree Regressor (ensemble learner)
    bagged_tree = BaggingRegressor(DecisionTreeRegressor())
    bagged_tree.fit(X_train, y_train)
    print('Bagged Tree Score: ' + str(bagged_tree.score(X_test, y_test)))
Example #21
0
def bag_svr(X_train=pd.DataFrame(),
            yinzi=[],
            y_train=pd.DataFrame(),
            n_iter=10,
            n_jobs=2):
    from sklearn.svm import SVR
    from sklearn.ensemble import BaggingRegressor

    bag_rg = BaggingRegressor(base_estimator=SVR(),
                              n_estimators=n_iter,
                              max_samples=0.5,
                              max_features=0.5,
                              bootstrap=True,
                              bootstrap_features=True,
                              random_state=0,
                              n_jobs=n_jobs)
    bag_rg = bag_rg.fit(X_train[yinzi], y_train)
    score0 = bag_rg.score(X_train[yinzi], y_train)
    print(f"bag_rg,得分:{score0}")
    pre = bag_rg.predict(X_train[yinzi])
    X_train = pd.DataFrame(X_train)
    X_train.loc[:, '预测值'] = pd.Series(pre, index=X_train.index)
    return bag_rg, X_train, score0
Example #22
0
def bagging(X, y, k_cv):
    kfold = KFold(n_splits=k_cv, shuffle=True, random_state=0)
    regr = BaggingRegressor(base_estimator=BayesianRidge(n_iter=1000),
                            n_estimators=20,
                            random_state=0,
                            max_samples=1.0,
                            max_features=0.7,
                            n_jobs=15)
    # regr = BaggingRegressor(base_estimator=SVR(C=40,gamma=0.01),
    #                         n_estimators=100, random_state=0,
    #                         max_samples=0.8,max_features=0.8,n_jobs=15)
    vaild_split = kfold.split(y)
    for i in range(k_cv):
        split_index = vaild_split.__next__()
        test_index = split_index[1]
        y_test = y[test_index]
        trainval_index = split_index[0]
        X_trainval = X[trainval_index, :]
        X_test = X[test_index, :]
        y_trainval = y[trainval_index]
        regr.fit(X_trainval, y_trainval)
        print((regr.score(X_trainval, y_trainval))**0.5)
        test_pre = regr.predict(X_test)
        print("accuracy: ", (r_2(y_test, test_pre))**0.5)
        print '******************************************'
        
        if name=='Boston': # Regression problem
        
            rfr = RandomForestRegressor(**params)
            rfr.fit(X, y)
            scores_rfr = cross_val_score(rfr, X, y ,cv=5)

            br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=max_depth), n_estimators=n_estimators)
            br.fit(X, y)
            scores_br = cross_val_score(br, X, y, cv=5)
            
            boston[i,1] = rfr.score(X, y)
            boston[i,2] = np.mean(scores_rfr)
            boston[i,3] = np.std(scores_rfr)
            boston[i,4] = br.score(X, y)
            boston[i,5] = np.mean(np.mean(scores_br))
            boston[i,6] = np.std(scores_br)

            print 'Score RandomForestRegressor = %s' % ( boston[i,1])
            print 'Cross Val : mean = %s' % (boston[i,2])
            print 'Cross Val : std = %s' % (boston[i,3])
            print 'Score BaggingRegressor = %s' % (boston[i,4])
            print 'Cross Val : mean = %s' %(boston[i,5])
            print 'Cross Val : std = %s' %(boston[i,6])
            
        if name=='Diabetes': # Regression problem
        
            rfr = RandomForestRegressor(**params)
            rfr.fit(X, y)
            scores_rfr = cross_val_score(rfr, X, y ,cv=5)
Example #24
0
                           encoding="GBK")
    data_cat_df = dataXFCA[[
        'area', 'province', 'city', 'year', 'month', 'day', 'industry'
    ]].astype(str)
    y_data = dataXFCA['fcA']
    data_num_df = dataXFCA[['gcA']]
    train, y_data = preprocess(data_cat_df, data_num_df, y_data)
    trainXF, testXF, trainyF, testyF = train_test_split(train,
                                                        y_data,
                                                        test_size=0.1,
                                                        random_state=1)

    #BaggingRegression
    ridge = Ridge(15)
    clf = BaggingRegressor(n_estimators=15, base_estimator=ridge)
    clf.fit(trainXF, trainyF)
    y_FCA = clf.predict(testXF)
    # print y_FCA
    #deal with scaling
    FCA_pred = scaling(y_FCA)
    print FCA_pred
    scores = clf.score(testXF, testyF)
    scores_c = cross_val_score(clf_Ada, train_Salary, train_Salary_y)
    scores_cv = np.mean(scores_c)
    print 'Baggingregression:', scores
    print 'BaggingRegression_CV', scores_cv
    print 'finished with the FCA(完成流通股本预测)'

    y = merge(sal_pred, FCA_pred)
    print '行业衡量标准:', y
    print 'ok'
Example #25
0
    print '******************************************'
    print name
    print '******************************************'
    
    if name=='Boston' or name=='Diabetes': # Regression problem
    
        rfr = RandomForestRegressor(**params)
        rfr.fit(X, y)
        print 'Score RandomForestRegressor = %s' % (rfr.score(X, y))
        scores_rfr = cross_val_score(rfr, X, y ,cv=5)
        print 'Cross Val Score RandomForestRegressor = %s' % (np.mean(scores_rfr))
        
        br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=max_depth), n_estimators=n_estimators)
        br.fit(X, y)
        print 'Score BaggingRegressor = %s' % (br.score(X, y))
        scores_br = cross_val_score(br, X, y, cv=5)
        print 'Cross Val Scores of BR = %s' %(np.mean(scores_br))
        
    if name=='Iris' or name=='Digits': # Classificaiton problem
    
        rfc = RandomForestClassifier(**params)
        rfc.fit(X, y)
        print 'Score RandomForestClassifier = %s' % (rfc.score(X, y))
        scores_rfc = cross_val_score(rfc, X, y ,cv=5)
        print 'Corss Val Scores of RandomForestClassifier = %s' %(np.mean(scores_rfc))

        bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators)
        bc.fit(X, y)        
        print 'Score BaggingClassifier == %s' % (bc.score(X, y))
        scores_bc = cross_val_score(bc, X, y, cv=5)
# In[197]:


from sklearn.ensemble import BaggingRegressor

bgcl = BaggingRegressor(n_estimators=10,random_state=1)

bgcl = bgcl.fit(X_train, y_train)


# In[198]:



y_predict = bgcl.predict(X_test)
acc_BG_train=bgcl.score(X_train , y_train)
print("Bagging - Train Accuracy:",acc_BG_train)
acc_BG = bgcl.score(X_test , y_test)
print("Bagging - Test Accuracy:",acc_BG)

results = cross_val_score(bgcl, X, y, cv=kfold, scoring='r2')
print(results)
kf_res_mean=results.mean()*100.0
kf_res_std=results.std()*100.0


# In[200]:


#Store the accuracy results for each model in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['Bagging'],'Training_Score': acc_BG_train, 
def bagging(df1, features, pred_var, df2):
    lr = BaggingRegressor()
    lr.fit(df1[features], df1[pred_var])
    print 'BaggingClassifier Score: ', lr.score(df2[features], df2[pred_var])
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn import preprocessing

data = pd.read_csv("clean_data.csv",
                   sep=",",
                   index_col=None,
                   prefix=None,
                   skip_blank_lines=True,
                   header=0)

X = data.loc[:, [
    "Quartier", "Commune", "Etage", "Superficie", "Piece", "Electricite",
    "Gaz", "Eau", "Acte notarie", "Jardin", "Livret foncier", "Meuble",
    "Garage"
]].values
Y = data.loc[:, "Prix"].values

X = pd.DataFrame(X)

le = preprocessing.LabelEncoder()
X = X.apply(le.fit_transform)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

regressor = BaggingRegressor(DecisionTreeRegressor(max_depth=4))
regressor.fit(X_train, Y_train)
score = regressor.score(X_test, Y_test)
print score
Example #29
0
from sklearn import preprocessing,cross_validation
from sklearn.ensemble import BaggingRegressor
import pickle
temp=0

# ID,ATM Name,Transaction Date,No Of Withdrawals,No Of CUB Card Withdrawals,No Of Other Card Withdrawals,Total amount Withdrawn,
# Amount withdrawn CUB Card,Amount withdrawn Other Card,averageWithdrawals,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,
# WorkingDay,H,N,C,M,NH,HWH,HHW,WWH,WHH,HWW,WWW,WHW,HHH,Rounded Amount Withdrawn,class,AvgAmountPerWithdrawal

df=pd.read_csv('ClassificationData.csv')
df=df[df['ATM Name']=='Big Street ATM']
df.drop(['ID','ATM Name','Transaction Date','No Of Withdrawals','No Of CUB Card Withdrawals','No Of Other Card Withdrawals',
		  'class','Amount withdrawn CUB Card','Amount withdrawn Other Card','Rounded Amount Withdrawn'],1,inplace=True)
X=np.array(df.drop('Total amount Withdrawn',1))
X=preprocessing.scale(X)
y=np.array(df['Total amount Withdrawn'])

X_train,X_test,y_train,y_test=cross_validation.train_test_split(X,y,test_size=0.2)
clf=BaggingRegressor(n_estimators=200)
clf.fit(X_train,y_train)
accuracy=clf.score(X_test,y_test)
print('Accuracy: ',accuracy)
# with open('gradientBoosting.pickle','wb') as f:
# 	pickle.dump(clf,f)

# pickle_in=open('gradientBoosting.pickle','rb')
# clf=pickle.load(pickle_in)

# temp+=accuracy
# print("Average Accuracy is: ",temp)
Example #30
0
            tmpSCR = adaBoostC.score(testX, yTest)
        else:
            adaBoostR.fit(trainX, yTrain)
            tmpSCR = adaBoostR.score(testX, yTest)
        scores['adaBoost'][label].append(tmpSCR)
        tTOT = time.time() - t0
        times['adaBoost'][label].append(tTOT)

        t0 = time.time()
        print("start bagging withOUT out-of-bag")
        if cnt < 2:
            bagCoobN.fit(trainX, yTrain)
            tmpSCR = bagCoobN.score(testX, yTest)
        else:
            bagRoobN.fit(trainX, yTrain)
            tmpSCR = bagRoobN.score(testX, yTest)
        scores['bagging (NO out of bag)'][label].append(tmpSCR)
        tTOT = time.time() - t0
        times['bagging (NO out of bag)'][label].append(tTOT)

        t0 = time.time()
        print("start bagging WITH out-of-bag")
        if cnt < 2:
            bagCoobY.fit(trainX, yTrain)
            tmpSCR = bagCoobY.score(testX, yTest)
        else:
            bagRoobY.fit(trainX, yTrain)
            tmpSCR = bagRoobY.score(testX, yTest)
        scores['bagging (YES out of bag)'][label].append(tmpSCR)
        tTOT = time.time() - t0
        times['bagging (YES out of bag)'][label].append(tTOT)
Example #31
0
reg8.fit(X_train, y_train)
reg1.fit(X_train, y_train)
reg2.fit(X_train, y_train)
reg3.fit(X_train, y_train)
ereg.fit(X_train, y_train)
reg4.fit(X_train, y_train)
reg5.fit(X_train, y_train)
reg6.fit(X_train, y_train)
# reg7.fit(X_train, y_train)
print("GradientBoostingRegressor:", reg1.score(X_test, y_test))
print("RandomForestRegressor:", reg2.score(X_test, y_test))
print("LinearRegression:", reg3.score(X_test, y_test))
print("VotingRegressor:", ereg.score(X_test, y_test))
print("AdaBoostRegressor:", reg4.score(X_test, y_test))
print("BaggingRegressor:", reg5.score(X_test, y_test))
print("ExtraTreesRegressor:", reg6.score(X_test, y_test))
# print("StackingRegressor:", reg7.score(X_test, y_test))
print("XGBRegressor:", reg8.score(X_test, y_test))

XGBpredictions = reg8.predict(X_test)
MAE = mean_absolute_error(y_test, XGBpredictions)
print('XGBoost validation MAE = ', MAE)
xx = []
# try:
#     file = open('regression.csv', 'w', newline='')
#     file_w = csv.writer(file)
# except Exception:
#     print('regression.csv open faild')
#     exit()
# names = ['test', 'prediction']
def timeseries(company_name):
	df = pd.read_csv('data_files/WIKI-'+company_name+'.csv')
	print(len(df))
	df['Date'] = pd.to_datetime(df['Date'])
	df.set_index('Date', inplace = True)

	df = df[['Adj. Open',  'Adj. High',  'Adj. Low', 'Adj. Volume', 'Adj. Close']]
	df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Low'] * 100.0
	df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0
	
	df_timeseries_open = df[df.columns[0]]
	df_timeseries_high = df[df.columns[1]]
	df_timeseries_low= df[df.columns[2]]
	df_timeseries_vol = df[df.columns[3]]
	df_timeseries_close = df[df.columns[4]]
	df_timeseries_HL_PCT = df[df.columns[5]]
	df_timeseries_PCT_change = df[df.columns[6]]
	
	x1,train_size = timer(df_timeseries_open)
	print("done 1 ",train_size," ",len(x1))
	x2,train_size = timer(df_timeseries_high)
	print("done 2 ",train_size)
	x3,train_size = timer(df_timeseries_low)
	print("done 3 ",train_size)
	x4,train_size = timer(df_timeseries_vol)
	print("done 4 ",train_size)
	x6,train_size = timer(df_timeseries_HL_PCT)
	print("done 6 ",train_size)
	x7,train_size = timer(df_timeseries_PCT_change)
	print("done 7 ",train_size)
	"""
	np.savetxt('open.txt', x1, fmt='%d')
	np.savetxt('high.txt', x2, fmt='%d')
	np.savetxt('low.txt', x3, fmt='%d')
	np.savetxt('volume.txt', x4, fmt='%d')
	np.savetxt('Close.txt', x5, fmt='%d')
	np.savetxt('HL_PCT.txt', x6, fmt='%d')
	np.savetxt('PCT_change.txt', x7, fmt='%d')
	"""
	x1 = np.loadtxt('open.txt', dtype=int)
	x2 = np.loadtxt('high.txt', dtype=int)
	x3 = np.loadtxt('low.txt', dtype=int)
	x4 = np.loadtxt('volume.txt', dtype=int)
	x6 = np.loadtxt('HL_PCT.txt', dtype=int)
	x7 = np.loadtxt('PCT_change.txt', dtype=int)
	
	dfform = {'Adj. Open': df['Adj. Open'], 
        'Adj. High':df['Adj. High'], 
        'Adj. Low': df['Adj. Low'],
        'Adj. Volume': df['Adj. Volume'],
        'Adj. Close':df['Adj. Close'],
        'HL_PCT':df['HL_PCT'],
        'PCT_change':df['PCT_change']
    }
	
	df1 = pd.DataFrame(dfform) 
	
	data = {'Adj. Open': x1, 
        'Adj. High': x2, 
        'Adj. Low': x3,
        'Adj. Volume': x4,
        'HL_PCT': x6,
        'PCT_change': x7
    } 
    
	df_test = pd.DataFrame(data)

	y = np.array(df1['Adj. Close'])
	y_train = y[0:train_size]
	y_test = y[train_size:]

	x_test = np.array(df_test)
	del df1['Adj. Close']
	x_train = np.array(df1[0:train_size])

	from sklearn.linear_model import Ridge 
	clf1=Ridge(alpha=1.0)
	print("fitting ridge")
	clf1.fit(x_train, y_train)
	predicted=clf1.predict(x_test)
	print("score")
	confidence1 = clf1.score(x_test, y_test)
	print("Ridge : %.3f%%" % (confidence1*100.0))
	print(" E   N   D")

	clf = LinearRegression()
	print("fitting LR")
	clf.fit(x_train, y_train)
	print("score")
	confidence2 = clf.score(x_test, y_test)
	print("LinearRegressor : %.3f%%" % (confidence2*100.0))
	print(" E   N   D")

	from sklearn.ensemble import BaggingRegressor
	clfy=BaggingRegressor(base_estimator=None,n_estimators=10)
	print("fitting bagging")
	clfy.fit(x_train, y_train)
	predictedy=clfy.predict(x_test)
	print("score")
	confidencey = clfy.score(x_test, y_test)
	print("BAGGING : %.3f%%" % (confidencey*100.0))

	from sklearn.ensemble import GradientBoostingRegressor
	clfz=GradientBoostingRegressor()
	print("GradientBoostingRegressor")
	clfz.fit(x_train, y_train)
	predictedz=clfz.predict(x_test)
	print("score")
	confidencez = clfz.score(x_test, y_test)
	print("BOOSTING : %.3f%%" % (confidencez*100.0))

	import matplotlib.pyplot as plt 
	plt.plot(predictedz,label='predicted')
	plt.plot(y_test,label='Actual')
	plt.legend()
	plt.xlabel('Time')
	plt.ylabel('Price')
	plt.savefig('fea/'+str(company_name)+'.png',dpi=200,bbox_inches='tight')
Example #33
0
from sklearn.model_selection import cross_val_score
clf = RandomForestRegressor()
scores = cross_val_score(clf, X_test, y_test, cv=5)
scores.mean()

#mse in $
mse = mean_absolute_error(y_test, y_pred)
print("The mean absolute error is:$", mse)
#chceking r^2
from sklearn.metrics import r2_score

print("r_Score:", r2_score(y_test, y_pred))

bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10)
bg.fit(X_train, y_train)
bg.score(X_train, y_train)
bg.score(X_test, y_test)

#Adaboosting
regr = AdaBoostRegressor()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

#Decision
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

#gradientBoost
from sklearn.ensemble import GradientBoostingRegressor
Example #34
0
# Appendix - Datasets description section at the end of this MOOC.
# ```

# %% [markdown]
# Create a `BaggingRegressor` and provide a `DecisionTreeRegressor`
# to its parameter `base_estimator`. Train the regressor and evaluate its
# statistical performance on the testing set.

# %%
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor

tree = DecisionTreeRegressor()
bagging = BaggingRegressor(base_estimator=tree, n_jobs=-1)
bagging.fit(data_train, target_train)
test_score = bagging.score(data_test, target_test)
print(f"Basic R2 score of the bagging regressor:\n" f"{test_score:.2f}")

# %% [markdown]
# Now, create a `RandomizedSearchCV` instance using the previous model and
# tune the important parameters of the bagging regressor. Find the best
# parameters  and check if you are able to find a set of parameters that
# improve the default regressor.

# ```{tip}
# You can list the bagging regressor's parameters using the `get_params`
# method.
# ```

# %%
for param in bagging.get_params().keys():
def impute_missing_values(df,
                          var_deviation_tolerance=0.97,
                          actual_or_gaussian_residuals='actual',
                          col_floor_ceiling_dict=None,
                          scores=False):
    '''Impute missing values while minimizing distortion of variable distribution
    by creating a bagged model using other variables and adding residuals to output values
    
    Parameters:
    df: dataframe with missing values
    var_deviation_tolerance: target percent deviation from original variable distributions
    actual_or_guassian_residuals: apply residuals to model outputs from actual distribution or from
        a gaussian distribution based on residuals' means and variances
    col_floor_ceiling_dict: a dictionary with the variable name and a tuple of the min and max for variables 
        with a finite range. Use float(inf) or float(-inf) for variables that are limited in only one direction
    scores: return accuracy score of models per variable
    
    Returns:
    df: df with imputed values
    problems: columns that failed to impute
    column_scores: accuracy scores of imputation model on non-missing values
    '''
    df = df.copy()
    columns = df.columns
    type_dict = df.dtypes.to_dict()
    missing_columns = list(
        df.isna().sum()[df.isna().sum() > 0].sort_values().index)
    have_columns = [i for i in columns if i not in missing_columns]
    column_scores = {}
    problems = []
    for col in tqdm.tqdm(missing_columns):
        try:
            percent_missing = df[col].isna().sum() / df.shape[0]
            m = math.ceil(percent_missing / ((1 / .97) - 1))
            other_columns = [i for i in columns if i != col]
            na_index = df[df[col].isna() == 1].index
            have_index = [i for i in df.index if i not in na_index]
            na_have_cols = set(df.loc[na_index,
                                      other_columns].dropna(axis=1).columns)
            have_have_cols = set(df.loc[have_index,
                                        other_columns].dropna(axis=1).columns)
            both_cols = na_have_cols.intersection(have_have_cols)
            int_df = pd.get_dummies(df.loc[:, both_cols], drop_first=True)
            X_have = int_df.loc[have_index, :]
            y_have = df[col][have_index]
            X_na = int_df.loc[na_index, :]
            if type_dict[col] == 'object':
                le = LabelEncoder()
                y_have = le.fit_transform(y_have)
                df[col][have_index] = y_have
                rf = RandomForestClassifier()
                bagc = BaggingClassifier(base_estimator=rf, n_estimators=m)
                bagc.fit(X_have, y_have)
                column_scores[col] = bagc.score(X_have, y_have)
                resid_preds = bagc.predict(X_have)
                residuals = y_have - resid_preds
                preds = bagc.predict(X_na)
            else:
                bagr = BaggingRegressor(n_estimators=m)
                bagr.fit(X_have, y_have)
                column_scores[col] = bagr.score(X_have, y_have)
                resid_preds = bagr.predict(X_have)
                residuals = y_have - resid_preds
                preds = bagr.predict(X_na)
            if actual_or_gaussian_residuals == 'actual':
                rand_resids = np.random.choice(residuals,
                                               len(X_na),
                                               replace=True)
            else:
                rand_resids = np.random.normal(residuals.mean(),
                                               residuals.std(), len(X_na))
            preds = preds + rand_resids
            if type_dict[col] == 'object':
                preds = preds.round()
            if col_floor_ceiling_dict != None:
                if col in col_floor_ceiling_dict.keys():
                    preds = np.clip(preds, col_floor_ceiling_dict[col][0],
                                    col_floor_ceiling_dict[col][1])
            df[col][na_index] = preds
            have_columns.append(col)
        except:
            problems.append(col)
    if scores == False:
        return df, problems
    else:
        return df, problems, column_scores
# Bagging Methods
###########################################################

from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)

# usual knn
knn.fit(xtrain,ytrain)
print(knn.score(xtrain,ytrain),knn.score(xtest,ytest))

# full bagging n_estimators : 모델의 갯수, max_samples : resample의 비율, 0.5로 하면 반만 중복허용
# max_features : feature를 뽑을 때 중복허용 비율 bootstrap : true(중복허용) bootstrap_features : true(중복허용)
bf = BaggingRegressor(knn,n_estimators=100,max_samples=1.0,max_features=1.0,random_state=0)
bf.fit(xtrain,ytrain)
print(bf.score(xtrain,ytrain),bf.score(xtest,ytest))

# bagging with subsampling and feature randomization
bf = BaggingRegressor(knn,n_estimators=500,max_samples=0.5,max_features=0.5)
bf.fit(xtrain,ytrain)
print(bf.score(xtrain,ytrain),bf.score(xtest,ytest))

# effect of estimators
np.random.seed(0)
n_list = [1,5,10,20,30,50,100,200,500,1000]
s = np.zeros((len(n_list),2))
for i in range(len(n_list)):
    bf = BaggingRegressor(knn,n_estimators=n_list[i],max_samples=0.5,max_features=0.5)
    bf.fit(xtrain,ytrain)
    s[i,0] = bf.score(xtrain,ytrain)
    s[i,1] = bf.score(xtest,ytest)