Esempio n. 1
0
def avmPredict(params):
	town = getPlace(params['lat'], params['long'])[0]

	x, y, z = getXYZ(params['lat'], params['long'])

	r = 1.0

	data = []
	target = []
	header = []

	with open('../../../data/working22.csv') as f:
	
		f = csv.reader(f)
		header = next(f)

		for row in f:
			t = (map(float, row[:3] + row[4:]), float(row[3]))

			if weightF([x, y, z], t[0][0:3], r):
				data.append(t[0])
				target.append(t[1])

	ensemble = BaggingRegressor()
	ensemble.fit(data, target)

	test = createTest(params)
	return ensemble.predict(test)
Esempio n. 2
0
def test_oob_score_regression():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                           n_estimators=50,
                           bootstrap=True,
                           oob_score=True,
                           random_state=rng).fit(X_train, y_train)

    test_score = clf.score(X_test, y_test)

    assert_less(abs(test_score - clf.oob_score_), 0.1)

    # Test with few estimators
    assert_warns(UserWarning,
                 BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                  n_estimators=1,
                                  bootstrap=True,
                                  oob_score=True,
                                  random_state=rng).fit,
                 X_train,
                 y_train)
Esempio n. 3
0
def train_model(train, test, labels):
    rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=10)
    #rf = RandomForestRegressor(n_estimators=45, max_depth=9, random_state=10)
    clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.2, random_state=25)
    clf.fit(train, labels)
    #clf = SVR(C=1.0, epsilon=0.2)
    #clf.fit(train, labels)
    #clf = GaussianNB()
    #clf.fit(train, labels)
    print "Good!"
    predictions = clf.predict(test)
    print predictions.shape
    predictions = pd.DataFrame(predictions, columns = ['relevance'])
    print "Good again!"
    print "Predictions head -------"
    print predictions.head()
    print predictions.shape
    print "TEST head -------"
    print test.head()
    print test.shape
    #test['id'].to_csv("TEST_TEST.csv",index=False)
    #predictions.to_csv("PREDICTIONS.csv",index=False)
    #test = test.reset_index()
    #predictions = predictions.reset_index()
    #test = test.groupby(level=0).first()
    #predictions = predictions.groupby(level=0).first()
    predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False)
    print predictions
    return predictions
Esempio n. 4
0
def train_bagging_xgboost(X, Y):
    adaboost = BaggingRegressor(xgb.XGBRegressor(max_depth=6, learning_rate=0.02, n_estimators=300, silent=True,
                                                 objective='reg:linear', subsample=0.7, reg_alpha=0.8,
                                                 reg_lambda=0.8, booster="gblinear")
                                , max_features=0.7, n_estimators=30)
    adaboost.fit(X, Y)
    return adaboost
def model_fit_rf_bagging():

	def in_limits(x):
		if x<1: return 1
		if x>3: return 3
		return x

	print "STARTING MODEL"
	X = full_data[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values
	y = full_data['relevance'].values
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
	
	rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
	clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
	clf.fit(X_train, y_train)
	y_pred = clf.predict(X_test)

	in_limits = np.vectorize(in_limits,otypes=[np.float])
	y_pred = in_limits(y_pred)
	RMSE = mean_squared_error(y_test, y_pred)**0.5
	print "RMSE: ",RMSE

	# for the submission
	real_X_test = real_full_test[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values
	test_pred = clf.predict(real_X_test)
	test_pred = in_limits(test_pred)

	return test_pred
Esempio n. 6
0
def test_bootstrap_samples():
    """Test that bootstraping samples generate non-perfect base estimators."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    base_estimator = DecisionTreeRegressor().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=False,
                                random_state=rng).fit(X_train, y_train)

    assert_equal(base_estimator.score(X_train, y_train),
                 ensemble.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=True,
                                random_state=rng).fit(X_train, y_train)

    assert_greater(base_estimator.score(X_train, y_train),
                   ensemble.score(X_train, y_train))
Esempio n. 7
0
    def fit(self):
        """Scale data and train the model with the indicated algorithm.

        Do not forget to tune the hyperparameters.

        Parameters
        ----------
        algorithm : String,
            "KernelRidge", "SVM", "LinearRegression", "Lasso", "ElasticNet", "NeuralNet", "BaggingNeuralNet", default = "SVM"

        """
        self.X_scaler.fit(self.X_train)
        self.Y_scaler.fit(self.y_train)

        # scaling the data in all cases, it may not be used during the fit later
        self.X_train_sc = self.X_scaler.transform(self.X_train)
        self.y_train_sc = self.Y_scaler.transform(self.y_train)

        self.X_test_sc = self.X_scaler.transform(self.X_test)
        self.y_test_sc = self.Y_scaler.transform(self.y_test)

        if self.algorithm == "KernelRidge":
            clf_kr = KernelRidge(kernel=self.user_kernel)
            self.model = sklearn.model_selection.GridSearchCV(clf_kr, cv=5, param_grid=self.param_kr)

        elif self.algorithm == "SVM":
            clf_svm = SVR(kernel=self.user_kernel)
            self.model = sklearn.model_selection.GridSearchCV(clf_svm, cv=5, param_grid=self.param_svm)

        elif self.algorithm == "Lasso":
            clf_lasso = sklearn.linear_model.Lasso(alpha=0.1,random_state=self.rand_state)
            self.model = sklearn.model_selection.GridSearchCV(clf_lasso, cv=5,
                                                              param_grid=dict(alpha=np.logspace(-5,5,30)))

        elif self.algorithm == "ElasticNet":
            clf_ElasticNet = sklearn.linear_model.ElasticNet(alpha=0.1, l1_ratio=0.5,random_state=self.rand_state)
            self.model = sklearn.model_selection.GridSearchCV(clf_ElasticNet,cv=5,
                                                              param_grid=dict(alpha=np.logspace(-5,5,30)))

        elif self.algorithm == "LinearRegression":
            self.model = sklearn.linear_model.LinearRegression()

        elif self.algorithm == "NeuralNet":
            self.model = MLPRegressor(**self.param_neurons)
        elif self.algorithm == "BaggingNeuralNet":
            nn_m = MLPRegressor(**self.param_neurons)

            self.model = BaggingRegressor(base_estimator = nn_m, **self.param_bag)

        if self.scaling == True:
            self.model.fit(self.X_train_sc, self.y_train_sc.reshape(-1,))
            predict_train_sc = self.model.predict(self.X_train_sc)
            self.prediction_train = self.Y_scaler.inverse_transform(predict_train_sc.reshape(-1,1))
            predict_test_sc = self.model.predict(self.X_test_sc)
            self.prediction_test = self.Y_scaler.inverse_transform(predict_test_sc.reshape(-1,1))
        else:
            self.model.fit(self.X_train, self.y_train.reshape(-1,))
            self.prediction_train = self.model.predict(self.X_train)
            self.prediction_test = self.model.predict(self.X_test)
Esempio n. 8
0
def random_forest(X,Y,Xt):
    print('learn')    
    rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
    clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
    clf.fit(X, Y)
    print('predict')
    Yp_clamped = clf.predict(Xt)
    return Yp_clamped
Esempio n. 9
0
def test_sparse_regression():
    # Check regression for various parameter settings on sparse input.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)

    class CustomSVR(SVR):
        """SVC variant that records the nature of the training set"""

        def fit(self, X, y):
            super().fit(X, y)
            self.data_type_ = type(X)
            return self

    parameter_sets = [
        {"max_samples": 0.5,
         "max_features": 2,
         "bootstrap": True,
         "bootstrap_features": True},
        {"max_samples": 1.0,
         "max_features": 4,
         "bootstrap": True,
         "bootstrap_features": True},
        {"max_features": 2,
         "bootstrap": False,
         "bootstrap_features": True},
        {"max_samples": 0.5,
         "bootstrap": True,
         "bootstrap_features": False},
    ]

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)
        for params in parameter_sets:

            # Trained on sparse format
            sparse_classifier = BaggingRegressor(
                base_estimator=CustomSVR(),
                random_state=1,
                **params
            ).fit(X_train_sparse, y_train)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_results = BaggingRegressor(
                base_estimator=CustomSVR(),
                random_state=1,
                **params
            ).fit(X_train, y_train).predict(X_test)

            sparse_type = type(X_train_sparse)
            types = [i.data_type_ for i in sparse_classifier.estimators_]

            assert_array_almost_equal(sparse_results, dense_results)
            assert all([t == sparse_type for t in types])
            assert_array_almost_equal(sparse_results, dense_results)
Esempio n. 10
0
def procedureA(goldenFlag = False):
	# Trains and generates a prediction file
	# Uses hard heuristic for buy_or_not

	popFlag = True
	X, Y = getDataXY(currYearFlag = False, popFlag = popFlag)
	X, Y = shuffle(X, Y, random_state = 0)

	if popFlag:
		encoder = oneHot(X[:, 2:])
		Xt = encoder.transform(X[:, 2:])
		Xt = np.hstack((X[:,:2], Xt))
	else:
		encoder = oneHot(X)
		Xt = encoder.transform(X)

	buySet = set()
	for i in range(X.shape[0]):
		tmpTup = (X[i][0], X[i][2])
		buySet.add(tmpTup)
	# Y_buy = [1] * Xt.shape[0]

	min_max_scaler = preprocessing.MinMaxScaler()

	# Xt = min_max_scaler.fit_transform(Xt)

	if goldenFlag:
		print Xt.shape
		Xt = getGoldenX(Xt, 2, 2 + encoder.feature_indices_[1], 2 + encoder.feature_indices_[0], 2 + min(9, encoder.feature_indices_[1]))


	split = 0.9
	X_train, X_test = Xt[:(int(Xt.shape[0]*split)),:], Xt[int(Xt.shape[0]*split):, :]
	Y_train, Y_test = Y[:(int(Y.shape[0]*split)),:], Y[int(Y.shape[0]*split):, :]
	Y_train = Y_train.ravel()
	Y_test = Y_test.ravel()

	print X_train.shape
	print X_test.shape

	# clf = Ridge(alpha = 100)
	# clf = SVR(C = 10.0, kernel = 'poly', degree = 2)
	# clf = LinearSVR(C = 1.0)
	clf = BaggingRegressor(DecisionTreeRegressor(), n_estimators = 125, n_jobs = 4, random_state = 0)
	# clf = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = 100)
	# clf = DecisionTreeRegressor()
	# clf = RandomForestRegressor(random_state = 0, n_estimators = 200, n_jobs = 4)
	clf.fit(X_train, Y_train.ravel())

	Y_pred = clf.predict(X_test)
	evaluatePred(Y_pred, Y_test)

	return clf, encoder, min_max_scaler
Esempio n. 11
0
def test_single_estimator():
    # Check singleton ensembles.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    clf1 = BaggingRegressor(base_estimator=KNeighborsRegressor(),
                            n_estimators=1,
                            bootstrap=False,
                            bootstrap_features=False,
                            random_state=rng).fit(X_train, y_train)

    clf2 = KNeighborsRegressor().fit(X_train, y_train)

    assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
Esempio n. 12
0
    def __init__(self):
#         self.clf = GradientBoostingRegressor(n_estimators=200, max_features="sqrt", max_depth=5)
#         self.clf = LinearRegression() 
         self.clf = BaggingRegressor(LinearRegression())
#         self.clf = GaussianProcess(theta0=4)
#         self.sp = RandomizedLasso()       
         self.sp = SparseRandomProjection(n_components=5)
Esempio n. 13
0
def train_model(training, testing, window=5, n=5):
	X_train, y_train = prepare_data(training)
	X_test, y_test = prepare_data(testing)
	rf = RandomForestRegressor()
	rf.fit(X_train, y_train)
	predrf = rf.predict(X_test)
	print "mse for random forest regressor: ", mean_squared_error(predrf, y_test)

	gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025)
	gb.fit(X_train, y_train)
	predgb = gb.predict(X_test)
	print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test)
	## plot feature importance using GBR results
	fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility'])
	fx_imp /= fx_imp.max()  # normalize
	fx_imp.sort()
	ax = fx_imp.plot(kind='barh')
	fig = ax.get_figure()
	fig.savefig("output/feature_importance.png")

	adb = AdaBoostRegressor(DecisionTreeRegressor())
	adb.fit(X_train, y_train)
	predadb = adb.predict(X_test)
	print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test)

	scale = StandardScaler()
	scale.fit(X_train)
	X_trainscale = scale.transform(X_train)
	X_testscale = scale.transform(X_test)

	knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5)
	knn.fit(X_trainscale, y_train)
	predknn = knn.predict(X_testscale)
	print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test)

	pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn
	print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test)
	result = testing.copy()
	result.ix[5:-5, 'trend'] = pred_test
	result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values
	result.ix[:-5, 'pred_date'] = result.index[5:]

	return result
Esempio n. 14
0
def procc_modelfusion(df_test, data_test):
    from sklearn.ensemble import BaggingRegressor
    from sklearn import linear_model
    train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
    train_np = train_df.as_matrix()

    # y即Survival结果
    y = train_np[:, 0]

    # X即特征属性值
    X = train_np[:, 1:]

    # fit到BaggingRegressor之中
    clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
    bagging_clf.fit(X, y)

    test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
    predictions = bagging_clf.predict(test)
    result = pd.DataFrame({'PassengerId' : data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
    result.to_csv("logistic_regression_predictions3.csv", index=False)
Esempio n. 15
0
class Regressor(BaseEstimator):
    def __init__(self):
#         self.clf = GradientBoostingRegressor(n_estimators=200, max_features="sqrt", max_depth=5)
#         self.clf = LinearRegression() 
         self.clf = BaggingRegressor(LinearRegression())
#         self.clf = GaussianProcess(theta0=4)
#         self.sp = RandomizedLasso()       
         self.sp = SparseRandomProjection(n_components=5)
#         self.sp = TruncatedSVD()
 #        self.sp = KernelPCA(n_components=3, tol=0.0001, kernel="poly")
    # self.clf = ExtraTreesRegressor(n_estimators=200, max_features="sqrt", max_depth=5)

    def fit(self, X, y):
#        print(self.sp)

#        Xr = self.sp.fit_transform(X, y)
        self.clf.fit(X, y.ravel())
 
    def predict(self, X):
#        Xr = self.sp.transform(X)
        return self.clf.predict(X)
Esempio n. 16
0
def test_parallel_regression():
    # Check parallel regression.
    rng = check_random_state(0)

    X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng)

    ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train)

    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train)

    y3 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y3)
Esempio n. 17
0
def runTests():

    # Generate the training samples, extract training features and target
    trainSamples = GenSamples(numSamples)
    trainFeatures = extractFeatures(trainSamples)
    trainPred = extractPred(trainSamples)

    # Generate the test samples, extracr test features and target
    testSamples = GenSamples(numTestSamples)
    testFeatures = extractFeatures(testSamples)
    testPred = extractPred(testSamples)

    R2List = OrderedDict()
    R2List['TrainROI'] = []
    R2List['TestROI'] = []
    print 'Running Tests: '
    for i in range(numTests):
        # Bootstrap is True by default i.e., sampling with replacement
        # Bootstrap features is False by default i.e., all features used
        classifier = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                      n_estimators=numTrees,
                                      max_samples=int(0.5*numSamples),
                                      max_features=int(1))

        classifier.fit(trainFeatures, trainPred)
        predictROI = {}
        predictROI['Training'] = classifier.predict(trainFeatures)
        predictROI['Test'] = classifier.predict(testFeatures)

        R2 = {}
        R2['Train'] = r2_score(trainPred, predictROI['Training'])
        R2['Test'] = r2_score(testPred, predictROI['Test'])

        R2List['TrainROI'].append(R2['Train'])
        R2List['TestROI'].append(R2['Test'])

    print 'Best Train ROI: ', max(R2List['TrainROI'])
    print 'Best Test ROI: ', max(R2List['TestROI'])
Esempio n. 18
0
def test_bagging_regressor_with_missing_inputs():
    # Check that BaggingRegressor can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y_values = [
        np.array([2, 3, 3, 3, 3]),
        np.array([
            [2, 1, 9],
            [3, 6, 8],
            [3, 6, 8],
            [3, 6, 8],
            [3, 6, 8],
        ])
    ]
    for y in y_values:
        regressor = DecisionTreeRegressor()
        pipeline = make_pipeline(
            Imputer(),
            Imputer(missing_values=np.inf),
            Imputer(missing_values=np.NINF),
            regressor
        )
        pipeline.fit(X, y).predict(X)
        bagging_regressor = BaggingRegressor(pipeline)
        y_hat = bagging_regressor.fit(X, y).predict(X)
        assert_equal(y.shape, y_hat.shape)

        # Verify that exceptions can be raised by wrapper regressor
        regressor = DecisionTreeRegressor()
        pipeline = make_pipeline(regressor)
        assert_raises(ValueError, pipeline.fit, X, y)
        bagging_regressor = BaggingRegressor(pipeline)
        assert_raises(ValueError, bagging_regressor.fit, X, y)
Esempio n. 19
0
def test_bootstrap_samples():
    # Test that bootstrapping samples generate non-perfect base estimators.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    base_estimator = DecisionTreeRegressor().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=False,
                                random_state=rng).fit(X_train, y_train)

    assert_equal(base_estimator.score(X_train, y_train),
                 ensemble.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=True,
                                random_state=rng).fit(X_train, y_train)

    assert_greater(base_estimator.score(X_train, y_train),
                   ensemble.score(X_train, y_train))

    # check that each sampling correspond to a complete bootstrap resample.
    # the size of each bootstrap should be the same as the input data but
    # the data should be different (checked using the hash of the data).
    ensemble = BaggingRegressor(base_estimator=DummySizeEstimator(),
                                bootstrap=True).fit(X_train, y_train)
    training_hash = []
    for estimator in ensemble.estimators_:
        assert estimator.training_size_ == X_train.shape[0]
        training_hash.append(estimator.training_hash_)
    assert len(set(training_hash)) == len(training_hash)
Esempio n. 20
0
File: ensemble.py Progetto: smly/ume
class BaggingRegressor(BaseEstimator):
    """
    Usage:

    ```
    "model": {
        "class": "ume.ensemble.BaggingRegressor",
        "params": {
            "base_estimator": {
                "class": "sklearn.svm.SVR",
                "params": {
                    "kernel": "rbf",
                    "degree": 1,
                    "C": 1000000.0,
                    "epsilon": 0.01,
                },
            },
            "bag_kwargs": {
                "n_estimators": 100,
                "n_jobs": 5,
                "max_samples": 0.9,
            },
        }
    }
    ```
    """
    def __init__(self, base_estimator=None, bag_kwargs=None):
        klass = dynamic_load(base_estimator['class'])
        svr_reg = klass(**base_estimator['params'])
        self.__clf = SK_BaggingRegressor(base_estimator=svr_reg, **bag_kwargs)

    def fit(self, X, y):
        return self.__clf.fit(X, y)

    def predict(self, X):
        return self.__clf.predict(X)
Esempio n. 21
0
def get_bagging_prediction(X_train, y_train, X_test, X_valid=None, GS=False):
    if not GS:
        rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
        clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        if X_valid is None:
            return y_pred
        else:
            return y_pred, clf.predict(X_valid)
    else:
        rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
        clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
        param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
        model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=2, verbose=VERBOSE, scoring=RMSE)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if X_valid is None:
            return y_pred
        else:
            return y_pred, model.predict(X_valid)
Esempio n. 22
0
# Getting Testing Data out of the DF
test_data_frame = data_frame_regression.iloc[num_train:]

# Getting IDs for Testing Data
id_test = test_data_frame['id']

relevance_train = train_data_frame['relevance'].values

# All the Independent Variables in the Regressor
# These are Words in Title, Desription, Values
X_train = train_data_frame.drop(['id', 'relevance'], axis=1).values

# Same for Test Data
X_test = test_data_frame.drop(['id', 'relevance'], axis=1).values

# Using RandomForest Regressor
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)

# Using Bagging Regressor
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)

# Fit the Training Data to a Model
clf.fit(X_train, relevance_train)

# Predicting the relevance for Testind Data
relevance_pred = clf.predict(X_test)

# Writing the Relevance Values to Submission.csv
pandas.DataFrame({"id": id_test, "relevance": relevance_pred}).to_csv('submission.csv', index=False)
Esempio n. 23
0
import pandas
df_dir = '../data/'
K_fold = 2
num_train = 74000

def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

RMSE  = make_scorer(fmean_squared_error, greater_is_better=False)


df = pandas.read_csv(df_dir+'my_df_all.csv')
df = df[:num_train].drop(['Unnamed: 0'], axis = 1)
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf2 = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf2 = rf
clf = pipeline.Pipeline([('rfr', rf)])
param_grid = {'rfr__n_estimators': [350],  # 300 top
              'rfr__max_depth': [8],  # list(range(7,8,1))
              }
# param_grid = {'rfr__n_estimators':list(range(34,50,1)),
#               'rfr__max_depth':list(range(13,15,1))}
model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=5, verbose=0, scoring=RMSE)
errors = []
X_train = df.drop(['product_uid', 'id', 'relevance'], axis=1).values
y_train = df['relevance'].values
model.fit(X_train, y_train)

print("Best parameters found by grid search:")
print(model.best_params_)
Esempio n. 24
0
def bagging(df1, features, pred_var, df2):
    lr = BaggingRegressor()
    lr.fit(df1[features], df1[pred_var])
    print 'BaggingClassifier Score: ', lr.score(df2[features], df2[pred_var])
Esempio n. 25
0
def run_stack(SEED):

    model = "Lasso"

    trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv')
    trainBase = pd.read_csv('../models/' + model + '_train.csv')
    trainBaseWeight = trainBase['var11']
    test = pd.read_csv('../models/' + model + '_test.csv')

    #trainBase = shuffle(trainBase, random_state = SEED)

    print(trainBase.columns)
    trainBaseID = trainBase['id']
    testID = test['id']

    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    test = np.nan_to_num(np.array(test))

    avg = 0
    NumFolds = 5

    #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1),
    #Ridge()
    clfs = [
        #KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski')
        #SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False)
        #BayesianRidge(n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False)
        #ElasticNet(alpha=0.00069956421567126271, l1_ratio=1/10, fit_intercept=True, normalize=False, precompute='auto', max_iter=10000, copy_X=True, tol=1/10000, warm_start=False, positive=False)
        #LinearRegression(fit_intercept=True, normalize=False, copy_X=True)
        BaggingRegressor(n_estimators=50,
                         max_samples=1.0,
                         max_features=1.0,
                         bootstrap=True,
                         bootstrap_features=False,
                         oob_score=False,
                         n_jobs=1,
                         random_state=None,
                         verbose=0)
        #AdaBoostRegressor( n_estimators=10, learning_rate=1.0, loss='linear', random_state=None)
        #Lasso(alpha=0.0000329034456231),
        #Ridge(),
        #RandomForestRegressor(n_estimators=3000, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None),

        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=15, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=10, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=5, n_estimators=15, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=2, n_estimators=15, random_state=166, min_samples_leaf=1),

        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=300, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=1000, random_state=166, min_samples_leaf=1),
        #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=3000, random_state=166, min_samples_leaf=1),
    ]

    print("Data size: " + str(len(trainBase)) + " " + str(len(test)))
    dataset_blend_train = np.zeros((len(trainBase), len(clfs)))
    dataset_blend_test = np.zeros((len(test), len(clfs)))

    print("Begin Training")

    lenTrainBase = len(trainBase)
    lenTest = len(test)

    gc.collect()

    for ExecutionIndex, clf in enumerate(clfs):
        print(clf)
        avg = 0

        dataset_blend_test_set = np.zeros((lenTest, NumFolds))

        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase,
                                       n_folds=NumFolds,
                                       indices=True)

        for train_index, test_index in Folds:

            print()
            print("Iteration: " + str(foldCount))

            now = datetime.datetime.now()
            print(now.strftime("%Y/%m/%d %H:%M:%S"))

            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]
            weight = [trainBaseWeight[i] for i in train_index]

            targetTest = [targetBase[i] for i in test_index]
            trainTest = [trainBase[i] for i in test_index]
            weightTest = [trainBaseWeight[i] for i in test_index]

            #print "LEN: ", len(train), len(target)

            target = np.array(np.reshape(target, (-1, 1)))
            #train = np.array(np.reshape(train, (-1, 1))  )
            weight = np.array(np.reshape(weight, (-1, 1)))

            targetTest = np.array(np.reshape(targetTest, (-1, 1)))
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )
            weightTest = np.array(np.reshape(weightTest, (-1, 1)))

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest)
            #print(predicted[:,0])
            print(predicted)
            dataset_blend_train[
                test_index,
                ExecutionIndex] = predicted  #[:,0] #needed for Ridge

            #print(targetTest.shape)
            #print(prpredictedob.shape)
            #print(weightTest.shape)

            print(
                str(
                    score.normalized_weighted_gini(targetTest.ravel(),
                                                   predicted.ravel(),
                                                   weightTest.ravel())))
            avg += score.normalized_weighted_gini(
                targetTest.ravel(), predicted.ravel(),
                weightTest.ravel()) / NumFolds
            #print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())))
            #avg += score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())/NumFolds

            predicted = clf.predict(test)
            dataset_blend_test_set[:, foldCount] = predicted  #[:,0]

            foldCount = foldCount + 1

            #break

        dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1)

        now = datetime.datetime.now()
        #print dataset_blend_test_set.mean(1)
        #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1))

        submission = pd.DataFrame(np.zeros((len(testID), 2)),
                                  columns=['id', 'target'])
        submission['target'] = dataset_blend_test[:, ExecutionIndex]
        submission['id'] = testID
        submission.to_csv("../predictions/Stack_" +
                          now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" +
                          str(clf)[:12] + ".csv",
                          index=False)

        #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] )

        submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)),
                                  columns=['id', 'target'])
        submission['target'] = dataset_blend_train[:, ExecutionIndex]
        submission['id'] = trainBaseID
        submission.to_csv("../predictions/Target_Stack_" +
                          now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" +
                          str(clf)[:12] + ".csv",
                          index=False)

        csv_io.write_delimited_file("../log/RunLog.csv", [
            now.strftime("%Y %m %d %H %M %S"), "AVG.",
            str(avg),
            str(clf), "Folds:",
            str(NumFolds), "Model", "", "", ""
        ],
                                    filemode="a",
                                    delimiter=",")

        print("------------------------Average: " + str(avg))

        #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train)

    return dataset_blend_train, dataset_blend_test
Esempio n. 26
0
 def baggingRidged(self):
     return BaggingRegressor(self.ridged, n_estimators=100, max_samples=0.2)
Esempio n. 27
0
 def __init__(self, **args):
     """Init model."""
     self.model_lf = BaggingRegressor(**copy.deepcopy(args))
     self.model_hf = BaggingRegressor(**copy.deepcopy(args))
Esempio n. 28
0
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

#Step 1:Loading data
X, y = load_boston(return_X_y=True)

#Step 2:Split data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=40)

#step3:Training--BaggingRegressor
regression = BaggingRegressor(random_state=40)
param_grid = {
    'base_estimator':
    [DecisionTreeRegressor(criterion='mse', splitter='best')],
    'n_estimators': [x for x in np.arange(10, 101, 30)],
    'max_samples': [0.3, 0.7, 1.0],
    'max_features': [3, 6, 9, 13],
    'bootstrap_features': [True, False]
},
search = GridSearchCV(estimator=regression,
                      param_grid=param_grid,
                      cv=5,
                      refit=True,
                      verbose=1,
                      n_jobs=-1)
search.fit(X_train, y_train)
Esempio n. 29
0
def test_parallel_regression():
    # Check parallel regression.
    rng = check_random_state(0)

    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                n_jobs=3,
                                random_state=0).fit(X_train, y_train)

    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                n_jobs=1,
                                random_state=0).fit(X_train, y_train)

    y3 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y3)
Esempio n. 30
0
def test_sparse_regression():
    # Check regression for various parameter settings on sparse input.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)

    class CustomSVR(SVR):
        """SVC variant that records the nature of the training set"""
        def fit(self, X, y):
            super(CustomSVR, self).fit(X, y)
            self.data_type_ = type(X)
            return self

    parameter_sets = [
        {
            "max_samples": 0.5,
            "max_features": 2,
            "bootstrap": True,
            "bootstrap_features": True
        },
        {
            "max_samples": 1.0,
            "max_features": 4,
            "bootstrap": True,
            "bootstrap_features": True
        },
        {
            "max_features": 2,
            "bootstrap": False,
            "bootstrap_features": True
        },
        {
            "max_samples": 0.5,
            "bootstrap": True,
            "bootstrap_features": False
        },
    ]

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)
        for params in parameter_sets:

            # Trained on sparse format
            sparse_classifier = BaggingRegressor(base_estimator=CustomSVR(),
                                                 random_state=1,
                                                 **params).fit(
                                                     X_train_sparse, y_train)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_results = BaggingRegressor(base_estimator=CustomSVR(),
                                             random_state=1,
                                             **params).fit(
                                                 X_train,
                                                 y_train).predict(X_test)

            sparse_type = type(X_train_sparse)
            types = [i.data_type_ for i in sparse_classifier.estimators_]

            assert_array_equal(sparse_results, dense_results)
            assert all([t == sparse_type for t in types])
            assert_array_equal(sparse_results, dense_results)
Esempio n. 31
0
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import operator
import copy

x, y = datahelper.get_xy('data/', num_hours=3, error_minutes=15)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

scores = {}
models = []

for n in range(2, 20):
    estimator = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=4), max_samples=0.5, n_estimators=n)
    estimator.fit(x_train, y_train)
    scores[n] = estimator.score(x_test,y_test)
    models.append(copy.copy(estimator))

sorted_by_scores = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)
print('Results of 5 best # of estimators:\n')

for i in range(0, 5):
    n, score = sorted_by_scores[i]
    print("№ estimators = ", n)

    y_predicted = models[n-2].predict(x_test)

    print('R^2 = ' + str(r2_score(y_test, y_predicted)))
    print('MSE = ' + str(np.sqrt(mean_squared_error(y_test, y_predicted))))
Esempio n. 32
0
y_pred = random_forest.predict(X_test)

from sklearn.model_selection import cross_val_score
clf = RandomForestRegressor()
scores = cross_val_score(clf, X_test, y_test, cv=5)
scores.mean()

#mse in $
mse = mean_absolute_error(y_test, y_pred)
print("The mean absolute error is:$", mse)
#chceking r^2
from sklearn.metrics import r2_score

print("r_Score:", r2_score(y_test, y_pred))

bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10)
bg.fit(X_train, y_train)
bg.score(X_train, y_train)
bg.score(X_test, y_test)

#Adaboosting
regr = AdaBoostRegressor()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

#Decision
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)
    hour_list.append(date.hour)

rental_data["month"] = np.array(month_list)
rental_data["day"] = np.array(day_list)
rental_data["hour"] = np.array(hour_list)

del rental_data["datetime"]
rental_data = rental_data.iloc[np.random.permutation(len(rental_data))]
rental_counts = rental_data["count"].values

train_data,test_data,train_counts,test_counts = cross_validation.train_test_split(rental_data.values,rental_counts,test_size=0.2)

rf = RandomForestRegressor(n_estimators=101)
ada = AdaBoostRegressor(n_estimators=101)
grad = GradientBoostingRegressor(n_estimators=101)
bagging = BaggingRegressor(n_estimators=101)
svr = SVR()

regressors = [rf,ada,grad,bagging,svr]
regressor_names = ["Random Forests","Adaboost Regressor","Gradient Boost Regressor","Bagging Regressor","Support Vector Regressor"]

for regressor,regressor_name in zip(regressors,regressor_names):
    
    regressor.fit(train_data,train_counts)
    predicted_counts = regressor.predict(test_data)
    
    print "-----------------------------------------\n"
    print "Mean Absolute Error for ",regressor_name," : ",metrics.mean_absolute_error(test_counts,predicted_counts)
    print "Median Absolute Error for ",regressor_name," : ",metrics.median_absolute_error(test_counts,predicted_counts)
    print "Mean Squared Error for ",regressor_name," : ",metrics.mean_squared_error(test_counts,predicted_counts)
    print "R2 Score for ",regressor_name, " : ",metrics.r2_score(test_counts,predicted_counts)
    def fit(self,
            label_correctness_file,
            point_labels_file,
            users_file,
            user_quality_features_file='all_users.csv'):
        users = pd.read_csv(users_file)
        users = users.set_index('user_id')
        y_train = users['accuracy']

        users_for_training = users[users['labels_validated'] > 25].index
        self.label_correctness = extract_label_features(
            point_labels_file, label_correctness_file)
        #  Splits the users into training & testing groups
        user_quality_features = pd.read_csv(
            user_quality_features_file).set_index('user_id')
        half = int(len(users_for_training) / 2)
        users_labels_train = users_for_training[:half]
        users_labels_test = users_for_training[half:]

        #         mask = np.random.permutation(np.arange(len(users_for_training)))
        #         users_labels_train = users_for_training[mask[:int(proportion_labels * len(mask))]]
        #         users_labels_test = users_for_training[mask[int(proportion_labels * len(mask)):]]

        train_labels = self.label_correctness.copy()
        train_labels = train_labels[~pd.isna(train_labels['correct'])]
        train_labels = train_labels[~(pd.isna(train_labels[features]).any(
            axis=1))]

        #         en = OrdinalEncoder()
        #         en.fit(pd.concat((train_labels[['CLASS_DESC']], test_labels[['CLASS_DESC']])))
        #         train_labels[['CLASS_DESC']] = en.transform(train_labels[['CLASS_DESC']])

        self.rfe_labels = RFECV(
            estimator=RandomForestClassifier(n_estimators=10),
            step=1,
            cv=StratifiedKFold(5),
            scoring='precision')
        self.clf_labels = RandomForestClassifier(random_state=0,
                                                 n_jobs=-1,
                                                 n_estimators=30)
        self.clf_accuracy = BaggingRegressor(random_state=0,
                                             n_jobs=-1,
                                             n_estimators=30)
        self.rfe_accuracy = RFECV(
            estimator=RandomForestClassifier(n_estimators=10),
            step=1,
            cv=StratifiedKFold(5),
            scoring='f1')

        print('Training label classifier...')
        self.rfe_labels.fit(
            train_labels[train_labels['user_id'].isin(users_labels_train)]
            [features].values, train_labels[train_labels['user_id'].isin(
                users_labels_train)]['correct'].astype(int))

        self.clf_labels.fit(
            train_labels[train_labels['user_id'].isin(users_labels_train)]
            [features].values[:, self.rfe_labels.support_],
            train_labels[train_labels['user_id'].isin(
                users_labels_train)]['correct'].astype(int))

        train_labels = train_labels.join(pd.Series(
            data=self.clf_labels.predict_proba(
                train_labels[train_labels['user_id'].isin(users_labels_test)]
                [features].values[:, self.rfe_labels.support_])[:, 1],
            index=train_labels[train_labels['user_id'].isin(
                users_labels_test)].index).rename('prob'),
                                         how='outer')

        prob_hist_predictions = pd.DataFrame(train_labels[train_labels['user_id'].isin(users_labels_test)]
            .groupby('user_id').apply(lambda x:\
            prob_hist(x['prob'].values)).rename('prob'))

        prob_hist_predictions = prob_hist_predictions.join(
            user_quality_features)

        print('Training accuracy classifier...')
        self.rfe_accuracy.fit(
            np.concatenate((dearray(prob_hist_predictions['prob']),
                            prob_hist_predictions.drop(columns='prob').values),
                           axis=1),
            y_train.loc[prob_hist_predictions.index] > 65)

        self.clf_accuracy.fit(
            np.concatenate((dearray(prob_hist_predictions['prob']),
                            prob_hist_predictions.drop(columns='prob').values),
                           axis=1)[:, self.rfe_accuracy.support_],
            y_train.loc[prob_hist_predictions.index])
Esempio n. 35
0
def bagging():
    data = pd.read_csv('MR_meanencoding.csv', index_col=0)
    data = data.drop("PNO", axis=1)
    print(1)
    data = data[data['AGE'] <= 120]
    data = data[data['AGE'] >= 1]
    print(2)
    data = data[data['total'] < 90 * 60]
    data = data[data['total'] > 60]
    print(3)
    # 標準化
    # min = data['AGE'].min()
    # max = data['AGE'].max()
    # data['AGE'] = (data['AGE'] - min) / (max - min)

    # mlist = pd.unique(data['PLACE_n'])
    # print(mlist)
    # 第五台沒資料
    for m in range(1):
        pno = {405108: 50, 405568: 23, 405984: 11, 406750: 83}
        # pno = {405108: 54, 405568: 23, 405984: 12, 406750: 89}
        temp = data.copy()
        print(temp.columns)
        X_train, X_test, y_train, y_test = train_test_split(temp.drop('total',
                                                                      axis=1),
                                                            temp['total'],
                                                            test_size=0.3,
                                                            random_state=42)

        def find_alpha(X_train, y_train):
            reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
            reg.fit(X_train, y_train)
            print(m, ':', reg.alpha_)

        # find_alpha(X_train, y_train)
        alpha = {405108: 100, 405568: 10, 405984: 100, 406750: 10}
        bagging = BaggingRegressor(
            base_estimator=linear_model.LinearRegression(),
            max_samples=.1,
            max_features=1)
        bagging.fit(X_train, y_train)

        # print(m)
        print('bagging_linear:', mape(y_test, bagging.predict(X_test)))
        # print('bagging_linear:', mape(y_test, bagging.predict(X_test)), r2_score(y_test, bagging.predict(X_test)))
        # print(shoot(y_test, bagging.predict(X_test)))
        bagging = BaggingRegressor(base_estimator=linear_model.Ridge(alpha=.5),
                                   max_samples=.1,
                                   max_features=1)
        bagging.fit(X_train, y_train)
        print('bagging_Ridge:', mape(y_test, bagging.predict(X_test)))
        # print('bagging_Ridge:', mape(y_test, bagging.predict(X_test)), r2_score(y_test, bagging.predict(X_test)))
        # print(shoot(y_test, bagging.predict(X_test)))

        bagging = BaggingRegressor(
            base_estimator=linear_model.Lasso(alpha=0.5),
            max_samples=.1,
            max_features=1)
        bagging.fit(X_train, y_train)
        print('bagging_Lasso:', mape(y_test, bagging.predict(X_test)))
        # print('bagging_Lasso:', mape(y_test, bagging.predict(X_test)), r2_score(y_test, bagging.predict(X_test)))
        # print(shoot(y_test, bagging.predict(X_test)))

        bagging = BaggingRegressor(max_samples=.1, max_features=1)
        bagging.fit(X_train, y_train)
        print('bagging_decision:', mape(y_test, bagging.predict(X_test)))
        # print('bagging_decision:', mape(y_test, bagging.predict(X_test)), r2_score(y_test, bagging.predict(X_test)))
        # print(shoot(y_test, bagging.predict(X_test)))

        reg = linear_model.LinearRegression()
        reg.fit(X_train, y_train)
        print('linear', mape(y_test, reg.predict(X_test)))
        # print('linear', mape(y_test, reg.predict(X_test)), r2_score(y_test, bagging.predict(X_test)))
        # print(shoot(y_test, reg.predict(X_test)))

        reg = linear_model.Ridge(alpha=.5)
        reg.fit(X_train, y_train)
        print('Ridge', mape(y_test, reg.predict(X_test)))
        # print('Ridge', mape(y_test, reg.predict(X_test)), r2_score(y_test, bagging.predict(X_test)))
        # print(shoot(y_test, reg.predict(X_test)))

        reg = linear_model.Lasso(alpha=0.5)
        reg.fit(X_train, y_train)
        print('Lasso', mape(y_test, reg.predict(X_test)))
        # print('Lasso', mape(y_test, reg.predict(X_test)), r2_score(y_test, bagging.predict(X_test)))
        # print(shoot(y_test, reg.predict(X_test)))

        # break #只做一台

    def result():
        pass
        # bagging default
        """
Esempio n. 36
0
    def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for anomalies.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]

        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)

        if n_classes < 2:
            raise ValueError("This method needs samples of at least 2 classes"
                             " in the data, but the data contains only one"
                             " class: %r" % self.classes_[0])

        if not isinstance(self.max_depth_duplication, int) \
                and self.max_depth_duplication is not None:
            raise ValueError("max_depth_duplication should be an integer"
                             )
        if not set(self.classes_) == set([0, 1]):
            warn("Found labels %s. This method assumes target class to be"
                 " labeled as 1 and normal data to be labeled as 0. Any label"
                 " different from 0 will be considered as being from the"
                 " target class."
                 % set(self.classes_))
            y = (y > 0)

        # ensure that max_samples is in [1, n_samples]:
        n_samples = X.shape[0]

        if isinstance(self.max_samples, six.string_types):
            raise ValueError('max_samples (%s) is not supported.'
                             'Valid choices are: "auto", int or'
                             'float' % self.max_samples)

        elif isinstance(self.max_samples, INTEGER_TYPES):
            if self.max_samples > n_samples:
                warn("max_samples (%s) is greater than the "
                     "total number of samples (%s). max_samples "
                     "will be set to n_samples for estimation."
                     % (self.max_samples, n_samples))
                max_samples = n_samples
            else:
                max_samples = self.max_samples
        else:  # float
            if not (0. < self.max_samples <= 1.):
                raise ValueError("max_samples must be in (0, 1], got %r"
                                 % self.max_samples)
            max_samples = int(self.max_samples * X.shape[0])

        self.max_samples_ = max_samples

        self.rules_ = {}
        self.estimators_ = []
        self.estimators_samples_ = []
        self.estimators_features_ = []

        # default columns names :
        feature_names_ = [BASE_FEATURE_NAME + x for x in
                          np.arange(X.shape[1]).astype(str)]
        if self.feature_names is not None:
            self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat
                                  for i, feat in enumerate(self.feature_names)}
        else:
            self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat
                                  for i, feat in enumerate(feature_names_)}
        self.feature_names_ = feature_names_

        clfs = []
        regs = []

        self._max_depths = self.max_depth \
            if isinstance(self.max_depth, Iterable) else [self.max_depth]

        for max_depth in self._max_depths:
            bagging_clf = BaggingClassifier(
                base_estimator=DecisionTreeClassifier(
                    max_depth=max_depth,
                    max_features=self.max_features,
                    min_samples_split=self.min_samples_split),
                n_estimators=self.n_estimators,
                max_samples=self.max_samples_,
                max_features=self.max_samples_features,
                bootstrap=self.bootstrap,
                bootstrap_features=self.bootstrap_features,
                # oob_score=... XXX may be added
                # if selection on tree perf needed.
                # warm_start=... XXX may be added to increase computation perf.
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose)

            bagging_reg = BaggingRegressor(
                base_estimator=DecisionTreeRegressor(
                    max_depth=max_depth,
                    max_features=self.max_features,
                    min_samples_split=self.min_samples_split),
                n_estimators=self.n_estimators,
                max_samples=self.max_samples_,
                max_features=self.max_samples_features,
                bootstrap=self.bootstrap,
                bootstrap_features=self.bootstrap_features,
                # oob_score=... XXX may be added
                # if selection on tree perf needed.
                # warm_start=... XXX may be added to increase computation perf.
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose)

            clfs.append(bagging_clf)
            regs.append(bagging_reg)

        # define regression target:
        if sample_weight is not None:
            if sample_weight is not None:
                sample_weight = check_array(sample_weight, ensure_2d=False)
            weights = sample_weight - sample_weight.min()
            contamination = float(sum(y)) / len(y)
            y_reg = (
                    pow(weights, 0.5) * 0.5 / contamination * (y > 0) -
                    pow((weights).mean(), 0.5) * (y == 0))
            y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid
        else:
            y_reg = y  # same as an other classification bagging

        for clf in clfs:
            clf.fit(X, y)
            self.estimators_ += clf.estimators_
            self.estimators_samples_ += clf.estimators_samples_
            self.estimators_features_ += clf.estimators_features_

        for reg in regs:
            reg.fit(X, y_reg)
            self.estimators_ += reg.estimators_
            self.estimators_samples_ += reg.estimators_samples_
            self.estimators_features_ += reg.estimators_features_

        rules_ = []
        for estimator, samples, features in zip(self.estimators_,
                                                self.estimators_samples_,
                                                self.estimators_features_):

            # Create mask for OOB samples
            mask = ~samples
            if sum(mask) == 0:
                warn("OOB evaluation not possible: doing it in-bag."
                     " Performance evaluation is likely to be wrong"
                     " (overfitting) and selected rules are likely to"
                     " not perform well! Please use max_samples < 1.")
                mask = samples
            rules_from_tree = self._tree_to_rules(
                estimator, np.array(self.feature_names_)[features])

            # XXX todo: idem without dataframe
            X_oob = pandas.DataFrame((X[mask, :])[:, features],
                                     columns=np.array(
                                         self.feature_names_)[features])

            if X_oob.shape[1] > 1:  # otherwise pandas bug (cf. issue #16363)
                y_oob = y[mask]
                y_oob = np.array((y_oob != 0))

                # Add OOB performances to rules:
                rules_from_tree = [(r, self._eval_rule_perf(r, X_oob, y_oob))
                                   for r in set(rules_from_tree)]
                rules_ += rules_from_tree

        # Factorize rules before semantic tree filtering
        rules_ = [
            tuple(rule)
            for rule in
            [Rule(r, args=args) for r, args in rules_]]

        # keep only rules verifying precision_min and recall_min:
        for rule, score in rules_:
            if score[0] >= self.precision_min and score[1] >= self.recall_min:
                if rule in self.rules_:
                    # update the score to the new mean
                    c = self.rules_[rule][2] + 1
                    b = self.rules_[rule][1] + 1. / c * (
                            score[1] - self.rules_[rule][1])
                    a = self.rules_[rule][0] + 1. / c * (
                            score[0] - self.rules_[rule][0])

                    self.rules_[rule] = (a, b, c)
                else:
                    self.rules_[rule] = (score[0], score[1], 1)

        self.rules_ = sorted(self.rules_.items(),
                             key=lambda x: (x[1][0], x[1][1]), reverse=True)

        # Deduplicate the rule using semantic tree
        if self.max_depth_duplication is not None:
            self.rules_ = self.deduplicate(self.rules_)

        self.rules_ = sorted(self.rules_, key=lambda x: - self.f1_score(x))
        self.rules_without_feature_names_ = self.rules_

        # Replace generic feature names by real feature names
        self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf)
                       for rule, perf in self.rules_]

        return self
class UserQualityRegressor(BaseEstimator):
    def __init__(self):
        pass
#   Trains the ml model using the ineraction & label data of all users who have atleast 25 labels validated

    def fit(self,
            label_correctness_file,
            point_labels_file,
            users_file,
            user_quality_features_file='all_users.csv'):
        users = pd.read_csv(users_file)
        users = users.set_index('user_id')
        y_train = users['accuracy']

        users_for_training = users[users['labels_validated'] > 25].index
        self.label_correctness = extract_label_features(
            point_labels_file, label_correctness_file)
        #  Splits the users into training & testing groups
        user_quality_features = pd.read_csv(
            user_quality_features_file).set_index('user_id')
        half = int(len(users_for_training) / 2)
        users_labels_train = users_for_training[:half]
        users_labels_test = users_for_training[half:]

        #         mask = np.random.permutation(np.arange(len(users_for_training)))
        #         users_labels_train = users_for_training[mask[:int(proportion_labels * len(mask))]]
        #         users_labels_test = users_for_training[mask[int(proportion_labels * len(mask)):]]

        train_labels = self.label_correctness.copy()
        train_labels = train_labels[~pd.isna(train_labels['correct'])]
        train_labels = train_labels[~(pd.isna(train_labels[features]).any(
            axis=1))]

        #         en = OrdinalEncoder()
        #         en.fit(pd.concat((train_labels[['CLASS_DESC']], test_labels[['CLASS_DESC']])))
        #         train_labels[['CLASS_DESC']] = en.transform(train_labels[['CLASS_DESC']])

        self.rfe_labels = RFECV(
            estimator=RandomForestClassifier(n_estimators=10),
            step=1,
            cv=StratifiedKFold(5),
            scoring='precision')
        self.clf_labels = RandomForestClassifier(random_state=0,
                                                 n_jobs=-1,
                                                 n_estimators=30)
        self.clf_accuracy = BaggingRegressor(random_state=0,
                                             n_jobs=-1,
                                             n_estimators=30)
        self.rfe_accuracy = RFECV(
            estimator=RandomForestClassifier(n_estimators=10),
            step=1,
            cv=StratifiedKFold(5),
            scoring='f1')

        print('Training label classifier...')
        self.rfe_labels.fit(
            train_labels[train_labels['user_id'].isin(users_labels_train)]
            [features].values, train_labels[train_labels['user_id'].isin(
                users_labels_train)]['correct'].astype(int))

        self.clf_labels.fit(
            train_labels[train_labels['user_id'].isin(users_labels_train)]
            [features].values[:, self.rfe_labels.support_],
            train_labels[train_labels['user_id'].isin(
                users_labels_train)]['correct'].astype(int))

        train_labels = train_labels.join(pd.Series(
            data=self.clf_labels.predict_proba(
                train_labels[train_labels['user_id'].isin(users_labels_test)]
                [features].values[:, self.rfe_labels.support_])[:, 1],
            index=train_labels[train_labels['user_id'].isin(
                users_labels_test)].index).rename('prob'),
                                         how='outer')

        prob_hist_predictions = pd.DataFrame(train_labels[train_labels['user_id'].isin(users_labels_test)]
            .groupby('user_id').apply(lambda x:\
            prob_hist(x['prob'].values)).rename('prob'))

        prob_hist_predictions = prob_hist_predictions.join(
            user_quality_features)

        print('Training accuracy classifier...')
        self.rfe_accuracy.fit(
            np.concatenate((dearray(prob_hist_predictions['prob']),
                            prob_hist_predictions.drop(columns='prob').values),
                           axis=1),
            y_train.loc[prob_hist_predictions.index] > 65)

        self.clf_accuracy.fit(
            np.concatenate((dearray(prob_hist_predictions['prob']),
                            prob_hist_predictions.drop(columns='prob').values),
                           axis=1)[:, self.rfe_accuracy.support_],
            y_train.loc[prob_hist_predictions.index])

# Creates the prediction of the given user's accuracy based off of their passed in label & interaction data

    def __predict_accuracy(self, probs, user_features):
        return self.clf_accuracy.predict([
            np.concatenate(
                (prob_hist(probs), user_features))[self.rfe_accuracy.support_]
        ])[0]
# Takes in all the names of the files and creates a prediction of the given user's accuracy

    def predict_one_user(self, filename, label_correctness_file,
                         point_labels_file, panos_file, user_name):
        user_features, user_features_header, user_id = extract_user_features(
            filename, panos_file, user_name)
        label_correctness = extract_label_features(point_labels_file,
                                                   label_correctness_file)
        label_correctness = label_correctness[~(
            pd.isna(label_correctness[features]).any(axis=1))]
        label_correctness = label_correctness[label_correctness['user_id'] ==
                                              user_id]
        probs = self.clf_labels.predict(
            label_correctness[features].values[:, self.rfe_labels.support_])
        return self.__predict_accuracy(probs, user_features)

    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.__dict__, f)

    @classmethod
    def load(filename):
        return pickle.load(filename)
Esempio n. 38
0
if __name__ == "__main__":
    np.random.seed(0)
    N = 200
    x = np.random.rand(N) * 10 - 5  # [-5,5)
    x = np.sort(x)
    y = f(x) + 0.05 * np.random.randn(N)
    x.shape = -1, 1

    degree = 6
    n_estimators = 50
    max_samples = 0.5
    ridge = RidgeCV(alphas=np.logspace(-3, 2, 20), fit_intercept=False)
    ridged = Pipeline([('poly', PolynomialFeatures(degree=degree)),
                       ('Ridge', ridge)])
    bagging_ridged = BaggingRegressor(ridged,
                                      n_estimators=n_estimators,
                                      max_samples=max_samples)
    dtr = DecisionTreeRegressor(max_depth=9)
    regs = [('DecisionTree', dtr), ('Ridge(%d Degree)' % degree, ridged),
            ('Bagging Ridge(%d Degree)' % degree, bagging_ridged),
            ('Bagging DecisionTree',
             BaggingRegressor(dtr,
                              n_estimators=n_estimators,
                              max_samples=max_samples))]
    x_test = np.linspace(1.1 * x.min(), 1.1 * x.max(), 1000)
    mpl.rcParams['font.sans-serif'] = ['SimHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(8, 6), facecolor='w')
    plt.plot(x, y, 'ro', mec='k', label='训练数据')
    plt.plot(x_test, f(x_test), color='k', lw=3, ls='-', label='真实值')
    clrs = '#FF2020', 'm', 'y', 'g'
Esempio n. 39
0
 def baggingRegressor(self):
     if self.dtr == None:
         self.decisionTreeRegressor()
     return BaggingRegressor(self.dtr, n_estimators=100, max_samples=0.2)
    return(out)

log = LogisticRegression(solver = 'sag')
lm = LinearRegression()

write_function(test['y'], '/tmp/truths.txt')

print('optimizing samples')
for n_samp in [0.1, 0.25, 0.33, 0.5, 0.75, 1.0]:
    for n_feat in [0.1, 0.25, 0.33, 0.5, 0.75, 1.0]:    
        
        lm_bagged = BaggingRegressor(
          base_estimator = lm, 
          n_estimators = 75, 
          max_samples = n_samp, 
          max_features = n_feat,
          bootstrap = True, 
          oob_score = False, 
          warm_start = False, 
          n_jobs = -1
        )
        
        log_bagged = BaggingClassifier(
          base_estimator = log, 
          n_estimators = 75, 
          max_samples = n_samp, 
          max_features = n_feat,
          bootstrap = True, 
          oob_score = False, 
          warm_start = False, 
          n_jobs = -1
        )
Esempio n. 41
0
    # 分割数据,按照 训练数据:cv数据 = 7:3的比例
    split_train, split_cv = model_selection.train_test_split(df,
                                                             test_size=0.3,
                                                             random_state=0)
    train_df = split_train.filter(
        regex=
        'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
    )

    # fit到BaggingRegressor之中
    clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    clf = BaggingRegressor(clf,
                           n_estimators=20,
                           max_samples=0.8,
                           max_features=1.0,
                           bootstrap=True,
                           bootstrap_features=False,
                           n_jobs=-1)
    clf.fit(X, y)

    cv_df = split_cv.filter(
        regex=
        'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*'
    )
    predictions = clf.predict(cv_df.as_matrix()[:, 1:])

    # bad_cases = data_train.loc[data_train['PassengerId'].isin(split_cv[predictions != cv_df.as_matrix()[:,0]]['PassengerId'].values)]

    data_test = pd.read_csv("test.csv")
    # 均值
Esempio n. 42
0
(2)多模型(模型融合)使用Bagging策略融合LR
from sklearn.ensemble import BaggingRegressor

train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
train_np = train_df.as_matrix()

# y即Survival结果
y = train_np[:, 0]

# X即特征属性值
X = train_np[:, 1:]

# fit到BaggingRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
bagging_clf.fit(X, y)

test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
predictions = bagging_clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("/Users/HanXiaoyang/Titanic_data/logistic_regression_bagging_predictions.csv", index=False)


------------------------------------------------------------------------------

-------------------------------------------------------------------------------
"""七、工具"""

"""训练xgb模型对特征进行重要性排序,特征选择"""
df_y = df_Master['target'] 
Esempio n. 43
0
        if target == 'lat':
            y = y[:, 0]
        elif target == 'lon':
            y = y[:, 1]
            # print(y)
        if phase == 'train':
            model.fit(X, y)

        y_pred = model.predict(X)
        total_loss = calc_loss(y, y_pred)

        print("{} {} loss is: {:.5f}".format(phase, target, total_loss))

for target in ['lat', 'lon']:
    # print(target)
    model = BaggingRegressor(n_estimators=1, max_features=0.2)
    for phase in ['train', 'val']:
        running_loss = 0.0
        dl = dataloader[phase]
        X = dl[0]
        y = dl[1]
        if target == 'lat':
            y = y[:, 0]
        elif target == 'lon':
            y = y[:, 1]
            # print(y)
        if phase == 'train':
            model.fit(X, y)

        y_pred = model.predict(X)
        total_loss = calc_loss(y, y_pred)
Esempio n. 44
0
def test_parallel():
    """Check parallel computations."""
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    for n_jobs in [-1, 3]:
        ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                     n_jobs=n_jobs,
                                     random_state=0).fit(X_train, y_train)

        # predict_proba
        ensemble.set_params(n_jobs=1)
        y1 = ensemble.predict_proba(X_test)
        ensemble.set_params(n_jobs=2)
        y2 = ensemble.predict_proba(X_test)
        assert_array_almost_equal(y1, y2)

        ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                     n_jobs=1,
                                     random_state=0).fit(X_train, y_train)

        y3 = ensemble.predict_proba(X_test)
        assert_array_almost_equal(y1, y3)

        # decision_function
        ensemble = BaggingClassifier(SVC(), n_jobs=n_jobs,
                                     random_state=0).fit(X_train, y_train)

        ensemble.set_params(n_jobs=1)
        decisions1 = ensemble.decision_function(X_test)
        ensemble.set_params(n_jobs=2)
        decisions2 = ensemble.decision_function(X_test)
        assert_array_almost_equal(decisions1, decisions2)

        ensemble = BaggingClassifier(SVC(), n_jobs=1,
                                     random_state=0).fit(X_train, y_train)

        decisions3 = ensemble.decision_function(X_test)
        assert_array_almost_equal(decisions1, decisions3)

    # Regression
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    for n_jobs in [-1, 3]:
        ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                    n_jobs=3,
                                    random_state=0).fit(X_train, y_train)

        ensemble.set_params(n_jobs=1)
        y1 = ensemble.predict(X_test)
        ensemble.set_params(n_jobs=2)
        y2 = ensemble.predict(X_test)
        assert_array_almost_equal(y1, y2)

        ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                    n_jobs=1,
                                    random_state=0).fit(X_train, y_train)

        y3 = ensemble.predict(X_test)
        assert_array_almost_equal(y1, y3)
Esempio n. 45
0
        ["xgboost_Dummies", ""],
        ["xgboost_Label", ""],
        ["xgboost_Vect", ""],
    ]
    full_predictions = []
    for alg, predictors in algorithms:
        if alg == "xgboost_Label":
            full_predictions.append(xgboost_Label(train, test, labels))
        elif alg == "xgboost_Vect":
            full_predictions.append(xgboost_Vect(train, test, labels))
        elif alg == "xgboost_Dummies":
            full_predictions.append(xgboost_Dummies(train, test, labels))
        else:
            if predictors == "dummies":
                print ("Train ", alg.__class__.__name__, " dummies Model ")
                alg = BaggingRegressor(alg)
                alg.fit(train_du, labels)
                print "Prediction :", alg.__class__.__name__, " dummies Model "
                prediction = alg.predict(test_du)
                full_predictions.append(prediction)
            else:
                print ("Train ", alg.__class__.__name__, " Label Model ")
                alg = BaggingRegressor(alg)
                alg.fit(train_rf, labels)
                print "Prediction :", alg.__class__.__name__, " Label Model "
                prediction = alg.predict(test_rf)
                full_predictions.append(prediction)

                # Ensemble models
    RF_label_pred = full_predictions[0]
    RF_dummies_pred = full_predictions[1]
Esempio n. 46
0
mlp = MLPRegressor()
mlpFit = mlp.fit(x_train, y_train)

regr = AdaBoostRegressor(random_state=0, n_estimators=100)
regrFit = regr.fit(x_train, y_train)

clfRidge = Ridge(alpha=1.0)
clfRidgeFit = clfRidge.fit(x_train, y_train)

clfBayesian = linear_model.BayesianRidge()
clfBayesianFit = clfBayesian.fit(x_train, y_train)

reg = linear_model.LassoLars(alpha=0.01)
regFit = reg.fit(x_train, y_train)

bag = BaggingRegressor()
bagFit = bag.fit(x_train, y_train)

DT_MAD = mean_absolute_error(y_test, DT_regressionFit.predict(x_test))
SVR_MAD = mean_absolute_error(y_test, svr_regressionFit.predict(x_test))
KNN_MAD = mean_absolute_error(y_test, neighFit.predict(x_test))
MLP_MAD = mean_absolute_error(y_test, mlpFit.predict(x_test))
regr_MAD = mean_absolute_error(y_test, mlpFit.predict(x_test))
clfRidge_MAD = mean_absolute_error(y_test, clfRidgeFit.predict(x_test))
clfBayesion_MAD = mean_absolute_error(y_test, clfBayesianFit.predict(x_test))
reg_MAD = mean_absolute_error(y_test, regFit.predict(x_test))
bag_MAD = mean_absolute_error(y_test, bagFit.predict(x_test))

print('Regression Tree MAD: ' + str(DT_MAD))
print('Support Vector Regression MAD ' + str(SVR_MAD))
print('KNN MAD ' + str(KNN_MAD))
Esempio n. 47
0
import numpy as np
import pickle
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
from sklearn import linear_model
#from sklearn import svm
from sklearn.decomposition import PCA

model = BaggingRegressor(DecisionTreeRegressor(max_depth=20, min_samples_split=20, min_samples_leaf=1), n_estimators = 50)

(X, y) = pickle.load(open('train.pickle'))

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

index = [15, 17, 18, 7, 0, 1, 2, 22, 23, 24, 9, 10, 4, 8, 5, 13, 14, 19, 20, 21, 16, 11, 12, 3, 25, 26, 6]

X = X[:, index[:5]]
#X = np.random.rand(X.shape[0], 1)
#scaler = preprocessing.StandardScaler().fit(X)
#X = scaler.transform(X)


#X = X[:, range(12) + range(13, X.shape[1])]

#pca = PCA(n_components=0.99)
#pca.fit(X)
    def process(i):
        current_dat = header.iloc[i]
        current_dat_name = current_dat['bench.id']
        # Define Datasets
        # print(str(i)+': '+current_dat_name)
        filename = '/Users/apple/Documents/AD_Datasets/' + dataname + '/benchmarks/' + current_dat_name + '.csv'
        with open(filename, 'r') as csvfile:
            reader = csv.reader(csvfile)
            data = list(reader)
        data = np.array(data)

        X_train = data[1:, 6:].astype('double')
        anomaly_type = data[1:, 5]
        y_label = np.zeros(len(anomaly_type))
        # normal_ind = np.where(anomaly_type == 'nominal')[0]
        anomaly_ind = np.where(anomaly_type == 'anomaly')[0]
        y_label[anomaly_ind] = 1
        # X_normal = X_train[normal_ind,:]
        # X_outlier = X_train[anomaly_ind,:]
        # contamination = len(anomaly_ind)/len(y_label)

        rng = np.random.RandomState(42)

        # BaggedDTM
        #     #################################################################################################
        #     # max_samples = min(2048,X_train.shape[0])
        #     # y = np.random.uniform(size=X_train.shape[0])
        #     # bag_neigh = max(10, int(np.floor(0.03 * max_samples)))
        #     # clf_bagDTM = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh,contamination=0.1),
        #     #                               n_estimators=100, max_samples=max_samples, bootstrap=False, random_state=rng)
        #     # y_score_BDTM = clf_bagDTM.fit(X_train, y).predict(X_train)
        #     # # fpr_DTM, tpr_DTM, thresholds_DTM = roc_curve(y_label, -DTM_score)
        #     # auc_BDTM_score = roc_auc_score(y_label, -y_score_BDTM)
        #     # ap_BDTM_score = average_precision_score(y_label, -y_score_BDTM)

        # sp
        #################################################################################################
        max_samples = min(20, X_train.shape[0])
        y = np.random.uniform(size=X_train.shape[0])
        bag_neigh = 1
        clf_spDTM = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh,
                                                        contamination=0.1),
                                     n_estimators=1,
                                     max_samples=max_samples,
                                     bootstrap=False,
                                     random_state=rng)
        y_score_spDTM = clf_spDTM.fit(X_train, y).predict(X_train)
        auc_spDTM_score = roc_auc_score(y_label, -y_score_spDTM)
        ap_spDTM_score = average_precision_score(y_label, -y_score_spDTM)

        # aNNE
        #################################################################################################
        clf_aNNE = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh,
                                                       contamination=0.1),
                                    n_estimators=100,
                                    max_samples=max_samples,
                                    bootstrap=False,
                                    random_state=rng)
        y_score_aNNE = clf_aNNE.fit(X_train, y).predict(X_train)
        auc_aNNE_score = roc_auc_score(y_label, -y_score_aNNE)
        ap_aNNE_score = average_precision_score(y_label, -y_score_aNNE)

        return [auc_spDTM_score,
                auc_aNNE_score], [ap_spDTM_score, ap_aNNE_score]
Esempio n. 49
0
#     midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
#     diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
#     return midpoint, diff
#
# plot_learning_curve(model, u"learning  curve", X, Y)


# 6...................模型融合.....................#


train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.as_matrix()

# y即Survival结果
y = train_np[:, 0]

x = train_np[:, 1:]


# fit 到BaggingRegressor之中
model = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
bagging_model = BaggingRegressor(model,n_estimators=20,max_samples=0.8,max_features=1.0,bootstrap=True,bootstrap_features=False,n_jobs=-1)
bagging_model.fit(x,y)

test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = bagging_model.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(),'Survived':predictions.astype(np.int32 )})
result.to_csv('./result.csv',index=False)


from sklearn.ensemble import RandomForestRegressor

from sklearn.multioutput import MultiOutputRegressor

# to set number of jobs to the number of cores, use n_jobs=-1
model = MultiOutputRegressor(GradientBoostingRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
models.append(model)
modeldims.append(2)
modelnames.append('GradientBoostingRegressor')

model = MultiOutputRegressor(AdaBoostRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
models.append(model)
modeldims.append(2)
modelnames.append('AdaBoostRegressor')

model = MultiOutputRegressor(BaggingRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
models.append(model)
modeldims.append(2)
modelnames.append('BaggingRegressor')

model = MultiOutputRegressor(ExtraTreesRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
models.append(model)
modeldims.append(2)
modelnames.append('ExtraTreesRegressor')

model = MultiOutputRegressor(RandomForestRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
models.append(model)
modeldims.append(2)
modelnames.append('RandomForestRegressor')

model = MultiOutputRegressor(SVR(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
'r-',tz=None, xdate=True, ydate=False)
ax2.set_title('Error between actual and predicted loads')
ax2.set_ylabel("Error, MW")

featImportances=gradBoost.feature_importances_
pos = np.arange(len(features))
pairs = zip(features, featImportances)
sorted_pairs = sorted(pairs, key = lambda pair: pair[1])
features_sorted, featImportances_sorted = zip(*sorted_pairs)
fig, ax = plt.subplots()
plt.barh(pos, featImportances_sorted, 1, color = "blue")
plt.yticks(pos,features_sorted)
ax.set_title('Gradient Boosting: Relative Feature Importance')

#Tree Bagging
TreeBagger=BaggingRegressor()
TreeBagger.fit(Xtrain, Ytrain)
fig = plt.figure()
ax1 = fig.add_subplot(2, 1, 1)
ax1.plot_date(dates, modeldata.Load[45000:50000], 'r-',tz=None, xdate=True,
          ydate=False, label='Actual Load')
ax1.set_title('Tree Bagging: Actual and Predicted Loads')          
plt.plot(dates, TreeBagger.predict(Xtest), 'g-',label='Predicted Load')
ax1.legend()
ax2 = fig.add_subplot(2, 1, 2)
ax2.plot_date(dates, modeldata.Load[45000:50000]-TreeBagger.predict(Xtest), 'r-',tz=None, xdate=True,
          ydate=False)
ax2.set_title('Error between actual and predicted loads, MW')

MSEs_Bagging=[mean_squared_error(Ytest, TreeBagger.predict(Xtest)), mean_squared_error(Ytrain, TreeBagger.predict(Xtrain))]
Esempio n. 52
0
    def linear_regression_algo(self):

        X = []
        Y = []

        with open('../Data/full_table.csv', 'r') as file:
            for line in csv.reader(file, delimiter=','):
                if len(line) == 13:
                    try:
                        zhvi = float(line[5])
                        property_type = line[6]
                        room_type = line[7]
                        accommodates = int(line[8])
                        bathrooms = float(line[9])
                        beds = int(line[10])
                        bed_type = line[11]
                        price = float(line[12])

                        x = {
                            'zhvi': zhvi,
                            'property_type': property_type,
                            'room_type': room_type,
                            'accommodates': accommodates,
                            'bathrooms': bathrooms,
                            'beds': beds,
                            'bed_type': bed_type
                        }

                        y = price

                        X.append(x)
                        Y.append(y)

                    except:
                        pass

        # The DictVectorizer converts data from a dictionary to an array
        vec = DictVectorizer()

        # Convert X to Array
        X = vec.fit_transform(X).toarray()

        # Split X and Y into training and testing sets
        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.33,
                                                            random_state=43)

        # Linear Regression
        model = linear_model.LinearRegression()
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Linear Regression')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))

        # With Boosting
        model_boost = AdaBoostRegressor(linear_model.LinearRegression())
        model_boost.fit(X_train, Y_train)
        Y_pred = model_boost.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Linear Regression (with AdaBoost)')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))

        # With Bagging
        model_bag = BaggingRegressor(linear_model.LinearRegression())
        model_bag.fit(X_train, Y_train)
        Y_pred = model_bag.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Linear Regression (with Bagging)')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))
            figure_width,
            figure_height,
            correct_orientation,
            cmap=cmap)


names = [
    "KNeighborsRegressor", "Bagging KNN Essembler", "XGBoost",
    "GradientBoostingRegressor", "AdaBoost + KNN", "AdaBoost", "Random Forest",
    "SVC + RandomForest Pipeline", "ExtraTreesRegressor"
]

classifiers = [
    #KNeighborsRegressor(13),
    KNeighborsRegressor(26),
    BaggingRegressor(KNeighborsRegressor(n_neighbors=30)),
    xgb.XGBRegressor(
        max_depth=6,
        n_estimators=55,  #55
        learning_rate=0.05,
        min_child_weight=60,
        nthread=8,
        subsample=0.95,  #95
        colsample_bytree=0.95,  # 95
        # subsample=1.00,
        # colsample_bytree=1.00,
        seed=482),
    # xgb.XGBRegressor(n_estimators=300, max_depth=7, min_child_weight=2,
    #                            learning_rate=0.01, subsample=0.80, colsample_bytree=0.70,
    #                            seed=818, reg_alpha=0.1),
    GradientBoostingRegressor(n_estimators=250,
Esempio n. 54
0
####3.4KNN回归####
from sklearn import neighbors

model_KNeighborsRegressor = neighbors.KNeighborsRegressor()
####3.5随机森林回归####
from sklearn import ensemble

model_RandomForestRegressor = ensemble.RandomForestRegressor(
    n_estimators=20)  #这里使用20个决策树
####3.6Adaboost回归####
from sklearn import ensemble

model_AdaBoostRegressor = ensemble.AdaBoostRegressor(
    n_estimators=50)  #这里使用50个决策树
####3.7GBRT回归####
from sklearn import ensemble

model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor(
    learning_rate=0.2, n_estimators=200)  #这里使用100个决策树
####3.8Bagging回归####
from sklearn.ensemble import BaggingRegressor

model_BaggingRegressor = BaggingRegressor()
####3.9ExtraTree极端随机树回归####
from sklearn.tree import ExtraTreeRegressor

model_ExtraTreeRegressor = ExtraTreeRegressor()

###########4.具体方法调用部分##########
try_different_method(model_LinearRegression)
    plt.plot(t, x, 'r-', lw=1, label=u'原始数据')
    plt.plot(abnormal,
             x[abnormal],
             'go',
             markeredgecolor='g',
             ms=3,
             label=u'异常值')
    plt.legend(loc='upper right')
    plt.title(u'异常检测', fontsize=18)
    plt.grid(b=True)

    # 预测
    plt.subplot(133)
    select = np.ones(N, dtype=np.bool)
    select[abnormal] = False
    t = np.arange(N)
    dtr = DecisionTreeRegressor(criterion='mse', max_depth=10)
    br = BaggingRegressor(dtr, n_estimators=10, max_samples=0.3)
    br.fit(t[select].reshape(-1, 1), x[select])
    y = br.predict(np.arange(N).reshape(-1, 1))
    y[select] = x[select]
    plt.plot(x, 'g--', lw=1, label=u'原始值')  # 原始值
    plt.plot(y, 'r-', lw=1, label=u'校正值')  # 校正值
    plt.legend(loc='upper right')
    plt.title(u'异常值校正', fontsize=18)
    plt.grid(b=True)

    plt.tight_layout(1.5, rect=(0, 0, 1, 0.95))
    plt.suptitle(u'排污数据的异常值检测与校正', fontsize=22)
    plt.show()
Esempio n. 56
0
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

# Settings
n_repeat = 50  # Number of iterations for computing expectations
n_train = 50  # Size of the training set
n_test = 1000  # Size of the test set
noise = 0.1  # Standard deviation of the noise
np.random.seed(0)

# Change this for exploring the bias-variance decomposition of other
# estimators. This should work well for estimators with high variance (e.g.,
# decision trees or KNN), but poorly for estimators with low variance (e.g.,
# linear models).
estimators = [("Tree", DecisionTreeRegressor()),
              ("Bagging(Tree)", BaggingRegressor(DecisionTreeRegressor()))]

n_estimators = len(estimators)


# Generate data
def f(x):
    x = x.ravel()

    return np.exp(-x**2) + 1.5 * np.exp(-(x - 2)**2)


def generate(n_samples, noise, n_repeat=1):
    X = np.random.random(n_samples) * 10 - 5
    X = np.sort(X)
Esempio n. 57
0
    # Get rid of Nan values
    X[np.isnan(X)] = 0.

    print '******************************************'
    print name
    print '******************************************'
    
    if name=='Boston' or name=='Diabetes': # Regression problem
    
        rfr = RandomForestRegressor(**params)
        rfr.fit(X, y)
        print 'Score RandomForestRegressor = %s' % (rfr.score(X, y))
        scores_rfr = cross_val_score(rfr, X, y ,cv=5)
        print 'Cross Val Score RandomForestRegressor = %s' % (np.mean(scores_rfr))
        
        br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=max_depth), n_estimators=n_estimators)
        br.fit(X, y)
        print 'Score BaggingRegressor = %s' % (br.score(X, y))
        scores_br = cross_val_score(br, X, y, cv=5)
        print 'Cross Val Scores of BR = %s' %(np.mean(scores_br))
        
    if name=='Iris' or name=='Digits': # Classificaiton problem
    
        rfc = RandomForestClassifier(**params)
        rfc.fit(X, y)
        print 'Score RandomForestClassifier = %s' % (rfc.score(X, y))
        scores_rfc = cross_val_score(rfc, X, y ,cv=5)
        print 'Corss Val Scores of RandomForestClassifier = %s' %(np.mean(scores_rfc))

        bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators)
        bc.fit(X, y)        
X_train, y_train = generate(n_samples=n_train, noise=noise)
X_test, y_test = generate(n_samples=n_test, noise=noise)

# One decision tree regressor
dtree = DecisionTreeRegressor().fit(X_train, y_train)
d_predict = dtree.predict(X_test)

plt.figure(figsize=(10, 8))
plt.plot(X_test, f(X_test), 'b')
plt.scatter(X_train, y_train, c='b', s=20)
plt.plot(X_test, d_predict, 'g', lw=2)
plt.xlim([-5, 5])
plt.title("Decision tree, MSE = %.2f" % np.sum((y_test - d_predict)**2))

# Bagging decision tree regressor
bdt = BaggingRegressor(DecisionTreeRegressor()).fit(X_train, y_train)
bdt_predict = bdt.predict(X_test)

plt.figure(figsize=(10, 8))
plt.plot(X_test, f(X_test), 'b')
plt.scatter(X_train, y_train, c='b', s=20)
plt.plot(X_test, bdt_predict, 'y', lw=2)
plt.xlim([-5, 5])
plt.title("Bagging decision tree, MSE = %.2f" % np.sum(
    (y_test - bdt_predict)**2))

# Random forest
rf = RandomForestRegressor(n_estimators=10).fit(X_train, y_train)
rf_predict = rf.predict(X_test)

plt.figure(figsize=(10, 8))
df_all['letter_in_description'] = df_all['product_info'].map(
        lambda x: str_common_letter(x.split('\t')[0], x.split('\t')[2]))

print("Drop columns that were changed...")
df_all = df_all.drop(['search_term', 'product_title', 'product_description', 'product_info'], axis=1)

# Set up training and test sets
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]

id_test = df_test['id']
y_train = df_train['relevance'].values

# Drop 'id' and 'relevance' columns from the training and test sets
X_train = df_train.drop(['id', 'relevance'], axis=1).values
X_test = df_test.drop(['id', 'relevance'], axis=1).values

# Setup RandomForest and Bagging Regressors
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)

# Fit the training data into the regression model using the output values
clf.fit(X_train, y_train)

# Run the prediction
y_pred = clf.predict(X_test)

# Set up our Data Frame
datafr = pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('../dataset/submission.csv', index=False)
print(datafr)
Esempio n. 60
0
del globals()['profiles']
del globals()['profilesLSo']
del globals()['profilesLS']
del globals()['row']
del globals()['tmpLS']
del globals()['tmpAGE']
del globals()['profsTOlikes']
del globals()['i']
del globals()['tmpIND']

seed = 7
myRand = np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(likesMAT,
                                                    consARR,
                                                    test_size=1500)

nJOBS = int(sys.argv[1])
nEST = int(sys.argv[2])
bagOUT = BaggingRegressor(n_jobs=nJOBS, n_estimators=nEST, oob_score=True)

#bagOUT.fit(likesMAT, consARR)
bagOUT.fit(X_train, y_train)

y_pred = bagOUT.predict(X_test)
import math
myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("cons, bagOUT:  ", str(nEST), " ", myRMSE)

# joblib.dump(bagOUT, "/Users/jamster/bagOUT-A-cons.xz", compress=9)

# impbagOUT = joblib.load("/Users/jamster/bagOUT-A-cons.xz")