Example #1
0
def train_model(train, test, labels):
    rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=10)
    #rf = RandomForestRegressor(n_estimators=45, max_depth=9, random_state=10)
    clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.2, random_state=25)
    clf.fit(train, labels)
    #clf = SVR(C=1.0, epsilon=0.2)
    #clf.fit(train, labels)
    #clf = GaussianNB()
    #clf.fit(train, labels)
    print "Good!"
    predictions = clf.predict(test)
    print predictions.shape
    predictions = pd.DataFrame(predictions, columns = ['relevance'])
    print "Good again!"
    print "Predictions head -------"
    print predictions.head()
    print predictions.shape
    print "TEST head -------"
    print test.head()
    print test.shape
    #test['id'].to_csv("TEST_TEST.csv",index=False)
    #predictions.to_csv("PREDICTIONS.csv",index=False)
    #test = test.reset_index()
    #predictions = predictions.reset_index()
    #test = test.groupby(level=0).first()
    #predictions = predictions.groupby(level=0).first()
    predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False)
    print predictions
    return predictions
def model_fit_rf_bagging():

	def in_limits(x):
		if x<1: return 1
		if x>3: return 3
		return x

	print "STARTING MODEL"
	X = full_data[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values
	y = full_data['relevance'].values
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
	
	rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
	clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
	clf.fit(X_train, y_train)
	y_pred = clf.predict(X_test)

	in_limits = np.vectorize(in_limits,otypes=[np.float])
	y_pred = in_limits(y_pred)
	RMSE = mean_squared_error(y_test, y_pred)**0.5
	print "RMSE: ",RMSE

	# for the submission
	real_X_test = real_full_test[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values
	test_pred = clf.predict(real_X_test)
	test_pred = in_limits(test_pred)

	return test_pred
Example #3
0
def avmPredict(params):
	town = getPlace(params['lat'], params['long'])[0]

	x, y, z = getXYZ(params['lat'], params['long'])

	r = 1.0

	data = []
	target = []
	header = []

	with open('../../../data/working22.csv') as f:
	
		f = csv.reader(f)
		header = next(f)

		for row in f:
			t = (map(float, row[:3] + row[4:]), float(row[3]))

			if weightF([x, y, z], t[0][0:3], r):
				data.append(t[0])
				target.append(t[1])

	ensemble = BaggingRegressor()
	ensemble.fit(data, target)

	test = createTest(params)
	return ensemble.predict(test)
def train_bagging_xgboost(X, Y):
    adaboost = BaggingRegressor(xgb.XGBRegressor(max_depth=6, learning_rate=0.02, n_estimators=300, silent=True,
                                                 objective='reg:linear', subsample=0.7, reg_alpha=0.8,
                                                 reg_lambda=0.8, booster="gblinear")
                                , max_features=0.7, n_estimators=30)
    adaboost.fit(X, Y)
    return adaboost
Example #5
0
def random_forest(X,Y,Xt):
    print('learn')    
    rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
    clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
    clf.fit(X, Y)
    print('predict')
    Yp_clamped = clf.predict(Xt)
    return Yp_clamped
Example #6
0
def procedureA(goldenFlag = False):
	# Trains and generates a prediction file
	# Uses hard heuristic for buy_or_not

	popFlag = True
	X, Y = getDataXY(currYearFlag = False, popFlag = popFlag)
	X, Y = shuffle(X, Y, random_state = 0)

	if popFlag:
		encoder = oneHot(X[:, 2:])
		Xt = encoder.transform(X[:, 2:])
		Xt = np.hstack((X[:,:2], Xt))
	else:
		encoder = oneHot(X)
		Xt = encoder.transform(X)

	buySet = set()
	for i in range(X.shape[0]):
		tmpTup = (X[i][0], X[i][2])
		buySet.add(tmpTup)
	# Y_buy = [1] * Xt.shape[0]

	min_max_scaler = preprocessing.MinMaxScaler()

	# Xt = min_max_scaler.fit_transform(Xt)

	if goldenFlag:
		print Xt.shape
		Xt = getGoldenX(Xt, 2, 2 + encoder.feature_indices_[1], 2 + encoder.feature_indices_[0], 2 + min(9, encoder.feature_indices_[1]))


	split = 0.9
	X_train, X_test = Xt[:(int(Xt.shape[0]*split)),:], Xt[int(Xt.shape[0]*split):, :]
	Y_train, Y_test = Y[:(int(Y.shape[0]*split)),:], Y[int(Y.shape[0]*split):, :]
	Y_train = Y_train.ravel()
	Y_test = Y_test.ravel()

	print X_train.shape
	print X_test.shape

	# clf = Ridge(alpha = 100)
	# clf = SVR(C = 10.0, kernel = 'poly', degree = 2)
	# clf = LinearSVR(C = 1.0)
	clf = BaggingRegressor(DecisionTreeRegressor(), n_estimators = 125, n_jobs = 4, random_state = 0)
	# clf = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = 100)
	# clf = DecisionTreeRegressor()
	# clf = RandomForestRegressor(random_state = 0, n_estimators = 200, n_jobs = 4)
	clf.fit(X_train, Y_train.ravel())

	Y_pred = clf.predict(X_test)
	evaluatePred(Y_pred, Y_test)

	return clf, encoder, min_max_scaler
Example #7
0
def train_model(training, testing, window=5, n=5):
	X_train, y_train = prepare_data(training)
	X_test, y_test = prepare_data(testing)
	rf = RandomForestRegressor()
	rf.fit(X_train, y_train)
	predrf = rf.predict(X_test)
	print "mse for random forest regressor: ", mean_squared_error(predrf, y_test)

	gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025)
	gb.fit(X_train, y_train)
	predgb = gb.predict(X_test)
	print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test)
	## plot feature importance using GBR results
	fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility'])
	fx_imp /= fx_imp.max()  # normalize
	fx_imp.sort()
	ax = fx_imp.plot(kind='barh')
	fig = ax.get_figure()
	fig.savefig("output/feature_importance.png")

	adb = AdaBoostRegressor(DecisionTreeRegressor())
	adb.fit(X_train, y_train)
	predadb = adb.predict(X_test)
	print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test)

	scale = StandardScaler()
	scale.fit(X_train)
	X_trainscale = scale.transform(X_train)
	X_testscale = scale.transform(X_test)

	knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5)
	knn.fit(X_trainscale, y_train)
	predknn = knn.predict(X_testscale)
	print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test)

	pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn
	print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test)
	result = testing.copy()
	result.ix[5:-5, 'trend'] = pred_test
	result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values
	result.ix[:-5, 'pred_date'] = result.index[5:]

	return result
Example #8
0
def procc_modelfusion(df_test, data_test):
    from sklearn.ensemble import BaggingRegressor
    from sklearn import linear_model
    train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
    train_np = train_df.as_matrix()

    # y即Survival结果
    y = train_np[:, 0]

    # X即特征属性值
    X = train_np[:, 1:]

    # fit到BaggingRegressor之中
    clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
    bagging_clf.fit(X, y)

    test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
    predictions = bagging_clf.predict(test)
    result = pd.DataFrame({'PassengerId' : data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
    result.to_csv("logistic_regression_predictions3.csv", index=False)
Example #9
0
class Regressor(BaseEstimator):
    def __init__(self):
#         self.clf = GradientBoostingRegressor(n_estimators=200, max_features="sqrt", max_depth=5)
#         self.clf = LinearRegression() 
         self.clf = BaggingRegressor(LinearRegression())
#         self.clf = GaussianProcess(theta0=4)
#         self.sp = RandomizedLasso()       
         self.sp = SparseRandomProjection(n_components=5)
#         self.sp = TruncatedSVD()
 #        self.sp = KernelPCA(n_components=3, tol=0.0001, kernel="poly")
    # self.clf = ExtraTreesRegressor(n_estimators=200, max_features="sqrt", max_depth=5)

    def fit(self, X, y):
#        print(self.sp)

#        Xr = self.sp.fit_transform(X, y)
        self.clf.fit(X, y.ravel())
 
    def predict(self, X):
#        Xr = self.sp.transform(X)
        return self.clf.predict(X)
def get_bagging_prediction(X_train, y_train, X_test, X_valid=None, GS=False):
    if not GS:
        rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
        clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        if X_valid is None:
            return y_pred
        else:
            return y_pred, clf.predict(X_valid)
    else:
        rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
        clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
        param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
        model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=2, verbose=VERBOSE, scoring=RMSE)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if X_valid is None:
            return y_pred
        else:
            return y_pred, model.predict(X_valid)
Example #11
0
def runTests():

    # Generate the training samples, extract training features and target
    trainSamples = GenSamples(numSamples)
    trainFeatures = extractFeatures(trainSamples)
    trainPred = extractPred(trainSamples)

    # Generate the test samples, extracr test features and target
    testSamples = GenSamples(numTestSamples)
    testFeatures = extractFeatures(testSamples)
    testPred = extractPred(testSamples)

    R2List = OrderedDict()
    R2List['TrainROI'] = []
    R2List['TestROI'] = []
    print 'Running Tests: '
    for i in range(numTests):
        # Bootstrap is True by default i.e., sampling with replacement
        # Bootstrap features is False by default i.e., all features used
        classifier = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                      n_estimators=numTrees,
                                      max_samples=int(0.5*numSamples),
                                      max_features=int(1))

        classifier.fit(trainFeatures, trainPred)
        predictROI = {}
        predictROI['Training'] = classifier.predict(trainFeatures)
        predictROI['Test'] = classifier.predict(testFeatures)

        R2 = {}
        R2['Train'] = r2_score(trainPred, predictROI['Training'])
        R2['Test'] = r2_score(testPred, predictROI['Test'])

        R2List['TrainROI'].append(R2['Train'])
        R2List['TestROI'].append(R2['Test'])

    print 'Best Train ROI: ', max(R2List['TrainROI'])
    print 'Best Test ROI: ', max(R2List['TestROI'])
Example #12
0
def test_bagging_regressor_with_missing_inputs():
    # Check that BaggingRegressor can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y_values = [
        np.array([2, 3, 3, 3, 3]),
        np.array([
            [2, 1, 9],
            [3, 6, 8],
            [3, 6, 8],
            [3, 6, 8],
            [3, 6, 8],
        ])
    ]
    for y in y_values:
        regressor = DecisionTreeRegressor()
        pipeline = make_pipeline(
            Imputer(),
            Imputer(missing_values=np.inf),
            Imputer(missing_values=np.NINF),
            regressor
        )
        pipeline.fit(X, y).predict(X)
        bagging_regressor = BaggingRegressor(pipeline)
        y_hat = bagging_regressor.fit(X, y).predict(X)
        assert_equal(y.shape, y_hat.shape)

        # Verify that exceptions can be raised by wrapper regressor
        regressor = DecisionTreeRegressor()
        pipeline = make_pipeline(regressor)
        assert_raises(ValueError, pipeline.fit, X, y)
        bagging_regressor = BaggingRegressor(pipeline)
        assert_raises(ValueError, bagging_regressor.fit, X, y)
Example #13
0
File: ensemble.py Project: smly/ume
class BaggingRegressor(BaseEstimator):
    """
    Usage:

    ```
    "model": {
        "class": "ume.ensemble.BaggingRegressor",
        "params": {
            "base_estimator": {
                "class": "sklearn.svm.SVR",
                "params": {
                    "kernel": "rbf",
                    "degree": 1,
                    "C": 1000000.0,
                    "epsilon": 0.01,
                },
            },
            "bag_kwargs": {
                "n_estimators": 100,
                "n_jobs": 5,
                "max_samples": 0.9,
            },
        }
    }
    ```
    """
    def __init__(self, base_estimator=None, bag_kwargs=None):
        klass = dynamic_load(base_estimator['class'])
        svr_reg = klass(**base_estimator['params'])
        self.__clf = SK_BaggingRegressor(base_estimator=svr_reg, **bag_kwargs)

    def fit(self, X, y):
        return self.__clf.fit(X, y)

    def predict(self, X):
        return self.__clf.predict(X)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
data = inspection_data.select_dtypes(include=numerics)
data = pd.DataFrame(data)
y = data['SCORE']
data.drop('SCORE', axis = 1, inplace= True )


# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3)
#print(X_train.shape, y_train.shape)
#print(X_test.shape, y_test.shape)

dt_clf = DecisionTreeRegressor(splitter="random", max_leaf_nodes=16, random_state=0)
bag_clf = BaggingRegressor(dt_clf, n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=0)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
#print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

#iris = load_iris()
##no early stoping defined, so it goes the full length
rnd_clf = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)

for name, score in zip(X_train, rnd_clf.feature_importances_):
    print(name, score)


inspection_data['risk factor'] = inspection_data['risk factor'].astype('int64')
inspection_data['year'] = pd.DatetimeIndex(inspection_data['ACTIVITY DATE']).year
inspection_data['month'] = pd.DatetimeIndex(inspection_data['ACTIVITY DATE']).month
Example #15
0
X = df.iloc[:, [1, 2, 3, 4]]
y = df.iloc[:, 5]

X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=1)

kVals = range(1, 21, 1)
accuracies = []

for k in kVals:
    basemodel = KNeighborsRegressor(n_neighbors=k)
    model = BaggingRegressor(base_estimator=basemodel,
                             n_estimators=30,
                             bootstrap_features=True,
                             max_features=2,
                             random_state=1)
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    score = model.score(X_test, y_test)
    print("k=%d, R^2 score=%.2f%%" % (k, score * 100))
    print("k=%d, mae = %.2f" % (k, mean_absolute_error(y_test, y_predicted)))
    print("k=%d, mse =%.2f%%" %
          (k, mean_squared_error(y_test, y_predicted) * 100))
    pred = np.array(y_predicted)
    org = np.array(y_test)
    dif = abs(pred - org)
    dif = 1 - dif / org
    div = pred / org
    ix = np.where(div > 1)
    div[ix] = 1 / div[ix]
    div = 1 - div
    print("k=%d, mape =%.4f%%" % (k, dif.mean() * 100))
Example #16
0
# Sample 3.8: Diabetes Ensemble Regression
from sklearn import datasets
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

bagreg = BaggingRegressor()
bagreg.fit(X_train, y_train)
bagreg_predict = bagreg.predict(X_test)
print("Bagging Mean squared error: %.2f" %
      mean_squared_error(y_test, bagreg_predict))

rfreg = RandomForestRegressor()
rfreg.fit(X_train, y_train)
rfreg_predict = rfreg.predict(X_test)
print("Random Forest Mean squared error: %.2f" %
      mean_squared_error(y_test, rfreg_predict))
Example #17
0
                                        min_samples_split=2,
                                        n_jobs=-1,
                                        max_features='log2',
                                        n_estimators=1900)
extra_trees_reg.fit(X_train, y_train.ravel())
print("Feature Importances: {}".format(extra_trees_reg.feature_importances_))

# In[38]:

bag_reg = BaggingRegressor(DecisionTreeRegressor(random_state=42),
                           n_estimators=1000,
                           max_samples=300,
                           bootstrap=True,
                           n_jobs=-1,
                           random_state=42)
bag_reg.fit(X_train, y_train.ravel())

# Comparing regressors with tuned paramters

# In[39]:

SET_FIT_INTERCEPT = True

names = [
    "LinearRegression", "Ridge", "ExtraTreesRegressor",
    "RandomForestRegressor", "BaggingRegressor"
]

regressors = [
    LinearRegression(fit_intercept=SET_FIT_INTERCEPT),
    Ridge(alpha=5,
Example #18
0
train_np = train_d.as_matrix()

# y就是Survival结果
y = train_np[:, 0]

# X就是特征属性值
X = train_np[:, 1:]

#fit到BaggingRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
bagging_clf = BaggingRegressor(clf,
                               n_estimators=20,
                               max_samples=0.8,
                               max_features=1.0,
                               bootstrap=True,
                               bootstrap_features=False,
                               n_jobs=-1)
bagging_clf.fit(X, y)

testf = test_f.filter(
    regex=
    'Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title'
)
predictions = bagging_clf.predict(testf)

result = pd.DataFrame({
    'PassengerId': test['PassengerId'].as_matrix(),
    'Survived': predictions.astype(np.int32)
})
result
Example #19
0
for train, test in kf:
	TR.append(train)
	TS.append(test)

A = []
B = []

for k in range(kfcv):
	print k
	X_train = X[TR[k], :]
	y_train = y[TR[k]]
	X_test = X[TS[k], :]
	y_test = y[TS[k]]

	model.fit(X_train, y_train)
	y_predict = model.predict(X_test)

#	plt.subplot(2, 10, k + 1)
#	plt.scatter(y_predict, y_test)
#	plt.xlabel('y_predict')	
#	plt.ylabel('y_true')
#	plt.title('Fold = %d' % (k + 1))

	A.extend(list(y_predict))
	B.extend(list(y_test))

#	mse = mean_squared_error(y_predict, y_test)
#	print 'mse = %f' % mse

mse = mean_squared_error(A, B)
# In[17]:


from sklearn.metrics import r2_score
XGBBoost_RF_score=r2_score(xbg_y_pred,test_y)
XGBBoost_RF_score


# In[18]:


# BaggingRegressor
from sklearn.ensemble import BaggingRegressor
BR=BaggingRegressor(base_estimator=RF,n_estimators=50,)
BR.fit(train_x,train_y)
BR_y_pred=BR.predict(test_x)
BR_score=r2_score(BR_y_pred,test_y)
BR_score


# In[20]:


#Nueral Networks MLPRegressor
from sklearn.neural_network import MLPRegressor
mlpR= MLPRegressor(hidden_layer_sizes=(100,))
mlpR.fit(train_x,train_y)
mlpr_y_pred=mlpR.predict(test_x)

Example #21
0
class Regressor(skl.base.BaseEstimator, skl.base.TransformerMixin):
    """docstring"""
    def __init__(self,
                 base_estimator='AdaBoostedLinearRegression',
                 n_estimators=50,
                 learning_rate=1.0,
                 loss='linear',
                 random_state=None,
                 save_path=None):
        super(Regressor, self).__init__()
        self.base_estimator = str(base_estimator)
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.loss = loss
        self.random_state = random_state
        self.save_path = save_path
        self.regressor = None

    def fit(self, X, y, sample_weight=None):
        X, y = check_X_y(X, y)
        if (self.base_estimator == 'BayesianRidge'):
            self.regressor = BayesianRidge()
        elif (self.base_estimator == 'LASSO'):
            self.regressor = Lasso()
        elif (self.base_estimator == 'ElasticNet'):
            self.regressor = ElasticNet()
        elif (self.base_estimator == 'MLPRegressor'):
            self.regressor = MLPRegressor()
        elif (self.base_estimator == 'KernelRidge'):
            self.regressor = KernelRidge(kernel='polynomial')
        elif (self.base_estimator == 'LinearRegression'):
            self.regressor = LinearRegression()
        elif (self.base_estimator == 'BaggingRegressorLinear'):
            base_estimator = LinearRegression()
            self.regressor = BaggingRegressor(base_estimator)
        elif (self.base_estimator == 'BaggingRegressorKernelRidge'):
            base_estimator = KernelRidge(kernel='polynomial')
            self.regressor = BaggingRegressor(base_estimator)
        elif (self.base_estimator == 'BaggingRegressorLasso'):
            base_estimator = Lasso()
            self.regressor = BaggingRegressor(base_estimator)
        else:
            raise Exception('Unsupported base_estimator: ' +
                            self.base_estimator)
        self.regressor.fit(X, y)
        return self

    def predict(self, X):
        check_is_fitted(self, ["regressor"])
        X = check_array(X)

        return self.regressor.predict(X)

    def score(self, X, y, sample_weight=None):
        scores = -(self.predict(X) - y)**2 / len(y)
        score = np.sum(scores)

        print(score)
        sys.stdout.flush()

        return score

    def set_save_path(self, save_path):
        self.save_path = save_path
Example #22
0
class ShapeletForestRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                 n_estimators=100,
                 max_depth=None,
                 min_samples_split=2,
                 n_shapelets=10,
                 min_shapelet_size=0,
                 max_shapelet_size=1,
                 metric='euclidean',
                 metric_params=None,
                 bootstrap=True,
                 n_jobs=None,
                 random_state=None):
        """A shapelet forest regressor
        """
        self.n_estimators = n_estimators
        self.bootstrap = bootstrap
        self.n_jobs = n_jobs
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_shapelets = n_shapelets
        self.min_shapelet_size = min_shapelet_size
        self.max_shapelet_size = max_shapelet_size
        self.metric = metric
        self.metric_params = metric_params
        self.random_state = random_state

    def predict(self, X, check_input=True):
        if X.ndim < 2 or X.ndim > 3:
            raise ValueError("illegal input dimensions X.ndim ({})".format(
                X.ndim))

        if self.n_dims_ > 1 and X.ndim != 3:
            raise ValueError("illegal input dimensions X.ndim != 3")

        if X.shape[-1] != self.n_timestep_:
            raise ValueError("illegal input shape ({} != {})".format(
                X.shape[-1], self.n_timestep_))

        if X.ndim > 2 and X.shape[1] != self.n_dims_:
            raise ValueError("illegal input shape ({} != {}".format(
                X.shape[1], self.n_dims))

        if check_input:
            X = check_array(X, dtype=np.float64, allow_nd=True, order="C")

        if X.dtype != np.float64 or not X.flags.contiguous:
            X = np.ascontiguousarray(X, dtype=np.float64)

        X = X.reshape(X.shape[0], self.n_dims_ * self.n_timestep_)
        return self.bagging_regressor_.predict(X)

    def fit(self, X, y, sample_weight=None, check_input=True):
        """Fit a random shapelet forest regressor
        """
        random_state = check_random_state(self.random_state)
        if check_input:
            X = check_array(X, dtype=np.float64, allow_nd=True, order="C")
            y = check_array(y, dtype=np.float64, ensure_2d=False, order="C")

        if X.ndim < 2 or X.ndim > 3:
            raise ValueError("illegal input dimension")

        n_samples = X.shape[0]
        self.n_timestep_ = X.shape[-1]
        if X.ndim > 2:
            n_dims = X.shape[1]
        else:
            n_dims = 1

        self.n_dims_ = n_dims

        if len(y) != n_samples:
            raise ValueError("Number of labels={} does not match "
                             "number of samples={}".format(len(y), n_samples))

        if X.dtype != np.float64 or not X.flags.contiguous:
            X = np.ascontiguousarray(X, dtype=np.float64)

        if y.dtype != np.float64 or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=np.float64)

        shapelet_tree_regressor = ShapeletTreeRegressor(
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            n_shapelets=self.n_shapelets,
            min_shapelet_size=self.min_shapelet_size,
            max_shapelet_size=self.max_shapelet_size,
            metric=self.metric,
            metric_params=self.metric_params,
            random_state=random_state,
        )

        if n_dims > 1:
            shapelet_tree_regressor.force_dim = n_dims

        self.bagging_regressor_ = BaggingRegressor(
            base_estimator=shapelet_tree_regressor,
            bootstrap=self.bootstrap,
            n_jobs=self.n_jobs,
            n_estimators=self.n_estimators,
            random_state=self.random_state,
        )
        X = X.reshape(n_samples, n_dims * self.n_timestep_)
        self.bagging_regressor_.fit(X, y, sample_weight=sample_weight)
        return self
# 线性回归模型
lr = LinearRegression()
lr.fit(train_X, (train_y))
print('lr_error:', error(cv_y, lr.predict(cv_X)))

# 岭回归
ridge = RidgeCV(alphas=[151], cv=10)
ridge.fit(train_X, train_y)
print('ridge_error:', error(cv_y, ridge.predict(cv_X)))

lasso = LassoCV(alphas=[0.003], max_iter=10000, cv=10)
lasso.fit(train_X, train_y)
print('lasso_error:', error(cv_y, np.exp(lasso.predict(cv_X))))

br_lr = BaggingRegressor(base_estimator=lr, n_estimators=3)
br_lr.fit(train_X, train_y)
print('br_lr_error:', error(cv_y, br_lr.predict(cv_X)))

br_ridge = BaggingRegressor(base_estimator=ridge, n_estimators=3)
br_ridge.fit(train_X, train_y)
print('br_ridge_error:', error(cv_y, br_ridge.predict(cv_X)))

br_lasso = BaggingRegressor(base_estimator=lasso, n_estimators=7)
br_lasso.fit(train_X, train_y)
print('br_lasso_error:', error(cv_y, br_lasso.predict(cv_X)))

# 测试集预处理
data_test = pd.read_csv("C:/Tool/Pycharm/TianChi/d_test_A_20180102.csv",
                        encoding='gb2312')
test_columns = [
    '年龄', '*天门冬氨酸氨基转换酶', '*丙氨酸氨基转换酶', '*碱性磷酸酶', '*r-谷氨酰基转换酶', '*总蛋白', '白蛋白',
Example #24
0
class BaggingClass:
    """
    Name      : BaggingRegressor
    Attribute : None
    Method    : predict, predict_by_cv, save_model
    """

    def __init__(self):
        # 알고리즘 이름
        self._name = 'bagging'

        # 기본 경로
        self._f_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir))

        # 경고 메시지 삭제
        warnings.filterwarnings('ignore')

        # 원본 데이터 로드
        data = pd.read_csv(self._f_path + "/regression/resource/regression_sample.csv", sep=",", encoding="utf-8")

        # 학습 및 테스트 데이터 분리
        self._x = (data["year"] <= 2017)
        self._y = (data["year"] >= 2018)

        # 학습 데이터 분리
        self._x_train, self._y_train = self.preprocessing(data[self._x])
        # 테스트 데이터 분리
        self._x_test, self._y_test = self.preprocessing(data[self._y])

        # 모델 선언
        self._model = BaggingRegressor()

        # 모델 학습
        self._model.fit(self._x_train, self._y_train)

        # 그리드 서치 모델
        self._g_model = None

    # 데이터 전처리
    def preprocessing(self, data):
        # 학습
        x = []
        # 레이블
        y = []
        # 기준점(7일)
        base_interval = 7
        # 기온
        temps = list(data["temperature"])

        for i in range(len(temps)):
            if i < base_interval:
                continue
            y.append(temps[i])

            xa = []

            for p in range(base_interval):
                d = i + p - base_interval
                xa.append(temps[d])
            x.append(xa)
        return x, y

    # 일반 예측
    def predict(self, save_img=False, show_chart=False):
        # 예측
        y_pred = self._model.predict(self._x_test)

        # 스코어 정보
        score = r2_score(self._y_test, y_pred)

        # 리포트 확인
        if hasattr(self._model, 'coef_') and hasattr(self._model, 'intercept_'):
            print(f'Coef = {self._model.coef_}')
            print(f'intercept = {self._model.intercept_}')

        print(f'Score = {score}')

        # 이미지 저장 여부
        if save_img:
            self.save_chart_image(y_pred, show_chart)

        # 예측 값  & 스코어
        return [list(y_pred), score]

    #  CV 예측(Cross Validation)
    def predict_by_cv(self):
        # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현
        return False

    #  GridSearchCV 예측
    def predict_by_gs(self):
        # 그리드 서치 Params
        param_grid = {
            # 모형 개수
            'n_estimators': [5, 10, 15],
            # 데이터 중복 여부
            'bootstrap': [True, False],
            # 차원 중복 여부
            'bootstrap_features': [True, False],
            # 독립 변수 차원 비율
            'max_samples': [0.6, 0.8, 1.0]
        }

        # 그리드 서치 초기화
        self._g_model = GridSearchCV(BaggingRegressor(), param_grid=param_grid)

        # 그리드 서치 학습
        self._g_model.fit(self._x_train, self._y_train)

        # 파라미터 모두 출력
        print(self._g_model.param_grid)
        # 베스트 스코어
        print(self._g_model.best_score_)
        # 베스트 파라미터
        print(self._g_model.best_params_)
        # 전체 결과 출력
        print(self._g_model.cv_results_)

        return dict(gs_all_params=self._g_model.param_grid, gs_best_score=self._g_model.best_score_,
                    gs_best_param=self._g_model.best_params_)

    # 모델 저장 및 갱신
    def save_model(self, renew=False):
        # 모델 저장
        if not renew:
            # 처음 저장
            joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl')
        else:
            # 기존 모델 대체
            if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'):
                os.rename(self._f_path + f'/model/{self._name}_rg.pkl',
                          self._f_path + f'/model/{str(self._name) + str(time.time())}_rg.pkl')
            joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl')

    # 회귀 차트 저장
    def save_chart_image(self, data, show_chart):
        # 사이즈
        plt.figure(figsize=(15, 10), dpi=100)

        # 레이블
        plt.plot(self._y_test, c='r')

        # 예측 값
        plt.plot(data, c='b')

        # 이미지로 저장
        plt.savefig('./chart_images/tenki-kion-lr.png')

        # 차트 확인(Optional)
        if show_chart:
            plt.show()

    def __del__(self):
        del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
Example #25
0
    plt.legend(loc='upper right')
    plt.grid(b=True)

    plt.subplot(132)
    t = np.arange(N)
    plt.plot(t, x, 'r-', lw=1, label=u'原始数据')
    plt.plot(abnormal, x[abnormal], 'go', markeredgecolor='g', ms=3, label=u'异常值')
    plt.legend(loc='upper right')
    plt.title(u'异常检测', fontsize=18)
    plt.grid(b=True)

    # 预测
    plt.subplot(133)
    select = np.ones(N, dtype=np.bool)
    select[abnormal] = False
    t = np.arange(N)
    dtr = DecisionTreeRegressor(criterion='mse', max_depth=10)
    br = BaggingRegressor(dtr, n_estimators=10, max_samples=0.3)
    br.fit(t[select].reshape(-1, 1), x[select])
    y = br.predict(np.arange(N).reshape(-1, 1))
    y[select] = x[select]
    plt.plot(x, 'g--', lw=1, label=u'原始值')    # 原始值
    plt.plot(y, 'r-', lw=1, label=u'校正值')     # 校正值
    plt.legend(loc='upper right')
    plt.title(u'异常值校正', fontsize=18)
    plt.grid(b=True)

    plt.tight_layout(1.5, rect=(0, 0, 1, 0.95))
    plt.suptitle(u'排污数据的异常值检测与校正', fontsize=22)
    plt.show()
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt

def f(x):
    return 0.5 * np.exp(-(x+3)**2) + np.exp(-x**2) + 0.5 * np.exp(-(x-3)**2)

N = 200 # 200 samples

x_train = np.linspace(-5.5, 5.5, N)
X_train = pd.DataFrame({"x": x_train})
y_train = f(x_train) + (np.random.rand(N) - 0.5) * (2 * 0.05)

dtr = DecisionTreeRegressor(max_depth=5)
br = BaggingRegressor(dtr, n_estimators=200, max_samples=0.2)
br.fit(X_train, y_train)

x_test = np.linspace(x_train.min() * 1.1, x_train.max() * 1.1, 1000)
X_test = pd.DataFrame({"x": x_test})
y_test = f(x_test)
y_predict = br.predict(X_test)

plt.scatter(x_train, y_train)
plt.scatter(x_test, y_test)
plt.scatter(x_test, y_predict)
plt.show()
    Alpha, Test_score, y_pred = model_Ridge(dummy_train_df, train_y, dummy_test_df)

    #下面是用bagging处理,该bagging中基学习器是岭回归,学习器的个数通过交叉验证来确定
    ridge = Ridge(alpha = Alpha)
    # params = [1, 10, 15, 20, 25, 30, 40]
    params = np.arange(1,50)
    test_scores = []
    min_score = 1
    for param in params:
        clf = BaggingRegressor(base_estimator= ridge, n_estimators= param)
        test_score = np.sqrt(-cross_val_score(clf, dummy_train_df, train_y, cv=10, scoring='neg_mean_squared_error'))
        temp = np.mean(test_score)
        if temp < min_score:
            min_score = temp
            optimal_params = param
        test_scores.append(np.mean(test_score))
    print(optimal_params)

    # plt.plot(params, test_scores)
    # plt.title("n_estimators vs CV_error")
    # plt.show()

    #用训练好的参数预测
    br = BaggingRegressor(base_estimator= ridge, n_estimators=optimal_params)
    br.fit(dummy_train_df, train_y)
    y_final = np.expm1(br.predict(dummy_test_df))
    # print(y_final)
    summsion_csv(y_final, test)


Example #28
0
# GradientBoostingRegressor:
gradient_boosting_regressor = GradientBoostingRegressor()
gradient_boosting_regressor.fit(x_train, y_train)
gb_score = gradient_boosting_regressor.score(x_test, y_test)
print('GradientBoostingRegressor score: ' + str(gb_score)) # -> 0.8978408816999488

# ExtraTreesRegressor:
extra_trees_regressor = ExtraTreesRegressor()
extra_trees_regressor.fit(x_train, y_train)
et_score = extra_trees_regressor.score(x_test, y_test)
print('ExtraTreesRegressor score: ' + str(et_score)) # -> 0.9071302394368891

# BaggingRegressor:
bagging_regressor = BaggingRegressor()
bagging_regressor.fit(x_train, y_train)
b_score = bagging_regressor.score(x_test, y_test)
print('BaggingRegressor score: ' + str(b_score)) # -> 0.9154010467830169

# RandomForestRegressor:
random_forest_regressor = RandomForestRegressor()
random_forest_regressor.fit(x_train, y_train)
rf_score = random_forest_regressor.score(x_test, y_test)
print('RandomForestRegressor score: ' + str(rf_score)) # -> 0.920122663462127

# RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100):
random_forest_regressor = RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100)
random_forest_regressor.fit(x_train, y_train)
rf_score_1 = random_forest_regressor.score(x_test, y_test)
print('RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100) score: ' + str(rf_score_1)) # -> 0.8134829521876554
              }

        
model_gbr_allfeatures = grid_search.GridSearchCV(estimator =gbr, param_grid = parameters, n_jobs = -1, cv = 2, verbose = 20, scoring='mean_squared_error')
model_gbr_allfeatures.fit(X_train, Y_train)
print(model_gbr_allfeatures.best_params_) #'max_depth': 6, 'n_estimators': 500, 'learning_rate': 0.1, 'max_features': 'auto'


predictions_gbr_allfeatures = model_gbr_allfeatures.predict(X_test)
mean_squared_error(Y_test, predictions_gbr_allfeatures) #7.071566



#ensembling randomForest model using bagging
bag = BaggingRegressor(rfr, n_estimators=500, max_samples=0.1, random_state=25)
bag.fit(X_train, Y_train)
predictions_rfr_bagging = bag.predict(X_test)
mean_squared_error(Y_test, predictions_rfr_bagging)

#recursive selection of features for randomForest
from sklearn.feature_selection import RFECV
rfecv = RFECV(estimator=rfr, step=1, cv=3,
              scoring='mean_squared_error')
rfecv.fit(X_train, Y_train)
print("Optimal number of features : %d" % rfecv.n_features_) 




                     
# 大家都看过知识问答的综艺节目中, 求助现场观众时候, 让观众投票, 最高的答案作为自己的答案的形式吧, 每个人都有一个判定结果,
#     最后我们相信答案在大多数人手里.
# 再通俗一点举个例子. 你和你班某数学大神关系好, 每次作业都『模仿』他的, 于是绝大多数情况下, 他做对了, 你也对了.
#     突然某一天大神脑子犯糊涂, 手一抖, 写错了一个数, 于是…恩, 你也只能跟着错了.
# 我们再来看看另外一个场景, 你和你班 5 个数学大神关系都很好, 每次都把他们作业拿过来, 对比一下, 再『自己做』, 那你想想,
#     如果哪天某大神犯糊涂了, 写错了, but 另外四个写对了啊, 那你肯定相信另外 4 人的是正确答案吧?
# 最简单的模型融合大概就是这么个意思, 比如分类问题, 当我们手头上有一堆在同一份数据集上训练得到的分类器
#     (比如 logistic regression, SVM, KNN, random forest, 神经网络), 那我们让他们都分别去做判定, 然后对结果做投票统计, 取票数最多的结果为最后结果.

# 模型融合可以比较好地缓解, 训练过程中产生的过拟合问题, 从而对于结果的准确度提升有一定的帮助.
# 话说回来, 回到我们现在的问题. 你看, 我们现在只讲了 logistic regression, 如果我们还想用这个融合思想去提高我们的结果, 我们该怎么做呢?
# 既然这个时候模型没得选, 那咱们就在数据上动动手脚咯. 大家想想, 如果模型出现过拟合现在, 一定是在我们的训练上出现拟合过度造成的对吧.
# 那我们干脆就不要用全部的训练集, 每次取训练集的一个 subset, 做训练, 这样, 我们虽然用的是同一个机器学习算法,
#     但是得到的模型却是不一样的;同时, 因为我们没有任何一份子数据集是全的, 因此即使出现过拟合, 也是在子训练集上出现过拟合,
#     而不是全体数据上, 这样做一个融合, 可能对最后的结果有一定的帮助. 对, 这就是常用的 Bagging.
# 我们用 scikit-learn 里面的 Bagging 来完成上面的思路, 过程非常简单. 代码如下:
from sklearn.ensemble import BaggingRegressor
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
train_np = train_df.as_matrix()
y = train_np[:, 0]  # y 即 Survival 结果
X = train_np[:, 1:]  # X 即特征属性值
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)  # fit 到 BaggingRegressor 之中
bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
bagging_clf.fit(X, y)
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
predictions = bagging_clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("./tmp_dataset/Kaggle-Titanic/result.csv", index=False)
# 0.75598; 竟然更低了, 可能是 BaggingRegressor 随机分配的时候运气不好
# 上一个结果和博客中作者的结果一毛一样, 第二次竟然不一样了
Example #31
0
        ["xgboost_Label", ""],
        ["xgboost_Vect", ""],
    ]
    full_predictions = []
    for alg, predictors in algorithms:
        if alg == "xgboost_Label":
            full_predictions.append(xgboost_Label(train, test, labels))
        elif alg == "xgboost_Vect":
            full_predictions.append(xgboost_Vect(train, test, labels))
        elif alg == "xgboost_Dummies":
            full_predictions.append(xgboost_Dummies(train, test, labels))
        else:
            if predictors == "dummies":
                print ("Train ", alg.__class__.__name__, " dummies Model ")
                alg = BaggingRegressor(alg)
                alg.fit(train_du, labels)
                print "Prediction :", alg.__class__.__name__, " dummies Model "
                prediction = alg.predict(test_du)
                full_predictions.append(prediction)
            else:
                print ("Train ", alg.__class__.__name__, " Label Model ")
                alg = BaggingRegressor(alg)
                alg.fit(train_rf, labels)
                print "Prediction :", alg.__class__.__name__, " Label Model "
                prediction = alg.predict(test_rf)
                full_predictions.append(prediction)

                # Ensemble models
    RF_label_pred = full_predictions[0]
    RF_dummies_pred = full_predictions[1]
    pred_xgb_dummies = full_predictions[2]
accu = pd.DataFrame(index=['MSLE', 'Root MSLE', 'R2 Score', 'Accuracy(%)'])
"""  Bagging Regressor METHODE @-@  """
""" model implementation """

baggReg = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=20),
                           n_estimators=50,
                           random_state=1,
                           max_samples=1.0,
                           max_features=1.0,
                           bootstrap=False,
                           bootstrap_features=False,
                           oob_score=False,
                           warm_start=False,
                           n_jobs=-1,
                           verbose=0)
baggReg.fit(X_train, y_train)
y_pred = baggReg.predict(X_test)
""" model evaluation """

r6_br = result(y_test, y_pred)

print("MSLE : {}".format(r6_br[0]))
print("Root MSLE : {}".format(r6_br[1]))
print("R2 Score : {} or {}%".format(r6_br[2], r6_br[3]))
""" Visualization of true value and predicted """

df_check = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_check = df_check.sample(50)
df_check.plot(kind='bar', figsize=(10, 5))
plt.grid(which='major', linestyle='-', linewidth='0.1', color='Green')
plt.title('Performance Bagging Regressor')
Example #33
0
#     midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
#     diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
#     return midpoint, diff
#
# plot_learning_curve(model, u"learning  curve", X, Y)


# 6...................模型融合.....................#


train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.as_matrix()

# y即Survival结果
y = train_np[:, 0]

x = train_np[:, 1:]


# fit 到BaggingRegressor之中
model = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
bagging_model = BaggingRegressor(model,n_estimators=20,max_samples=0.8,max_features=1.0,bootstrap=True,bootstrap_features=False,n_jobs=-1)
bagging_model.fit(x,y)

test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = bagging_model.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(),'Survived':predictions.astype(np.int32 )})
result.to_csv('./result.csv',index=False)


Example #34
0
# ## Bagged decision trees in scikit-learn (with B=500)

# define the training and testing sets
X_train = train.iloc[:, 1:]
y_train = train.iloc[:, 0]
X_test = test.iloc[:, 1:]
y_test = test.iloc[:, 0]


# instruct BaggingRegressor to use DecisionTreeRegressor as the "base estimator"
from sklearn.ensemble import BaggingRegressor
bagreg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)


# fit and predict
bagreg.fit(X_train, y_train)
y_pred = bagreg.predict(X_test)
y_pred


# calculate RMSE
np.sqrt(metrics.mean_squared_error(y_test, y_pred))


# ## Estimating out-of-sample error
# 
# For bagged models, out-of-sample error can be estimated without using **train/test split** or **cross-validation**!
# 
# On average, each bagged tree uses about **two-thirds** of the observations. For each tree, the **remaining observations** are called "out-of-bag" observations.

# show the first bootstrap sample
Example #35
0
    def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features. XXX sparse matrix?

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for frauds.
            XXX maybe make such y ourselves from input?

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]

        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)

        if n_classes < 2:
            raise ValueError("This method needs samples of at least 2 classes"
                             " in the data, but the data contains only one"
                             " class: %r" % self.classes_[0])

        if not set(self.classes_) == set([0, 1]):
            warn("Found labels %s. This method assumes fraud to be labeled as"
                 " 1 and normal data to be labeled as 0. Any label"
                 " different from 0 will be considered as fraud." %
                 set(self.classes_))
            y = (y > 0)

        # ensure that max_samples is in [1, n_samples]:
        n_samples = X.shape[0]

        if isinstance(self.max_samples, six.string_types):
            raise ValueError('max_samples (%s) is not supported.'
                             'Valid choices are: "auto", int or'
                             'float' % self.max_samples)

        elif isinstance(self.max_samples, INTEGER_TYPES):
            if self.max_samples > n_samples:
                warn("max_samples (%s) is greater than the "
                     "total number of samples (%s). max_samples "
                     "will be set to n_samples for estimation." %
                     (self.max_samples, n_samples))
                max_samples = n_samples
            else:
                max_samples = self.max_samples
        else:  # float
            if not (0. < self.max_samples <= 1.):
                raise ValueError("max_samples must be in (0, 1], got %r" %
                                 self.max_samples)
            max_samples = int(self.max_samples * X.shape[0])

        self.max_samples_ = max_samples

        self.rules_ = {}
        self.estimators_ = []
        self.estimators_samples_ = []
        self.estimators_features_ = []

        # default columns names of the form ['c0', 'c1', ...]:
        feature_names_ = (self.feature_names
                          if self.feature_names is not None else
                          ['c' + x for x in np.arange(X.shape[1]).astype(str)])
        self.feature_names_ = feature_names_

        bagging_clf = BaggingClassifier(
            base_estimator=DecisionTreeClassifier(
                max_depth=self.max_depth,
                max_features=self.max_features,
                min_samples_split=self.min_samples_split),
            n_estimators=self.n_estimators,
            max_samples=self.max_samples_,
            max_features=self.max_samples_features,
            bootstrap=self.bootstrap,
            bootstrap_features=self.bootstrap_features,
            # oob_score=... XXX may be added if selection on tree perf needed.
            # warm_start=... XXX may be added to increase computation perf.
            n_jobs=self.n_jobs,
            random_state=self.random_state,
            verbose=self.verbose)

        bagging_reg = BaggingRegressor(
            base_estimator=DecisionTreeRegressor(
                max_depth=self.max_depth,
                max_features=self.max_features,
                min_samples_split=self.min_samples_split),
            n_estimators=self.n_estimators,
            max_samples=self.max_samples_,
            max_features=self.max_samples_features,
            bootstrap=self.bootstrap,
            bootstrap_features=self.bootstrap_features,
            # oob_score=... XXX may be added if selection on tree perf needed.
            # warm_start=... XXX may be added to increase computation perf.
            n_jobs=self.n_jobs,
            random_state=self.random_state,
            verbose=self.verbose)

        bagging_clf.fit(X, y)

        # define regression target:
        if sample_weight is not None:
            if sample_weight is not None:
                sample_weight = check_array(sample_weight, ensure_2d=False)
            weights = sample_weight - sample_weight.min()
            contamination = float(sum(y)) / len(y)
            y_reg = (pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow(
                (weights).mean(), 0.5) * (y == 0))
            y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid
        else:
            y_reg = y  # same as an other classification bagging

        bagging_reg.fit(X, y_reg)

        self.estimators_ += bagging_clf.estimators_
        self.estimators_ += bagging_reg.estimators_

        self.estimators_samples_ += bagging_clf.estimators_samples_
        self.estimators_samples_ += bagging_reg.estimators_samples_

        self.estimators_features_ += bagging_clf.estimators_features_
        self.estimators_features_ += bagging_reg.estimators_features_

        rules_ = []
        for estimator, samples, features in zip(self.estimators_,
                                                self.estimators_samples_,
                                                self.estimators_features_):

            # Create mask for OOB samples
            mask = ~samples
            if sum(mask) == 0:
                warn("OOB evaluation not possible: doing it in-bag."
                     " Performance evaluation is likely to be wrong"
                     " (overfitting) and selected rules are likely to"
                     " not perform well! Please use max_samples < 1.")
                mask = samples
            rules_from_tree = self._tree_to_rules(
                estimator,
                np.array(self.feature_names_)[features])

            # XXX todo: idem without dataframe
            X_oob = pandas.DataFrame(
                (X[mask, :])[:, features],
                columns=np.array(self.feature_names_)[features])
            y_oob = y[mask]
            y_oob = np.array((y_oob != 0))
            # Add OOB performances to rules:

            rules_from_tree = [(r, self._eval_rule_perf(r, X_oob, y_oob))
                               for r in set(rules_from_tree)]
            rules_ += rules_from_tree

        # keep only rules verifying precision_min and recall_min:
        for rule, score in rules_:
            if (score[0] > self.precision_min and score[1] > self.recall_min):
                if rule in self.rules_:
                    # update the score to the new mean
                    c = self.rules_[rule][2] + 1
                    b = self.rules_[rule][1] + 1. / c * (score[1] -
                                                         self.rules_[rule][1])
                    a = self.rules_[rule][0] + 1. / c * (score[0] -
                                                         self.rules_[rule][0])

                    self.rules_[rule] = (a, b, c)
                else:
                    self.rules_[rule] = (score[0], score[1], 1)

        self.rules_ = sorted(self.rules_.items(),
                             key=lambda x: (x[1][0], x[1][1]),
                             reverse=True)
        return self
Example #36
0
    start = time.time()
    model = svm.SVR(kernel='poly', degree=2, C=0.7)
    "boostedModel = AdaBoostRegressor(base_estimator= model, n_estimators = 10, loss = 'square')"
    baggedModel = BaggingRegressor(base_estimator=model,
                                   n_estimators=50,
                                   max_features=0.6,
                                   max_samples=0.5,
                                   bootstrap=False)

    X_train, X_test, y_train, y_test = train_test_split(X_ls,
                                                        y_ls,
                                                        test_size=0.30)

    with measure_time('Training'):
        print('Training...')
        baggedModel.fit(X_train, y_train)

    y_pred = baggedModel.predict(X_test)
    print('MSE on 0.2 of LS: ',
          mean_squared_error(y_true, y_pred, multioutput='uniform_average'))

    with measure_time('Training'):
        print('Training...')
        baggedModel.fit(X_ls, y_ls)

    # ------------------------------ Prediction ------------------------------ #
    # Load test data
    test_user_movie_pairs = load_from_csv(os.path.join(prefix,
                                                       'data_test.csv'))

    # Build the prediction matrix
Example #37
0
alg_test_id = alg_test['id']

Y_train = alg_train['relevance'].values

X_train = alg_train.drop(['id', 'relevance'], axis=1).values
X_test = alg_test.drop(['id', 'relevance'], axis=1).values

forest = RandomForestRegressor(n_estimators=550,
                               criterion="mse",
                               max_features=10,
                               max_depth=15,
                               n_jobs=-1,
                               verbose=0)
bg = BaggingRegressor(forest,
                      n_estimators=150,
                      max_samples=0.1,
                      random_state=29)
bg.fit(X_train, Y_train)
Y_output = bg.predict(X_test)

# score = forest.score(X_train, Y_train)
#
# print score

filename = 'submission_' + date + '.csv'

pd.DataFrame({
    "id": alg_test_id,
    "relevance": Y_output
}).to_csv(filename, index=False)
Example #38
0
passenger_id = full.loc[sourceRow:,'PassengerId']
#数据框:乘客id,预测生存情况的值
predDf = pd.DataFrame(
    {'PassengerId': passenger_id,
    'Survived':pred_Y})
predDf.shape
predDf.head()
#保存结果
predDf.to_csv('titanic_pred_new.csv', index = False)
'''

# fit到BaggingRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)

bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False)
bagging_clf.fit(train_X, train_y)
print bagging_clf.score(test_X, test_y)
predictions = bagging_clf.predict(pred_X)

result = pd.DataFrame({'PassengerId':test['PassengerId'].values, 'Survived':predictions.astype(np.int32)})
result.to_csv("logistic_regression_bagging_predictions1.csv", index=False)

'''
xgb_clf = XGBClassifier(learning_rate=0.1, max_depth=2, silent=True, objective='binary:logistic')
# xgb_clf = XGBClassifier()
xgb_clf.fit(train_X.values, train_y.values)
print xgb_clf.score(test_X.values, test_y.values)

predictions = xgb_clf.predict(pred_X.values)

result = pd.DataFrame({'PassengerId':test['PassengerId'].values, 'Survived':predictions.astype(np.int32)})
del globals()['profiles']
del globals()['profilesLSo']
del globals()['profilesLS']
del globals()['row']
del globals()['tmpLS']
del globals()['tmpAGE']
del globals()['profsTOlikes']
del globals()['i']
del globals()['tmpIND']

seed = 7
myRand = np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(likesMAT,
                                                    extsARR,
                                                    test_size=1500)

nJOBS = int(sys.argv[1])
nEST = int(sys.argv[2])
bagIN = BaggingRegressor(n_jobs=nJOBS, n_estimators=nEST)

#bagIN.fit(likesMAT, extsARR)
bagIN.fit(X_train, y_train)

y_pred = bagIN.predict(X_test)
import math
myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("exts, bagIN:  ", str(nEST), " ", myRMSE)

# joblib.dump(bagIN, "/Users/jamster/bagIN-A-exts.xz", compress=9)

# impbagIN = joblib.load("/Users/jamster/bagIN-A-exts.xz")
Example #40
0
    def gradient_descent_algo(self):

        X = []
        Y = []

        with open('../Data/full_table.csv', 'r') as file:
            for line in csv.reader(file, delimiter=','):
                if len(line) == 13:
                    try:
                        zhvi = float(line[5])
                        property_type = line[6]
                        room_type = line[7]
                        accommodates = int(line[8])
                        bathrooms = float(line[9])
                        beds = int(line[10])
                        bed_type = line[11]
                        price = float(line[12])

                        x = {
                            'zhvi': zhvi,
                            'property_type': property_type,
                            'room_type': room_type,
                            'accommodates': accommodates,
                            'bathrooms': bathrooms,
                            'beds': beds,
                            'bed_type': bed_type
                        }

                        y = price

                        X.append(x)
                        Y.append(y)

                    except:
                        pass

        # The DictVectorizer converts data from a dictionary to an array
        vec = DictVectorizer()

        # Convert X to Array
        X = vec.fit_transform(X).toarray()

        # Normalize Data
        X = preprocessing.normalize(X)

        # Split X and Y into training and testing sets
        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.33)

        # Gradient Descent
        model = linear_model.SGDRegressor()
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Gradient Descent')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))

        # With Boosting
        model_boost = AdaBoostRegressor(linear_model.SGDRegressor())
        model_boost.fit(X_train, Y_train)
        Y_pred = model_boost.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Gradient Descent (with AdaBoost)')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))

        # With Bagging
        model_bag = BaggingRegressor(linear_model.SGDRegressor())
        model_bag.fit(X_train, Y_train)
        Y_pred = model_bag.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Gradient Descent (with Bagging)')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))
Example #41
0
def QuickML_Stacking(X_train,
                     y_train,
                     X_test='',
                     modeltype='Regression',
                     Boosting_Flag=False,
                     scoring='',
                     verbose=0):
    """
    Quickly build Stacks of multiple model results 
    Input must be a clean data set (only numeric variables, no categorical or string variables).
    """
    start_time = time.time()
    seed = 99
    if len(X_train) <= 100000 or X_train.shape[1] < 50:
        NUMS = 100
        FOLDS = 5
    else:
        NUMS = 200
        FOLDS = 10
    ## create Stacking models
    estimators = []
    ### This keeps tracks of the number of predict_proba columns generated by each model ####
    estimator_length = []
    if isinstance(X_test, str):
        no_fit = True
    else:
        no_fit = False
    if no_fit:
        #### This is where you don't fit the model but just do cross_val_predict ####
        if modeltype == 'Regression':
            if scoring == '':
                scoring = 'neg_mean_squared_error'
            scv = KFold(n_splits=FOLDS, random_state=seed, shuffle=True)
            if Boosting_Flag:
                ######    Bagging models if Bagging is chosen ####
                model4 = BaggingRegressor(
                    DecisionTreeRegressor(random_state=seed),
                    n_estimators=NUMS,
                    random_state=seed)
                results = cross_val_predict(model4,
                                            X_train,
                                            y_train,
                                            cv=scv,
                                            n_jobs=-1)
                estimators.append(('Bagging1', model4))
                estimator_length.append(1)
            elif Boosting_Flag is None:
                ####   Tree models if Linear chosen #####
                model5 = DecisionTreeRegressor(random_state=seed,
                                               min_samples_leaf=2)
                results = cross_val_predict(model5,
                                            X_train,
                                            y_train,
                                            cv=scv,
                                            n_jobs=-1)
                estimators.append(('Decision Trees', model5))
                estimator_length.append(1)
            else:
                ####   Linear Models if Boosting is chosen #####
                model6 = LassoCV(alphas=np.logspace(-10, -1, 50),
                                 cv=scv,
                                 random_state=seed)
                results = cross_val_predict(model6,
                                            X_train,
                                            y_train,
                                            cv=scv,
                                            n_jobs=-1)
                estimators.append(('LassoCV Regularization', model6))
                estimator_length.append(1)
        else:
            n_classes = len(Counter(y_train))
            if scoring == '':
                scoring = 'accuracy'
            scv = StratifiedKFold(n_splits=FOLDS,
                                  random_state=seed,
                                  shuffle=True)
            if Boosting_Flag:
                ####   Linear Models if Boosting is chosen #####
                model4 = LinearDiscriminantAnalysis()
                results = cross_val_predict(model4,
                                            X_train,
                                            y_train,
                                            cv=scv,
                                            n_jobs=-1,
                                            method='predict_proba')
                estimators.append(('Linear Discriminant', model4))
                estimator_length.append(results.shape[1])
            elif Boosting_Flag is None:
                ####   Tree models if Linear chosen #####
                model6 = DecisionTreeClassifier(min_samples_leaf=2)
                results = cross_val_predict(model6,
                                            X_train,
                                            y_train,
                                            cv=scv,
                                            n_jobs=-1,
                                            method='predict_proba')
                estimators.append(('Decision Tree', model6))
                estimator_length.append(results.shape[1])
            else:
                ######    Naive Bayes models if Bagging is chosen ####
                if n_classes <= 2:
                    try:
                        model7 = GaussianNB()
                    except:
                        model7 = DecisionTreeClassifier(min_samples_leaf=2)
                else:
                    try:
                        model7 = MultinomialNB()
                    except:
                        model7 = DecisionTreeClassifier(min_samples_leaf=2)
                results = cross_val_predict(model7,
                                            X_train,
                                            y_train,
                                            cv=scv,
                                            n_jobs=-1,
                                            method='predict_proba')
                estimators.append(('Naive Bayes', model7))
                estimator_length.append(results.shape[1])
    else:
        #### This is where you fit the model and then predict ########
        if modeltype == 'Regression':
            if scoring == '':
                scoring = 'neg_mean_squared_error'
            scv = KFold(n_splits=FOLDS, random_state=seed, shuffle=True)
            if Boosting_Flag:
                ######    Bagging models if Bagging is chosen ####
                model4 = BaggingRegressor(
                    DecisionTreeRegressor(random_state=seed),
                    n_estimators=NUMS,
                    random_state=seed)
                results = model4.fit(X_train, y_train).predict(X_test)
                estimators.append(('Bagging1', model4))
                estimator_length.append(1)
            elif Boosting_Flag is None:
                ####   Tree models if Linear chosen #####
                model5 = DecisionTreeRegressor(random_state=seed,
                                               min_samples_leaf=2)
                results = model5.fit(X_train, y_train).predict(X_test)
                estimators.append(('Decision Trees', model5))
                estimator_length.append(1)
            else:
                ####   Linear Models if Boosting is chosen #####
                model6 = LassoCV(alphas=np.logspace(-10, -1, 50),
                                 cv=scv,
                                 random_state=seed)
                results = model6.fit(X_train, y_train).predict(X_test)
                estimators.append(('LassoCV Regularization', model6))
                estimator_length.append(1)
        else:
            n_classes = len(Counter(y_train))
            if scoring == '':
                scoring = 'accuracy'
            scv = StratifiedKFold(n_splits=FOLDS,
                                  random_state=seed,
                                  shuffle=True)
            if Boosting_Flag:
                ####   Linear Models if Boosting is chosen #####
                model4 = LinearDiscriminantAnalysis()
                results = model4.fit(X_train, y_train).predict_proba(X_test)
                estimators.append(('Linear Discriminant', model4))
                estimator_length.append(results.shape[1])
            elif Boosting_Flag is None:
                ####   Tree models if Linear chosen #####
                model6 = DecisionTreeClassifier(min_samples_leaf=2)
                results = model6.fit(X_train, y_train).predict_proba(X_test)
                estimators.append(('Decision Tree', model6))
                estimator_length.append(results.shape[1])
            else:
                ######    Naive Bayes models if Bagging is chosen ####
                if n_classes <= 2:
                    try:
                        model7 = GaussianNB()
                    except:
                        model7 = DecisionTreeClassifier(min_samples_leaf=2)
                else:
                    try:
                        model7 = MultinomialNB()
                    except:
                        model7 = DecisionTreeClassifier(min_samples_leaf=2)
                results = model7.fit(X_train, y_train).predict_proba(X_test)
                estimators.append(('Naive Bayes', model7))
                estimator_length.append(results.shape[1])
    #stacks = np.c_[results1,results2,results3]
    estimators_list = [(tuples[0], tuples[1]) for tuples in estimators]
    estimator_names = [tuples[0] for tuples in estimators]
    #### Here is where we consolidate the estimator names and their results into one common list ###
    ls = []
    for x, y in dict(zip(estimator_names, estimator_length)).items():
        els = [x + str(eachy) for eachy in range(y)]
        ls += els
    if verbose == 1:
        print('    Time taken for Stacking: %0.1f seconds' %
              (time.time() - start_time))
    return ls, results


#########################################################
ax2.set_title('Error between actual and predicted loads')
ax2.set_ylabel("Error, MW")

featImportances=gradBoost.feature_importances_
pos = np.arange(len(features))
pairs = zip(features, featImportances)
sorted_pairs = sorted(pairs, key = lambda pair: pair[1])
features_sorted, featImportances_sorted = zip(*sorted_pairs)
fig, ax = plt.subplots()
plt.barh(pos, featImportances_sorted, 1, color = "blue")
plt.yticks(pos,features_sorted)
ax.set_title('Gradient Boosting: Relative Feature Importance')

#Tree Bagging
TreeBagger=BaggingRegressor()
TreeBagger.fit(Xtrain, Ytrain)
fig = plt.figure()
ax1 = fig.add_subplot(2, 1, 1)
ax1.plot_date(dates, modeldata.Load[45000:50000], 'r-',tz=None, xdate=True,
          ydate=False, label='Actual Load')
ax1.set_title('Tree Bagging: Actual and Predicted Loads')          
plt.plot(dates, TreeBagger.predict(Xtest), 'g-',label='Predicted Load')
ax1.legend()
ax2 = fig.add_subplot(2, 1, 2)
ax2.plot_date(dates, modeldata.Load[45000:50000]-TreeBagger.predict(Xtest), 'r-',tz=None, xdate=True,
          ydate=False)
ax2.set_title('Error between actual and predicted loads, MW')

MSEs_Bagging=[mean_squared_error(Ytest, TreeBagger.predict(Xtest)), mean_squared_error(Ytrain, TreeBagger.predict(Xtrain))]

#Model Comparison: Bar charts
Example #43
0
###########################################################
# Bagging Methods
###########################################################

from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)

# usual knn
knn.fit(xtrain,ytrain)
knn.score(xtrain,ytrain)
knn.score(xtest,ytest)

# full bagging
bf = BaggingRegressor(knn,n_estimators=100,max_samples=1.0,max_features=1.0,random_state=0)
bf.fit(xtrain,ytrain)
bf.score(xtrain,ytrain)
bf.score(xtest,ytest)

# bagging with subsampling and feature randomization
bf = BaggingRegressor(knn,n_estimators=500,max_samples=0.5,max_features=0.5)
bf.fit(xtrain,ytrain)
bf.score(xtrain,ytrain)
bf.score(xtest,ytest)

# effect of estimators
np.random.seed(0)
n_list = [1,5,10,20,30,50,100,200,500,1000]
s = np.zeros((len(n_list),2))
for i in range(len(n_list)):
    bf = BaggingRegressor(knn,n_estimators=n_list[i],max_samples=0.5,max_features=0.5)
        
        lm_bagged = BaggingRegressor(
          base_estimator = lm, 
          n_estimators = 75, 
          max_samples = n_samp, 
          max_features = n_feat,
          bootstrap = True, 
          oob_score = False, 
          warm_start = False, 
          n_jobs = -1
        )
        
        log_bagged = BaggingClassifier(
          base_estimator = log, 
          n_estimators = 75, 
          max_samples = n_samp, 
          max_features = n_feat,
          bootstrap = True, 
          oob_score = False, 
          warm_start = False, 
          n_jobs = -1
        )
        
        lm_bagged.fit(X = train[features], y = train['y'])
        log_bagged.fit(X = train[features], y = train['y'])        
        lm_bagged_preds = lm_bagged.predict(X = test[features])
        log_bagged_preds = log_bagged.predict_proba(X = test[features])
        
        write_function(lm_bagged_preds, '/tmp/lm_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat))
        write_function(second_pos_clip(log_bagged_preds), '/tmp/log_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat))
Example #45
0
from sklearn.model_selection import cross_val_score
clf = RandomForestRegressor()
scores = cross_val_score(clf, X_test, y_test, cv=5)
scores.mean()

#mse in $
mse = mean_absolute_error(y_test, y_pred)
print("The mean absolute error is:$", mse)
#chceking r^2
from sklearn.metrics import r2_score

print("r_Score:", r2_score(y_test, y_pred))

bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10)
bg.fit(X_train, y_train)
bg.score(X_train, y_train)
bg.score(X_test, y_test)

#Adaboosting
regr = AdaBoostRegressor()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

#Decision
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

#gradientBoost
    plt.legend(loc='upper right')
    plt.grid(b=True)

    plt.subplot(132)
    t = np.arange(N)
    plt.plot(t, x, 'r-', lw=1, label=u'原始数据')
    plt.plot(abnormal, x[abnormal], 'go', markeredgecolor='g', ms=3, label=u'异常值')
    plt.legend(loc='upper right')
    plt.title(u'异常检测', fontsize=18)
    plt.grid(b=True)

    # 预测
    plt.subplot(133)
    select = np.ones(N, dtype=np.bool)
    select[abnormal] = False
    t = np.arange(N)
    dtr = DecisionTreeRegressor(criterion='mse', max_depth=10)
    br = BaggingRegressor(dtr, n_estimators=10, max_samples=0.3)
    br.fit(t[select].reshape(-1, 1), x[select])
    y = br.predict(np.arange(N).reshape(-1, 1))
    y[select] = x[select]
    plt.plot(x, 'g--', lw=1, label=u'原始值')    # 原始值
    plt.plot(y, 'r-', lw=1, label=u'校正值')     # 校正值
    plt.legend(loc='upper right')
    plt.title(u'异常值校正', fontsize=18)
    plt.grid(b=True)

    plt.tight_layout(1.5, rect=(0, 0, 1, 0.95))
    plt.suptitle(u'排污数据的异常值检测与校正', fontsize=22)
    plt.show()
            np.transpose(X), err=0.1, pct=50)
        end = time.time()
        time_all[j, 3] = end - start

        print('\n****** DTM *******\n')
        rng = np.random.RandomState(42)
        max_samples = min(20, X.shape[0])
        bag_neigh = 1
        clf_spDTM = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh,
                                                        contamination=0.1),
                                     n_estimators=1,
                                     max_samples=max_samples,
                                     bootstrap=False,
                                     random_state=rng)
        start = time.time()
        y_score_spDTM = clf_spDTM.fit(X, y).predict(X)

        end = time.time()
        time_all[j, 4] = end - start

        precision_iso, recall_iso, thresholds_iso = metrics.precision_recall_curve(
            y, -iso_scores, pos_label=1)
        precision_lof, recall_lof, thresholds_lof = metrics.precision_recall_curve(
            y, -lof_scores, pos_label=1)
        precision_osvm, recall_osvm, thresholds_osvm = metrics.precision_recall_curve(
            y, -osvm_scores, pos_label=1)
        precision_our, recall_our, thresholds_our = metrics.precision_recall_curve(
            y, -our_scores, pos_label=1)
        precision_dtm, recall_dtm, thresholds_dtm = metrics.precision_recall_curve(
            y, -y_score_spDTM, pos_label=1)
Example #48
0
def bootstrapped_ci(base_estimator,
                    x,
                    y,
                    n_resamples=100,
                    is_regression=True,
                    n_jobs=-1,
                    verbose=0):
    """
    Use Bootstrapped models to get distributions of params to estimate credible distribution.
    Those credible intervals are then used to estimate the distance of the mean from 0.

    >>> from sklearn.linear_model import LinearRegression
    >>> from sklearn.datasets import load_breast_cancer
    >>> x, y = load_breast_cancer(return_X_y=True)
    >>> _ = bootstrapped_ci(LinearRegression(), x, y)
    These are not p-values!
    >>> print('Done')
    Done

    :param n_jobs:
    :param verbose:
    :param base_estimator: SKLearn style linear model that has a coef_ attribute after fitting.
    :param x:
    :param y:
    :param n_resamples: How many Bootstrap draws to make
    :param is_regression:
    :return:
    """
    import scipy.stats as stats
    from rosey.helpers import np_min
    from sklearn.ensemble import BaggingRegressor, BaggingClassifier

    print('These are not p-values!')
    if is_regression:
        # Fit Models
        bootstrapped_models = BaggingRegressor(base_estimator,
                                               n_estimators=n_resamples,
                                               bootstrap=True,
                                               bootstrap_features=False,
                                               n_jobs=n_jobs,
                                               verbose=verbose)
        bootstrapped_models.fit(x, y)

        # Get params
        model_coefs = np.vstack(
            [est.coef_ for est in bootstrapped_models.estimators_])
        coefs_mu = model_coefs.mean(axis=0)
        coefs_sd = model_coefs.std(axis=0)
        vec = coefs_mu.shape

        # Compute p values
        p_values = np_min(
            stats.norm.pdf(np.zeros(vec), loc=coefs_mu, scale=coefs_sd),
            stats.norm.cdf(np.zeros(vec), loc=coefs_mu, scale=coefs_sd))
        if any(p_values > 1):
            warnings.warn('Bad p-values detected')

        return pd.DataFrame(np.vstack([coefs_mu, coefs_sd, p_values]).T,
                            columns=['mu', 'sd', 'p'])
    else:
        raise NotImplementedError
Example #49
0
rf = RandomForestRegressor()
br = BaggingRegressor(rf)

pipe = pipeline.Pipeline([('rf', rf), ('br', br)])

parameters = dict(rf__n_estimators=[5, 10, 15, 20], rf__max_depth=[2, 4, 6, 8, 10], rf__random_state=[0, 5, 10, 15],
	br__n_estimators=[5, 15, 25, 35, 45, 55], br__max_samples=[0.1, 0.2, 0.3], br__random_state=[0, 5, 10, 15, 20, 25, 30])
model = grid_search.GridSearchCV(pipe, parameters)
model.fit(features_train, labels_train)

print("Best parameters:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

#Best parameters:
#{'br__max_samples': 0.1, 'br__n_estimators': 45, 'rf__max_depth': 6, 'br__random_state': 25, 'rf__random_state': 0, 'rf__n_estimators': 5}
#Best CV score: 0.13390585367

pred = model.predict(features_test)
"""

# Use the best parameters from gridsearch
rf = RandomForestRegressor(n_estimators=5, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

# Write predicted numbers to submission.csv file
pd.DataFrame({"id": id_test, "relevance": pred}).to_csv('submission.csv',index=False)
Example #50
0
del globals()['profilesLSo']
del globals()['profilesLS']
del globals()['row']
del globals()['tmpLS']
del globals()['tmpAGE']
del globals()['profsTOlikes']
del globals()['i']
del globals()['tmpIND']

seed = 7
myRand = np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(likesMAT,
                                                    neusARR,
                                                    test_size=1500)

nJOBS = int(sys.argv[1])
nEST = int(sys.argv[2])
bagOUT = BaggingRegressor(n_jobs=nJOBS, n_estimators=nEST, oob_score=True)

#bagOUT.fit(likesMAT, neusARR)
bagOUT.fit(X_train, y_train)

y_pred = bagOUT.predict(X_test)
import math

myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("neus, bagOUT:  ", str(nEST), " ", myRMSE)

# joblib.dump(bagOUT, "/Users/jamster/bagOUT-A-neus.xz", compress=9)

# impbagOUT = joblib.load("/Users/jamster/bagOUT-A-neus.xz")
Example #51
0
model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=5, verbose=0, scoring=RMSE)
errors = []
X_train = df.drop(['product_uid', 'id', 'relevance'], axis=1).values
y_train = df['relevance'].values
model.fit(X_train, y_train)

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

del X_train, y_train



kf = KFold(df.shape[0], n_folds=K_fold)
for train_index, test_index in kf:
    train_set = df.iloc[train_index]
    test_set = df.iloc[test_index]

    y_train = train_set['relevance'].values
    X_train = train_set.drop(['product_uid', 'id', 'relevance'], axis=1).values
    y_test = train_set['relevance'].values
    X_test = test_set.drop(['product_uid', 'id', 'relevance'], axis=1).values

    clf2.fit(X_train,y_train)

    result = clf2.predict(X_test)
    error = np.sqrt(mean_squared_error(result,y_test))
    errors.extend([error])
print np.mean(errors)
# Model
#---------------------------------------------------------------------------------
if not hyperparam_opt:
    rf = RandomForestRegressor(n_estimators=20,
                               bootstrap=True,
                               min_samples_leaf=15,
                               min_samples_split=15,
                               max_features=3,
                               max_depth=10)
    clf = BaggingRegressor(rf, n_estimators=5, max_samples=0.1, random_state=0)
    bf = GradientBoostingRegressor(n_estimators=5, max_depth=6, random_state=0)

    rf.fit(X_train, y_train)
    print("random forest fitted...")

    clf.fit(X_train, y_train)
    print("bagging fitted...")

    bf.fit(X_train, y_train)
    print("boosting fitted...")
else:
    rf = RandomForestRegressor(n_estimators=20)
    clf = BaggingRegressor(rf, n_estimators=20)
    bf = GradientBoostingRegressor(
        n_estimators=20)  #min_samples_split=2,learning_rate=0.01, loss='ls')

    all_models = [{'rf': rf}, {'clf': clf}, {'bf': bf}]
    all_results = []
    for model in all_models:
        all_results.append(rand_search(model, True))
    rf, clf, bf = all_results
Example #53
0
# Getting Testing Data out of the DF
test_data_frame = data_frame_regression.iloc[num_train:]

# Getting IDs for Testing Data
id_test = test_data_frame['id']

relevance_train = train_data_frame['relevance'].values

# All the Independent Variables in the Regressor
# These are Words in Title, Desription, Values
X_train = train_data_frame.drop(['id', 'relevance'], axis=1).values

# Same for Test Data
X_test = test_data_frame.drop(['id', 'relevance'], axis=1).values

# Using RandomForest Regressor
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)

# Using Bagging Regressor
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)

# Fit the Training Data to a Model
clf.fit(X_train, relevance_train)

# Predicting the relevance for Testind Data
relevance_pred = clf.predict(X_test)

# Writing the Relevance Values to Submission.csv
pandas.DataFrame({"id": id_test, "relevance": relevance_pred}).to_csv('submission.csv', index=False)
Example #54
0
y = tran_np[:, 7]

#-----------input------
x = tran_np[:, :7]

clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)

#-----------use bagging------
bagging_clf = BaggingRegressor(clf,
                               n_estimators=20,
                               max_features=1.0,
                               max_samples=0.8,
                               bootstrap=True,
                               bootstrap_features=False,
                               n_jobs=-1)
bagging_clf.fit(x, y)

print clf

#-----------testdata------
test = pd.read_csv('/home/perfecum/下载/test.csv', header=0)
test_df = test.filter(
    regex='Dealership|Showroom|ComputerSearch|M5|3Series|Z4|Financing')
print test_df
test_np = test_df.as_matrix()

predictions = bagging_clf.predict(test_df)

print predictions

#-----------something test------
Example #55
0

out_dir = "STS-en-{}-{}".format(GROUP, APPROACH)
if not os.path.exists(out_dir): os.mkdir(out_dir)

filenames = []

for sts12_train_id, sts12_test_id, sts13_test_id, sts14_test_id in id_pairs:
    # combine 2012, 2013 training and test data
    X_sts12_train, y_sts12_train = ntnu_sts12.read_train_data(sts12_train_id, feats)
    X_sts12_test, y_sts12_test = ntnu_sts12.read_test_data(sts12_test_id, feats)
    X_sts13_test, y_sts13_test = sts13.read_test_data(sts13_test_id, feats)
    X_train = np.vstack([X_sts12_train, X_sts12_test, X_sts13_test])
    y_train = np.hstack([y_sts12_train, y_sts12_test, y_sts13_test])

    regressor.fit(X_train, y_train)

    X_test = read_blind_test_data(sts14_test_id, feats)
    y_test = regressor.predict(X_test)

    test_input = read_system_input(test_input_fnames[sts14_test_id])
    postprocess(test_input,  y_test)

    fname =  "{}/STS-en.output.{}.txt".format(out_dir, sts14_test_id)
    write_scores(fname, y_test)
    filenames.append(fname)

descr_fname = "{}/STS-en-{}-{}.description.txt".format(out_dir, GROUP, APPROACH)
open(descr_fname, "w").write(DESCRIPTION)
filenames.append(descr_fname)
Example #56
0
def train_bagging_cart(X, Y):
    adaboost = BaggingRegressor(DecisionTreeRegressor(max_depth=5) , max_features=0.7, n_estimators=30)
    adaboost.fit(X, Y)
    return adaboost
Example #57
0
    X[np.isnan(X)] = 0.

    print '******************************************'
    print name
    print '******************************************'
    
    if name=='Boston' or name=='Diabetes': # Regression problem
    
        rfr = RandomForestRegressor(**params)
        rfr.fit(X, y)
        print 'Score RandomForestRegressor = %s' % (rfr.score(X, y))
        scores_rfr = cross_val_score(rfr, X, y ,cv=5)
        print 'Cross Val Score RandomForestRegressor = %s' % (np.mean(scores_rfr))
        
        br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=max_depth), n_estimators=n_estimators)
        br.fit(X, y)
        print 'Score BaggingRegressor = %s' % (br.score(X, y))
        scores_br = cross_val_score(br, X, y, cv=5)
        print 'Cross Val Scores of BR = %s' %(np.mean(scores_br))
        
    if name=='Iris' or name=='Digits': # Classificaiton problem
    
        rfc = RandomForestClassifier(**params)
        rfc.fit(X, y)
        print 'Score RandomForestClassifier = %s' % (rfc.score(X, y))
        scores_rfc = cross_val_score(rfc, X, y ,cv=5)
        print 'Corss Val Scores of RandomForestClassifier = %s' %(np.mean(scores_rfc))

        bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators)
        bc.fit(X, y)        
        print 'Score BaggingClassifier == %s' % (bc.score(X, y))
Example #58
0
print('Final Score %f' % score)
print('Final Out-of-Fold Score %f' % oof_score)
print('=====================')

ens0_pred = prediction1

submission         = pd.read_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/input/sample_submission.csv')
submission.y       = ens0_pred
submission.id      = id
submission.columns = ['ID', 'y']
submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_adaboostregressor_cv%f.csv' % oof_score, index=False)

print("Ensemble Model 1: BaggingRegressor")
ens1  = BaggingRegressor(DecisionTreeRegressor(max_depth=4), random_state=1337) #, learning_rate=.05, n_estimators=300,

ens1.fit(df, train.y)

# In Sample R2
ens1_insample_pred = ens1.predict(df)
r2_score(train.y, ens1_insample_pred ) # 0.62753671854582205 0.6998279121628439

# Predict
ens1_pred = ens1.predict(df_test) # LB: -0.77554

submission.y       = ens1_pred
submission.id      = id
submission.columns = ['ID', 'y']
submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_baggingreg.csv', index=False)

print("Ensemble Model 2: ExtraTreesRegressor")
ens2  = ExtraTreesRegressor(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2,
df_all['letter_in_description'] = df_all['product_info'].map(
        lambda x: str_common_letter(x.split('\t')[0], x.split('\t')[2]))

print("Drop columns that were changed...")
df_all = df_all.drop(['search_term', 'product_title', 'product_description', 'product_info'], axis=1)

# Set up training and test sets
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]

id_test = df_test['id']
y_train = df_train['relevance'].values

# Drop 'id' and 'relevance' columns from the training and test sets
X_train = df_train.drop(['id', 'relevance'], axis=1).values
X_test = df_test.drop(['id', 'relevance'], axis=1).values

# Setup RandomForest and Bagging Regressors
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)

# Fit the training data into the regression model using the output values
clf.fit(X_train, y_train)

# Run the prediction
y_pred = clf.predict(X_test)

# Set up our Data Frame
datafr = pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('../dataset/submission.csv', index=False)
print(datafr)
Example #60
0
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import accuracy_score

movies = pd.read_csv('avaliacoes_usuario.csv')
caracteristicas = movies[movies.columns[1:16]]
gostos = movies[movies.columns[16:]]

treino, teste, treino_marcacoes, teste_marcacoes = train_test_split(
    caracteristicas, gostos)

treino = np.array(treino).reshape(len(treino), 15)
teste = np.array(teste).reshape(len(teste), 15)
treino_marcacoes = np.array(treino_marcacoes).reshape(len(treino_marcacoes), 1)
teste_marcacoes = np.array(teste_marcacoes).reshape(len(teste_marcacoes), 1)

#Pensando em Regressão - Pegando média
modelo = BaggingRegressor()
modelo.fit(treino, treino_marcacoes.ravel())
modelo.score(treino, treino_marcacoes)
modelo.score(teste, teste_marcacoes)