def train_model(train, test, labels): rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=10) #rf = RandomForestRegressor(n_estimators=45, max_depth=9, random_state=10) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.2, random_state=25) clf.fit(train, labels) #clf = SVR(C=1.0, epsilon=0.2) #clf.fit(train, labels) #clf = GaussianNB() #clf.fit(train, labels) print "Good!" predictions = clf.predict(test) print predictions.shape predictions = pd.DataFrame(predictions, columns = ['relevance']) print "Good again!" print "Predictions head -------" print predictions.head() print predictions.shape print "TEST head -------" print test.head() print test.shape #test['id'].to_csv("TEST_TEST.csv",index=False) #predictions.to_csv("PREDICTIONS.csv",index=False) #test = test.reset_index() #predictions = predictions.reset_index() #test = test.groupby(level=0).first() #predictions = predictions.groupby(level=0).first() predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False) print predictions return predictions
def model_fit_rf_bagging(): def in_limits(x): if x<1: return 1 if x>3: return 3 return x print "STARTING MODEL" X = full_data[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values y = full_data['relevance'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) in_limits = np.vectorize(in_limits,otypes=[np.float]) y_pred = in_limits(y_pred) RMSE = mean_squared_error(y_test, y_pred)**0.5 print "RMSE: ",RMSE # for the submission real_X_test = real_full_test[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values test_pred = clf.predict(real_X_test) test_pred = in_limits(test_pred) return test_pred
def avmPredict(params): town = getPlace(params['lat'], params['long'])[0] x, y, z = getXYZ(params['lat'], params['long']) r = 1.0 data = [] target = [] header = [] with open('../../../data/working22.csv') as f: f = csv.reader(f) header = next(f) for row in f: t = (map(float, row[:3] + row[4:]), float(row[3])) if weightF([x, y, z], t[0][0:3], r): data.append(t[0]) target.append(t[1]) ensemble = BaggingRegressor() ensemble.fit(data, target) test = createTest(params) return ensemble.predict(test)
def train_bagging_xgboost(X, Y): adaboost = BaggingRegressor(xgb.XGBRegressor(max_depth=6, learning_rate=0.02, n_estimators=300, silent=True, objective='reg:linear', subsample=0.7, reg_alpha=0.8, reg_lambda=0.8, booster="gblinear") , max_features=0.7, n_estimators=30) adaboost.fit(X, Y) return adaboost
def random_forest(X,Y,Xt): print('learn') rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf.fit(X, Y) print('predict') Yp_clamped = clf.predict(Xt) return Yp_clamped
def procedureA(goldenFlag = False): # Trains and generates a prediction file # Uses hard heuristic for buy_or_not popFlag = True X, Y = getDataXY(currYearFlag = False, popFlag = popFlag) X, Y = shuffle(X, Y, random_state = 0) if popFlag: encoder = oneHot(X[:, 2:]) Xt = encoder.transform(X[:, 2:]) Xt = np.hstack((X[:,:2], Xt)) else: encoder = oneHot(X) Xt = encoder.transform(X) buySet = set() for i in range(X.shape[0]): tmpTup = (X[i][0], X[i][2]) buySet.add(tmpTup) # Y_buy = [1] * Xt.shape[0] min_max_scaler = preprocessing.MinMaxScaler() # Xt = min_max_scaler.fit_transform(Xt) if goldenFlag: print Xt.shape Xt = getGoldenX(Xt, 2, 2 + encoder.feature_indices_[1], 2 + encoder.feature_indices_[0], 2 + min(9, encoder.feature_indices_[1])) split = 0.9 X_train, X_test = Xt[:(int(Xt.shape[0]*split)),:], Xt[int(Xt.shape[0]*split):, :] Y_train, Y_test = Y[:(int(Y.shape[0]*split)),:], Y[int(Y.shape[0]*split):, :] Y_train = Y_train.ravel() Y_test = Y_test.ravel() print X_train.shape print X_test.shape # clf = Ridge(alpha = 100) # clf = SVR(C = 10.0, kernel = 'poly', degree = 2) # clf = LinearSVR(C = 1.0) clf = BaggingRegressor(DecisionTreeRegressor(), n_estimators = 125, n_jobs = 4, random_state = 0) # clf = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = 100) # clf = DecisionTreeRegressor() # clf = RandomForestRegressor(random_state = 0, n_estimators = 200, n_jobs = 4) clf.fit(X_train, Y_train.ravel()) Y_pred = clf.predict(X_test) evaluatePred(Y_pred, Y_test) return clf, encoder, min_max_scaler
def train_model(training, testing, window=5, n=5): X_train, y_train = prepare_data(training) X_test, y_test = prepare_data(testing) rf = RandomForestRegressor() rf.fit(X_train, y_train) predrf = rf.predict(X_test) print "mse for random forest regressor: ", mean_squared_error(predrf, y_test) gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025) gb.fit(X_train, y_train) predgb = gb.predict(X_test) print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test) ## plot feature importance using GBR results fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility']) fx_imp /= fx_imp.max() # normalize fx_imp.sort() ax = fx_imp.plot(kind='barh') fig = ax.get_figure() fig.savefig("output/feature_importance.png") adb = AdaBoostRegressor(DecisionTreeRegressor()) adb.fit(X_train, y_train) predadb = adb.predict(X_test) print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test) scale = StandardScaler() scale.fit(X_train) X_trainscale = scale.transform(X_train) X_testscale = scale.transform(X_test) knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5) knn.fit(X_trainscale, y_train) predknn = knn.predict(X_testscale) print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test) pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test) result = testing.copy() result.ix[5:-5, 'trend'] = pred_test result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values result.ix[:-5, 'pred_date'] = result.index[5:] return result
def procc_modelfusion(df_test, data_test): from sklearn.ensemble import BaggingRegressor from sklearn import linear_model train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') train_np = train_df.as_matrix() # y即Survival结果 y = train_np[:, 0] # X即特征属性值 X = train_np[:, 1:] # fit到BaggingRegressor之中 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1) bagging_clf.fit(X, y) test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') predictions = bagging_clf.predict(test) result = pd.DataFrame({'PassengerId' : data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)}) result.to_csv("logistic_regression_predictions3.csv", index=False)
class Regressor(BaseEstimator): def __init__(self): # self.clf = GradientBoostingRegressor(n_estimators=200, max_features="sqrt", max_depth=5) # self.clf = LinearRegression() self.clf = BaggingRegressor(LinearRegression()) # self.clf = GaussianProcess(theta0=4) # self.sp = RandomizedLasso() self.sp = SparseRandomProjection(n_components=5) # self.sp = TruncatedSVD() # self.sp = KernelPCA(n_components=3, tol=0.0001, kernel="poly") # self.clf = ExtraTreesRegressor(n_estimators=200, max_features="sqrt", max_depth=5) def fit(self, X, y): # print(self.sp) # Xr = self.sp.fit_transform(X, y) self.clf.fit(X, y.ravel()) def predict(self, X): # Xr = self.sp.transform(X) return self.clf.predict(X)
def get_bagging_prediction(X_train, y_train, X_test, X_valid=None, GS=False): if not GS: rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) if X_valid is None: return y_pred else: return y_pred, clf.predict(X_valid) else: rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]} model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=2, verbose=VERBOSE, scoring=RMSE) model.fit(X_train, y_train) y_pred = model.predict(X_test) if X_valid is None: return y_pred else: return y_pred, model.predict(X_valid)
def runTests(): # Generate the training samples, extract training features and target trainSamples = GenSamples(numSamples) trainFeatures = extractFeatures(trainSamples) trainPred = extractPred(trainSamples) # Generate the test samples, extracr test features and target testSamples = GenSamples(numTestSamples) testFeatures = extractFeatures(testSamples) testPred = extractPred(testSamples) R2List = OrderedDict() R2List['TrainROI'] = [] R2List['TestROI'] = [] print 'Running Tests: ' for i in range(numTests): # Bootstrap is True by default i.e., sampling with replacement # Bootstrap features is False by default i.e., all features used classifier = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=numTrees, max_samples=int(0.5*numSamples), max_features=int(1)) classifier.fit(trainFeatures, trainPred) predictROI = {} predictROI['Training'] = classifier.predict(trainFeatures) predictROI['Test'] = classifier.predict(testFeatures) R2 = {} R2['Train'] = r2_score(trainPred, predictROI['Training']) R2['Test'] = r2_score(testPred, predictROI['Test']) R2List['TrainROI'].append(R2['Train']) R2List['TestROI'].append(R2['Test']) print 'Best Train ROI: ', max(R2List['TrainROI']) print 'Best Test ROI: ', max(R2List['TestROI'])
def test_bagging_regressor_with_missing_inputs(): # Check that BaggingRegressor can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y_values = [ np.array([2, 3, 3, 3, 3]), np.array([ [2, 1, 9], [3, 6, 8], [3, 6, 8], [3, 6, 8], [3, 6, 8], ]) ] for y in y_values: regressor = DecisionTreeRegressor() pipeline = make_pipeline( Imputer(), Imputer(missing_values=np.inf), Imputer(missing_values=np.NINF), regressor ) pipeline.fit(X, y).predict(X) bagging_regressor = BaggingRegressor(pipeline) y_hat = bagging_regressor.fit(X, y).predict(X) assert_equal(y.shape, y_hat.shape) # Verify that exceptions can be raised by wrapper regressor regressor = DecisionTreeRegressor() pipeline = make_pipeline(regressor) assert_raises(ValueError, pipeline.fit, X, y) bagging_regressor = BaggingRegressor(pipeline) assert_raises(ValueError, bagging_regressor.fit, X, y)
class BaggingRegressor(BaseEstimator): """ Usage: ``` "model": { "class": "ume.ensemble.BaggingRegressor", "params": { "base_estimator": { "class": "sklearn.svm.SVR", "params": { "kernel": "rbf", "degree": 1, "C": 1000000.0, "epsilon": 0.01, }, }, "bag_kwargs": { "n_estimators": 100, "n_jobs": 5, "max_samples": 0.9, }, } } ``` """ def __init__(self, base_estimator=None, bag_kwargs=None): klass = dynamic_load(base_estimator['class']) svr_reg = klass(**base_estimator['params']) self.__clf = SK_BaggingRegressor(base_estimator=svr_reg, **bag_kwargs) def fit(self, X, y): return self.__clf.fit(X, y) def predict(self, X): return self.__clf.predict(X)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] data = inspection_data.select_dtypes(include=numerics) data = pd.DataFrame(data) y = data['SCORE'] data.drop('SCORE', axis = 1, inplace= True ) # create training and testing vars X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3) #print(X_train.shape, y_train.shape) #print(X_test.shape, y_test.shape) dt_clf = DecisionTreeRegressor(splitter="random", max_leaf_nodes=16, random_state=0) bag_clf = BaggingRegressor(dt_clf, n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=0) bag_clf.fit(X_train, y_train) y_pred = bag_clf.predict(X_test) #print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) #iris = load_iris() ##no early stoping defined, so it goes the full length rnd_clf = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=42) rnd_clf.fit(X_train, y_train) for name, score in zip(X_train, rnd_clf.feature_importances_): print(name, score) inspection_data['risk factor'] = inspection_data['risk factor'].astype('int64') inspection_data['year'] = pd.DatetimeIndex(inspection_data['ACTIVITY DATE']).year inspection_data['month'] = pd.DatetimeIndex(inspection_data['ACTIVITY DATE']).month
X = df.iloc[:, [1, 2, 3, 4]] y = df.iloc[:, 5] X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=1) kVals = range(1, 21, 1) accuracies = [] for k in kVals: basemodel = KNeighborsRegressor(n_neighbors=k) model = BaggingRegressor(base_estimator=basemodel, n_estimators=30, bootstrap_features=True, max_features=2, random_state=1) model.fit(X_train, y_train) y_predicted = model.predict(X_test) score = model.score(X_test, y_test) print("k=%d, R^2 score=%.2f%%" % (k, score * 100)) print("k=%d, mae = %.2f" % (k, mean_absolute_error(y_test, y_predicted))) print("k=%d, mse =%.2f%%" % (k, mean_squared_error(y_test, y_predicted) * 100)) pred = np.array(y_predicted) org = np.array(y_test) dif = abs(pred - org) dif = 1 - dif / org div = pred / org ix = np.where(div > 1) div[ix] = 1 / div[ix] div = 1 - div print("k=%d, mape =%.4f%%" % (k, dif.mean() * 100))
# Sample 3.8: Diabetes Ensemble Regression from sklearn import datasets from sklearn.metrics import mean_squared_error from sklearn.ensemble import BaggingRegressor, RandomForestRegressor from sklearn.model_selection import train_test_split diabetes = datasets.load_diabetes() X = diabetes.data y = diabetes.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) bagreg = BaggingRegressor() bagreg.fit(X_train, y_train) bagreg_predict = bagreg.predict(X_test) print("Bagging Mean squared error: %.2f" % mean_squared_error(y_test, bagreg_predict)) rfreg = RandomForestRegressor() rfreg.fit(X_train, y_train) rfreg_predict = rfreg.predict(X_test) print("Random Forest Mean squared error: %.2f" % mean_squared_error(y_test, rfreg_predict))
min_samples_split=2, n_jobs=-1, max_features='log2', n_estimators=1900) extra_trees_reg.fit(X_train, y_train.ravel()) print("Feature Importances: {}".format(extra_trees_reg.feature_importances_)) # In[38]: bag_reg = BaggingRegressor(DecisionTreeRegressor(random_state=42), n_estimators=1000, max_samples=300, bootstrap=True, n_jobs=-1, random_state=42) bag_reg.fit(X_train, y_train.ravel()) # Comparing regressors with tuned paramters # In[39]: SET_FIT_INTERCEPT = True names = [ "LinearRegression", "Ridge", "ExtraTreesRegressor", "RandomForestRegressor", "BaggingRegressor" ] regressors = [ LinearRegression(fit_intercept=SET_FIT_INTERCEPT), Ridge(alpha=5,
train_np = train_d.as_matrix() # y就是Survival结果 y = train_np[:, 0] # X就是特征属性值 X = train_np[:, 1:] #fit到BaggingRegressor之中 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1) bagging_clf.fit(X, y) testf = test_f.filter( regex= 'Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title' ) predictions = bagging_clf.predict(testf) result = pd.DataFrame({ 'PassengerId': test['PassengerId'].as_matrix(), 'Survived': predictions.astype(np.int32) }) result
for train, test in kf: TR.append(train) TS.append(test) A = [] B = [] for k in range(kfcv): print k X_train = X[TR[k], :] y_train = y[TR[k]] X_test = X[TS[k], :] y_test = y[TS[k]] model.fit(X_train, y_train) y_predict = model.predict(X_test) # plt.subplot(2, 10, k + 1) # plt.scatter(y_predict, y_test) # plt.xlabel('y_predict') # plt.ylabel('y_true') # plt.title('Fold = %d' % (k + 1)) A.extend(list(y_predict)) B.extend(list(y_test)) # mse = mean_squared_error(y_predict, y_test) # print 'mse = %f' % mse mse = mean_squared_error(A, B)
# In[17]: from sklearn.metrics import r2_score XGBBoost_RF_score=r2_score(xbg_y_pred,test_y) XGBBoost_RF_score # In[18]: # BaggingRegressor from sklearn.ensemble import BaggingRegressor BR=BaggingRegressor(base_estimator=RF,n_estimators=50,) BR.fit(train_x,train_y) BR_y_pred=BR.predict(test_x) BR_score=r2_score(BR_y_pred,test_y) BR_score # In[20]: #Nueral Networks MLPRegressor from sklearn.neural_network import MLPRegressor mlpR= MLPRegressor(hidden_layer_sizes=(100,)) mlpR.fit(train_x,train_y) mlpr_y_pred=mlpR.predict(test_x)
class Regressor(skl.base.BaseEstimator, skl.base.TransformerMixin): """docstring""" def __init__(self, base_estimator='AdaBoostedLinearRegression', n_estimators=50, learning_rate=1.0, loss='linear', random_state=None, save_path=None): super(Regressor, self).__init__() self.base_estimator = str(base_estimator) self.n_estimators = n_estimators self.learning_rate = learning_rate self.loss = loss self.random_state = random_state self.save_path = save_path self.regressor = None def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y) if (self.base_estimator == 'BayesianRidge'): self.regressor = BayesianRidge() elif (self.base_estimator == 'LASSO'): self.regressor = Lasso() elif (self.base_estimator == 'ElasticNet'): self.regressor = ElasticNet() elif (self.base_estimator == 'MLPRegressor'): self.regressor = MLPRegressor() elif (self.base_estimator == 'KernelRidge'): self.regressor = KernelRidge(kernel='polynomial') elif (self.base_estimator == 'LinearRegression'): self.regressor = LinearRegression() elif (self.base_estimator == 'BaggingRegressorLinear'): base_estimator = LinearRegression() self.regressor = BaggingRegressor(base_estimator) elif (self.base_estimator == 'BaggingRegressorKernelRidge'): base_estimator = KernelRidge(kernel='polynomial') self.regressor = BaggingRegressor(base_estimator) elif (self.base_estimator == 'BaggingRegressorLasso'): base_estimator = Lasso() self.regressor = BaggingRegressor(base_estimator) else: raise Exception('Unsupported base_estimator: ' + self.base_estimator) self.regressor.fit(X, y) return self def predict(self, X): check_is_fitted(self, ["regressor"]) X = check_array(X) return self.regressor.predict(X) def score(self, X, y, sample_weight=None): scores = -(self.predict(X) - y)**2 / len(y) score = np.sum(scores) print(score) sys.stdout.flush() return score def set_save_path(self, save_path): self.save_path = save_path
class ShapeletForestRegressor(BaseEstimator, RegressorMixin): def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, n_shapelets=10, min_shapelet_size=0, max_shapelet_size=1, metric='euclidean', metric_params=None, bootstrap=True, n_jobs=None, random_state=None): """A shapelet forest regressor """ self.n_estimators = n_estimators self.bootstrap = bootstrap self.n_jobs = n_jobs self.max_depth = max_depth self.min_samples_split = min_samples_split self.n_shapelets = n_shapelets self.min_shapelet_size = min_shapelet_size self.max_shapelet_size = max_shapelet_size self.metric = metric self.metric_params = metric_params self.random_state = random_state def predict(self, X, check_input=True): if X.ndim < 2 or X.ndim > 3: raise ValueError("illegal input dimensions X.ndim ({})".format( X.ndim)) if self.n_dims_ > 1 and X.ndim != 3: raise ValueError("illegal input dimensions X.ndim != 3") if X.shape[-1] != self.n_timestep_: raise ValueError("illegal input shape ({} != {})".format( X.shape[-1], self.n_timestep_)) if X.ndim > 2 and X.shape[1] != self.n_dims_: raise ValueError("illegal input shape ({} != {}".format( X.shape[1], self.n_dims)) if check_input: X = check_array(X, dtype=np.float64, allow_nd=True, order="C") if X.dtype != np.float64 or not X.flags.contiguous: X = np.ascontiguousarray(X, dtype=np.float64) X = X.reshape(X.shape[0], self.n_dims_ * self.n_timestep_) return self.bagging_regressor_.predict(X) def fit(self, X, y, sample_weight=None, check_input=True): """Fit a random shapelet forest regressor """ random_state = check_random_state(self.random_state) if check_input: X = check_array(X, dtype=np.float64, allow_nd=True, order="C") y = check_array(y, dtype=np.float64, ensure_2d=False, order="C") if X.ndim < 2 or X.ndim > 3: raise ValueError("illegal input dimension") n_samples = X.shape[0] self.n_timestep_ = X.shape[-1] if X.ndim > 2: n_dims = X.shape[1] else: n_dims = 1 self.n_dims_ = n_dims if len(y) != n_samples: raise ValueError("Number of labels={} does not match " "number of samples={}".format(len(y), n_samples)) if X.dtype != np.float64 or not X.flags.contiguous: X = np.ascontiguousarray(X, dtype=np.float64) if y.dtype != np.float64 or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=np.float64) shapelet_tree_regressor = ShapeletTreeRegressor( max_depth=self.max_depth, min_samples_split=self.min_samples_split, n_shapelets=self.n_shapelets, min_shapelet_size=self.min_shapelet_size, max_shapelet_size=self.max_shapelet_size, metric=self.metric, metric_params=self.metric_params, random_state=random_state, ) if n_dims > 1: shapelet_tree_regressor.force_dim = n_dims self.bagging_regressor_ = BaggingRegressor( base_estimator=shapelet_tree_regressor, bootstrap=self.bootstrap, n_jobs=self.n_jobs, n_estimators=self.n_estimators, random_state=self.random_state, ) X = X.reshape(n_samples, n_dims * self.n_timestep_) self.bagging_regressor_.fit(X, y, sample_weight=sample_weight) return self
# 线性回归模型 lr = LinearRegression() lr.fit(train_X, (train_y)) print('lr_error:', error(cv_y, lr.predict(cv_X))) # 岭回归 ridge = RidgeCV(alphas=[151], cv=10) ridge.fit(train_X, train_y) print('ridge_error:', error(cv_y, ridge.predict(cv_X))) lasso = LassoCV(alphas=[0.003], max_iter=10000, cv=10) lasso.fit(train_X, train_y) print('lasso_error:', error(cv_y, np.exp(lasso.predict(cv_X)))) br_lr = BaggingRegressor(base_estimator=lr, n_estimators=3) br_lr.fit(train_X, train_y) print('br_lr_error:', error(cv_y, br_lr.predict(cv_X))) br_ridge = BaggingRegressor(base_estimator=ridge, n_estimators=3) br_ridge.fit(train_X, train_y) print('br_ridge_error:', error(cv_y, br_ridge.predict(cv_X))) br_lasso = BaggingRegressor(base_estimator=lasso, n_estimators=7) br_lasso.fit(train_X, train_y) print('br_lasso_error:', error(cv_y, br_lasso.predict(cv_X))) # 测试集预处理 data_test = pd.read_csv("C:/Tool/Pycharm/TianChi/d_test_A_20180102.csv", encoding='gb2312') test_columns = [ '年龄', '*天门冬氨酸氨基转换酶', '*丙氨酸氨基转换酶', '*碱性磷酸酶', '*r-谷氨酰基转换酶', '*总蛋白', '白蛋白',
class BaggingClass: """ Name : BaggingRegressor Attribute : None Method : predict, predict_by_cv, save_model """ def __init__(self): # 알고리즘 이름 self._name = 'bagging' # 기본 경로 self._f_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) # 경고 메시지 삭제 warnings.filterwarnings('ignore') # 원본 데이터 로드 data = pd.read_csv(self._f_path + "/regression/resource/regression_sample.csv", sep=",", encoding="utf-8") # 학습 및 테스트 데이터 분리 self._x = (data["year"] <= 2017) self._y = (data["year"] >= 2018) # 학습 데이터 분리 self._x_train, self._y_train = self.preprocessing(data[self._x]) # 테스트 데이터 분리 self._x_test, self._y_test = self.preprocessing(data[self._y]) # 모델 선언 self._model = BaggingRegressor() # 모델 학습 self._model.fit(self._x_train, self._y_train) # 그리드 서치 모델 self._g_model = None # 데이터 전처리 def preprocessing(self, data): # 학습 x = [] # 레이블 y = [] # 기준점(7일) base_interval = 7 # 기온 temps = list(data["temperature"]) for i in range(len(temps)): if i < base_interval: continue y.append(temps[i]) xa = [] for p in range(base_interval): d = i + p - base_interval xa.append(temps[d]) x.append(xa) return x, y # 일반 예측 def predict(self, save_img=False, show_chart=False): # 예측 y_pred = self._model.predict(self._x_test) # 스코어 정보 score = r2_score(self._y_test, y_pred) # 리포트 확인 if hasattr(self._model, 'coef_') and hasattr(self._model, 'intercept_'): print(f'Coef = {self._model.coef_}') print(f'intercept = {self._model.intercept_}') print(f'Score = {score}') # 이미지 저장 여부 if save_img: self.save_chart_image(y_pred, show_chart) # 예측 값 & 스코어 return [list(y_pred), score] # CV 예측(Cross Validation) def predict_by_cv(self): # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현 return False # GridSearchCV 예측 def predict_by_gs(self): # 그리드 서치 Params param_grid = { # 모형 개수 'n_estimators': [5, 10, 15], # 데이터 중복 여부 'bootstrap': [True, False], # 차원 중복 여부 'bootstrap_features': [True, False], # 독립 변수 차원 비율 'max_samples': [0.6, 0.8, 1.0] } # 그리드 서치 초기화 self._g_model = GridSearchCV(BaggingRegressor(), param_grid=param_grid) # 그리드 서치 학습 self._g_model.fit(self._x_train, self._y_train) # 파라미터 모두 출력 print(self._g_model.param_grid) # 베스트 스코어 print(self._g_model.best_score_) # 베스트 파라미터 print(self._g_model.best_params_) # 전체 결과 출력 print(self._g_model.cv_results_) return dict(gs_all_params=self._g_model.param_grid, gs_best_score=self._g_model.best_score_, gs_best_param=self._g_model.best_params_) # 모델 저장 및 갱신 def save_model(self, renew=False): # 모델 저장 if not renew: # 처음 저장 joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') else: # 기존 모델 대체 if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'): os.rename(self._f_path + f'/model/{self._name}_rg.pkl', self._f_path + f'/model/{str(self._name) + str(time.time())}_rg.pkl') joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') # 회귀 차트 저장 def save_chart_image(self, data, show_chart): # 사이즈 plt.figure(figsize=(15, 10), dpi=100) # 레이블 plt.plot(self._y_test, c='r') # 예측 값 plt.plot(data, c='b') # 이미지로 저장 plt.savefig('./chart_images/tenki-kion-lr.png') # 차트 확인(Optional) if show_chart: plt.show() def __del__(self): del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
plt.legend(loc='upper right') plt.grid(b=True) plt.subplot(132) t = np.arange(N) plt.plot(t, x, 'r-', lw=1, label=u'原始数据') plt.plot(abnormal, x[abnormal], 'go', markeredgecolor='g', ms=3, label=u'异常值') plt.legend(loc='upper right') plt.title(u'异常检测', fontsize=18) plt.grid(b=True) # 预测 plt.subplot(133) select = np.ones(N, dtype=np.bool) select[abnormal] = False t = np.arange(N) dtr = DecisionTreeRegressor(criterion='mse', max_depth=10) br = BaggingRegressor(dtr, n_estimators=10, max_samples=0.3) br.fit(t[select].reshape(-1, 1), x[select]) y = br.predict(np.arange(N).reshape(-1, 1)) y[select] = x[select] plt.plot(x, 'g--', lw=1, label=u'原始值') # 原始值 plt.plot(y, 'r-', lw=1, label=u'校正值') # 校正值 plt.legend(loc='upper right') plt.title(u'异常值校正', fontsize=18) plt.grid(b=True) plt.tight_layout(1.5, rect=(0, 0, 1, 0.95)) plt.suptitle(u'排污数据的异常值检测与校正', fontsize=22) plt.show()
# -*- coding: utf-8 -*- import numpy as np import pandas as pd from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeRegressor import matplotlib.pyplot as plt def f(x): return 0.5 * np.exp(-(x+3)**2) + np.exp(-x**2) + 0.5 * np.exp(-(x-3)**2) N = 200 # 200 samples x_train = np.linspace(-5.5, 5.5, N) X_train = pd.DataFrame({"x": x_train}) y_train = f(x_train) + (np.random.rand(N) - 0.5) * (2 * 0.05) dtr = DecisionTreeRegressor(max_depth=5) br = BaggingRegressor(dtr, n_estimators=200, max_samples=0.2) br.fit(X_train, y_train) x_test = np.linspace(x_train.min() * 1.1, x_train.max() * 1.1, 1000) X_test = pd.DataFrame({"x": x_test}) y_test = f(x_test) y_predict = br.predict(X_test) plt.scatter(x_train, y_train) plt.scatter(x_test, y_test) plt.scatter(x_test, y_predict) plt.show()
Alpha, Test_score, y_pred = model_Ridge(dummy_train_df, train_y, dummy_test_df) #下面是用bagging处理,该bagging中基学习器是岭回归,学习器的个数通过交叉验证来确定 ridge = Ridge(alpha = Alpha) # params = [1, 10, 15, 20, 25, 30, 40] params = np.arange(1,50) test_scores = [] min_score = 1 for param in params: clf = BaggingRegressor(base_estimator= ridge, n_estimators= param) test_score = np.sqrt(-cross_val_score(clf, dummy_train_df, train_y, cv=10, scoring='neg_mean_squared_error')) temp = np.mean(test_score) if temp < min_score: min_score = temp optimal_params = param test_scores.append(np.mean(test_score)) print(optimal_params) # plt.plot(params, test_scores) # plt.title("n_estimators vs CV_error") # plt.show() #用训练好的参数预测 br = BaggingRegressor(base_estimator= ridge, n_estimators=optimal_params) br.fit(dummy_train_df, train_y) y_final = np.expm1(br.predict(dummy_test_df)) # print(y_final) summsion_csv(y_final, test)
# GradientBoostingRegressor: gradient_boosting_regressor = GradientBoostingRegressor() gradient_boosting_regressor.fit(x_train, y_train) gb_score = gradient_boosting_regressor.score(x_test, y_test) print('GradientBoostingRegressor score: ' + str(gb_score)) # -> 0.8978408816999488 # ExtraTreesRegressor: extra_trees_regressor = ExtraTreesRegressor() extra_trees_regressor.fit(x_train, y_train) et_score = extra_trees_regressor.score(x_test, y_test) print('ExtraTreesRegressor score: ' + str(et_score)) # -> 0.9071302394368891 # BaggingRegressor: bagging_regressor = BaggingRegressor() bagging_regressor.fit(x_train, y_train) b_score = bagging_regressor.score(x_test, y_test) print('BaggingRegressor score: ' + str(b_score)) # -> 0.9154010467830169 # RandomForestRegressor: random_forest_regressor = RandomForestRegressor() random_forest_regressor.fit(x_train, y_train) rf_score = random_forest_regressor.score(x_test, y_test) print('RandomForestRegressor score: ' + str(rf_score)) # -> 0.920122663462127 # RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100): random_forest_regressor = RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100) random_forest_regressor.fit(x_train, y_train) rf_score_1 = random_forest_regressor.score(x_test, y_test) print('RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100) score: ' + str(rf_score_1)) # -> 0.8134829521876554
} model_gbr_allfeatures = grid_search.GridSearchCV(estimator =gbr, param_grid = parameters, n_jobs = -1, cv = 2, verbose = 20, scoring='mean_squared_error') model_gbr_allfeatures.fit(X_train, Y_train) print(model_gbr_allfeatures.best_params_) #'max_depth': 6, 'n_estimators': 500, 'learning_rate': 0.1, 'max_features': 'auto' predictions_gbr_allfeatures = model_gbr_allfeatures.predict(X_test) mean_squared_error(Y_test, predictions_gbr_allfeatures) #7.071566 #ensembling randomForest model using bagging bag = BaggingRegressor(rfr, n_estimators=500, max_samples=0.1, random_state=25) bag.fit(X_train, Y_train) predictions_rfr_bagging = bag.predict(X_test) mean_squared_error(Y_test, predictions_rfr_bagging) #recursive selection of features for randomForest from sklearn.feature_selection import RFECV rfecv = RFECV(estimator=rfr, step=1, cv=3, scoring='mean_squared_error') rfecv.fit(X_train, Y_train) print("Optimal number of features : %d" % rfecv.n_features_)
# 大家都看过知识问答的综艺节目中, 求助现场观众时候, 让观众投票, 最高的答案作为自己的答案的形式吧, 每个人都有一个判定结果, # 最后我们相信答案在大多数人手里. # 再通俗一点举个例子. 你和你班某数学大神关系好, 每次作业都『模仿』他的, 于是绝大多数情况下, 他做对了, 你也对了. # 突然某一天大神脑子犯糊涂, 手一抖, 写错了一个数, 于是…恩, 你也只能跟着错了. # 我们再来看看另外一个场景, 你和你班 5 个数学大神关系都很好, 每次都把他们作业拿过来, 对比一下, 再『自己做』, 那你想想, # 如果哪天某大神犯糊涂了, 写错了, but 另外四个写对了啊, 那你肯定相信另外 4 人的是正确答案吧? # 最简单的模型融合大概就是这么个意思, 比如分类问题, 当我们手头上有一堆在同一份数据集上训练得到的分类器 # (比如 logistic regression, SVM, KNN, random forest, 神经网络), 那我们让他们都分别去做判定, 然后对结果做投票统计, 取票数最多的结果为最后结果. # 模型融合可以比较好地缓解, 训练过程中产生的过拟合问题, 从而对于结果的准确度提升有一定的帮助. # 话说回来, 回到我们现在的问题. 你看, 我们现在只讲了 logistic regression, 如果我们还想用这个融合思想去提高我们的结果, 我们该怎么做呢? # 既然这个时候模型没得选, 那咱们就在数据上动动手脚咯. 大家想想, 如果模型出现过拟合现在, 一定是在我们的训练上出现拟合过度造成的对吧. # 那我们干脆就不要用全部的训练集, 每次取训练集的一个 subset, 做训练, 这样, 我们虽然用的是同一个机器学习算法, # 但是得到的模型却是不一样的;同时, 因为我们没有任何一份子数据集是全的, 因此即使出现过拟合, 也是在子训练集上出现过拟合, # 而不是全体数据上, 这样做一个融合, 可能对最后的结果有一定的帮助. 对, 这就是常用的 Bagging. # 我们用 scikit-learn 里面的 Bagging 来完成上面的思路, 过程非常简单. 代码如下: from sklearn.ensemble import BaggingRegressor train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') train_np = train_df.as_matrix() y = train_np[:, 0] # y 即 Survival 结果 X = train_np[:, 1:] # X 即特征属性值 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) # fit 到 BaggingRegressor 之中 bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1) bagging_clf.fit(X, y) test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') predictions = bagging_clf.predict(test) result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)}) result.to_csv("./tmp_dataset/Kaggle-Titanic/result.csv", index=False) # 0.75598; 竟然更低了, 可能是 BaggingRegressor 随机分配的时候运气不好 # 上一个结果和博客中作者的结果一毛一样, 第二次竟然不一样了
["xgboost_Label", ""], ["xgboost_Vect", ""], ] full_predictions = [] for alg, predictors in algorithms: if alg == "xgboost_Label": full_predictions.append(xgboost_Label(train, test, labels)) elif alg == "xgboost_Vect": full_predictions.append(xgboost_Vect(train, test, labels)) elif alg == "xgboost_Dummies": full_predictions.append(xgboost_Dummies(train, test, labels)) else: if predictors == "dummies": print ("Train ", alg.__class__.__name__, " dummies Model ") alg = BaggingRegressor(alg) alg.fit(train_du, labels) print "Prediction :", alg.__class__.__name__, " dummies Model " prediction = alg.predict(test_du) full_predictions.append(prediction) else: print ("Train ", alg.__class__.__name__, " Label Model ") alg = BaggingRegressor(alg) alg.fit(train_rf, labels) print "Prediction :", alg.__class__.__name__, " Label Model " prediction = alg.predict(test_rf) full_predictions.append(prediction) # Ensemble models RF_label_pred = full_predictions[0] RF_dummies_pred = full_predictions[1] pred_xgb_dummies = full_predictions[2]
accu = pd.DataFrame(index=['MSLE', 'Root MSLE', 'R2 Score', 'Accuracy(%)']) """ Bagging Regressor METHODE @-@ """ """ model implementation """ baggReg = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=20), n_estimators=50, random_state=1, max_samples=1.0, max_features=1.0, bootstrap=False, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=-1, verbose=0) baggReg.fit(X_train, y_train) y_pred = baggReg.predict(X_test) """ model evaluation """ r6_br = result(y_test, y_pred) print("MSLE : {}".format(r6_br[0])) print("Root MSLE : {}".format(r6_br[1])) print("R2 Score : {} or {}%".format(r6_br[2], r6_br[3])) """ Visualization of true value and predicted """ df_check = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) df_check = df_check.sample(50) df_check.plot(kind='bar', figsize=(10, 5)) plt.grid(which='major', linestyle='-', linewidth='0.1', color='Green') plt.title('Performance Bagging Regressor')
# midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 # diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1]) # return midpoint, diff # # plot_learning_curve(model, u"learning curve", X, Y) # 6...................模型融合.....................# train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') train_np = train_df.as_matrix() # y即Survival结果 y = train_np[:, 0] x = train_np[:, 1:] # fit 到BaggingRegressor之中 model = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_model = BaggingRegressor(model,n_estimators=20,max_samples=0.8,max_features=1.0,bootstrap=True,bootstrap_features=False,n_jobs=-1) bagging_model.fit(x,y) test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') predictions = bagging_model.predict(test) result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(),'Survived':predictions.astype(np.int32 )}) result.to_csv('./result.csv',index=False)
# ## Bagged decision trees in scikit-learn (with B=500) # define the training and testing sets X_train = train.iloc[:, 1:] y_train = train.iloc[:, 0] X_test = test.iloc[:, 1:] y_test = test.iloc[:, 0] # instruct BaggingRegressor to use DecisionTreeRegressor as the "base estimator" from sklearn.ensemble import BaggingRegressor bagreg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1) # fit and predict bagreg.fit(X_train, y_train) y_pred = bagreg.predict(X_test) y_pred # calculate RMSE np.sqrt(metrics.mean_squared_error(y_test, y_pred)) # ## Estimating out-of-sample error # # For bagged models, out-of-sample error can be estimated without using **train/test split** or **cross-validation**! # # On average, each bagged tree uses about **two-thirds** of the observations. For each tree, the **remaining observations** are called "out-of-bag" observations. # show the first bootstrap sample
def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. XXX sparse matrix? y : array-like, shape (n_samples,) Target vector relative to X. Has to follow the convention 0 for normal data, 1 for frauds. XXX maybe make such y ourselves from input? sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples, typically the amount in case of transactions data. Used to grow regression trees producing further rules to be tested. If not provided, then each sample is given unit weight. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_ = X.shape[1] self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("This method needs samples of at least 2 classes" " in the data, but the data contains only one" " class: %r" % self.classes_[0]) if not set(self.classes_) == set([0, 1]): warn("Found labels %s. This method assumes fraud to be labeled as" " 1 and normal data to be labeled as 0. Any label" " different from 0 will be considered as fraud." % set(self.classes_)) y = (y > 0) # ensure that max_samples is in [1, n_samples]: n_samples = X.shape[0] if isinstance(self.max_samples, six.string_types): raise ValueError('max_samples (%s) is not supported.' 'Valid choices are: "auto", int or' 'float' % self.max_samples) elif isinstance(self.max_samples, INTEGER_TYPES): if self.max_samples > n_samples: warn("max_samples (%s) is greater than the " "total number of samples (%s). max_samples " "will be set to n_samples for estimation." % (self.max_samples, n_samples)) max_samples = n_samples else: max_samples = self.max_samples else: # float if not (0. < self.max_samples <= 1.): raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples self.rules_ = {} self.estimators_ = [] self.estimators_samples_ = [] self.estimators_features_ = [] # default columns names of the form ['c0', 'c1', ...]: feature_names_ = (self.feature_names if self.feature_names is not None else ['c' + x for x in np.arange(X.shape[1]).astype(str)]) self.feature_names_ = feature_names_ bagging_clf = BaggingClassifier( base_estimator=DecisionTreeClassifier( max_depth=self.max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split), n_estimators=self.n_estimators, max_samples=self.max_samples_, max_features=self.max_samples_features, bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, # oob_score=... XXX may be added if selection on tree perf needed. # warm_start=... XXX may be added to increase computation perf. n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) bagging_reg = BaggingRegressor( base_estimator=DecisionTreeRegressor( max_depth=self.max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split), n_estimators=self.n_estimators, max_samples=self.max_samples_, max_features=self.max_samples_features, bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, # oob_score=... XXX may be added if selection on tree perf needed. # warm_start=... XXX may be added to increase computation perf. n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) bagging_clf.fit(X, y) # define regression target: if sample_weight is not None: if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) weights = sample_weight - sample_weight.min() contamination = float(sum(y)) / len(y) y_reg = (pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow( (weights).mean(), 0.5) * (y == 0)) y_reg = 1. / (1 + np.exp(-y_reg)) # sigmoid else: y_reg = y # same as an other classification bagging bagging_reg.fit(X, y_reg) self.estimators_ += bagging_clf.estimators_ self.estimators_ += bagging_reg.estimators_ self.estimators_samples_ += bagging_clf.estimators_samples_ self.estimators_samples_ += bagging_reg.estimators_samples_ self.estimators_features_ += bagging_clf.estimators_features_ self.estimators_features_ += bagging_reg.estimators_features_ rules_ = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): # Create mask for OOB samples mask = ~samples if sum(mask) == 0: warn("OOB evaluation not possible: doing it in-bag." " Performance evaluation is likely to be wrong" " (overfitting) and selected rules are likely to" " not perform well! Please use max_samples < 1.") mask = samples rules_from_tree = self._tree_to_rules( estimator, np.array(self.feature_names_)[features]) # XXX todo: idem without dataframe X_oob = pandas.DataFrame( (X[mask, :])[:, features], columns=np.array(self.feature_names_)[features]) y_oob = y[mask] y_oob = np.array((y_oob != 0)) # Add OOB performances to rules: rules_from_tree = [(r, self._eval_rule_perf(r, X_oob, y_oob)) for r in set(rules_from_tree)] rules_ += rules_from_tree # keep only rules verifying precision_min and recall_min: for rule, score in rules_: if (score[0] > self.precision_min and score[1] > self.recall_min): if rule in self.rules_: # update the score to the new mean c = self.rules_[rule][2] + 1 b = self.rules_[rule][1] + 1. / c * (score[1] - self.rules_[rule][1]) a = self.rules_[rule][0] + 1. / c * (score[0] - self.rules_[rule][0]) self.rules_[rule] = (a, b, c) else: self.rules_[rule] = (score[0], score[1], 1) self.rules_ = sorted(self.rules_.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True) return self
start = time.time() model = svm.SVR(kernel='poly', degree=2, C=0.7) "boostedModel = AdaBoostRegressor(base_estimator= model, n_estimators = 10, loss = 'square')" baggedModel = BaggingRegressor(base_estimator=model, n_estimators=50, max_features=0.6, max_samples=0.5, bootstrap=False) X_train, X_test, y_train, y_test = train_test_split(X_ls, y_ls, test_size=0.30) with measure_time('Training'): print('Training...') baggedModel.fit(X_train, y_train) y_pred = baggedModel.predict(X_test) print('MSE on 0.2 of LS: ', mean_squared_error(y_true, y_pred, multioutput='uniform_average')) with measure_time('Training'): print('Training...') baggedModel.fit(X_ls, y_ls) # ------------------------------ Prediction ------------------------------ # # Load test data test_user_movie_pairs = load_from_csv(os.path.join(prefix, 'data_test.csv')) # Build the prediction matrix
alg_test_id = alg_test['id'] Y_train = alg_train['relevance'].values X_train = alg_train.drop(['id', 'relevance'], axis=1).values X_test = alg_test.drop(['id', 'relevance'], axis=1).values forest = RandomForestRegressor(n_estimators=550, criterion="mse", max_features=10, max_depth=15, n_jobs=-1, verbose=0) bg = BaggingRegressor(forest, n_estimators=150, max_samples=0.1, random_state=29) bg.fit(X_train, Y_train) Y_output = bg.predict(X_test) # score = forest.score(X_train, Y_train) # # print score filename = 'submission_' + date + '.csv' pd.DataFrame({ "id": alg_test_id, "relevance": Y_output }).to_csv(filename, index=False)
passenger_id = full.loc[sourceRow:,'PassengerId'] #数据框:乘客id,预测生存情况的值 predDf = pd.DataFrame( {'PassengerId': passenger_id, 'Survived':pred_Y}) predDf.shape predDf.head() #保存结果 predDf.to_csv('titanic_pred_new.csv', index = False) ''' # fit到BaggingRegressor之中 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False) bagging_clf.fit(train_X, train_y) print bagging_clf.score(test_X, test_y) predictions = bagging_clf.predict(pred_X) result = pd.DataFrame({'PassengerId':test['PassengerId'].values, 'Survived':predictions.astype(np.int32)}) result.to_csv("logistic_regression_bagging_predictions1.csv", index=False) ''' xgb_clf = XGBClassifier(learning_rate=0.1, max_depth=2, silent=True, objective='binary:logistic') # xgb_clf = XGBClassifier() xgb_clf.fit(train_X.values, train_y.values) print xgb_clf.score(test_X.values, test_y.values) predictions = xgb_clf.predict(pred_X.values) result = pd.DataFrame({'PassengerId':test['PassengerId'].values, 'Survived':predictions.astype(np.int32)})
del globals()['profiles'] del globals()['profilesLSo'] del globals()['profilesLS'] del globals()['row'] del globals()['tmpLS'] del globals()['tmpAGE'] del globals()['profsTOlikes'] del globals()['i'] del globals()['tmpIND'] seed = 7 myRand = np.random.seed(seed) X_train, X_test, y_train, y_test = train_test_split(likesMAT, extsARR, test_size=1500) nJOBS = int(sys.argv[1]) nEST = int(sys.argv[2]) bagIN = BaggingRegressor(n_jobs=nJOBS, n_estimators=nEST) #bagIN.fit(likesMAT, extsARR) bagIN.fit(X_train, y_train) y_pred = bagIN.predict(X_test) import math myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred)) print("exts, bagIN: ", str(nEST), " ", myRMSE) # joblib.dump(bagIN, "/Users/jamster/bagIN-A-exts.xz", compress=9) # impbagIN = joblib.load("/Users/jamster/bagIN-A-exts.xz")
def gradient_descent_algo(self): X = [] Y = [] with open('../Data/full_table.csv', 'r') as file: for line in csv.reader(file, delimiter=','): if len(line) == 13: try: zhvi = float(line[5]) property_type = line[6] room_type = line[7] accommodates = int(line[8]) bathrooms = float(line[9]) beds = int(line[10]) bed_type = line[11] price = float(line[12]) x = { 'zhvi': zhvi, 'property_type': property_type, 'room_type': room_type, 'accommodates': accommodates, 'bathrooms': bathrooms, 'beds': beds, 'bed_type': bed_type } y = price X.append(x) Y.append(y) except: pass # The DictVectorizer converts data from a dictionary to an array vec = DictVectorizer() # Convert X to Array X = vec.fit_transform(X).toarray() # Normalize Data X = preprocessing.normalize(X) # Split X and Y into training and testing sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33) # Gradient Descent model = linear_model.SGDRegressor() model.fit(X_train, Y_train) Y_pred = model.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Gradient Descent') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2)) # With Boosting model_boost = AdaBoostRegressor(linear_model.SGDRegressor()) model_boost.fit(X_train, Y_train) Y_pred = model_boost.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Gradient Descent (with AdaBoost)') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2)) # With Bagging model_bag = BaggingRegressor(linear_model.SGDRegressor()) model_bag.fit(X_train, Y_train) Y_pred = model_bag.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Gradient Descent (with Bagging)') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2))
def QuickML_Stacking(X_train, y_train, X_test='', modeltype='Regression', Boosting_Flag=False, scoring='', verbose=0): """ Quickly build Stacks of multiple model results Input must be a clean data set (only numeric variables, no categorical or string variables). """ start_time = time.time() seed = 99 if len(X_train) <= 100000 or X_train.shape[1] < 50: NUMS = 100 FOLDS = 5 else: NUMS = 200 FOLDS = 10 ## create Stacking models estimators = [] ### This keeps tracks of the number of predict_proba columns generated by each model #### estimator_length = [] if isinstance(X_test, str): no_fit = True else: no_fit = False if no_fit: #### This is where you don't fit the model but just do cross_val_predict #### if modeltype == 'Regression': if scoring == '': scoring = 'neg_mean_squared_error' scv = KFold(n_splits=FOLDS, random_state=seed, shuffle=True) if Boosting_Flag: ###### Bagging models if Bagging is chosen #### model4 = BaggingRegressor( DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) results = cross_val_predict(model4, X_train, y_train, cv=scv, n_jobs=-1) estimators.append(('Bagging1', model4)) estimator_length.append(1) elif Boosting_Flag is None: #### Tree models if Linear chosen ##### model5 = DecisionTreeRegressor(random_state=seed, min_samples_leaf=2) results = cross_val_predict(model5, X_train, y_train, cv=scv, n_jobs=-1) estimators.append(('Decision Trees', model5)) estimator_length.append(1) else: #### Linear Models if Boosting is chosen ##### model6 = LassoCV(alphas=np.logspace(-10, -1, 50), cv=scv, random_state=seed) results = cross_val_predict(model6, X_train, y_train, cv=scv, n_jobs=-1) estimators.append(('LassoCV Regularization', model6)) estimator_length.append(1) else: n_classes = len(Counter(y_train)) if scoring == '': scoring = 'accuracy' scv = StratifiedKFold(n_splits=FOLDS, random_state=seed, shuffle=True) if Boosting_Flag: #### Linear Models if Boosting is chosen ##### model4 = LinearDiscriminantAnalysis() results = cross_val_predict(model4, X_train, y_train, cv=scv, n_jobs=-1, method='predict_proba') estimators.append(('Linear Discriminant', model4)) estimator_length.append(results.shape[1]) elif Boosting_Flag is None: #### Tree models if Linear chosen ##### model6 = DecisionTreeClassifier(min_samples_leaf=2) results = cross_val_predict(model6, X_train, y_train, cv=scv, n_jobs=-1, method='predict_proba') estimators.append(('Decision Tree', model6)) estimator_length.append(results.shape[1]) else: ###### Naive Bayes models if Bagging is chosen #### if n_classes <= 2: try: model7 = GaussianNB() except: model7 = DecisionTreeClassifier(min_samples_leaf=2) else: try: model7 = MultinomialNB() except: model7 = DecisionTreeClassifier(min_samples_leaf=2) results = cross_val_predict(model7, X_train, y_train, cv=scv, n_jobs=-1, method='predict_proba') estimators.append(('Naive Bayes', model7)) estimator_length.append(results.shape[1]) else: #### This is where you fit the model and then predict ######## if modeltype == 'Regression': if scoring == '': scoring = 'neg_mean_squared_error' scv = KFold(n_splits=FOLDS, random_state=seed, shuffle=True) if Boosting_Flag: ###### Bagging models if Bagging is chosen #### model4 = BaggingRegressor( DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) results = model4.fit(X_train, y_train).predict(X_test) estimators.append(('Bagging1', model4)) estimator_length.append(1) elif Boosting_Flag is None: #### Tree models if Linear chosen ##### model5 = DecisionTreeRegressor(random_state=seed, min_samples_leaf=2) results = model5.fit(X_train, y_train).predict(X_test) estimators.append(('Decision Trees', model5)) estimator_length.append(1) else: #### Linear Models if Boosting is chosen ##### model6 = LassoCV(alphas=np.logspace(-10, -1, 50), cv=scv, random_state=seed) results = model6.fit(X_train, y_train).predict(X_test) estimators.append(('LassoCV Regularization', model6)) estimator_length.append(1) else: n_classes = len(Counter(y_train)) if scoring == '': scoring = 'accuracy' scv = StratifiedKFold(n_splits=FOLDS, random_state=seed, shuffle=True) if Boosting_Flag: #### Linear Models if Boosting is chosen ##### model4 = LinearDiscriminantAnalysis() results = model4.fit(X_train, y_train).predict_proba(X_test) estimators.append(('Linear Discriminant', model4)) estimator_length.append(results.shape[1]) elif Boosting_Flag is None: #### Tree models if Linear chosen ##### model6 = DecisionTreeClassifier(min_samples_leaf=2) results = model6.fit(X_train, y_train).predict_proba(X_test) estimators.append(('Decision Tree', model6)) estimator_length.append(results.shape[1]) else: ###### Naive Bayes models if Bagging is chosen #### if n_classes <= 2: try: model7 = GaussianNB() except: model7 = DecisionTreeClassifier(min_samples_leaf=2) else: try: model7 = MultinomialNB() except: model7 = DecisionTreeClassifier(min_samples_leaf=2) results = model7.fit(X_train, y_train).predict_proba(X_test) estimators.append(('Naive Bayes', model7)) estimator_length.append(results.shape[1]) #stacks = np.c_[results1,results2,results3] estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] estimator_names = [tuples[0] for tuples in estimators] #### Here is where we consolidate the estimator names and their results into one common list ### ls = [] for x, y in dict(zip(estimator_names, estimator_length)).items(): els = [x + str(eachy) for eachy in range(y)] ls += els if verbose == 1: print(' Time taken for Stacking: %0.1f seconds' % (time.time() - start_time)) return ls, results #########################################################
ax2.set_title('Error between actual and predicted loads') ax2.set_ylabel("Error, MW") featImportances=gradBoost.feature_importances_ pos = np.arange(len(features)) pairs = zip(features, featImportances) sorted_pairs = sorted(pairs, key = lambda pair: pair[1]) features_sorted, featImportances_sorted = zip(*sorted_pairs) fig, ax = plt.subplots() plt.barh(pos, featImportances_sorted, 1, color = "blue") plt.yticks(pos,features_sorted) ax.set_title('Gradient Boosting: Relative Feature Importance') #Tree Bagging TreeBagger=BaggingRegressor() TreeBagger.fit(Xtrain, Ytrain) fig = plt.figure() ax1 = fig.add_subplot(2, 1, 1) ax1.plot_date(dates, modeldata.Load[45000:50000], 'r-',tz=None, xdate=True, ydate=False, label='Actual Load') ax1.set_title('Tree Bagging: Actual and Predicted Loads') plt.plot(dates, TreeBagger.predict(Xtest), 'g-',label='Predicted Load') ax1.legend() ax2 = fig.add_subplot(2, 1, 2) ax2.plot_date(dates, modeldata.Load[45000:50000]-TreeBagger.predict(Xtest), 'r-',tz=None, xdate=True, ydate=False) ax2.set_title('Error between actual and predicted loads, MW') MSEs_Bagging=[mean_squared_error(Ytest, TreeBagger.predict(Xtest)), mean_squared_error(Ytrain, TreeBagger.predict(Xtrain))] #Model Comparison: Bar charts
########################################################### # Bagging Methods ########################################################### from sklearn.ensemble import BaggingRegressor from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor(n_neighbors=5) # usual knn knn.fit(xtrain,ytrain) knn.score(xtrain,ytrain) knn.score(xtest,ytest) # full bagging bf = BaggingRegressor(knn,n_estimators=100,max_samples=1.0,max_features=1.0,random_state=0) bf.fit(xtrain,ytrain) bf.score(xtrain,ytrain) bf.score(xtest,ytest) # bagging with subsampling and feature randomization bf = BaggingRegressor(knn,n_estimators=500,max_samples=0.5,max_features=0.5) bf.fit(xtrain,ytrain) bf.score(xtrain,ytrain) bf.score(xtest,ytest) # effect of estimators np.random.seed(0) n_list = [1,5,10,20,30,50,100,200,500,1000] s = np.zeros((len(n_list),2)) for i in range(len(n_list)): bf = BaggingRegressor(knn,n_estimators=n_list[i],max_samples=0.5,max_features=0.5)
lm_bagged = BaggingRegressor( base_estimator = lm, n_estimators = 75, max_samples = n_samp, max_features = n_feat, bootstrap = True, oob_score = False, warm_start = False, n_jobs = -1 ) log_bagged = BaggingClassifier( base_estimator = log, n_estimators = 75, max_samples = n_samp, max_features = n_feat, bootstrap = True, oob_score = False, warm_start = False, n_jobs = -1 ) lm_bagged.fit(X = train[features], y = train['y']) log_bagged.fit(X = train[features], y = train['y']) lm_bagged_preds = lm_bagged.predict(X = test[features]) log_bagged_preds = log_bagged.predict_proba(X = test[features]) write_function(lm_bagged_preds, '/tmp/lm_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat)) write_function(second_pos_clip(log_bagged_preds), '/tmp/log_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat))
from sklearn.model_selection import cross_val_score clf = RandomForestRegressor() scores = cross_val_score(clf, X_test, y_test, cv=5) scores.mean() #mse in $ mse = mean_absolute_error(y_test, y_pred) print("The mean absolute error is:$", mse) #chceking r^2 from sklearn.metrics import r2_score print("r_Score:", r2_score(y_test, y_pred)) bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10) bg.fit(X_train, y_train) bg.score(X_train, y_train) bg.score(X_test, y_test) #Adaboosting regr = AdaBoostRegressor() regr.fit(X_train, y_train) regr.score(X_test, y_test) #Decision from sklearn.tree import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.fit(X_train, y_train) dt.score(X_test, y_test) #gradientBoost
np.transpose(X), err=0.1, pct=50) end = time.time() time_all[j, 3] = end - start print('\n****** DTM *******\n') rng = np.random.RandomState(42) max_samples = min(20, X.shape[0]) bag_neigh = 1 clf_spDTM = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh, contamination=0.1), n_estimators=1, max_samples=max_samples, bootstrap=False, random_state=rng) start = time.time() y_score_spDTM = clf_spDTM.fit(X, y).predict(X) end = time.time() time_all[j, 4] = end - start precision_iso, recall_iso, thresholds_iso = metrics.precision_recall_curve( y, -iso_scores, pos_label=1) precision_lof, recall_lof, thresholds_lof = metrics.precision_recall_curve( y, -lof_scores, pos_label=1) precision_osvm, recall_osvm, thresholds_osvm = metrics.precision_recall_curve( y, -osvm_scores, pos_label=1) precision_our, recall_our, thresholds_our = metrics.precision_recall_curve( y, -our_scores, pos_label=1) precision_dtm, recall_dtm, thresholds_dtm = metrics.precision_recall_curve( y, -y_score_spDTM, pos_label=1)
def bootstrapped_ci(base_estimator, x, y, n_resamples=100, is_regression=True, n_jobs=-1, verbose=0): """ Use Bootstrapped models to get distributions of params to estimate credible distribution. Those credible intervals are then used to estimate the distance of the mean from 0. >>> from sklearn.linear_model import LinearRegression >>> from sklearn.datasets import load_breast_cancer >>> x, y = load_breast_cancer(return_X_y=True) >>> _ = bootstrapped_ci(LinearRegression(), x, y) These are not p-values! >>> print('Done') Done :param n_jobs: :param verbose: :param base_estimator: SKLearn style linear model that has a coef_ attribute after fitting. :param x: :param y: :param n_resamples: How many Bootstrap draws to make :param is_regression: :return: """ import scipy.stats as stats from rosey.helpers import np_min from sklearn.ensemble import BaggingRegressor, BaggingClassifier print('These are not p-values!') if is_regression: # Fit Models bootstrapped_models = BaggingRegressor(base_estimator, n_estimators=n_resamples, bootstrap=True, bootstrap_features=False, n_jobs=n_jobs, verbose=verbose) bootstrapped_models.fit(x, y) # Get params model_coefs = np.vstack( [est.coef_ for est in bootstrapped_models.estimators_]) coefs_mu = model_coefs.mean(axis=0) coefs_sd = model_coefs.std(axis=0) vec = coefs_mu.shape # Compute p values p_values = np_min( stats.norm.pdf(np.zeros(vec), loc=coefs_mu, scale=coefs_sd), stats.norm.cdf(np.zeros(vec), loc=coefs_mu, scale=coefs_sd)) if any(p_values > 1): warnings.warn('Bad p-values detected') return pd.DataFrame(np.vstack([coefs_mu, coefs_sd, p_values]).T, columns=['mu', 'sd', 'p']) else: raise NotImplementedError
rf = RandomForestRegressor() br = BaggingRegressor(rf) pipe = pipeline.Pipeline([('rf', rf), ('br', br)]) parameters = dict(rf__n_estimators=[5, 10, 15, 20], rf__max_depth=[2, 4, 6, 8, 10], rf__random_state=[0, 5, 10, 15], br__n_estimators=[5, 15, 25, 35, 45, 55], br__max_samples=[0.1, 0.2, 0.3], br__random_state=[0, 5, 10, 15, 20, 25, 30]) model = grid_search.GridSearchCV(pipe, parameters) model.fit(features_train, labels_train) print("Best parameters:") print(model.best_params_) print("Best CV score:") print(model.best_score_) #Best parameters: #{'br__max_samples': 0.1, 'br__n_estimators': 45, 'rf__max_depth': 6, 'br__random_state': 25, 'rf__random_state': 0, 'rf__n_estimators': 5} #Best CV score: 0.13390585367 pred = model.predict(features_test) """ # Use the best parameters from gridsearch rf = RandomForestRegressor(n_estimators=5, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf.fit(features_train, labels_train) pred = clf.predict(features_test) # Write predicted numbers to submission.csv file pd.DataFrame({"id": id_test, "relevance": pred}).to_csv('submission.csv',index=False)
del globals()['profilesLSo'] del globals()['profilesLS'] del globals()['row'] del globals()['tmpLS'] del globals()['tmpAGE'] del globals()['profsTOlikes'] del globals()['i'] del globals()['tmpIND'] seed = 7 myRand = np.random.seed(seed) X_train, X_test, y_train, y_test = train_test_split(likesMAT, neusARR, test_size=1500) nJOBS = int(sys.argv[1]) nEST = int(sys.argv[2]) bagOUT = BaggingRegressor(n_jobs=nJOBS, n_estimators=nEST, oob_score=True) #bagOUT.fit(likesMAT, neusARR) bagOUT.fit(X_train, y_train) y_pred = bagOUT.predict(X_test) import math myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred)) print("neus, bagOUT: ", str(nEST), " ", myRMSE) # joblib.dump(bagOUT, "/Users/jamster/bagOUT-A-neus.xz", compress=9) # impbagOUT = joblib.load("/Users/jamster/bagOUT-A-neus.xz")
model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=5, verbose=0, scoring=RMSE) errors = [] X_train = df.drop(['product_uid', 'id', 'relevance'], axis=1).values y_train = df['relevance'].values model.fit(X_train, y_train) print("Best parameters found by grid search:") print(model.best_params_) print("Best CV score:") print(model.best_score_) del X_train, y_train kf = KFold(df.shape[0], n_folds=K_fold) for train_index, test_index in kf: train_set = df.iloc[train_index] test_set = df.iloc[test_index] y_train = train_set['relevance'].values X_train = train_set.drop(['product_uid', 'id', 'relevance'], axis=1).values y_test = train_set['relevance'].values X_test = test_set.drop(['product_uid', 'id', 'relevance'], axis=1).values clf2.fit(X_train,y_train) result = clf2.predict(X_test) error = np.sqrt(mean_squared_error(result,y_test)) errors.extend([error]) print np.mean(errors)
# Model #--------------------------------------------------------------------------------- if not hyperparam_opt: rf = RandomForestRegressor(n_estimators=20, bootstrap=True, min_samples_leaf=15, min_samples_split=15, max_features=3, max_depth=10) clf = BaggingRegressor(rf, n_estimators=5, max_samples=0.1, random_state=0) bf = GradientBoostingRegressor(n_estimators=5, max_depth=6, random_state=0) rf.fit(X_train, y_train) print("random forest fitted...") clf.fit(X_train, y_train) print("bagging fitted...") bf.fit(X_train, y_train) print("boosting fitted...") else: rf = RandomForestRegressor(n_estimators=20) clf = BaggingRegressor(rf, n_estimators=20) bf = GradientBoostingRegressor( n_estimators=20) #min_samples_split=2,learning_rate=0.01, loss='ls') all_models = [{'rf': rf}, {'clf': clf}, {'bf': bf}] all_results = [] for model in all_models: all_results.append(rand_search(model, True)) rf, clf, bf = all_results
# Getting Testing Data out of the DF test_data_frame = data_frame_regression.iloc[num_train:] # Getting IDs for Testing Data id_test = test_data_frame['id'] relevance_train = train_data_frame['relevance'].values # All the Independent Variables in the Regressor # These are Words in Title, Desription, Values X_train = train_data_frame.drop(['id', 'relevance'], axis=1).values # Same for Test Data X_test = test_data_frame.drop(['id', 'relevance'], axis=1).values # Using RandomForest Regressor rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) # Using Bagging Regressor clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) # Fit the Training Data to a Model clf.fit(X_train, relevance_train) # Predicting the relevance for Testind Data relevance_pred = clf.predict(X_test) # Writing the Relevance Values to Submission.csv pandas.DataFrame({"id": id_test, "relevance": relevance_pred}).to_csv('submission.csv', index=False)
y = tran_np[:, 7] #-----------input------ x = tran_np[:, :7] clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) #-----------use bagging------ bagging_clf = BaggingRegressor(clf, n_estimators=20, max_features=1.0, max_samples=0.8, bootstrap=True, bootstrap_features=False, n_jobs=-1) bagging_clf.fit(x, y) print clf #-----------testdata------ test = pd.read_csv('/home/perfecum/下载/test.csv', header=0) test_df = test.filter( regex='Dealership|Showroom|ComputerSearch|M5|3Series|Z4|Financing') print test_df test_np = test_df.as_matrix() predictions = bagging_clf.predict(test_df) print predictions #-----------something test------
out_dir = "STS-en-{}-{}".format(GROUP, APPROACH) if not os.path.exists(out_dir): os.mkdir(out_dir) filenames = [] for sts12_train_id, sts12_test_id, sts13_test_id, sts14_test_id in id_pairs: # combine 2012, 2013 training and test data X_sts12_train, y_sts12_train = ntnu_sts12.read_train_data(sts12_train_id, feats) X_sts12_test, y_sts12_test = ntnu_sts12.read_test_data(sts12_test_id, feats) X_sts13_test, y_sts13_test = sts13.read_test_data(sts13_test_id, feats) X_train = np.vstack([X_sts12_train, X_sts12_test, X_sts13_test]) y_train = np.hstack([y_sts12_train, y_sts12_test, y_sts13_test]) regressor.fit(X_train, y_train) X_test = read_blind_test_data(sts14_test_id, feats) y_test = regressor.predict(X_test) test_input = read_system_input(test_input_fnames[sts14_test_id]) postprocess(test_input, y_test) fname = "{}/STS-en.output.{}.txt".format(out_dir, sts14_test_id) write_scores(fname, y_test) filenames.append(fname) descr_fname = "{}/STS-en-{}-{}.description.txt".format(out_dir, GROUP, APPROACH) open(descr_fname, "w").write(DESCRIPTION) filenames.append(descr_fname)
def train_bagging_cart(X, Y): adaboost = BaggingRegressor(DecisionTreeRegressor(max_depth=5) , max_features=0.7, n_estimators=30) adaboost.fit(X, Y) return adaboost
X[np.isnan(X)] = 0. print '******************************************' print name print '******************************************' if name=='Boston' or name=='Diabetes': # Regression problem rfr = RandomForestRegressor(**params) rfr.fit(X, y) print 'Score RandomForestRegressor = %s' % (rfr.score(X, y)) scores_rfr = cross_val_score(rfr, X, y ,cv=5) print 'Cross Val Score RandomForestRegressor = %s' % (np.mean(scores_rfr)) br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=max_depth), n_estimators=n_estimators) br.fit(X, y) print 'Score BaggingRegressor = %s' % (br.score(X, y)) scores_br = cross_val_score(br, X, y, cv=5) print 'Cross Val Scores of BR = %s' %(np.mean(scores_br)) if name=='Iris' or name=='Digits': # Classificaiton problem rfc = RandomForestClassifier(**params) rfc.fit(X, y) print 'Score RandomForestClassifier = %s' % (rfc.score(X, y)) scores_rfc = cross_val_score(rfc, X, y ,cv=5) print 'Corss Val Scores of RandomForestClassifier = %s' %(np.mean(scores_rfc)) bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators) bc.fit(X, y) print 'Score BaggingClassifier == %s' % (bc.score(X, y))
print('Final Score %f' % score) print('Final Out-of-Fold Score %f' % oof_score) print('=====================') ens0_pred = prediction1 submission = pd.read_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/input/sample_submission.csv') submission.y = ens0_pred submission.id = id submission.columns = ['ID', 'y'] submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_adaboostregressor_cv%f.csv' % oof_score, index=False) print("Ensemble Model 1: BaggingRegressor") ens1 = BaggingRegressor(DecisionTreeRegressor(max_depth=4), random_state=1337) #, learning_rate=.05, n_estimators=300, ens1.fit(df, train.y) # In Sample R2 ens1_insample_pred = ens1.predict(df) r2_score(train.y, ens1_insample_pred ) # 0.62753671854582205 0.6998279121628439 # Predict ens1_pred = ens1.predict(df_test) # LB: -0.77554 submission.y = ens1_pred submission.id = id submission.columns = ['ID', 'y'] submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_baggingreg.csv', index=False) print("Ensemble Model 2: ExtraTreesRegressor") ens2 = ExtraTreesRegressor(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2,
df_all['letter_in_description'] = df_all['product_info'].map( lambda x: str_common_letter(x.split('\t')[0], x.split('\t')[2])) print("Drop columns that were changed...") df_all = df_all.drop(['search_term', 'product_title', 'product_description', 'product_info'], axis=1) # Set up training and test sets df_train = df_all.iloc[:num_train] df_test = df_all.iloc[num_train:] id_test = df_test['id'] y_train = df_train['relevance'].values # Drop 'id' and 'relevance' columns from the training and test sets X_train = df_train.drop(['id', 'relevance'], axis=1).values X_test = df_test.drop(['id', 'relevance'], axis=1).values # Setup RandomForest and Bagging Regressors rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) # Fit the training data into the regression model using the output values clf.fit(X_train, y_train) # Run the prediction y_pred = clf.predict(X_test) # Set up our Data Frame datafr = pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('../dataset/submission.csv', index=False) print(datafr)
import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn import tree from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.ensemble import BaggingRegressor from sklearn.metrics import accuracy_score movies = pd.read_csv('avaliacoes_usuario.csv') caracteristicas = movies[movies.columns[1:16]] gostos = movies[movies.columns[16:]] treino, teste, treino_marcacoes, teste_marcacoes = train_test_split( caracteristicas, gostos) treino = np.array(treino).reshape(len(treino), 15) teste = np.array(teste).reshape(len(teste), 15) treino_marcacoes = np.array(treino_marcacoes).reshape(len(treino_marcacoes), 1) teste_marcacoes = np.array(teste_marcacoes).reshape(len(teste_marcacoes), 1) #Pensando em Regressão - Pegando média modelo = BaggingRegressor() modelo.fit(treino, treino_marcacoes.ravel()) modelo.score(treino, treino_marcacoes) modelo.score(teste, teste_marcacoes)