def avmPredict(params): town = getPlace(params['lat'], params['long'])[0] x, y, z = getXYZ(params['lat'], params['long']) r = 1.0 data = [] target = [] header = [] with open('../../../data/working22.csv') as f: f = csv.reader(f) header = next(f) for row in f: t = (map(float, row[:3] + row[4:]), float(row[3])) if weightF([x, y, z], t[0][0:3], r): data.append(t[0]) target.append(t[1]) ensemble = BaggingRegressor() ensemble.fit(data, target) test = createTest(params) return ensemble.predict(test)
def test_oob_score_regression(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=50, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert_less(abs(test_score - clf.oob_score_), 0.1) # Test with few estimators assert_warns(UserWarning, BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=1, bootstrap=True, oob_score=True, random_state=rng).fit, X_train, y_train)
def train_model(train, test, labels): rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=10) #rf = RandomForestRegressor(n_estimators=45, max_depth=9, random_state=10) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.2, random_state=25) clf.fit(train, labels) #clf = SVR(C=1.0, epsilon=0.2) #clf.fit(train, labels) #clf = GaussianNB() #clf.fit(train, labels) print "Good!" predictions = clf.predict(test) print predictions.shape predictions = pd.DataFrame(predictions, columns = ['relevance']) print "Good again!" print "Predictions head -------" print predictions.head() print predictions.shape print "TEST head -------" print test.head() print test.shape #test['id'].to_csv("TEST_TEST.csv",index=False) #predictions.to_csv("PREDICTIONS.csv",index=False) #test = test.reset_index() #predictions = predictions.reset_index() #test = test.groupby(level=0).first() #predictions = predictions.groupby(level=0).first() predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False) print predictions return predictions
def train_bagging_xgboost(X, Y): adaboost = BaggingRegressor(xgb.XGBRegressor(max_depth=6, learning_rate=0.02, n_estimators=300, silent=True, objective='reg:linear', subsample=0.7, reg_alpha=0.8, reg_lambda=0.8, booster="gblinear") , max_features=0.7, n_estimators=30) adaboost.fit(X, Y) return adaboost
def model_fit_rf_bagging(): def in_limits(x): if x<1: return 1 if x>3: return 3 return x print "STARTING MODEL" X = full_data[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values y = full_data['relevance'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) in_limits = np.vectorize(in_limits,otypes=[np.float]) y_pred = in_limits(y_pred) RMSE = mean_squared_error(y_test, y_pred)**0.5 print "RMSE: ",RMSE # for the submission real_X_test = real_full_test[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values test_pred = clf.predict(real_X_test) test_pred = in_limits(test_pred) return test_pred
def test_bootstrap_samples(): """Test that bootstraping samples generate non-perfect base estimators.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) base_estimator = DecisionTreeRegressor().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=False, random_state=rng).fit(X_train, y_train) assert_equal(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train)) # with bootstrap, trees are no longer perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=True, random_state=rng).fit(X_train, y_train) assert_greater(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train))
def fit(self): """Scale data and train the model with the indicated algorithm. Do not forget to tune the hyperparameters. Parameters ---------- algorithm : String, "KernelRidge", "SVM", "LinearRegression", "Lasso", "ElasticNet", "NeuralNet", "BaggingNeuralNet", default = "SVM" """ self.X_scaler.fit(self.X_train) self.Y_scaler.fit(self.y_train) # scaling the data in all cases, it may not be used during the fit later self.X_train_sc = self.X_scaler.transform(self.X_train) self.y_train_sc = self.Y_scaler.transform(self.y_train) self.X_test_sc = self.X_scaler.transform(self.X_test) self.y_test_sc = self.Y_scaler.transform(self.y_test) if self.algorithm == "KernelRidge": clf_kr = KernelRidge(kernel=self.user_kernel) self.model = sklearn.model_selection.GridSearchCV(clf_kr, cv=5, param_grid=self.param_kr) elif self.algorithm == "SVM": clf_svm = SVR(kernel=self.user_kernel) self.model = sklearn.model_selection.GridSearchCV(clf_svm, cv=5, param_grid=self.param_svm) elif self.algorithm == "Lasso": clf_lasso = sklearn.linear_model.Lasso(alpha=0.1,random_state=self.rand_state) self.model = sklearn.model_selection.GridSearchCV(clf_lasso, cv=5, param_grid=dict(alpha=np.logspace(-5,5,30))) elif self.algorithm == "ElasticNet": clf_ElasticNet = sklearn.linear_model.ElasticNet(alpha=0.1, l1_ratio=0.5,random_state=self.rand_state) self.model = sklearn.model_selection.GridSearchCV(clf_ElasticNet,cv=5, param_grid=dict(alpha=np.logspace(-5,5,30))) elif self.algorithm == "LinearRegression": self.model = sklearn.linear_model.LinearRegression() elif self.algorithm == "NeuralNet": self.model = MLPRegressor(**self.param_neurons) elif self.algorithm == "BaggingNeuralNet": nn_m = MLPRegressor(**self.param_neurons) self.model = BaggingRegressor(base_estimator = nn_m, **self.param_bag) if self.scaling == True: self.model.fit(self.X_train_sc, self.y_train_sc.reshape(-1,)) predict_train_sc = self.model.predict(self.X_train_sc) self.prediction_train = self.Y_scaler.inverse_transform(predict_train_sc.reshape(-1,1)) predict_test_sc = self.model.predict(self.X_test_sc) self.prediction_test = self.Y_scaler.inverse_transform(predict_test_sc.reshape(-1,1)) else: self.model.fit(self.X_train, self.y_train.reshape(-1,)) self.prediction_train = self.model.predict(self.X_train) self.prediction_test = self.model.predict(self.X_test)
def random_forest(X,Y,Xt): print('learn') rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf.fit(X, Y) print('predict') Yp_clamped = clf.predict(Xt) return Yp_clamped
def test_sparse_regression(): # Check regression for various parameter settings on sparse input. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) class CustomSVR(SVR): """SVC variant that records the nature of the training set""" def fit(self, X, y): super().fit(X, y) self.data_type_ = type(X) return self parameter_sets = [ {"max_samples": 0.5, "max_features": 2, "bootstrap": True, "bootstrap_features": True}, {"max_samples": 1.0, "max_features": 4, "bootstrap": True, "bootstrap_features": True}, {"max_features": 2, "bootstrap": False, "bootstrap_features": True}, {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False}, ] for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: # Trained on sparse format sparse_classifier = BaggingRegressor( base_estimator=CustomSVR(), random_state=1, **params ).fit(X_train_sparse, y_train) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_results = BaggingRegressor( base_estimator=CustomSVR(), random_state=1, **params ).fit(X_train, y_train).predict(X_test) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert_array_almost_equal(sparse_results, dense_results) assert all([t == sparse_type for t in types]) assert_array_almost_equal(sparse_results, dense_results)
def procedureA(goldenFlag = False): # Trains and generates a prediction file # Uses hard heuristic for buy_or_not popFlag = True X, Y = getDataXY(currYearFlag = False, popFlag = popFlag) X, Y = shuffle(X, Y, random_state = 0) if popFlag: encoder = oneHot(X[:, 2:]) Xt = encoder.transform(X[:, 2:]) Xt = np.hstack((X[:,:2], Xt)) else: encoder = oneHot(X) Xt = encoder.transform(X) buySet = set() for i in range(X.shape[0]): tmpTup = (X[i][0], X[i][2]) buySet.add(tmpTup) # Y_buy = [1] * Xt.shape[0] min_max_scaler = preprocessing.MinMaxScaler() # Xt = min_max_scaler.fit_transform(Xt) if goldenFlag: print Xt.shape Xt = getGoldenX(Xt, 2, 2 + encoder.feature_indices_[1], 2 + encoder.feature_indices_[0], 2 + min(9, encoder.feature_indices_[1])) split = 0.9 X_train, X_test = Xt[:(int(Xt.shape[0]*split)),:], Xt[int(Xt.shape[0]*split):, :] Y_train, Y_test = Y[:(int(Y.shape[0]*split)),:], Y[int(Y.shape[0]*split):, :] Y_train = Y_train.ravel() Y_test = Y_test.ravel() print X_train.shape print X_test.shape # clf = Ridge(alpha = 100) # clf = SVR(C = 10.0, kernel = 'poly', degree = 2) # clf = LinearSVR(C = 1.0) clf = BaggingRegressor(DecisionTreeRegressor(), n_estimators = 125, n_jobs = 4, random_state = 0) # clf = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = 100) # clf = DecisionTreeRegressor() # clf = RandomForestRegressor(random_state = 0, n_estimators = 200, n_jobs = 4) clf.fit(X_train, Y_train.ravel()) Y_pred = clf.predict(X_test) evaluatePred(Y_pred, Y_test) return clf, encoder, min_max_scaler
def test_single_estimator(): # Check singleton ensembles. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) clf1 = BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=1, bootstrap=False, bootstrap_features=False, random_state=rng).fit(X_train, y_train) clf2 = KNeighborsRegressor().fit(X_train, y_train) assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
def __init__(self): # self.clf = GradientBoostingRegressor(n_estimators=200, max_features="sqrt", max_depth=5) # self.clf = LinearRegression() self.clf = BaggingRegressor(LinearRegression()) # self.clf = GaussianProcess(theta0=4) # self.sp = RandomizedLasso() self.sp = SparseRandomProjection(n_components=5)
def train_model(training, testing, window=5, n=5): X_train, y_train = prepare_data(training) X_test, y_test = prepare_data(testing) rf = RandomForestRegressor() rf.fit(X_train, y_train) predrf = rf.predict(X_test) print "mse for random forest regressor: ", mean_squared_error(predrf, y_test) gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025) gb.fit(X_train, y_train) predgb = gb.predict(X_test) print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test) ## plot feature importance using GBR results fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility']) fx_imp /= fx_imp.max() # normalize fx_imp.sort() ax = fx_imp.plot(kind='barh') fig = ax.get_figure() fig.savefig("output/feature_importance.png") adb = AdaBoostRegressor(DecisionTreeRegressor()) adb.fit(X_train, y_train) predadb = adb.predict(X_test) print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test) scale = StandardScaler() scale.fit(X_train) X_trainscale = scale.transform(X_train) X_testscale = scale.transform(X_test) knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5) knn.fit(X_trainscale, y_train) predknn = knn.predict(X_testscale) print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test) pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test) result = testing.copy() result.ix[5:-5, 'trend'] = pred_test result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values result.ix[:-5, 'pred_date'] = result.index[5:] return result
def procc_modelfusion(df_test, data_test): from sklearn.ensemble import BaggingRegressor from sklearn import linear_model train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') train_np = train_df.as_matrix() # y即Survival结果 y = train_np[:, 0] # X即特征属性值 X = train_np[:, 1:] # fit到BaggingRegressor之中 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1) bagging_clf.fit(X, y) test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') predictions = bagging_clf.predict(test) result = pd.DataFrame({'PassengerId' : data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)}) result.to_csv("logistic_regression_predictions3.csv", index=False)
class Regressor(BaseEstimator): def __init__(self): # self.clf = GradientBoostingRegressor(n_estimators=200, max_features="sqrt", max_depth=5) # self.clf = LinearRegression() self.clf = BaggingRegressor(LinearRegression()) # self.clf = GaussianProcess(theta0=4) # self.sp = RandomizedLasso() self.sp = SparseRandomProjection(n_components=5) # self.sp = TruncatedSVD() # self.sp = KernelPCA(n_components=3, tol=0.0001, kernel="poly") # self.clf = ExtraTreesRegressor(n_estimators=200, max_features="sqrt", max_depth=5) def fit(self, X, y): # print(self.sp) # Xr = self.sp.fit_transform(X, y) self.clf.fit(X, y.ravel()) def predict(self, X): # Xr = self.sp.transform(X) return self.clf.predict(X)
def test_parallel_regression(): # Check parallel regression. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
def runTests(): # Generate the training samples, extract training features and target trainSamples = GenSamples(numSamples) trainFeatures = extractFeatures(trainSamples) trainPred = extractPred(trainSamples) # Generate the test samples, extracr test features and target testSamples = GenSamples(numTestSamples) testFeatures = extractFeatures(testSamples) testPred = extractPred(testSamples) R2List = OrderedDict() R2List['TrainROI'] = [] R2List['TestROI'] = [] print 'Running Tests: ' for i in range(numTests): # Bootstrap is True by default i.e., sampling with replacement # Bootstrap features is False by default i.e., all features used classifier = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=numTrees, max_samples=int(0.5*numSamples), max_features=int(1)) classifier.fit(trainFeatures, trainPred) predictROI = {} predictROI['Training'] = classifier.predict(trainFeatures) predictROI['Test'] = classifier.predict(testFeatures) R2 = {} R2['Train'] = r2_score(trainPred, predictROI['Training']) R2['Test'] = r2_score(testPred, predictROI['Test']) R2List['TrainROI'].append(R2['Train']) R2List['TestROI'].append(R2['Test']) print 'Best Train ROI: ', max(R2List['TrainROI']) print 'Best Test ROI: ', max(R2List['TestROI'])
def test_bagging_regressor_with_missing_inputs(): # Check that BaggingRegressor can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y_values = [ np.array([2, 3, 3, 3, 3]), np.array([ [2, 1, 9], [3, 6, 8], [3, 6, 8], [3, 6, 8], [3, 6, 8], ]) ] for y in y_values: regressor = DecisionTreeRegressor() pipeline = make_pipeline( Imputer(), Imputer(missing_values=np.inf), Imputer(missing_values=np.NINF), regressor ) pipeline.fit(X, y).predict(X) bagging_regressor = BaggingRegressor(pipeline) y_hat = bagging_regressor.fit(X, y).predict(X) assert_equal(y.shape, y_hat.shape) # Verify that exceptions can be raised by wrapper regressor regressor = DecisionTreeRegressor() pipeline = make_pipeline(regressor) assert_raises(ValueError, pipeline.fit, X, y) bagging_regressor = BaggingRegressor(pipeline) assert_raises(ValueError, bagging_regressor.fit, X, y)
def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) base_estimator = DecisionTreeRegressor().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=False, random_state=rng).fit(X_train, y_train) assert_equal(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train)) # with bootstrap, trees are no longer perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=True, random_state=rng).fit(X_train, y_train) assert_greater(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train)) # check that each sampling correspond to a complete bootstrap resample. # the size of each bootstrap should be the same as the input data but # the data should be different (checked using the hash of the data). ensemble = BaggingRegressor(base_estimator=DummySizeEstimator(), bootstrap=True).fit(X_train, y_train) training_hash = [] for estimator in ensemble.estimators_: assert estimator.training_size_ == X_train.shape[0] training_hash.append(estimator.training_hash_) assert len(set(training_hash)) == len(training_hash)
class BaggingRegressor(BaseEstimator): """ Usage: ``` "model": { "class": "ume.ensemble.BaggingRegressor", "params": { "base_estimator": { "class": "sklearn.svm.SVR", "params": { "kernel": "rbf", "degree": 1, "C": 1000000.0, "epsilon": 0.01, }, }, "bag_kwargs": { "n_estimators": 100, "n_jobs": 5, "max_samples": 0.9, }, } } ``` """ def __init__(self, base_estimator=None, bag_kwargs=None): klass = dynamic_load(base_estimator['class']) svr_reg = klass(**base_estimator['params']) self.__clf = SK_BaggingRegressor(base_estimator=svr_reg, **bag_kwargs) def fit(self, X, y): return self.__clf.fit(X, y) def predict(self, X): return self.__clf.predict(X)
def get_bagging_prediction(X_train, y_train, X_test, X_valid=None, GS=False): if not GS: rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) if X_valid is None: return y_pred else: return y_pred, clf.predict(X_valid) else: rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]} model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=2, verbose=VERBOSE, scoring=RMSE) model.fit(X_train, y_train) y_pred = model.predict(X_test) if X_valid is None: return y_pred else: return y_pred, model.predict(X_valid)
# Getting Testing Data out of the DF test_data_frame = data_frame_regression.iloc[num_train:] # Getting IDs for Testing Data id_test = test_data_frame['id'] relevance_train = train_data_frame['relevance'].values # All the Independent Variables in the Regressor # These are Words in Title, Desription, Values X_train = train_data_frame.drop(['id', 'relevance'], axis=1).values # Same for Test Data X_test = test_data_frame.drop(['id', 'relevance'], axis=1).values # Using RandomForest Regressor rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) # Using Bagging Regressor clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) # Fit the Training Data to a Model clf.fit(X_train, relevance_train) # Predicting the relevance for Testind Data relevance_pred = clf.predict(X_test) # Writing the Relevance Values to Submission.csv pandas.DataFrame({"id": id_test, "relevance": relevance_pred}).to_csv('submission.csv', index=False)
import pandas df_dir = '../data/' K_fold = 2 num_train = 74000 def fmean_squared_error(ground_truth, predictions): fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5 return fmean_squared_error_ RMSE = make_scorer(fmean_squared_error, greater_is_better=False) df = pandas.read_csv(df_dir+'my_df_all.csv') df = df[:num_train].drop(['Unnamed: 0'], axis = 1) rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf2 = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) clf2 = rf clf = pipeline.Pipeline([('rfr', rf)]) param_grid = {'rfr__n_estimators': [350], # 300 top 'rfr__max_depth': [8], # list(range(7,8,1)) } # param_grid = {'rfr__n_estimators':list(range(34,50,1)), # 'rfr__max_depth':list(range(13,15,1))} model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=5, verbose=0, scoring=RMSE) errors = [] X_train = df.drop(['product_uid', 'id', 'relevance'], axis=1).values y_train = df['relevance'].values model.fit(X_train, y_train) print("Best parameters found by grid search:") print(model.best_params_)
def bagging(df1, features, pred_var, df2): lr = BaggingRegressor() lr.fit(df1[features], df1[pred_var]) print 'BaggingClassifier Score: ', lr.score(df2[features], df2[pred_var])
def run_stack(SEED): model = "Lasso" trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBase = pd.read_csv('../models/' + model + '_train.csv') trainBaseWeight = trainBase['var11'] test = pd.read_csv('../models/' + model + '_test.csv') #trainBase = shuffle(trainBase, random_state = SEED) print(trainBase.columns) trainBaseID = trainBase['id'] testID = test['id'] trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) test = np.nan_to_num(np.array(test)) avg = 0 NumFolds = 5 #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=30, random_state=166, min_samples_leaf=1), #Ridge() clfs = [ #KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski') #SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False) #BayesianRidge(n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False) #ElasticNet(alpha=0.00069956421567126271, l1_ratio=1/10, fit_intercept=True, normalize=False, precompute='auto', max_iter=10000, copy_X=True, tol=1/10000, warm_start=False, positive=False) #LinearRegression(fit_intercept=True, normalize=False, copy_X=True) BaggingRegressor(n_estimators=50, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, n_jobs=1, random_state=None, verbose=0) #AdaBoostRegressor( n_estimators=10, learning_rate=1.0, loss='linear', random_state=None) #Lasso(alpha=0.0000329034456231), #Ridge(), #RandomForestRegressor(n_estimators=3000, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, min_density=None, compute_importances=None), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=15, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=10, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=5, n_estimators=15, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=2, n_estimators=15, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=300, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=1000, random_state=166, min_samples_leaf=1), #GradientBoostingRegressor(loss='ls', learning_rate=0.05, subsample=0.5, max_depth=10, n_estimators=3000, random_state=166, min_samples_leaf=1), ] print("Data size: " + str(len(trainBase)) + " " + str(len(test))) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) print("Begin Training") lenTrainBase = len(trainBase) lenTest = len(test) gc.collect() for ExecutionIndex, clf in enumerate(clfs): print(clf) avg = 0 dataset_blend_test_set = np.zeros((lenTest, NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1))) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1))) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) #print(predicted[:,0]) print(predicted) dataset_blend_train[ test_index, ExecutionIndex] = predicted #[:,0] #needed for Ridge #print(targetTest.shape) #print(prpredictedob.shape) #print(weightTest.shape) print( str( score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini( targetTest.ravel(), predicted.ravel(), weightTest.ravel()) / NumFolds #print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel()))) #avg += score.normalized_weighted_gini(targetTest.ravel(), predicted[:,0], weightTest.ravel())/NumFolds predicted = clf.predict(test) dataset_blend_test_set[:, foldCount] = predicted #[:,0] foldCount = foldCount + 1 #break dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1) now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) #csv_io.write_delimited_file_single("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) submission = pd.DataFrame(np.zeros((len(testID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_test[:, ExecutionIndex] submission['id'] = testID submission.to_csv("../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index=False) #csv_io.write_delimited_file_single("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:,ExecutionIndex] ) submission = pd.DataFrame(np.zeros((len(trainBaseID), 2)), columns=['id', 'target']) submission['target'] = dataset_blend_train[:, ExecutionIndex] submission['id'] = trainBaseID submission.to_csv("../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", index=False) csv_io.write_delimited_file("../log/RunLog.csv", [ now.strftime("%Y %m %d %H %M %S"), "AVG.", str(avg), str(clf), "Folds:", str(NumFolds), "Model", "", "", "" ], filemode="a", delimiter=",") print("------------------------Average: " + str(avg)) #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
def baggingRidged(self): return BaggingRegressor(self.ridged, n_estimators=100, max_samples=0.2)
def __init__(self, **args): """Init model.""" self.model_lf = BaggingRegressor(**copy.deepcopy(args)) self.model_hf = BaggingRegressor(**copy.deepcopy(args))
from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import BaggingRegressor from sklearn.model_selection import GridSearchCV from sklearn import metrics #Step 1:Loading data X, y = load_boston(return_X_y=True) #Step 2:Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40) #step3:Training--BaggingRegressor regression = BaggingRegressor(random_state=40) param_grid = { 'base_estimator': [DecisionTreeRegressor(criterion='mse', splitter='best')], 'n_estimators': [x for x in np.arange(10, 101, 30)], 'max_samples': [0.3, 0.7, 1.0], 'max_features': [3, 6, 9, 13], 'bootstrap_features': [True, False] }, search = GridSearchCV(estimator=regression, param_grid=param_grid, cv=5, refit=True, verbose=1, n_jobs=-1) search.fit(X_train, y_train)
def test_sparse_regression(): # Check regression for various parameter settings on sparse input. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) class CustomSVR(SVR): """SVC variant that records the nature of the training set""" def fit(self, X, y): super(CustomSVR, self).fit(X, y) self.data_type_ = type(X) return self parameter_sets = [ { "max_samples": 0.5, "max_features": 2, "bootstrap": True, "bootstrap_features": True }, { "max_samples": 1.0, "max_features": 4, "bootstrap": True, "bootstrap_features": True }, { "max_features": 2, "bootstrap": False, "bootstrap_features": True }, { "max_samples": 0.5, "bootstrap": True, "bootstrap_features": False }, ] for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: # Trained on sparse format sparse_classifier = BaggingRegressor(base_estimator=CustomSVR(), random_state=1, **params).fit( X_train_sparse, y_train) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_results = BaggingRegressor(base_estimator=CustomSVR(), random_state=1, **params).fit( X_train, y_train).predict(X_test) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert_array_equal(sparse_results, dense_results) assert all([t == sparse_type for t in types]) assert_array_equal(sparse_results, dense_results)
import numpy as np from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split import operator import copy x, y = datahelper.get_xy('data/', num_hours=3, error_minutes=15) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) scores = {} models = [] for n in range(2, 20): estimator = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=4), max_samples=0.5, n_estimators=n) estimator.fit(x_train, y_train) scores[n] = estimator.score(x_test,y_test) models.append(copy.copy(estimator)) sorted_by_scores = sorted(scores.items(), key=operator.itemgetter(1), reverse=True) print('Results of 5 best # of estimators:\n') for i in range(0, 5): n, score = sorted_by_scores[i] print("№ estimators = ", n) y_predicted = models[n-2].predict(x_test) print('R^2 = ' + str(r2_score(y_test, y_predicted))) print('MSE = ' + str(np.sqrt(mean_squared_error(y_test, y_predicted))))
y_pred = random_forest.predict(X_test) from sklearn.model_selection import cross_val_score clf = RandomForestRegressor() scores = cross_val_score(clf, X_test, y_test, cv=5) scores.mean() #mse in $ mse = mean_absolute_error(y_test, y_pred) print("The mean absolute error is:$", mse) #chceking r^2 from sklearn.metrics import r2_score print("r_Score:", r2_score(y_test, y_pred)) bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10) bg.fit(X_train, y_train) bg.score(X_train, y_train) bg.score(X_test, y_test) #Adaboosting regr = AdaBoostRegressor() regr.fit(X_train, y_train) regr.score(X_test, y_test) #Decision from sklearn.tree import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.fit(X_train, y_train) dt.score(X_test, y_test)
hour_list.append(date.hour) rental_data["month"] = np.array(month_list) rental_data["day"] = np.array(day_list) rental_data["hour"] = np.array(hour_list) del rental_data["datetime"] rental_data = rental_data.iloc[np.random.permutation(len(rental_data))] rental_counts = rental_data["count"].values train_data,test_data,train_counts,test_counts = cross_validation.train_test_split(rental_data.values,rental_counts,test_size=0.2) rf = RandomForestRegressor(n_estimators=101) ada = AdaBoostRegressor(n_estimators=101) grad = GradientBoostingRegressor(n_estimators=101) bagging = BaggingRegressor(n_estimators=101) svr = SVR() regressors = [rf,ada,grad,bagging,svr] regressor_names = ["Random Forests","Adaboost Regressor","Gradient Boost Regressor","Bagging Regressor","Support Vector Regressor"] for regressor,regressor_name in zip(regressors,regressor_names): regressor.fit(train_data,train_counts) predicted_counts = regressor.predict(test_data) print "-----------------------------------------\n" print "Mean Absolute Error for ",regressor_name," : ",metrics.mean_absolute_error(test_counts,predicted_counts) print "Median Absolute Error for ",regressor_name," : ",metrics.median_absolute_error(test_counts,predicted_counts) print "Mean Squared Error for ",regressor_name," : ",metrics.mean_squared_error(test_counts,predicted_counts) print "R2 Score for ",regressor_name, " : ",metrics.r2_score(test_counts,predicted_counts)
def fit(self, label_correctness_file, point_labels_file, users_file, user_quality_features_file='all_users.csv'): users = pd.read_csv(users_file) users = users.set_index('user_id') y_train = users['accuracy'] users_for_training = users[users['labels_validated'] > 25].index self.label_correctness = extract_label_features( point_labels_file, label_correctness_file) # Splits the users into training & testing groups user_quality_features = pd.read_csv( user_quality_features_file).set_index('user_id') half = int(len(users_for_training) / 2) users_labels_train = users_for_training[:half] users_labels_test = users_for_training[half:] # mask = np.random.permutation(np.arange(len(users_for_training))) # users_labels_train = users_for_training[mask[:int(proportion_labels * len(mask))]] # users_labels_test = users_for_training[mask[int(proportion_labels * len(mask)):]] train_labels = self.label_correctness.copy() train_labels = train_labels[~pd.isna(train_labels['correct'])] train_labels = train_labels[~(pd.isna(train_labels[features]).any( axis=1))] # en = OrdinalEncoder() # en.fit(pd.concat((train_labels[['CLASS_DESC']], test_labels[['CLASS_DESC']]))) # train_labels[['CLASS_DESC']] = en.transform(train_labels[['CLASS_DESC']]) self.rfe_labels = RFECV( estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5), scoring='precision') self.clf_labels = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=30) self.clf_accuracy = BaggingRegressor(random_state=0, n_jobs=-1, n_estimators=30) self.rfe_accuracy = RFECV( estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5), scoring='f1') print('Training label classifier...') self.rfe_labels.fit( train_labels[train_labels['user_id'].isin(users_labels_train)] [features].values, train_labels[train_labels['user_id'].isin( users_labels_train)]['correct'].astype(int)) self.clf_labels.fit( train_labels[train_labels['user_id'].isin(users_labels_train)] [features].values[:, self.rfe_labels.support_], train_labels[train_labels['user_id'].isin( users_labels_train)]['correct'].astype(int)) train_labels = train_labels.join(pd.Series( data=self.clf_labels.predict_proba( train_labels[train_labels['user_id'].isin(users_labels_test)] [features].values[:, self.rfe_labels.support_])[:, 1], index=train_labels[train_labels['user_id'].isin( users_labels_test)].index).rename('prob'), how='outer') prob_hist_predictions = pd.DataFrame(train_labels[train_labels['user_id'].isin(users_labels_test)] .groupby('user_id').apply(lambda x:\ prob_hist(x['prob'].values)).rename('prob')) prob_hist_predictions = prob_hist_predictions.join( user_quality_features) print('Training accuracy classifier...') self.rfe_accuracy.fit( np.concatenate((dearray(prob_hist_predictions['prob']), prob_hist_predictions.drop(columns='prob').values), axis=1), y_train.loc[prob_hist_predictions.index] > 65) self.clf_accuracy.fit( np.concatenate((dearray(prob_hist_predictions['prob']), prob_hist_predictions.drop(columns='prob').values), axis=1)[:, self.rfe_accuracy.support_], y_train.loc[prob_hist_predictions.index])
def bagging(): data = pd.read_csv('MR_meanencoding.csv', index_col=0) data = data.drop("PNO", axis=1) print(1) data = data[data['AGE'] <= 120] data = data[data['AGE'] >= 1] print(2) data = data[data['total'] < 90 * 60] data = data[data['total'] > 60] print(3) # 標準化 # min = data['AGE'].min() # max = data['AGE'].max() # data['AGE'] = (data['AGE'] - min) / (max - min) # mlist = pd.unique(data['PLACE_n']) # print(mlist) # 第五台沒資料 for m in range(1): pno = {405108: 50, 405568: 23, 405984: 11, 406750: 83} # pno = {405108: 54, 405568: 23, 405984: 12, 406750: 89} temp = data.copy() print(temp.columns) X_train, X_test, y_train, y_test = train_test_split(temp.drop('total', axis=1), temp['total'], test_size=0.3, random_state=42) def find_alpha(X_train, y_train): reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13)) reg.fit(X_train, y_train) print(m, ':', reg.alpha_) # find_alpha(X_train, y_train) alpha = {405108: 100, 405568: 10, 405984: 100, 406750: 10} bagging = BaggingRegressor( base_estimator=linear_model.LinearRegression(), max_samples=.1, max_features=1) bagging.fit(X_train, y_train) # print(m) print('bagging_linear:', mape(y_test, bagging.predict(X_test))) # print('bagging_linear:', mape(y_test, bagging.predict(X_test)), r2_score(y_test, bagging.predict(X_test))) # print(shoot(y_test, bagging.predict(X_test))) bagging = BaggingRegressor(base_estimator=linear_model.Ridge(alpha=.5), max_samples=.1, max_features=1) bagging.fit(X_train, y_train) print('bagging_Ridge:', mape(y_test, bagging.predict(X_test))) # print('bagging_Ridge:', mape(y_test, bagging.predict(X_test)), r2_score(y_test, bagging.predict(X_test))) # print(shoot(y_test, bagging.predict(X_test))) bagging = BaggingRegressor( base_estimator=linear_model.Lasso(alpha=0.5), max_samples=.1, max_features=1) bagging.fit(X_train, y_train) print('bagging_Lasso:', mape(y_test, bagging.predict(X_test))) # print('bagging_Lasso:', mape(y_test, bagging.predict(X_test)), r2_score(y_test, bagging.predict(X_test))) # print(shoot(y_test, bagging.predict(X_test))) bagging = BaggingRegressor(max_samples=.1, max_features=1) bagging.fit(X_train, y_train) print('bagging_decision:', mape(y_test, bagging.predict(X_test))) # print('bagging_decision:', mape(y_test, bagging.predict(X_test)), r2_score(y_test, bagging.predict(X_test))) # print(shoot(y_test, bagging.predict(X_test))) reg = linear_model.LinearRegression() reg.fit(X_train, y_train) print('linear', mape(y_test, reg.predict(X_test))) # print('linear', mape(y_test, reg.predict(X_test)), r2_score(y_test, bagging.predict(X_test))) # print(shoot(y_test, reg.predict(X_test))) reg = linear_model.Ridge(alpha=.5) reg.fit(X_train, y_train) print('Ridge', mape(y_test, reg.predict(X_test))) # print('Ridge', mape(y_test, reg.predict(X_test)), r2_score(y_test, bagging.predict(X_test))) # print(shoot(y_test, reg.predict(X_test))) reg = linear_model.Lasso(alpha=0.5) reg.fit(X_train, y_train) print('Lasso', mape(y_test, reg.predict(X_test))) # print('Lasso', mape(y_test, reg.predict(X_test)), r2_score(y_test, bagging.predict(X_test))) # print(shoot(y_test, reg.predict(X_test))) # break #只做一台 def result(): pass # bagging default """
def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. Has to follow the convention 0 for normal data, 1 for anomalies. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples, typically the amount in case of transactions data. Used to grow regression trees producing further rules to be tested. If not provided, then each sample is given unit weight. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_ = X.shape[1] self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("This method needs samples of at least 2 classes" " in the data, but the data contains only one" " class: %r" % self.classes_[0]) if not isinstance(self.max_depth_duplication, int) \ and self.max_depth_duplication is not None: raise ValueError("max_depth_duplication should be an integer" ) if not set(self.classes_) == set([0, 1]): warn("Found labels %s. This method assumes target class to be" " labeled as 1 and normal data to be labeled as 0. Any label" " different from 0 will be considered as being from the" " target class." % set(self.classes_)) y = (y > 0) # ensure that max_samples is in [1, n_samples]: n_samples = X.shape[0] if isinstance(self.max_samples, six.string_types): raise ValueError('max_samples (%s) is not supported.' 'Valid choices are: "auto", int or' 'float' % self.max_samples) elif isinstance(self.max_samples, INTEGER_TYPES): if self.max_samples > n_samples: warn("max_samples (%s) is greater than the " "total number of samples (%s). max_samples " "will be set to n_samples for estimation." % (self.max_samples, n_samples)) max_samples = n_samples else: max_samples = self.max_samples else: # float if not (0. < self.max_samples <= 1.): raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples self.rules_ = {} self.estimators_ = [] self.estimators_samples_ = [] self.estimators_features_ = [] # default columns names : feature_names_ = [BASE_FEATURE_NAME + x for x in np.arange(X.shape[1]).astype(str)] if self.feature_names is not None: self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(self.feature_names)} else: self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(feature_names_)} self.feature_names_ = feature_names_ clfs = [] regs = [] self._max_depths = self.max_depth \ if isinstance(self.max_depth, Iterable) else [self.max_depth] for max_depth in self._max_depths: bagging_clf = BaggingClassifier( base_estimator=DecisionTreeClassifier( max_depth=max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split), n_estimators=self.n_estimators, max_samples=self.max_samples_, max_features=self.max_samples_features, bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, # oob_score=... XXX may be added # if selection on tree perf needed. # warm_start=... XXX may be added to increase computation perf. n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) bagging_reg = BaggingRegressor( base_estimator=DecisionTreeRegressor( max_depth=max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split), n_estimators=self.n_estimators, max_samples=self.max_samples_, max_features=self.max_samples_features, bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, # oob_score=... XXX may be added # if selection on tree perf needed. # warm_start=... XXX may be added to increase computation perf. n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) clfs.append(bagging_clf) regs.append(bagging_reg) # define regression target: if sample_weight is not None: if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) weights = sample_weight - sample_weight.min() contamination = float(sum(y)) / len(y) y_reg = ( pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow((weights).mean(), 0.5) * (y == 0)) y_reg = 1. / (1 + np.exp(-y_reg)) # sigmoid else: y_reg = y # same as an other classification bagging for clf in clfs: clf.fit(X, y) self.estimators_ += clf.estimators_ self.estimators_samples_ += clf.estimators_samples_ self.estimators_features_ += clf.estimators_features_ for reg in regs: reg.fit(X, y_reg) self.estimators_ += reg.estimators_ self.estimators_samples_ += reg.estimators_samples_ self.estimators_features_ += reg.estimators_features_ rules_ = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): # Create mask for OOB samples mask = ~samples if sum(mask) == 0: warn("OOB evaluation not possible: doing it in-bag." " Performance evaluation is likely to be wrong" " (overfitting) and selected rules are likely to" " not perform well! Please use max_samples < 1.") mask = samples rules_from_tree = self._tree_to_rules( estimator, np.array(self.feature_names_)[features]) # XXX todo: idem without dataframe X_oob = pandas.DataFrame((X[mask, :])[:, features], columns=np.array( self.feature_names_)[features]) if X_oob.shape[1] > 1: # otherwise pandas bug (cf. issue #16363) y_oob = y[mask] y_oob = np.array((y_oob != 0)) # Add OOB performances to rules: rules_from_tree = [(r, self._eval_rule_perf(r, X_oob, y_oob)) for r in set(rules_from_tree)] rules_ += rules_from_tree # Factorize rules before semantic tree filtering rules_ = [ tuple(rule) for rule in [Rule(r, args=args) for r, args in rules_]] # keep only rules verifying precision_min and recall_min: for rule, score in rules_: if score[0] >= self.precision_min and score[1] >= self.recall_min: if rule in self.rules_: # update the score to the new mean c = self.rules_[rule][2] + 1 b = self.rules_[rule][1] + 1. / c * ( score[1] - self.rules_[rule][1]) a = self.rules_[rule][0] + 1. / c * ( score[0] - self.rules_[rule][0]) self.rules_[rule] = (a, b, c) else: self.rules_[rule] = (score[0], score[1], 1) self.rules_ = sorted(self.rules_.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True) # Deduplicate the rule using semantic tree if self.max_depth_duplication is not None: self.rules_ = self.deduplicate(self.rules_) self.rules_ = sorted(self.rules_, key=lambda x: - self.f1_score(x)) self.rules_without_feature_names_ = self.rules_ # Replace generic feature names by real feature names self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf) for rule, perf in self.rules_] return self
class UserQualityRegressor(BaseEstimator): def __init__(self): pass # Trains the ml model using the ineraction & label data of all users who have atleast 25 labels validated def fit(self, label_correctness_file, point_labels_file, users_file, user_quality_features_file='all_users.csv'): users = pd.read_csv(users_file) users = users.set_index('user_id') y_train = users['accuracy'] users_for_training = users[users['labels_validated'] > 25].index self.label_correctness = extract_label_features( point_labels_file, label_correctness_file) # Splits the users into training & testing groups user_quality_features = pd.read_csv( user_quality_features_file).set_index('user_id') half = int(len(users_for_training) / 2) users_labels_train = users_for_training[:half] users_labels_test = users_for_training[half:] # mask = np.random.permutation(np.arange(len(users_for_training))) # users_labels_train = users_for_training[mask[:int(proportion_labels * len(mask))]] # users_labels_test = users_for_training[mask[int(proportion_labels * len(mask)):]] train_labels = self.label_correctness.copy() train_labels = train_labels[~pd.isna(train_labels['correct'])] train_labels = train_labels[~(pd.isna(train_labels[features]).any( axis=1))] # en = OrdinalEncoder() # en.fit(pd.concat((train_labels[['CLASS_DESC']], test_labels[['CLASS_DESC']]))) # train_labels[['CLASS_DESC']] = en.transform(train_labels[['CLASS_DESC']]) self.rfe_labels = RFECV( estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5), scoring='precision') self.clf_labels = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=30) self.clf_accuracy = BaggingRegressor(random_state=0, n_jobs=-1, n_estimators=30) self.rfe_accuracy = RFECV( estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5), scoring='f1') print('Training label classifier...') self.rfe_labels.fit( train_labels[train_labels['user_id'].isin(users_labels_train)] [features].values, train_labels[train_labels['user_id'].isin( users_labels_train)]['correct'].astype(int)) self.clf_labels.fit( train_labels[train_labels['user_id'].isin(users_labels_train)] [features].values[:, self.rfe_labels.support_], train_labels[train_labels['user_id'].isin( users_labels_train)]['correct'].astype(int)) train_labels = train_labels.join(pd.Series( data=self.clf_labels.predict_proba( train_labels[train_labels['user_id'].isin(users_labels_test)] [features].values[:, self.rfe_labels.support_])[:, 1], index=train_labels[train_labels['user_id'].isin( users_labels_test)].index).rename('prob'), how='outer') prob_hist_predictions = pd.DataFrame(train_labels[train_labels['user_id'].isin(users_labels_test)] .groupby('user_id').apply(lambda x:\ prob_hist(x['prob'].values)).rename('prob')) prob_hist_predictions = prob_hist_predictions.join( user_quality_features) print('Training accuracy classifier...') self.rfe_accuracy.fit( np.concatenate((dearray(prob_hist_predictions['prob']), prob_hist_predictions.drop(columns='prob').values), axis=1), y_train.loc[prob_hist_predictions.index] > 65) self.clf_accuracy.fit( np.concatenate((dearray(prob_hist_predictions['prob']), prob_hist_predictions.drop(columns='prob').values), axis=1)[:, self.rfe_accuracy.support_], y_train.loc[prob_hist_predictions.index]) # Creates the prediction of the given user's accuracy based off of their passed in label & interaction data def __predict_accuracy(self, probs, user_features): return self.clf_accuracy.predict([ np.concatenate( (prob_hist(probs), user_features))[self.rfe_accuracy.support_] ])[0] # Takes in all the names of the files and creates a prediction of the given user's accuracy def predict_one_user(self, filename, label_correctness_file, point_labels_file, panos_file, user_name): user_features, user_features_header, user_id = extract_user_features( filename, panos_file, user_name) label_correctness = extract_label_features(point_labels_file, label_correctness_file) label_correctness = label_correctness[~( pd.isna(label_correctness[features]).any(axis=1))] label_correctness = label_correctness[label_correctness['user_id'] == user_id] probs = self.clf_labels.predict( label_correctness[features].values[:, self.rfe_labels.support_]) return self.__predict_accuracy(probs, user_features) def save(self, filename): with open(filename, 'wb') as f: pickle.dump(self.__dict__, f) @classmethod def load(filename): return pickle.load(filename)
if __name__ == "__main__": np.random.seed(0) N = 200 x = np.random.rand(N) * 10 - 5 # [-5,5) x = np.sort(x) y = f(x) + 0.05 * np.random.randn(N) x.shape = -1, 1 degree = 6 n_estimators = 50 max_samples = 0.5 ridge = RidgeCV(alphas=np.logspace(-3, 2, 20), fit_intercept=False) ridged = Pipeline([('poly', PolynomialFeatures(degree=degree)), ('Ridge', ridge)]) bagging_ridged = BaggingRegressor(ridged, n_estimators=n_estimators, max_samples=max_samples) dtr = DecisionTreeRegressor(max_depth=9) regs = [('DecisionTree', dtr), ('Ridge(%d Degree)' % degree, ridged), ('Bagging Ridge(%d Degree)' % degree, bagging_ridged), ('Bagging DecisionTree', BaggingRegressor(dtr, n_estimators=n_estimators, max_samples=max_samples))] x_test = np.linspace(1.1 * x.min(), 1.1 * x.max(), 1000) mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['axes.unicode_minus'] = False plt.figure(figsize=(8, 6), facecolor='w') plt.plot(x, y, 'ro', mec='k', label='训练数据') plt.plot(x_test, f(x_test), color='k', lw=3, ls='-', label='真实值') clrs = '#FF2020', 'm', 'y', 'g'
def baggingRegressor(self): if self.dtr == None: self.decisionTreeRegressor() return BaggingRegressor(self.dtr, n_estimators=100, max_samples=0.2)
return(out) log = LogisticRegression(solver = 'sag') lm = LinearRegression() write_function(test['y'], '/tmp/truths.txt') print('optimizing samples') for n_samp in [0.1, 0.25, 0.33, 0.5, 0.75, 1.0]: for n_feat in [0.1, 0.25, 0.33, 0.5, 0.75, 1.0]: lm_bagged = BaggingRegressor( base_estimator = lm, n_estimators = 75, max_samples = n_samp, max_features = n_feat, bootstrap = True, oob_score = False, warm_start = False, n_jobs = -1 ) log_bagged = BaggingClassifier( base_estimator = log, n_estimators = 75, max_samples = n_samp, max_features = n_feat, bootstrap = True, oob_score = False, warm_start = False, n_jobs = -1 )
# 分割数据,按照 训练数据:cv数据 = 7:3的比例 split_train, split_cv = model_selection.train_test_split(df, test_size=0.3, random_state=0) train_df = split_train.filter( regex= 'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) # fit到BaggingRegressor之中 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1) clf.fit(X, y) cv_df = split_cv.filter( regex= 'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) predictions = clf.predict(cv_df.as_matrix()[:, 1:]) # bad_cases = data_train.loc[data_train['PassengerId'].isin(split_cv[predictions != cv_df.as_matrix()[:,0]]['PassengerId'].values)] data_test = pd.read_csv("test.csv") # 均值
(2)多模型(模型融合)使用Bagging策略融合LR from sklearn.ensemble import BaggingRegressor train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') train_np = train_df.as_matrix() # y即Survival结果 y = train_np[:, 0] # X即特征属性值 X = train_np[:, 1:] # fit到BaggingRegressor之中 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1) bagging_clf.fit(X, y) test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title') predictions = bagging_clf.predict(test) result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)}) result.to_csv("/Users/HanXiaoyang/Titanic_data/logistic_regression_bagging_predictions.csv", index=False) ------------------------------------------------------------------------------ ------------------------------------------------------------------------------- """七、工具""" """训练xgb模型对特征进行重要性排序,特征选择""" df_y = df_Master['target']
if target == 'lat': y = y[:, 0] elif target == 'lon': y = y[:, 1] # print(y) if phase == 'train': model.fit(X, y) y_pred = model.predict(X) total_loss = calc_loss(y, y_pred) print("{} {} loss is: {:.5f}".format(phase, target, total_loss)) for target in ['lat', 'lon']: # print(target) model = BaggingRegressor(n_estimators=1, max_features=0.2) for phase in ['train', 'val']: running_loss = 0.0 dl = dataloader[phase] X = dl[0] y = dl[1] if target == 'lat': y = y[:, 0] elif target == 'lon': y = y[:, 1] # print(y) if phase == 'train': model.fit(X, y) y_pred = model.predict(X) total_loss = calc_loss(y, y_pred)
def test_parallel(): """Check parallel computations.""" rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) for n_jobs in [-1, 3]: ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=n_jobs, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(), n_jobs=n_jobs, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) ensemble = BaggingClassifier(SVC(), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3) # Regression X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) for n_jobs in [-1, 3]: ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
["xgboost_Dummies", ""], ["xgboost_Label", ""], ["xgboost_Vect", ""], ] full_predictions = [] for alg, predictors in algorithms: if alg == "xgboost_Label": full_predictions.append(xgboost_Label(train, test, labels)) elif alg == "xgboost_Vect": full_predictions.append(xgboost_Vect(train, test, labels)) elif alg == "xgboost_Dummies": full_predictions.append(xgboost_Dummies(train, test, labels)) else: if predictors == "dummies": print ("Train ", alg.__class__.__name__, " dummies Model ") alg = BaggingRegressor(alg) alg.fit(train_du, labels) print "Prediction :", alg.__class__.__name__, " dummies Model " prediction = alg.predict(test_du) full_predictions.append(prediction) else: print ("Train ", alg.__class__.__name__, " Label Model ") alg = BaggingRegressor(alg) alg.fit(train_rf, labels) print "Prediction :", alg.__class__.__name__, " Label Model " prediction = alg.predict(test_rf) full_predictions.append(prediction) # Ensemble models RF_label_pred = full_predictions[0] RF_dummies_pred = full_predictions[1]
mlp = MLPRegressor() mlpFit = mlp.fit(x_train, y_train) regr = AdaBoostRegressor(random_state=0, n_estimators=100) regrFit = regr.fit(x_train, y_train) clfRidge = Ridge(alpha=1.0) clfRidgeFit = clfRidge.fit(x_train, y_train) clfBayesian = linear_model.BayesianRidge() clfBayesianFit = clfBayesian.fit(x_train, y_train) reg = linear_model.LassoLars(alpha=0.01) regFit = reg.fit(x_train, y_train) bag = BaggingRegressor() bagFit = bag.fit(x_train, y_train) DT_MAD = mean_absolute_error(y_test, DT_regressionFit.predict(x_test)) SVR_MAD = mean_absolute_error(y_test, svr_regressionFit.predict(x_test)) KNN_MAD = mean_absolute_error(y_test, neighFit.predict(x_test)) MLP_MAD = mean_absolute_error(y_test, mlpFit.predict(x_test)) regr_MAD = mean_absolute_error(y_test, mlpFit.predict(x_test)) clfRidge_MAD = mean_absolute_error(y_test, clfRidgeFit.predict(x_test)) clfBayesion_MAD = mean_absolute_error(y_test, clfBayesianFit.predict(x_test)) reg_MAD = mean_absolute_error(y_test, regFit.predict(x_test)) bag_MAD = mean_absolute_error(y_test, bagFit.predict(x_test)) print('Regression Tree MAD: ' + str(DT_MAD)) print('Support Vector Regression MAD ' + str(SVR_MAD)) print('KNN MAD ' + str(KNN_MAD))
import numpy as np import pickle from sklearn import preprocessing from sklearn.cross_validation import KFold from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error from matplotlib import pyplot as plt from sklearn import linear_model #from sklearn import svm from sklearn.decomposition import PCA model = BaggingRegressor(DecisionTreeRegressor(max_depth=20, min_samples_split=20, min_samples_leaf=1), n_estimators = 50) (X, y) = pickle.load(open('train.pickle')) scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) index = [15, 17, 18, 7, 0, 1, 2, 22, 23, 24, 9, 10, 4, 8, 5, 13, 14, 19, 20, 21, 16, 11, 12, 3, 25, 26, 6] X = X[:, index[:5]] #X = np.random.rand(X.shape[0], 1) #scaler = preprocessing.StandardScaler().fit(X) #X = scaler.transform(X) #X = X[:, range(12) + range(13, X.shape[1])] #pca = PCA(n_components=0.99) #pca.fit(X)
def process(i): current_dat = header.iloc[i] current_dat_name = current_dat['bench.id'] # Define Datasets # print(str(i)+': '+current_dat_name) filename = '/Users/apple/Documents/AD_Datasets/' + dataname + '/benchmarks/' + current_dat_name + '.csv' with open(filename, 'r') as csvfile: reader = csv.reader(csvfile) data = list(reader) data = np.array(data) X_train = data[1:, 6:].astype('double') anomaly_type = data[1:, 5] y_label = np.zeros(len(anomaly_type)) # normal_ind = np.where(anomaly_type == 'nominal')[0] anomaly_ind = np.where(anomaly_type == 'anomaly')[0] y_label[anomaly_ind] = 1 # X_normal = X_train[normal_ind,:] # X_outlier = X_train[anomaly_ind,:] # contamination = len(anomaly_ind)/len(y_label) rng = np.random.RandomState(42) # BaggedDTM # ################################################################################################# # # max_samples = min(2048,X_train.shape[0]) # # y = np.random.uniform(size=X_train.shape[0]) # # bag_neigh = max(10, int(np.floor(0.03 * max_samples))) # # clf_bagDTM = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh,contamination=0.1), # # n_estimators=100, max_samples=max_samples, bootstrap=False, random_state=rng) # # y_score_BDTM = clf_bagDTM.fit(X_train, y).predict(X_train) # # # fpr_DTM, tpr_DTM, thresholds_DTM = roc_curve(y_label, -DTM_score) # # auc_BDTM_score = roc_auc_score(y_label, -y_score_BDTM) # # ap_BDTM_score = average_precision_score(y_label, -y_score_BDTM) # sp ################################################################################################# max_samples = min(20, X_train.shape[0]) y = np.random.uniform(size=X_train.shape[0]) bag_neigh = 1 clf_spDTM = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh, contamination=0.1), n_estimators=1, max_samples=max_samples, bootstrap=False, random_state=rng) y_score_spDTM = clf_spDTM.fit(X_train, y).predict(X_train) auc_spDTM_score = roc_auc_score(y_label, -y_score_spDTM) ap_spDTM_score = average_precision_score(y_label, -y_score_spDTM) # aNNE ################################################################################################# clf_aNNE = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh, contamination=0.1), n_estimators=100, max_samples=max_samples, bootstrap=False, random_state=rng) y_score_aNNE = clf_aNNE.fit(X_train, y).predict(X_train) auc_aNNE_score = roc_auc_score(y_label, -y_score_aNNE) ap_aNNE_score = average_precision_score(y_label, -y_score_aNNE) return [auc_spDTM_score, auc_aNNE_score], [ap_spDTM_score, ap_aNNE_score]
# midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 # diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1]) # return midpoint, diff # # plot_learning_curve(model, u"learning curve", X, Y) # 6...................模型融合.....................# train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') train_np = train_df.as_matrix() # y即Survival结果 y = train_np[:, 0] x = train_np[:, 1:] # fit 到BaggingRegressor之中 model = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) bagging_model = BaggingRegressor(model,n_estimators=20,max_samples=0.8,max_features=1.0,bootstrap=True,bootstrap_features=False,n_jobs=-1) bagging_model.fit(x,y) test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') predictions = bagging_model.predict(test) result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(),'Survived':predictions.astype(np.int32 )}) result.to_csv('./result.csv',index=False)
from sklearn.ensemble import RandomForestRegressor from sklearn.multioutput import MultiOutputRegressor # to set number of jobs to the number of cores, use n_jobs=-1 model = MultiOutputRegressor(GradientBoostingRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:]) models.append(model) modeldims.append(2) modelnames.append('GradientBoostingRegressor') model = MultiOutputRegressor(AdaBoostRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:]) models.append(model) modeldims.append(2) modelnames.append('AdaBoostRegressor') model = MultiOutputRegressor(BaggingRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:]) models.append(model) modeldims.append(2) modelnames.append('BaggingRegressor') model = MultiOutputRegressor(ExtraTreesRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:]) models.append(model) modeldims.append(2) modelnames.append('ExtraTreesRegressor') model = MultiOutputRegressor(RandomForestRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:]) models.append(model) modeldims.append(2) modelnames.append('RandomForestRegressor') model = MultiOutputRegressor(SVR(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
'r-',tz=None, xdate=True, ydate=False) ax2.set_title('Error between actual and predicted loads') ax2.set_ylabel("Error, MW") featImportances=gradBoost.feature_importances_ pos = np.arange(len(features)) pairs = zip(features, featImportances) sorted_pairs = sorted(pairs, key = lambda pair: pair[1]) features_sorted, featImportances_sorted = zip(*sorted_pairs) fig, ax = plt.subplots() plt.barh(pos, featImportances_sorted, 1, color = "blue") plt.yticks(pos,features_sorted) ax.set_title('Gradient Boosting: Relative Feature Importance') #Tree Bagging TreeBagger=BaggingRegressor() TreeBagger.fit(Xtrain, Ytrain) fig = plt.figure() ax1 = fig.add_subplot(2, 1, 1) ax1.plot_date(dates, modeldata.Load[45000:50000], 'r-',tz=None, xdate=True, ydate=False, label='Actual Load') ax1.set_title('Tree Bagging: Actual and Predicted Loads') plt.plot(dates, TreeBagger.predict(Xtest), 'g-',label='Predicted Load') ax1.legend() ax2 = fig.add_subplot(2, 1, 2) ax2.plot_date(dates, modeldata.Load[45000:50000]-TreeBagger.predict(Xtest), 'r-',tz=None, xdate=True, ydate=False) ax2.set_title('Error between actual and predicted loads, MW') MSEs_Bagging=[mean_squared_error(Ytest, TreeBagger.predict(Xtest)), mean_squared_error(Ytrain, TreeBagger.predict(Xtrain))]
def linear_regression_algo(self): X = [] Y = [] with open('../Data/full_table.csv', 'r') as file: for line in csv.reader(file, delimiter=','): if len(line) == 13: try: zhvi = float(line[5]) property_type = line[6] room_type = line[7] accommodates = int(line[8]) bathrooms = float(line[9]) beds = int(line[10]) bed_type = line[11] price = float(line[12]) x = { 'zhvi': zhvi, 'property_type': property_type, 'room_type': room_type, 'accommodates': accommodates, 'bathrooms': bathrooms, 'beds': beds, 'bed_type': bed_type } y = price X.append(x) Y.append(y) except: pass # The DictVectorizer converts data from a dictionary to an array vec = DictVectorizer() # Convert X to Array X = vec.fit_transform(X).toarray() # Split X and Y into training and testing sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=43) # Linear Regression model = linear_model.LinearRegression() model.fit(X_train, Y_train) Y_pred = model.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Linear Regression') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2)) # With Boosting model_boost = AdaBoostRegressor(linear_model.LinearRegression()) model_boost.fit(X_train, Y_train) Y_pred = model_boost.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Linear Regression (with AdaBoost)') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2)) # With Bagging model_bag = BaggingRegressor(linear_model.LinearRegression()) model_bag.fit(X_train, Y_train) Y_pred = model_bag.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Linear Regression (with Bagging)') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2))
figure_width, figure_height, correct_orientation, cmap=cmap) names = [ "KNeighborsRegressor", "Bagging KNN Essembler", "XGBoost", "GradientBoostingRegressor", "AdaBoost + KNN", "AdaBoost", "Random Forest", "SVC + RandomForest Pipeline", "ExtraTreesRegressor" ] classifiers = [ #KNeighborsRegressor(13), KNeighborsRegressor(26), BaggingRegressor(KNeighborsRegressor(n_neighbors=30)), xgb.XGBRegressor( max_depth=6, n_estimators=55, #55 learning_rate=0.05, min_child_weight=60, nthread=8, subsample=0.95, #95 colsample_bytree=0.95, # 95 # subsample=1.00, # colsample_bytree=1.00, seed=482), # xgb.XGBRegressor(n_estimators=300, max_depth=7, min_child_weight=2, # learning_rate=0.01, subsample=0.80, colsample_bytree=0.70, # seed=818, reg_alpha=0.1), GradientBoostingRegressor(n_estimators=250,
####3.4KNN回归#### from sklearn import neighbors model_KNeighborsRegressor = neighbors.KNeighborsRegressor() ####3.5随机森林回归#### from sklearn import ensemble model_RandomForestRegressor = ensemble.RandomForestRegressor( n_estimators=20) #这里使用20个决策树 ####3.6Adaboost回归#### from sklearn import ensemble model_AdaBoostRegressor = ensemble.AdaBoostRegressor( n_estimators=50) #这里使用50个决策树 ####3.7GBRT回归#### from sklearn import ensemble model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor( learning_rate=0.2, n_estimators=200) #这里使用100个决策树 ####3.8Bagging回归#### from sklearn.ensemble import BaggingRegressor model_BaggingRegressor = BaggingRegressor() ####3.9ExtraTree极端随机树回归#### from sklearn.tree import ExtraTreeRegressor model_ExtraTreeRegressor = ExtraTreeRegressor() ###########4.具体方法调用部分########## try_different_method(model_LinearRegression)
plt.plot(t, x, 'r-', lw=1, label=u'原始数据') plt.plot(abnormal, x[abnormal], 'go', markeredgecolor='g', ms=3, label=u'异常值') plt.legend(loc='upper right') plt.title(u'异常检测', fontsize=18) plt.grid(b=True) # 预测 plt.subplot(133) select = np.ones(N, dtype=np.bool) select[abnormal] = False t = np.arange(N) dtr = DecisionTreeRegressor(criterion='mse', max_depth=10) br = BaggingRegressor(dtr, n_estimators=10, max_samples=0.3) br.fit(t[select].reshape(-1, 1), x[select]) y = br.predict(np.arange(N).reshape(-1, 1)) y[select] = x[select] plt.plot(x, 'g--', lw=1, label=u'原始值') # 原始值 plt.plot(y, 'r-', lw=1, label=u'校正值') # 校正值 plt.legend(loc='upper right') plt.title(u'异常值校正', fontsize=18) plt.grid(b=True) plt.tight_layout(1.5, rect=(0, 0, 1, 0.95)) plt.suptitle(u'排污数据的异常值检测与校正', fontsize=22) plt.show()
from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeRegressor # Settings n_repeat = 50 # Number of iterations for computing expectations n_train = 50 # Size of the training set n_test = 1000 # Size of the test set noise = 0.1 # Standard deviation of the noise np.random.seed(0) # Change this for exploring the bias-variance decomposition of other # estimators. This should work well for estimators with high variance (e.g., # decision trees or KNN), but poorly for estimators with low variance (e.g., # linear models). estimators = [("Tree", DecisionTreeRegressor()), ("Bagging(Tree)", BaggingRegressor(DecisionTreeRegressor()))] n_estimators = len(estimators) # Generate data def f(x): x = x.ravel() return np.exp(-x**2) + 1.5 * np.exp(-(x - 2)**2) def generate(n_samples, noise, n_repeat=1): X = np.random.random(n_samples) * 10 - 5 X = np.sort(X)
# Get rid of Nan values X[np.isnan(X)] = 0. print '******************************************' print name print '******************************************' if name=='Boston' or name=='Diabetes': # Regression problem rfr = RandomForestRegressor(**params) rfr.fit(X, y) print 'Score RandomForestRegressor = %s' % (rfr.score(X, y)) scores_rfr = cross_val_score(rfr, X, y ,cv=5) print 'Cross Val Score RandomForestRegressor = %s' % (np.mean(scores_rfr)) br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=max_depth), n_estimators=n_estimators) br.fit(X, y) print 'Score BaggingRegressor = %s' % (br.score(X, y)) scores_br = cross_val_score(br, X, y, cv=5) print 'Cross Val Scores of BR = %s' %(np.mean(scores_br)) if name=='Iris' or name=='Digits': # Classificaiton problem rfc = RandomForestClassifier(**params) rfc.fit(X, y) print 'Score RandomForestClassifier = %s' % (rfc.score(X, y)) scores_rfc = cross_val_score(rfc, X, y ,cv=5) print 'Corss Val Scores of RandomForestClassifier = %s' %(np.mean(scores_rfc)) bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators) bc.fit(X, y)
X_train, y_train = generate(n_samples=n_train, noise=noise) X_test, y_test = generate(n_samples=n_test, noise=noise) # One decision tree regressor dtree = DecisionTreeRegressor().fit(X_train, y_train) d_predict = dtree.predict(X_test) plt.figure(figsize=(10, 8)) plt.plot(X_test, f(X_test), 'b') plt.scatter(X_train, y_train, c='b', s=20) plt.plot(X_test, d_predict, 'g', lw=2) plt.xlim([-5, 5]) plt.title("Decision tree, MSE = %.2f" % np.sum((y_test - d_predict)**2)) # Bagging decision tree regressor bdt = BaggingRegressor(DecisionTreeRegressor()).fit(X_train, y_train) bdt_predict = bdt.predict(X_test) plt.figure(figsize=(10, 8)) plt.plot(X_test, f(X_test), 'b') plt.scatter(X_train, y_train, c='b', s=20) plt.plot(X_test, bdt_predict, 'y', lw=2) plt.xlim([-5, 5]) plt.title("Bagging decision tree, MSE = %.2f" % np.sum( (y_test - bdt_predict)**2)) # Random forest rf = RandomForestRegressor(n_estimators=10).fit(X_train, y_train) rf_predict = rf.predict(X_test) plt.figure(figsize=(10, 8))
df_all['letter_in_description'] = df_all['product_info'].map( lambda x: str_common_letter(x.split('\t')[0], x.split('\t')[2])) print("Drop columns that were changed...") df_all = df_all.drop(['search_term', 'product_title', 'product_description', 'product_info'], axis=1) # Set up training and test sets df_train = df_all.iloc[:num_train] df_test = df_all.iloc[num_train:] id_test = df_test['id'] y_train = df_train['relevance'].values # Drop 'id' and 'relevance' columns from the training and test sets X_train = df_train.drop(['id', 'relevance'], axis=1).values X_test = df_test.drop(['id', 'relevance'], axis=1).values # Setup RandomForest and Bagging Regressors rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0) clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25) # Fit the training data into the regression model using the output values clf.fit(X_train, y_train) # Run the prediction y_pred = clf.predict(X_test) # Set up our Data Frame datafr = pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('../dataset/submission.csv', index=False) print(datafr)
del globals()['profiles'] del globals()['profilesLSo'] del globals()['profilesLS'] del globals()['row'] del globals()['tmpLS'] del globals()['tmpAGE'] del globals()['profsTOlikes'] del globals()['i'] del globals()['tmpIND'] seed = 7 myRand = np.random.seed(seed) X_train, X_test, y_train, y_test = train_test_split(likesMAT, consARR, test_size=1500) nJOBS = int(sys.argv[1]) nEST = int(sys.argv[2]) bagOUT = BaggingRegressor(n_jobs=nJOBS, n_estimators=nEST, oob_score=True) #bagOUT.fit(likesMAT, consARR) bagOUT.fit(X_train, y_train) y_pred = bagOUT.predict(X_test) import math myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred)) print("cons, bagOUT: ", str(nEST), " ", myRMSE) # joblib.dump(bagOUT, "/Users/jamster/bagOUT-A-cons.xz", compress=9) # impbagOUT = joblib.load("/Users/jamster/bagOUT-A-cons.xz")