def fillAgeNa(data): #使用随机森林算法进行预测来填充 columns=data.columns if 'Survived' in columns: tr=data[data['Age'].notna()] te=data[data['Age'].isna()] tr_x=tr[['Survived','Pclass','Sex','SibSp','Parch','Fare','Embarked']] tr_y=tr[['Age']] tr_y=tr_y.values.ravel()#把tr_y转化为一维向量输入 te_x = te[['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']] rfModel=rfr(n_estimators=100)#采用默认参数的随机森林模型 rfModel.fit(tr_x,tr_y) Age_predict=rfModel.predict(te_x) else: tr = data[data['Age'].notna()] te = data[data['Age'].isna()] tr_x = tr[[ 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']] tr_y = tr[['Age']] tr_y = tr_y.values.ravel() # 把tr_y转化为一维向量输入 te_x = te[[ 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']] rfModel = rfr(n_estimators=100) # 采用默认参数的随机森林模型 rfModel.fit(tr_x, tr_y) Age_predict = rfModel.predict(te_x) data.loc[data['Age'].isna(),'Age']=Age_predict data['Age']=data['Age'].map(lambda x: int(x)) #年龄取整数 return data
def best_model(xt, xv, yt, yv): models = [] name_dt = "DecisionTreeRegressor" model_dt = dtr(random_state=1) # decision tree model_dt.fit(xt, yt) models.append({'name': name_dt, 'model': model_dt, 'mae': get_mae(model_dt, xv, yv)}) name_rf = "RandomForestRegressor" model_rf = rfr(random_state=1) # random forest model_rf.fit(xt, yt) models.append({'name': name_rf, 'model': model_rf, 'mae': get_mae(model_rf, xv, yv)}) name_xgb = "XGBRegressor" model_xgb = xgb(random_state=1, n_estimators=10000, learning_rate=0.01) # xgboost model_xgb.fit(xt, yt, early_stopping_rounds=10, eval_set=[(xv, yv)], verbose=False) models.append({'name': name_xgb, 'model': model_xgb, 'mae': get_mae(model_xgb, xv, yv)}) print("\n") for m in models: print("Model {} has MAE {}".format(m.get('name'), m.get('mae'))) min_mae = min(i['mae'] for i in models) best_model = [m for m in models if m.get('mae') == min_mae] print("\nBest model pick: ", best_model[0].get('name')) print("\n") return best_model[0].get('model')
def fill_missing_rf(X, y, to_fill): """ 使用随机森林填补一个特征的缺失值的函数 参数: X:要填补的特征矩阵 y:完整的,没有缺失值的标签 to_fill:字符串,要填补的那一列的名称 """ #构建我们的新特征矩阵和新标签 df = X.copy() fill = df.loc[:, to_fill] df = pd.concat([df.loc[:, df.columns != to_fill], pd.DataFrame(y)], axis=1) # 找出我们的训练集和测试集 Ytrain = fill[fill.notnull()] Ytest = fill[fill.isnull()] Xtrain = df.iloc[Ytrain.index, :] Xtest = df.iloc[Ytest.index, :] #用随机森林回归来填补缺失值 from sklearn.ensemble import RandomForestRegressor as rfr rfr = rfr(n_estimators=100 ) # random_state=0,n_estimators=200,max_depth=3,n_jobs=-1 rfr = rfr.fit(Xtrain, Ytrain) Ypredict = rfr.predict(Xtest) return Ypredict
def create_model(self, filepath): train_data = pd.read_csv(filepath) self.rf_model = rfr(random_state=1) X = train_data[self.features] y = train_data.SalePrice self.rf_model.fit(X, y)
def forward_multiRF_solver(X, y): xtr, xte, ytr, yte = train_test_split(X, y.ravel(), shuffle=True, random_state=0) rf = rfr(n_estimators=20) rf.fit(xtr, ytr) plt.figure() p = np.random.choice(pi_line) u = cb.init_u(p) + cb.init_u(np.random.choice(pi_line)) _, abs_work = LW_solver(u, "u_test", write=True) fetch_real_u = lambda it : np.load(osp.join(abs_work, "u_test_it%d.npy"%(it))) u_nNext = [] for it in range(1, cb.itmax) : if it > 1 : u = u_nNext u_nNext = [] xs = np.array(u).reshape(1,-1) u_nNext.append(rf.predict(xs)) u_nNext.insert(0, u[-2]) u_nNext.insert(len(u), u[1]) u_nNext = np.array(u_nNext).ravel() plt.clf() plt.plot(cb.line_x[1:cb.Nx-1], fetch_real_u(it+1)[1:cb.Nx-1], label="True it = %d" %(it+1), c='k') plt.plot(cb.line_x[1:cb.Nx-1], u_nNext[1:cb.Nx-1], label="RF Predicted at it = %d" %(it), marker='o', fillstyle = 'none', linestyle= 'none', c="green") plt.legend() plt.pause(2)
def get_regression_model(algo, poly_Order=2, **kwargs): # for key in kwargs: print(key) sys.exit() print_mod_info = False ### Regression models ### https://stackoverflow.com/questions/12860841/python-import-in-if if algo == 'XGR': mod = xgr(**kwargs) elif algo == 'RFR': mod = rfr(**kwargs) elif algo == 'ABR': mod = abr(**kwargs) elif algo == 'P1R': mod = LinearRegression(**kwargs) elif algo == 'P2R': mod = make_pipeline(PolynomialFeatures(poly_Order), Ridge(**kwargs)) elif algo == 'ANN': mod = MLPRegressor(**kwargs) elif algo == 'ELN': mod = ElasticNet(**kwargs) # add parameters later elif algo == 'E2R': mod = make_pipeline(PolynomialFeatures(poly_Order), ElasticNet(**kwargs)) elif algo == 'PLS': mod = PLSRegression(**kwargs) else: print('Algorithm has not yet been added to the menu.') sys.exit() return mod
def getModel(X_train, y_train, sorted_scores, numFeatures, estimators): np.random.seed(42) included_features = np.array( sorted_scores)[:, 0][:numFeatures] # ordered list of important features X = X_train[included_features] mean_rfrs = [] std_rfrs_upper = [] std_rfrs_lower = [] # yt = [i for i in Y["SalePrice"]] # for each number of estimators, fit the model and find the results for 8-fold cross validation for i in estimators: model = rfr(n_estimators=i, max_depth=None) scores_rfr = cross_val_score(model, X, y_train, cv=10, scoring="explained_variance") print("estimators:", i) # print('explained variance scores for k=10 fold validation:',scores_rfr) print("Est. explained variance: %0.2f (+/- %0.2f)" % (scores_rfr.mean(), scores_rfr.std() * 2)) print("") mean_rfrs.append(scores_rfr.mean()) std_rfrs_upper.append(scores_rfr.mean() + scores_rfr.std() * 2) # for error plotting std_rfrs_lower.append(scores_rfr.mean() - scores_rfr.std() * 2) # for error plotting return mean_rfrs, std_rfrs_upper, std_rfrs_lower
def rfrModel(train_X, test_X, train_y, test_y): """using random forest reegressor""" from sklearn.ensemble import RandomForestRegressor as rfr model = rfr(random_state=1) # defining the model model.fit(train_X, train_y) # fitting the model on training sets y_pred = model.predict(test_X) #predicting on test values '''Evaluating your model using mean absolute error''' return mae(test_y, y_pred)
def randomRegression(x, y, testX, testY): model = rfr() model.fit(x, y) print("Fitting Complete. Displaying Results... / 모델 피팅 성공. 결과 출력...") print("R^2 Score:",model.score(testX, testY))
def createPeakTestModel(train, test): forest = rfr() forest.fit(train[:, :-1], train[:, -1]) scores = cross_val_score(forest, test[:, :-1], test[:, -1]) print test.shape print train.shape print scores.mean() for x in xrange(test.shape[0]): print "%i vs %i given %s" % (test[x, -1], forest.predict(test[x, :-1].reshape( 1, -1)), test[x, :-1])
def model_randomforest_regressor(X_train, X_test, y_train, y_test): model_name = f'model_{count}_randomforest_regressor' model = rfr() model.fit(X_train, y_train) model.independentcols = independentcols score = model.score(X_test, y_test) print(f'{model_name} accuracy: {score}') joblib.dump(model, f'model/{model_name}.joblib')
def regression(self, metric="root_mean_squared_error", folds=10, alphas=[], graph=False): size = 1.3 * self.report_width // 10 models = {} models["Linear regressor"] = lr() models["Lasso regressor"] = lassor() models["Lasso CV regressor"] = lassocvr() models["Ridge regressor"] = rr(alpha=0, normalize=True) models["Ridge CV regressor"] = rcvr(alphas = alphas) models["K nearest neighbors regressor K2u"] = knnr(n_neighbors=2, weights='uniform') models["K nearest neighbors regressor K2d"] = knnr(n_neighbors=2, weights='distance') models["K nearest neighbors regressor K5"] = knnr(n_neighbors=5) models["K nearest neighbors regressor K10"] = knnr(n_neighbors=10) models["SGD regressor"] = sgdr(max_iter=10000, warm_start=True) models["Decision tree regressor"] = dtr() models["Decision tree regressor D3"] = dtr(max_depth=3) models["Random forest regressor"] = rfr() models["Ada boost regressor"] = abr() models["Gradient boost regressor"] = gbr() models["Support vector regressor"] = svr() self.models = models print('\n') print(self.report_width * '*', '\n*') print('* REGRESSION RESULTS - BEFORE PARAMETERS BOOSTING \n*') #kf = StratifiedKFold(n_splits=folds, shuffle=True) kf = KFold(n_splits=folds) results = [] names = [] for model_name in models: cv_scores = -1 * cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, scoring=metric) results.append(cv_scores) names.append(model_name) print(self.report_width * '*', '') report = pd.DataFrame({'Regressor': names, 'Score': results}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True) report.drop('Score', axis=1, inplace=True) display(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Regressor Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0) plt.show() return None
def linear_regression_model(train, validation, alpha, depth=None): X_train = train['X'].values[:, 1:] y_train = np.ravel(train['y'].values) X_validation = validation['X'].values[:, 1:] y_validation = np.ravel(validation['y'].values) models = { 'type': ['ridge', 'decision tree', 'random forest'], 'model': [ Ridge(alpha=0.1, fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, tol=tol, random_state=random_state), dt(criterion='mse', splitter='best', max_depth=depth, min_samples_split=2, min_samples_leaf=1, random_state=random_state), rfr(n_estimators=100, criterion='mse', max_depth=depth, min_samples_split=2, min_samples_leaf=1, random_state=random_state) ], 'score_train': [], 'score_valid': [], 'mse_train': [], 'mse_valid': [] } y_train_predict = [] y_valid_predict = [] for i in np.arange(0, len(models['type']), 1): m = models['model'][i] m.alpha = alpha m.fit(X_train, y_train) models['score_train'].append(m.score(X_train, y_train)) models['score_valid'].append(m.score(X_validation, y_validation)) y_train_predict.append(m.predict(X_train)) y_valid_predict.append(m.predict(X_validation)) models['mse_train'].append(mse(y_train, y_train_predict[i])) models['mse_valid'].append(mse(y_validation, y_valid_predict[i])) print('models: ', models['type']) print('R2 training:', models['score_train']) print('R2 validation:', models['score_valid']) print('MSE training:', models['mse_train']) print('MSE validation:', models['mse_valid']) return models
def regressor(file, X, Y, x, y): param = [] acc = [] criterion = ['mse', 'mae'] for i in it.product(n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, min_impurity_decrease, min_impurity_split, bootstrap, oob_score, n_jobs, random_state, verbose, warm_start): # print(*i) forest = rfr(*i) forest.fit(X, Y) # print('Accuracy: ' + str(forest.score(x, y)) + '\n') acc.append(forest.score(x, y)) param.append([*i]) _results(file, acc, param)
def set_missing_ages(df): # 取出已有的数值型特征 age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']] # 把乘客分成已知年龄和未知年龄 known_age = age_df[age_df.Age.notnull()].as_matrix() unknown_age = age_df[age_df.Age.isnull()].as_matrix() # 训练集 y = known_age[:, 0] X = known_age[:, 1:] regressor = rfr(random_state=0, n_estimators=2000, n_jobs=-1) regressor.fit(X, y) return regressor
def fill_missing_rf(X, y, to_fill): df = X.copy() fill = df.loc[:, to_fill] df = pd.concat([df.loc[:, df.columns != to_fill], pd.DataFrame(y)], axis=1) # 找出我们的训练集和测试集 Ytrain = fill[fill.notnull()] Ytest = fill[fill.isnull()] Xtrain = df.iloc[Ytrain.index, :] Xtest = df.iloc[Ytest.index, :] # 用随机森林回归来填补缺失值 from sklearn.ensemble import RandomForestRegressor as rfr rfr = rfr(n_estimators=100) rfr = rfr.fit(Xtrain, Ytrain) Ypredict = rfr.predict(Xtest) return Ypredict
def process(self): data = self.parameters.get('in') target_col = self.parameters.get('target_col', None) features_col = self.parameters.get('features_col', None) kwargs = self.parameters.get('parameters', None) if features_col is None: features_col = list(set(data.columns) - set([target_col])) targets = np.array(data[target_col]) features = np.array(data[features_col]) rf_model = rfr(**kwargs) rf_model.fit(features, targets) return rf_model
def runRF(train_1_x, test_x, hold_out, sub_test, fobj): ntrees = 100 njobs = 2 exp=30 c_train_1_x = train_1_x[(train_1_x[:,-1]<=exp)] c_train_y = c_train_1_x[:,-1] c_train_x = c_train_1_x[:,:-1] rf_model = rfr(n_estimators=ntrees, n_jobs=-1) est = rf_model.fit(c_train_x, c_train_y) train_y_pred = est.predict(c_train_x) error = mt.mean_absolute_error(c_train_y, train_y_pred) print 'RF Train-1 Error: %r\n' % (error) fobj.write('RF Train-1 Error: %r\n' % (error)) valid_y_pred = est.predict(test_x) hold_y = est.predict(hold_out) sub_y = est.predict(sub_test) return est, valid_y_pred, hold_y, sub_y
def runRF(train_1_x, test_x, hold_out, sub_test, fobj): ntrees = 100 njobs = 2 exp = 30 c_train_1_x = train_1_x[(train_1_x[:, -1] <= exp)] c_train_y = c_train_1_x[:, -1] c_train_x = c_train_1_x[:, :-1] rf_model = rfr(n_estimators=ntrees, n_jobs=-1) est = rf_model.fit(c_train_x, c_train_y) train_y_pred = est.predict(c_train_x) error = mt.mean_absolute_error(c_train_y, train_y_pred) print 'RF Train-1 Error: %r\n' % (error) fobj.write('RF Train-1 Error: %r\n' % (error)) valid_y_pred = est.predict(test_x) hold_y = est.predict(hold_out) sub_y = est.predict(sub_test) return est, valid_y_pred, hold_y, sub_y
def get_regression_model(algo, settings, print_mod_info=False): ### Regression models ### https://stackoverflow.com/questions/12860841/python-import-in-if if algo == 'XGR': mod = xgr(n_estimators=settings[0], max_depth=settings[1]) if print_mod_info: print('XGBoost:', mod) elif algo == 'RFR': mod = rfr(n_estimators=settings[0]) if print_mod_info: print('Random Forest:', mod) elif algo == 'ABR': mod = abr(n_estimators=settings[0]) if print_mod_info: print('AdaBoost:', mod) elif algo == 'P1R': mod = LinearRegression() if print_mod_info: print('Linear:', mod) elif algo == 'P2R': mod = make_pipeline(PolynomialFeatures(settings[0]), Ridge()) if print_mod_info: print('Poly 2:', mod) elif algo == 'ANN': mod = MLPRegressor( solver='lbfgs', hidden_layer_sizes=(settings[0], settings[1]), # (137,73), tol=settings[2]) if print_mod_info: print('Neural Net Regression:', mod) elif algo == 'ELN': mod = ElasticNet(alpha=settings[0], l1_ratio=settings[1]) # add parameters later if print_mod_info: print('Elastic Net Regression:', mod) elif algo == 'E2R': mod = make_pipeline( PolynomialFeatures(settings[0]), ElasticNet(alpha=settings[1], l1_ratio=settings[2])) if print_mod_info: print('Poly 2:', mod) elif algo == 'PLS': mod = PLSRegression(n_components=settings[0]) if print_mod_info: print('Partial Least Squares Regression:', mod) else: print('Algorithm not setup yet.') sys.exit() return mod
def fit(self, losses, configs=None): if configs is None: configs = [[]] * len(times) # convert learning curves into X and y data X = [] y = [] for l, c in zip(losses, configs): l = self.apply_differencing(l) for i in range(self.order, len(l)): X.append(np.hstack([l[i - self.order:i], c])) y.append(l[i]) self.X = np.array(X) self.y = np.array(y) self.rfr = rfr().fit(self.X, self.y)
def main(): # read train data filenames = ["ct_rac_S000_JT00_2013.csv", "ny_rac_S000_JT00_2013.csv", "ca_rac_S000_JT00_2013.csv", "mt_rac_S000_JT00_2013.csv"] print "loading ct..." X, y, ct_d = load(filenames[0]) return # print "loading ny..." # ny_X, ny_y, ny_d = load(filenames[1]) # print "loading ca..." # ca_X, ca_y, ca_d = load(filenames[2]) # print "loading mt..." # mt_X, mt_y, mt_d = load(filenames[3]) # lr = LinearRegression() # lr.fit(X, y) # predict_y = lr.predict(X) # print "ct: ", getmse(y, predict_y) # predict_y = lr.predict(ny_X) # print "ny: ", getmse(ny_y, predict_y) # predict_y = lr.predict(ca_X) # print "ca: ", getmse(ca_y, predict_y) # predict_y = lr.predict(mt_X) # print "mt: ", getmse(mt_y, predict_y) print "training data..." rtree = rfr(n_estimators = 79, max_features = 10, oob_score = True) rtree.fit(X, y) print rtree.feature_importances_
def regression(X, y): print(X.shape, y.shape) score = 0 folds = 4 forest = rfr(n_estimators=5) # Ensuring label percentage balance when K-folding skf = KFold(X.shape[0], shuffle=True, n_folds=folds) for train_index, test_index in skf: Xtrain, Xtest = X[train_index], X[test_index] ytrain, ytest = y[train_index], y[test_index] Xtrain = np.array(Xtrain, dtype='float64') Xtest = np.array(Xtest, dtype='float64') #Xtrain[np.isinf(Xtrain)] = 0 forest.fit(Xtrain, ytrain) error = 0 errorList = [] predictions = [] for i in range(0, Xtest.shape[0]): a = np.transpose(Xtest[i, :].reshape(Xtest[i, :].shape[0], 1)) pr = forest.predict(a) temp_err = np.absolute(pr - ytest[i]) * 60 errorList.append(temp_err) predictions.append(pr) error += temp_err visualize(ytest, errorList, predictions) print('Average error in minutes: {0}'.format(error / Xtest.shape[0])) print('Max/min/median error: {0} , {1} , {2}'.format( max(errorList), min(errorList), np.median(errorList))) del errorList[:] del predictions[:]
def regression(X,y): print(X.shape,y.shape) score = 0 folds=4 forest = rfr(n_estimators=5) # Ensuring label percentage balance when K-folding skf = KFold( X.shape[0],shuffle=True, n_folds=folds) for train_index,test_index in skf: Xtrain,Xtest = X[train_index], X[test_index] ytrain,ytest = y[train_index], y[test_index] Xtrain = np.array(Xtrain,dtype='float64') Xtest = np.array(Xtest,dtype='float64') #Xtrain[np.isinf(Xtrain)] = 0 forest.fit(Xtrain,ytrain) error=0 errorList =[] predictions= [] for i in range(0,Xtest.shape[0]): a= np.transpose(Xtest[i,:].reshape(Xtest[i,:].shape[0],1)) pr = forest.predict(a) temp_err=np.absolute(pr-ytest[i])*60 errorList.append(temp_err) predictions.append(pr) error += temp_err visualize(ytest,errorList,predictions) print('Average error in minutes: {0}'.format(error/Xtest.shape[0])) print('Max/min/median error: {0} , {1} , {2}'.format(max(errorList),min(errorList),np.median(errorList))) del errorList[:] del predictions[:]
def select_model(model): pca = PCA(n_components=5) svr = SVR() ss = StandardScaler() rf = rfr() if model == 'svr': pip_svm = Pipeline([ ('pca', pca), ('ss', ss), #('rf',rf) ('svm', svr), ]) params_svr = { "pca__n_components": [i for i in range(1, 6)], "svm__kernel": ['rbf'], "svm__gamma": [10**i for i in range(-4, 0)], "svm__C": [10**i for i in range(1, 4)] } return pip_svm, params_svr else: pip_rf = Pipeline([('pca', pca), ('ss', ss), ('rf', rf) #('svm',svr), ]) params_rf = { #"pca__n_components":[i for i in range(1, len(x.columns))], 'rf__n_estimators': [50, 100, 300], 'rf__max_features': [i for i in range(1, 6)], 'rf__random_state': [0], 'rf__n_jobs': [-1], #'rf__min_samples_split' : [3, 5, 10, 15, 20, 25, 30, 40, 50, 100], 'rf__max_depth': [30, 40, 50, 100] } return pip_rf, params_rf
def fillMissingValue(df, fy, WaytoFillNan = Env_var.get('WaytoFillNan')): train_data_temp = df[df.iloc[:,fy].notnull()] test_data_temp = df[df.iloc[:,fy].isnull()] train_y=train_data_temp.iloc[:,fy] train_X=train_data_temp.copy() train_X = train_X.drop(train_X.columns[fy], axis = 1) test_X = test_data_temp.copy() test_X = test_X.drop(test_X.columns[fy], axis =1) mixed_X = Imputer().fit_transform(train_X.append(test_X, ignore_index=True)) length_train = len(train_X) train_X = mixed_X[:length_train,:] test_X = mixed_X[length_train:,:] if (WaytoFillNan == 'rfr'): print ("Try to fill-up value with rfr") rfr_regressor=rfr(n_estimators=100, verbose = 5, n_jobs = -1) rfr_regressor.fit(train_X,train_y) y_pred = rfr_regressor.predict(test_X) print (y_pred) df.iloc[:,fy] = df.iloc[:,fy].fillna(value = pd.Series(data = y_pred)) elif (WaytoFillNan == 'mean'): print ("Try to fill-up value with mean") df.iloc[:,fy] = df.iloc[:,fy].fillna(value = np.mean(df.iloc[:,fy])) elif (WaytoFillNan == 'ffill'): df.iloc[:,fy] = df.iloc[:,fy].fillna(method = 'ffill') elif (WaytoFillNan == 'bfill'): df.iloc[:,fy] = df.iloc[:,fy].fillna(method = 'bfill') elif (WaytoFillNan == 'knn'): impute = KnnIm() df.iloc[:,fy] = (impute.knn(X=df, column=fy, k=10))[:,fy] return df
def fill_missing_rf(X, y, to_fill): """ 使用随机森林填补一个特征的缺失值的函数 参数: X:要填补的特征矩阵 y:完整的,没有缺失值的标签 to_fill:字符串,要填补的那一列的名称 """ # 构建我们的新特征矩阵和新标签 df = X # 缺失值特征: 待填充列 fill = df.loc[:, to_fill] # 将除 缺失值特征: 待填充列 之外的其他特征 与 因变量Y 按列拼接组成 临时特征集合 df = pd.concat([df.loc[:, df.columns != to_fill], pd.DataFrame(y)], axis=1) # 找出训练集和测试集 # 缺失值特征: 待填充列 非空值为 训练集Y Ytrain = fill[fill.notnull()] # 缺失值特征: 待填充列 空值为 测试集Y Ytest = fill[fill.isnull()] # 注意: 数据集是做过索引恢复重置的,所以能使用DataFrame.iloc[index], 如果数据集索引是乱的,只能使用DataFrame.loc[index] # 按 缺失值特征: 待填充列 非空值 的索引 取出 训练集X Xtrain = df.iloc[Ytrain.index, :] # 按 缺失值特征: 待填充列 空值 的索引 取出 测试集X Xtest = df.iloc[Ytest.index, :] # 用随机森林回归来填补缺失值 from sklearn.ensemble import RandomForestRegressor as rfr rfr = rfr(n_estimators=100 ) # random_state=0,n_estimators=200,max_depth=3,n_jobs=-1 rfr = rfr.fit(Xtrain, Ytrain) Ypredict = rfr.predict(Xtest) # <class 'numpy.ndarray'> 索引自动重排了: [0:29140] return Ypredict
X_train = data[0:1460, :] '''from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() sc_y = StandardScaler() X_train = sc_X.fit_transform(X_train) y = [y_train] y = np.transpose(y) y_train = sc_y.fit_transform(y) from sklearn.svm import SVR svr = SVR(kernel = 'rbf', C = 40) svr.fit(X_train, y_train)''' from sklearn.ensemble import RandomForestRegressor as rfr regressor = rfr(n_estimators=285, random_state=42) #0.14751 regressor.fit(X_train, y_train) regressor.score(X_train, y_train) X_test = data[1460:, :] #score = {['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']} from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator=regressor, X=X_train, y=y_train, scoring='neg_mean_squared_log_error', cv=5) accuracies.mean() accuracies.std() from sklearn.model_selection import GridSearchCV
#training_weights = training_data['weight'] training_data, training_location_data, training_response, training_weights = preprocess_dataframe( training_data) print "Shape: ", training_data.shape ### CREATE MODEL ### bl = Blender( verbose = True, training_fraction = 0.95) bl.add_model( lclR.LocalRegression(k = 500 , regressor = SmartSVR, params = {'gamma':0.0001}), "SmrtSVR") bl.add_model( lclR.LocalRegression(k = 500 , regressor = SmartSVR, params = {'gamma':0.001, "C":50}), "LooseSmrtSVR") bl.add_model( lclR.LocalRegression(k = 500, regressor = sklm.ElasticNet, params = {'alpha':0.0001, "normalize":True}), "ElNet500" ) bl.add_model( lclR.LocalRegression(k = 500, regressor = rfr, params={"n_jobs":5, "n_estimators": 10 } ), "rfr50est") bl.add_model( lclR.LocalRegression( k=500, regressor = sklm.Lasso, params={"alpha":0.01, "normalize":True}), "lasso" ) bl.add_model( rfr( n_jobs=150, n_estimators=200 ), "Globalrfr50est") bl.add_model( lclR.LocalRegression( k=750, regressor = sklm.Ridge, feature_selection = True, params={ 'alpha':0.001, 'normalize':True} ), "RidgeWithFeatureSelection" ) bl.add_model( lclR.LocalRegression( k=750, regressor = sklm.Ridge, feature_selection = False, params={"alpha":0.1, "normalize":True}), "Ridge") bl.add_model( lclR.LocalRegression( k = 250, regressor = sklm.Ridge, feature_selection = False, params={"alpha":1.5, "normalize":True} ), "LocalRidge") lc = [ training_location_data.values ] bl.fit( training_data.values, training_response.values, {"SmrtSVR": lc, "ElNet500":lc, "RidgeWithFeatureSelection":lc, "rfr50est":lc, "lasso":lc, "Ridge":lc, "LooseSmrtSVR":lc, "LocalRidge":lc } ) lc = [test_location_data.values] print print "Coefficients:" print bl.coef_ print "begin prediction" prediction = bl.predict( test_data.values, { "lasso":lc, "rfr50est":lc, "SmrtSVR": lc, "ElNet500":lc, "RidgeWithFeatureSelection":lc, "Ridge":lc, "LooseSmrtSVR":lc, "LocalRidge":lc } )
import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Position_Salaries.csv') x = dataset.iloc[:, 1:2].values y = dataset.iloc[:, 2].values # Splitting the dataset into the Training set and Test set """from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling """from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) sc_y = StandardScaler() y_train = sc_y.fit_transform(y_train)""" # Fitting the Random Forest Regression to the dataset from sklearn.ensemble import RandomForestRegressor as rfr regressor = rfr(n_estimators=300, random_state=0) regressor.fit(x, y) # Predicting a new result y_pred = regressor.predict([[6.5]]) # Visualising the Regression results (for higher resolution and smoother curve)
except IndexError: print("Need file name as argument") return x, y x, y = loadData() x_val = x[:1000] y_val = y[:1000] x = x[1000:] y = y[1000:] model = rfr(n_estimators=10, criterion='mse', n_jobs=7, verbose=0) kf = KFold(n_splits=5) d = {} for train, test in kf.split(x): x_train, x_test = x[train], x[test] y_train, y_test = y[train], y[test] w = model.fit(x_train, y_train) pred_test = w.predict(x_test) m_test = msle(y_test, pred_test) print("msle for testing set is: ", m_test) pred = w.predict(x_val) #print(pred[:100], y_val[:100]) m_val = msle(y_val, pred) print("msle for validation set is ", m_val ) d[m_test*0.3 + m_val*0.7] = w
model = RadiusNeighborsRegressor(radius=0.5, p=2) from sklearn.ensemble import RandomForestRegressor model=RandomForestRegressor(n_estimators=10,max_depth=8,\ min_samples_split=2) from sklearn.ensemble import AdaBoostRegressor model = AdaBoostRegressor(n_estimators=400) from sklearn.ensemble import GradientBoostingRegressor model=GradientBoostingRegressor(n_estimators=100,\ learning_rate=0.1,max_depth=10) from sklearn.ensemble import BaggingRegressor mb = model model=BaggingRegressor(base_estimator=mb,n_estimators=20,bootstrap=1,\ bootstrap_features=1,max_samples=0.3,max_features=0.3) model = LR(C=0.004) model = LR(C=0.01, penalty='l1') model=rfr(n_estimators=2000,max_depth=1,min_samples_leaf=20,\ min_samples_split=100) model=BR(alpha_1=1e2,alpha_2=3e2,lambda_1=1e-9,\ lambda_2=1e-9,compute_score=False) model_dtr=dtr(max_depth=3,min_samples_leaf=5,max_leaf_nodes=200,\ min_weight_fraction_leaf=0.05) model=dtr(max_depth=4,min_samples_leaf=5,max_leaf_nodes=200,\ min_weight_fraction_leaf=0.05) model = dtr(max_depth=3, min_samples_leaf=2, max_leaf_nodes=20, splitter='random') model=AdaBoostRegressor(n_estimators=50,learning_rate=0.01,\ loss='square',base_estimator=mb)
with open(dataset, 'r') as fh: for line in fh.readlines(): (time,luma,brightness) = line.split(' ') X.append([float(time),float(luma)]) Y.append(float(brightness.rstrip())) X = np.array(X) Y = np.array(Y) stderr.write(' done\n') stderr.flush() # create predictive model trees = 2000 stderr.write('model fitting averaging votes of %s randomized' ' trees..' % trees) stderr.flush() mymodel = rfr(n_estimators=trees, random_state=0, n_jobs=-1) mymodel.fit(X,Y) stderr.write(' done\n') stderr.flush() # generate testing dataset stderr.write('define test dataset..') stderr.flush() T = [] Tentries = 0 for time in range(int(np.min(X[:,0])+0.5), int(np.max(X[:,0])+1.5), 500): for luma in range(int(np.min(X[:,1]+0.5)), int(np.max(X[:,1])+1.5), 100): T.append([time, luma]) Tentries += 1 if (Tentries % 5000) == 0: stderr.write('.')
scoring='neg_mean_absolute_error') grid_result = gridsearch.fit(X_train, y_train) grid_pred = gridsearch.predict(X_test) grid_rmse = mean_squared_error(y_test, grid_pred) print('Decision Tree MAE: ' + str(sum(abs(grid_pred - y_test))/(len(y_test)))) print('Decision Tree RMSE: ' + str(np.sqrt(grid_rmse))) print(grid_result.best_params_) print(abs(grid_result.best_score_)) #--------------------------RANDOM FOREST GRIDSEARCH---------------------------------------- """ the best values for each parameter came out as: 'max_depth'=14, 'min_samples_leaf'=1, 'min_samples_split'=2, and 'n_estimators'=12 using Dataset 3 """ gridsearch = GridSearchCV(estimator=rfr(random_state=4), cv=5, param_grid={ 'n_estimators':[10,20,40,70,100,500,1000], """we selected the n_estimators at approximately the point of diminishing performance returns, which is why the optimal value ended up being so low""" 'max_depth':[10,20,30,40,50], 'min_samples_split':[2,3,4,5], 'min_samples_leaf':[1,2,3,4,5] }, scoring='neg_mean_absolute_error') grid_result = gridsearch.fit(X_train, y_train) grid_pred = gridsearch.predict(X_test) grid_rmse = mean_squared_error(y_test, grid_pred) print('Random Forest MAE: ' + str(sum(abs(grid_pred - y_test))/(len(y_test)))) print('Random Forest RMSE: ' + str(np.sqrt(grid_rmse)))
import pandas as pd, numpy as np from sklearn.ensemble import RandomForestRegressor as rfr train = pd.read_csv('day.csv') train = train.sample(frac=1, random_state=0, axis=0) ytrain_ = train['cnt'] Xtrain_ = train.drop(['cnt', 'registered', 'casual', 'instant', 'dteday'], axis=1) m = Xtrain_.shape[0] split_point = 600 Xtrain = Xtrain_[:split_point] Xtest = Xtrain_[split_point:] ytrain = ytrain_[:split_point] ytest = ytrain_[split_point:] mtrain = Xtrain.shape[0] mtest = Xtest.shape[0] n = Xtrain.shape[1] r = rfr(random_state=0, n_estimators=9, max_depth=7, min_samples_split=4) r.fit(Xtrain, ytrain) important_features = pd.Series(data=r.feature_importances_,index=Xtrain.columns) important_features.sort_values(ascending=False,inplace=True) print(important_features) train_acc = r.score(Xtrain, ytrain)*100 test_acc = r.score(Xtest, ytest)*100 print("Train accuracy : %.2f" % train_acc) print("Test accuracy : %.2f" % test_acc)