def RandomForestModel(self,path_train,segs_path,\ path_val,start_est,\ end_est,step_est,start_dp,\ end_dp,step_dp,field_model_train,\ field_model_val,criterion_split,path_assess_file,stateCheckBox,model_path,type_model): #To evaluate vector readable if vector_is_readable(path_train,'Error reading the ')==False: return 0 #get dataframe training samples dft=gpd.read_file(path_train) #get dataframe validation samples dfv=gpd.read_file(path_val) #Get CRS crsT=dft.crs['init'] crsV=dfv.crs['init'] if is_crs (crsT,crsV,'CRS are different - (Training and validation samples)' )==False: return 0 #get names segmentations segs_names=[f for f in os.listdir(segs_path) if f.endswith('.shp')] #best parameters best_parameters={'Trees':0,'Depth':0} #acurcia acuracia=0.0 #segmentations file for seg in segs_names: #Set progressa bar self.dlg.ui.progressBar.setValue(1) #Selecionar arquivos .shp #f_txt.write(segs_path+os.sep+seg+'\n') print (segs_path+os.sep+seg) if vector_is_readable(segs_path,'Data set is not readable') == False: return 0 #Ler segmentacoes dfs=gpd.read_file(segs_path+os.sep+seg) #To evaluate CRS data set and samples crsS = dfs.crs['init'] if is_crs (crsT,crsS,'CRS are different - (data set)' )==False: return 0 #create validation samples merge attribute spatial join dfjv=gpd.sjoin(dfv,dfs,how="inner", op='intersects') #Criar amostras de treinamento, merge attribute spatial join dfjt=gpd.sjoin(dft,dfs,how="inner", op='intersects') #Get features and remove geometry and id_seg if 'id_seg' in dfs.columns: dfs=dfs.drop(columns=['geometry','id_seg']) #remove duplicates validation dfjv=dfjv.drop_duplicates(subset='id_seg') #remove duplicates training dfjt=dfjt.drop_duplicates(subset='id_seg') else: dfs=dfs.drop(columns=['geometry']) #Get columns names equal dtype=float features=dfs.select_dtypes(include='float').columns #Drop NaN validation dfjt=dfjt.dropna(subset=features) #To evatualte join if is_join(dfjt.shape,'Data set and training samples do not overlap or contains NaN') == False: return 0 #Drop NaN validation dfjv=dfjv.dropna(subset=features) print('validation (rows, cols): ',dfjv.shape) print('training (rows, cols): ',dfjt.shape) #To evatualte join if is_join(dfjv.shape,'Data set and validation samples do not overlap or contains NaN') == False: return 0 #create text f_txt=open(path_assess_file,'w') #To evaluaate if is model type if type_model =='classification': #Write f_txt.write('Dataset;Trees;Depth;PC;QD;QA;kappa'+'\n') else: #Write f_txt.write('Dataset;Trees;Depth;AUC'+'\n') #Avaliar parametros da segmentacao for t in range(int(start_est),int(end_est)+int(step_est),int(step_est)): #Set progressbar self.dlg.ui.progressBar.setValue(100/(int(end_est)/t)) for md in range(int(start_dp),int(end_dp)+int(step_dp),int(step_dp)): #To evaluaate if is model type if type_model =='classification': #criar modelo Random Forest clf = ensemble.RandomForestClassifier( n_estimators =t, max_depth =md, criterion=criterion_split) #Ajustar modelo modelTree = clf.fit(dfjt[features].values, dfjt[field_model_train]) #Classificar clas = modelTree.predict(dfjv[features].values) #Calculate kappa kappa = metrics.cohen_kappa_score(dfjv[field_model_val],clas) #Calculate PC pc,qd,qa,matrix=self.pontius2011(dfjv[field_model_val],clas) #print (pc,qd,qa) f_txt.write(seg+';'+str(t)+';'+ str(md)+';'+';'+ str(round(pc,4))+';'+ str(round(qd,4))+';'+ str(round(qa,4))+';'+str(round(kappa,4))+'\n') #Avaliar a acuracia #print('Acc: '+str(acuracia)+' Pc: '+str(pc)) if pc > acuracia: acuracia=pc #Guardar parametros random forest best_parameters['Trees']=t best_parameters['Depth']=md best_parameters['Dataset']=seg else: #criar modelo Random Forest clf = ensemble.RandomForestRegressor( n_estimators =t, max_depth =md, criterion=criterion_split) #Ajustar modelo modelTree = clf.fit(dfjt[features].values, dfjt[field_model_train]) #Classificar regress = modelTree.predict(dfjv[features].values) #Calculate kappa auc =metrics.roc_auc_score(dfjv[field_model_val],regress) #print (pc,qd,qa) f_txt.write(seg+';'+str(t)+';'+ str(md)+';'+ str(round(auc,4))+'\n') #Avaliar a acuracia #print('Acc: '+str(acuracia)+' Pc: '+str(pc)) if auc > acuracia: acuracia=auc #Guardar parametros random forest best_parameters['Trees']=t best_parameters['Depth']=md best_parameters['Dataset']=seg #Set progressa bar self.dlg.ui.progressBar.setValue(100) #del dataframes del(dfs,dfjv,dfjt) #Set progressa bar self.dlg.ui.progressBar.setValue(50) #classificar segmentacao f_txt.write('############# Best Parameters #############'+'\n') f_txt.write('Data set: '+best_parameters['Dataset']+' - '+'Trees: '+str(best_parameters['Trees'])+ ' - Depth:'+str(best_parameters['Depth'])+'\n') ###################### classify best case############################## if bool(stateCheckBox) : #Ler segmentacoes df_dataset=gpd.read_file(segs_path+os.sep+best_parameters['Dataset']) #Remove NaN df_dataset=df_dataset.dropna(subset=features) #create validation samples merge attribute spatial join dfjv=gpd.sjoin(dfv,df_dataset,how="inner", op='intersects') #Criar amostras de treinamento, merge attribute spatial join dfjt=gpd.sjoin(dft,df_dataset,how="inner", op='intersects') #Drop NaN validation dfjt=dfjt.dropna(subset=features) #Drop NaN validation dfjv=dfjv.dropna(subset=features) #Get features and remove geometry and id_seg if 'id_seg' in df_dataset.columns: #remove duplicates validation dfjv=dfjv.drop_duplicates(subset='id_seg') #remove duplicates training dfjt=dfjt.drop_duplicates(subset='id_seg') #Apply model if self.dlg.ui.radioButtonClass.isChecked(): #criar modelo Random Forest clf = ensemble.RandomForestClassifier( n_estimators =best_parameters['Trees'], max_depth =best_parameters['Depth'],criterion=criterion_split) #Ajustar modelo model = clf.fit(dfjt[features].values, dfjt[field_model_train]) #Classificar clas = model.predict(dfjv[features].values) #Calculate PC pc,qd,qa,matrix=self.pontius2011(dfjv[field_model_val],clas) #Classificar classification = modelTree.predict(df_dataset[features].values) ##create aux DF classification df_dataset['classes']=classification #output classification df_dataset[['geometry','classes']].to_file( model_path) f_txt.write('############# Confusion Matrix #############'+'\n') f_txt.write(str(matrix)+'\n') else: #Create RandomForest Regressor clf = ensemble.RandomForestRegressor( n_estimators =best_parameters['Trees'], max_depth =best_parameters['Depth'],criterion=criterion_split) #Ajustar modelo model = clf.fit(dfjt[features].values, dfjt[field_model_train]) #Regressor regress = model.predict(df_dataset[features].values) ##create aux DF classification df_dataset['values']=regress #output classification df_dataset[['geometry','values']].to_file( model_path) #Write text f_txt.write('############# Features #############'+'\n') f_txt.write(str(features.tolist())+'\n') f_txt.write('############# Features Importances #############'+'\n') f_txt.write(str(model.feature_importances_.tolist())+'\n') #del del(df_dataset,dfjv,dfjt,dfv,dft) else: pass print ('Best parameters: ',best_parameters) #Set progressa bar self.dlg.ui.progressBar.setValue(100) f_txt.close()
from sklearn import linear_model, ensemble import pandas as pd from paths import * import numpy as np from utility import * from process import collect_data @timer def regressor(file, cols=0, model=linear_model.LinearRegression(), is_tree=False): X, y, pred = collect_data(file, cols) print('Observations: {}, Predictors: {}'.format(*X.shape)) model.fit(X, y) error = (y - model.predict(X))**2 print('Predictor\tImportance\n{}\t{}'.format('-' * 9, '-' * 11)) imp = model.coef_[0] if is_tree == False else model.feature_importances_ for i, v in enumerate(pred): print('{: <11}\t{:+f}'.format(v, imp[i])) print('Mean squared error = {}\nCoefficient of prediction = {}'\ .format(np.mean(error), model.score(X,y))) regressor(p2_sample_b, [9, 5], ensemble.RandomForestRegressor(), True) #regressor(p2_sample_b, [9], linear_model.LogisticRegression())
data = [] with open('../ml_data/bike_day.csv', 'r') as f: for line in f.readlines(): data.append(line[:-1].split(',')) # 整理输入与输出集 day_header = np.array(data[0][2:13]) x = np.array(data[1:])[:, 2:13].astype('f8') y = np.array(data[1:])[:, -1].astype('f8') # 打乱数据集,拆分训练集和测试集 x, y = su.shuffle(x, y, random_state=7) train_size = int(len(x) * 0.9) train_x, test_x, train_y, test_y = x[:train_size], x[ train_size:], y[:train_size], y[train_size:] # 训练随机森林模型 model = se.RandomForestRegressor(max_depth=10, n_estimators=1000, min_samples_split=2) model.fit(train_x, train_y) # 输出预测结果r2得分 pred_test_y = model.predict(test_x) print(sm.r2_score(test_y, pred_test_y)) day_fi = model.feature_importances_ data = [] with open('../ml_data/bike_hour.csv', 'r') as f: for line in f.readlines(): data.append(line[:-1].split(',')) # 整理输入与输出集 hour_header = np.array(data[0][2:14]) x = np.array(data[1:])[:, 2:14].astype('f8') y = np.array(data[1:])[:, -1].astype('f8') # 打乱数据集,拆分训练集和测试集
features = ['Neighborhood'] viz_cat_cont_box(house_train, features, target) #explore relationship of livarea and totalbsmt to saleprice features = ['GrLivArea', 'TotalBsmtSF'] viz_cont_cont(house_train, features, target) filter_features(house_train, ['Id']) #explore relation among all continuous features vs saleprice corr = get_heat_map_corr(house_train) get_target_corr(corr, 'SalePrice') get_target_corr(corr, 'log_sale_price') #do one-hot-encoding for all the categorical features print(get_categorical_columns(house_train)) house_train1 = one_hot_encode(house_train) house_train1.shape house_train1.info() filter_features(house_train1, ['SalePrice', 'log_sale_price']) X_train = house_train1 y_train = house_train['log_sale_price'] rf_estimator = ensemble.RandomForestRegressor(random_state=2017) rf_grid = { 'max_features': [9, 10, 11, 12, 15, 16], 'n_estimators': [50, 100, 200] } model = fit_model(rf_estimator, rf_grid, X_train, y_train)
features = list(df) features = [ 'Latitude', 'Longitude', 'Altitude', 'Min Temp', 'Max Temp', 'Mean Sunshine Hours', 'Radiation' ] features.remove('Radiation') X = df.as_matrix(features).astype(np.float) y = df.as_matrix(['Radiation']).astype(np.float) from sklearn import preprocessing scaler = preprocessing.StandardScaler() X = scaler.fit_transform(X) #features made unit varient from sklearn import ensemble clf = ensemble.RandomForestRegressor() scores = cross_val_score(clf, X, y, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ''' Accuracy: 0.46 (+/- 0.22) np.argwhere(np.isnan(X)) df1=preprocessing.normalize(df) from wordsegment import segment wo = segment("how are you") from nltk.corpus import words english_vocab = set(w.lower() for w in words.words()) '''
def run_regression(X, y): clf = ensemble.RandomForestRegressor(n_estimators=1000) clf.fit(X, y) return clf
trainData, testData = dc.convertPandasDataFrameToNumpyArray( traindf), dc.convertPandasDataFrameToNumpyArray(testdf) trainX = trainData[:, 2:] trainY = trainData[:, 1] testX = testData[:, 1:] print("Data loaded") LL = make_scorer(log_loss, greater_is_better=False) # {'max_depth': 11, 'n_estimators': 75} model = ensemble.RandomForestRegressor(n_estimators=75, max_depth=11, random_state=0) model.fit() """ params = {"max_depth" : [9,10,11,12,13], "n_estimators" : [25, 35, 40, 50, 75], } grid = GridSearchCV(model, param_grid=params, scoring=LL, n_jobs=-1, cv=3, verbose=20) grid.fit(trainX, trainY) print("Best parameters found by grid search:") print(grid.best_params_) print("Best CV score:") print(grid.best_score_) """ yPred = model.predict(testX)
res = sum(np.asarray(res)) / 30 print("Bagging Decision Tree with 50 different trees: ", .05 * sum(res.reshape(-1, 1) - y)**2) # # neigh = neighbors.KNeighborsRegressor(n_neighbors=4) # # neigh.fit(xTr, yTr) # # preds = neigh.predict(xValid) # # #plt.plot(preds,'r') # # #plt.plot(yValid,'b') # # #plt.show() # # #plt.savefig('Bagging_Tree.png') random_forest_eps = [] for i in range(100): i f = ensemble.RandomForestRegressor(n_estimators=i + 1, max_features=2) f.fit(xTr, yTr) preds = f.predict(xValid) preds = preds.reshape(-1, 1) random_forest_eps.append(0.5 * sum(preds - yValid)**2) print("Random Forest with " + str(i) + "trees: ", .05 * sum(preds - yValid)**2) # if .05*sum(preds-y)**2 < 0.00005: # break red_patch = mpatches.Patch(color='red', label='Decision Tree Errors') blue_patch = mpatches.Patch(color='blue', label='KNN errors') green_patch = mpatches.Patch(color='green', label='Random Forest Errors') plt.legend(handles=[red_patch, blue_patch, green_patch]) plt.xlabel("Number of Parameters")
def train_sys(data_path, data_dir1,train_rate, LR): for data in data_dir1: train_data, test_data, transform_data = input_data_pca(data_path, data, train_rate) standard_scaler_SYS = transform_data['min_max_scaler_SYS'] X_train=train_data['features'] Y_train=train_data['sys_bp'].reshape(-1) X_test=test_data['features'] Y_test=test_data['sys_bp'].reshape(-1) # Fit regression model model_SupportVectorRegressor = SVR(kernel='rbf', C=100, gamma=0.1) model_LinearRegression = linear_model.LinearRegression() #线性回归 model_RandomForestRegressor = ensemble.RandomForestRegressor(n_estimators=20) # 这里使用20个决策树 model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor(n_estimators=100) model1=model_SupportVectorRegressor.fit(X_train,Y_train) model2=model_LinearRegression.fit(X_train,Y_train) model3 = model_RandomForestRegressor.fit(X_train, Y_train) model4 =model_GradientBoostingRegressor.fit(X_train, Y_train) Y_pred1=model1.predict(X_test) Y_pred2 = model2.predict(X_test) Y_pred3 = model3.predict(X_test) Y_pred4 = model4.predict(X_test) pred_svr_sys = standard_scaler_SYS.inverse_transform(Y_pred1.reshape(-1, 1)).reshape(-1) pred_lr_sys = standard_scaler_SYS.inverse_transform(Y_pred2.reshape(-1, 1)).reshape(-1) pred_rf_sys = standard_scaler_SYS.inverse_transform(Y_pred3.reshape(-1, 1)).reshape(-1) pred_gbrt_sys = standard_scaler_SYS.inverse_transform(Y_pred4.reshape(-1, 1)).reshape(-1) real_sys = standard_scaler_SYS.inverse_transform(Y_test.reshape(-1, 1)).reshape(-1) # error = np.array(pred_sys).reshape(-1) - np.array(real_sys).reshape(-1) # # num_5=0 # num_10=0 # num_15=0 # for ii in error: # if abs(ii)<=5: # num_5+=1 # for ii in error: # if abs(ii)<=10: # num_10+=1 # for ii in error: # if abs(ii)<=15: # num_15+=1 # # num=len(error) # rate_5=num_5/num # rate_10=num_10/num # rate_15=num_15/num # # global a,b,c,aami # if rate_5>=0.65 and rate_10>=0.85 and rate_15>=0.95: # a+=1 # elif rate_5>=0.5 and rate_10>=0.75 and rate_15>=0.90: # b+=1 # elif rate_5 >= 0.4 and rate_10 >= 0.65 and rate_15 >= 0.85: # c+=1 # # mean_val=np.mean(error) # std_val=np.std(error) # if mean_val<=5 and std_val<=8: # aami+=1 #下面是画图所需要的东西 data_sys = './sysdata/' filepath = os.path.join(data_sys + 'data12.txt') f = open(filepath, 'rb+') data_sys = f.read() f.close() data = pickle.loads(data_sys) pred_lstm_sys = np.array(data["pred"]).reshape(-1) len_lstm=len(pred_lstm_sys) len_data=len(real_sys) if len_lstm<len_data: pred_svr_sys=pred_svr_sys[::-1] pred_lr_sys=pred_lr_sys[::-1] pred_rf_sys=pred_rf_sys[::-1] pred_gbrt_sys=pred_gbrt_sys[::-1] pred_lstm_sys=pred_lstm_sys[::-1] real_sys=real_sys[::-1] b, a = signal.butter(3, 0.5, 'low') real_sys = signal.filtfilt(b, a, real_sys) pred_svr_sys = signal.filtfilt(b, a, pred_svr_sys) pred_lr_sys = signal.filtfilt(b, a, pred_lr_sys) pred_rf_sys = signal.filtfilt(b, a, pred_rf_sys) pred_gbrt_sys = signal.filtfilt(b, a, pred_gbrt_sys) pred_lstm_sys = signal.filtfilt(b, a, pred_lstm_sys) plt.plot(pred_svr_sys, 'g-*', pred_lr_sys, 'b-v', pred_rf_sys, 'c-h', pred_gbrt_sys, 'm-d', pred_lstm_sys, 'k-o', real_sys, 'r-D') plt.legend(['svr','lr','rf','gbrt','lstm','sys']) plt.ylabel(u'收缩压') plt.xlabel(u'心搏周期') plt.show()
for line in trainFile: split = str.split(line, ',') material = Composition(split[0]) materials.append(material) tc.append(float(split[8])) error = mean(abs(mean(tc) - tc)) print("MAE: " + str(round(error, 3)) + " K") from sklearn.model_selection import cross_val_score from sklearn.model_selection import ShuffleSplit from sklearn import linear_model, metrics, ensemble # sklearn NO random forest KAIKI rfr = ensemble.RandomForestRegressor(n_estimators=10) # KOUSA KENSHO SIMASU cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) pf1 = [] pf2 = [] pf3 = [] pf4 = [] pf5 = [] pf6 = [] pf7 = [] pf8 = [] pf9 = [] pf10 = [] pf11 = []
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('configfn', metavar='CONFIGFN', type=str, help="Settings for regression model training (.json)") parser.add_argument('trainvecsfn', metavar='TRAINVECSFN', type=str, help="Training vectors (.txt)") args = parser.parse_args() with open(args.configfn) as infh: config = json.load(infh) trainvecs = np.loadtxt(args.trainvecsfn) X = trainvecs[:, :-2] height = trainvecs[:, -2] slope = trainvecs[:, -1] clf = ensemble.RandomForestRegressor( n_estimators=config["n_estimators"], min_samples_leaf=config["minsamples_height"]) clf = clf.fit(X, height) ttslab.tofile(clf, "pitch_height_model.pickle") clf = ensemble.RandomForestRegressor( n_estimators=config["n_estimators"], min_samples_leaf=config["minsamples_slope"]) clf = clf.fit(X, slope) ttslab.tofile(clf, "pitch_slope_model.pickle")
import sklearn.utils as su import sklearn.tree as st import sklearn.ensemble as se import sklearn.metrics as sm # 读取数据 boston = sd.load_boston() random_seed = 7 # 随机种子,用来产生随机值 x, y = su.shuffle(boston.data, # 13个特征 boston.target, # 标签,房屋价格中位数 random_state=7) # 打乱样本 train_size = int(len(x) * 0.8) # 计算训练集大小 # 划分训练集和测试集 train_x = x[:train_size] # 训练集输入 train_y = y[:train_size] # 训练集输出 test_x = x[train_size:] # 测试集输入 test_y = y[train_size:] # 测试集输出 # 定义模型 model = se.RandomForestRegressor( max_depth=10, # 最大深度 n_estimators=1000, # 树的数量 min_samples_split=2 # 最少样本数量 ) model.fit(train_x, train_y) # 训练 pred_test_y = model.predict(test_x) # 预测 r2 = sm.r2_score(test_y, pred_test_y) print('r2:', r2) # 由0.8提升到了0.92
def random_forest(x_train, y_train, x_test): model = ensemble.RandomForestRegressor(n_estimators=10) model.fit(x_train, y_train) return model.predict(x_test)
from sklearn import ensemble # Extra Tree Regressor # Random Forest Regressor MODELS = { "extraTrees": ensemble.ExtraTreesRegressor(), "randomForest": ensemble.RandomForestRegressor() }
# plot the important features # fig, ax = plt.subplots(figsize=(12, 18)) xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax) plt.show() # Categorical occupy the top spots followed by binary variables. # # Let us also build a Random Forest model and check the important variables. # In[ ]: from sklearn import ensemble model = ensemble.RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_leaf=4, max_features=0.2, n_jobs=-1, random_state=0) model.fit(train_X, train_y) feat_names = train_X.columns.values ## plot the importances ## importances = model.feature_importances_ std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0) indices = np.argsort(importances)[::-1][:20] plt.figure(figsize=(12, 12)) plt.title("Feature importances") plt.bar(range(len(indices)), importances[indices], color="r", align="center") plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical') plt.xlim([-1, len(indices)])
testRegressor(train, svm.SVR(kernel='rbf'), target, 'SVM rbf') testRegressor(train, svm.SVR(kernel='sigmoid'), target, 'SVM sigmoid') # Nearest neighbors testRegressor(train, neighbors.KNeighborsRegressor(n_neighbors=1), target, 'NearestNeighbor 1') testRegressor(train, neighbors.KNeighborsRegressor(n_neighbors=2), target, 'NearestNeighbor 2') testRegressor(train, neighbors.KNeighborsRegressor(n_neighbors=3), target, 'NearestNeighbor 3') testRegressor(train, neighbors.KNeighborsRegressor(n_neighbors=4), target, 'NearestNeighbor 4') testRegressor(train, neighbors.KNeighborsRegressor(n_neighbors=8), target, 'NearestNeighbor 8') testRegressor(train, neighbors.KNeighborsRegressor(n_neighbors=16), target, 'NearestNeighbor 16') testRegressor(train, neighbors.KNeighborsRegressor(n_neighbors=32), target, 'NearestNeighbor 32') # Gaussian process # testRegressor( train, gaussian_process.GaussianProcess(), target, 'Gaussian process' ) # Regression trees testRegressor(train, tree.DecisionTreeRegressor(), target, 'Regression tree') testRegressor(train, ensemble.RandomForestRegressor(n_estimators=1000), target, 'RandomForestRegressor') testRegressor(train, ensemble.ExtraTreesRegressor(), target, 'ExtraTreesRegressor') # Gradient tree Boosting #testRegressor( train, ensemble.GradientBoostingRegressor(loss='ls'), target, 'Gradient tree boosting' )
def fuse1(box, label, leaveout_lot, RF, BSIF, HAF, INST): #verbose=0 #model2 = ensemble.RandomForestRegressor(n_estimators=100, max_depth=32, verbose=verbose, max_features=0.33, random_state=99, n_jobs=-1) #Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001) #model2 = Ridge(alpha=.5) if RF == True: verbose = 0 model0 = ensemble.RandomForestRegressor(n_estimators=100, max_depth=32, verbose=verbose, max_features=0.33, random_state=99, n_jobs=-1) model1 = ensemble.RandomForestRegressor(n_estimators=100, max_depth=32, verbose=verbose, max_features=0.33, random_state=99, n_jobs=-1) model2 = ensemble.RandomForestRegressor(n_estimators=100, max_depth=32, verbose=verbose, max_features=0.33, random_state=99, n_jobs=-1) else: model0 = LinearRegression() model1 = LinearRegression() model2 = LinearRegression() sys.stdout.flush() if BSIF == True: data = box s1 = 4 s2 = 5 s3 = 6 else: data = label s1 = 13 s2 = 14 s3 = 15 training = data.values[data['lots'] != leaveout_lot] testing = data.values[data['lots'] == leaveout_lot] if BSIF and HAF and INST: if RF == True: cnt = 0 for i in training[:, 0]: training[:, 0][cnt] = np.append(i[0:20], training[:, 7:19][cnt]) cnt += 1 cnt = 0 for i in testing[:, 0]: testing[:, 0][cnt] = np.append(i[0:20], testing[:, 7:19][cnt]) cnt += 1 else: cnt = 0 for i in training[:, 0]: training[:, 0][cnt] = np.append(i, training[:, 7:19][cnt]) cnt += 1 cnt = 0 for i in testing[:, 0]: testing[:, 0][cnt] = np.append(i, testing[:, 7:19][cnt]) cnt += 1 X = np.vstack(training[:, 0]) X_ = np.vstack(testing[:, 0]) if BSIF and HAF and (not INST): if RF == True: cnt = 0 for i in training[:, 0]: training[:, 0][cnt] = np.append(i[0:20], training[:, 7:12][cnt]) cnt += 1 cnt = 0 for i in testing[:, 0]: testing[:, 0][cnt] = np.append(i[0:20], testing[:, 7:12][cnt]) cnt += 1 else: cnt = 0 for i in training[:, 0]: training[:, 0][cnt] = np.append(i, training[:, 7:12][cnt]) cnt += 1 cnt = 0 for i in testing[:, 0]: testing[:, 0][cnt] = np.append(i, testing[:, 7:12][cnt]) cnt += 1 X = np.vstack(training[:, 0]) X_ = np.vstack(testing[:, 0]) if BSIF and INST and (not HAF): if RF == True: cnt = 0 for i in training[:, 0]: training[:, 0][cnt] = np.append(i[0:20], training[:, 12:19][cnt]) cnt += 1 cnt = 0 for i in testing[:, 0]: testing[:, 0][cnt] = np.append(i[0:20], testing[:, 12:19][cnt]) cnt += 1 else: cnt = 0 for i in training[:, 0]: training[:, 0][cnt] = np.append(i, training[:, 12:19][cnt]) cnt += 1 cnt = 0 for i in testing[:, 0]: testing[:, 0][cnt] = np.append(i, testing[:, 12:19][cnt]) cnt += 1 X = np.vstack(training[:, 0]) X_ = np.vstack(testing[:, 0]) if BSIF and (not INST) and (not HAF): if RF == True: cnt = 0 for i in training[:, 0]: training[:, 0][cnt] = i[0:20] cnt += 1 cnt = 0 for i in testing[:, 0]: testing[:, 0][cnt] = i[0:20] cnt += 1 X = np.vstack(training[:, 0]) X_ = np.vstack(testing[:, 0]) if HAF and INST and (not BSIF): X = np.vstack(training[:, 1:13]) X_ = np.vstack(testing[:, 1:13]) if HAF and (not INST) and (not BSIF): X = np.vstack(training[:, 8:13]) X_ = np.vstack(testing[:, 8:13]) if INST and (not HAF) and (not BSIF): X = np.vstack(training[:, 1:8]) X_ = np.vstack(testing[:, 1:8]) input_train_data = X output_train_data0 = training[:, s1] output_train_data1 = training[:, s2] output_train_data2 = training[:, s3] input_test_data = X_ model0.fit(input_train_data, output_train_data0) model1.fit(input_train_data, output_train_data1) model2.fit(input_train_data, output_train_data2) ################################################################## predicted_stress = 0 stress = 0 count = 0 if BSIF == True: for i in input_test_data: stress = model0.predict(np.array([input_test_data[count]])) predicted_stress = np.append(predicted_stress, stress) count += 1 predicted_stress = np.median(predicted_stress) else: predicted_stress = model0.predict(input_test_data) error0 = abs(testing[0, s1] - predicted_stress) pError0 = 100 * error0 / testing[0, s1] ################################################################## predicted_strain = 0 strain = 0 count = 0 if BSIF == True: for i in input_test_data: strain = model1.predict(np.array([input_test_data[count]])) predicted_strain = np.append(predicted_strain, strain) count += 1 predicted_strain = np.median(predicted_strain) else: predicted_strain = model1.predict(input_test_data) error1 = abs(testing[0, s2] - predicted_strain) pError1 = 100 * error1 / testing[0, s2] ################################################################## predicted_slope = 0 slope = 0 count = 0 if BSIF == True: for i in input_test_data: slope = model2.predict(np.array([input_test_data[count]])) predicted_slope = np.append(predicted_slope, slope) count += 1 predicted_slope = np.median(predicted_slope) else: predicted_slope = model2.predict(input_test_data) error2 = abs(testing[0, s3] - predicted_slope) pError2 = 100 * error2 / testing[0, s3] ################################################################## # Round to 2 decimal place temp = { 'Stress': { leaveout_lot: np.around(testing[0, s1], 2) }, 'Predicted Stress': { leaveout_lot: np.around(predicted_stress, 2) }, 'Absolute error0': { leaveout_lot: np.around(error0, 2) }, '%Error0': { leaveout_lot: np.around(pError0, 2) }, 'Strain': { leaveout_lot: np.around(testing[0, s2], 2) }, 'Predicted Strain': { leaveout_lot: np.around(predicted_strain, 2) }, 'Absolute error1': { leaveout_lot: np.around(error1, 2) }, '%Error1': { leaveout_lot: np.around(pError1, 2) }, 'Slope': { leaveout_lot: np.around(testing[0, s3], 2) }, 'Predicted Slope': { leaveout_lot: np.around(predicted_slope, 2) }, 'Absolute error2': { leaveout_lot: np.around(error2, 2) }, '%Error2': { leaveout_lot: np.around(pError2, 2) } } info = pd.DataFrame.from_dict(temp) return info
def test_RandomForestRegressor(*args): X_train, X_test, y_train, y_test = args regr = ensemble.RandomForestRegressor() regr.fit(X_train, y_train) print("training score:%f" % regr.score(X_train, y_train)) print("testing score: %f" % regr.score(X_test, y_test))
# 2.线性回归 from sklearn.linear_model import LinearRegression model_linear_regression = LinearRegression() # 3.SVM回归 from sklearn import svm model_svm = svm.SVR() # 4.kNN回归 from sklearn import neighbors model_k_neighbor = neighbors.KNeighborsRegressor() # 5.随机森林回归 from sklearn import ensemble model_random_forest_regressor = ensemble.RandomForestRegressor( n_estimators=20) # 使用20个决策树 # 6.Adaboost回归 from sklearn import ensemble model_adaboost_regressor = ensemble.AdaBoostRegressor( n_estimators=50) # 这里使用50个决策树 # 7.GBRT回归 from sklearn import ensemble model_gradient_boosting_regressor = ensemble.GradientBoostingRegressor( n_estimators=100) # 这里使用100个决策树 # 8.Bagging回归 from sklearn import ensemble model_bagging_regressor = ensemble.BaggingRegressor()
from scipy import stats from sklearn import ensemble from . import base estimator = { "cl": ensemble.RandomForestClassifier(), "rg": ensemble.RandomForestRegressor(), "mcl": ensemble.RandomForestClassifier() } grid = dict(max_depth=[2, 4, 10], n_estimators=[100, 200], max_features=["auto", "sqrt"], min_samples_leaf=[0.01, 0.05]) distributions = dict(max_depth=list(range(2, 11)), n_estimators=[100, 200], max_features=["auto", "sqrt"], min_samples_leaf=stats.uniform(0.01, 0.1)) config = base.TunerConfig(estimator, distributions, distributions, grid, grid)
print("SVM_RMSE:", np.sqrt(mean_squared_error(y_crosstest, SVMmse))) print(datetime.now() - start) start = datetime.now() from sklearn.neighbors import KNeighborsRegressor Model_two = KNeighborsRegressor(n_neighbors=11) Model_two.fit(X_crosstrain_scl, y_crosstrain) KNNmse = Model_two.predict(X_crosstest_scl) print("KNN_RMSE:", np.sqrt(mean_squared_error(y_crosstest, KNNmse))) print(datetime.now() - start) start = datetime.now() from sklearn import ensemble Model_three = ensemble.RandomForestRegressor(n_estimators=500, verbose=1, n_jobs=-1, random_state=120, max_depth=16) Model_three.fit(X_crosstrain_svd, y_crosstrain) RFmse = Model_three.predict(X_crosstest_svd) print("RandomForest_RMSE:", np.sqrt(mean_squared_error(y_crosstest, RFmse))) print(datetime.now() - start) start = datetime.now() from sklearn.linear_model import BayesianRidge BR = BayesianRidge(n_iter=500, tol=0.001, normalize=True).fit(X_crosstrain_scl, y_crosstrain) pred_BR = BR.predict(X_crosstest_scl) print("BayesinRidge_RMSE:", np.sqrt(mean_squared_error(y_crosstest, pred_BR))) print(datetime.now() - start)
def process(): #output = defaultdict(lambda:500.0) output = defaultdict(float) data = pd.DataFrame(read_csv()) dataPreBegin = time.time() trainData, trainDataIndices = DataProcess(data) print("[+]Data Preprocess:" + str(round(time.time() - dataPreBegin, 4))) labelPreBegin = time.time() trainLabel = LabelProcess(data) print("[+]Label Preprocess:" + str(round(time.time() - labelPreBegin, 4))) data = pd.read_csv(path_test) testDataBegin = time.time() testData, testDataIndices = DataProcess(data) print("[+]TestData:" + str(round(time.time() - testDataBegin, 4))) modelBegin = time.time() scaler = StandardScaler() scaler.fit(testData) trainData = scaler.transform(trainData) testData = scaler.transform(testData) #model = ensemble.RandomForestRegressor(n_estimators=22, n_jobs=2) model = ensemble.RandomForestRegressor(n_estimators=2, n_jobs=2) #model = MLPRegressor(hidden_layer_sizes=(23,),learning_rate_init=0.001,max_iter=10000) #model = svm.SVR() #model = GradientBoostingRegressor() ''' trainData = lgb.Dataset(trainData,label=trainLabel) params = { 'task': 'train', 'boosting_type': 'rf', 'objective': 'regression', 'metric': ('l2', 'auc'), 'num_leaves': 31, 'learning_rate': 0.001, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 20, 'verbose': 0, 'num_iterations': 6000 } model = lgb.train(params,trainData,num_boost_round=80) predict = model.predict(testData,num_iteration=model.best_iteration) ''' ###############XGBoost############################ #model = xgb.XGBRegressor(booster='gbtree', max_depth=10, max_iter=1000, eta=0.05, subsample=1,colsample_bytree=1,scale_pos_weight=0.1,eval_metric='ndcg',objective='reg:linear') #model.fit(trainData, trainLabel) #predict =model.predict(testData) #model = AdaBoostRegressor() #estimator =GradientBoostingRegressor() #selector = RFE(estimator, 10, step=1) #selector = selector.fit(trainData, trainLabel) #print("[+]GraBoost" + str(selector.ranking_)) #estimator = ensemble.RandomForestRegressor(n_estimators=22, n_jobs=2) estimator = LinearRegression(n_jobs=2) selector = RFE(estimator, 10, step=1) selector = selector.fit(trainData, trainLabel) print("[+]RF" + str(selector.ranking_)) model.fit(trainData, trainLabel) predict = model.predict(testData) print("[+]Model:" + str(round(time.time() - modelBegin, 4))) outBegin = time.time() for i in range(testData.shape[0]): user = testDataIndices.iloc[i, 0] if output[user] < predict[i]: output[user] = predict[i] with (open(os.path.join(path_test_out, "test.csv"), mode="w")) as outer: writer = csv.writer(outer) writer.writerow(["Id", "Pred"]) for id, pred in output.items(): #pred = 0 if pred < 0.5 else pred writer.writerow([id, pred]) print("[+]Output:" + str(round(time.time() - outBegin, 4)))
def train_model(self, model_data, tar, var, algorithm, mod_type, train_start, train_end, val_start, val_end): # Model features # 1) Simple model with flat internal load if mod_type == 1: model_col = var.remove['hdh', 'cdh'] # 2) Model with Internal Gain profile by time, week and month elif mod_type == 2: model_col = var # turn time of day into dummy variables add_var = pd.get_dummies(model_data["TOD"], prefix="TOD_") # add all the columns to the model data model_data = model_data.join(add_var) # full list of variable for regression model_col = var + add_var.columns.tolist() # turn day of week into dummy variables add_var = pd.get_dummies(model_data["DOW"], prefix="DOW_") # add all the columns to the model data model_data = model_data.join(add_var) # full list of variable for regression model_col = model_col + add_var.columns.tolist() # turn month into dummy variables add_var = pd.get_dummies(model_data["MONTH"], prefix="MONTH_") # add all the columns to the model data model_data = model_data.join(add_var) # full list of variable for regression model_col = model_col + add_var.columns.tolist() # Select Training Set # .dropna() # slice training period data_train = model_data.loc[train_start:train_end, model_col] # if val_start & val_end # remove validation intervals data_train = data_train.drop(data_train[val_start:val_end].index) target_train = model_data.loc[train_start:train_end, tar] # .dropna() target_train = target_train.drop( target_train[val_start:val_end].index) # remove validation intervals # Train a simple linear model if algorithm == 1: clf = linear_model.LinearRegression() elif algorithm == 2: clf = ensemble.RandomForestRegressor() model = clf.fit(data_train, target_train) # Save the predicted target target_modeled_train = model.predict(data_train) # Set negative values (energy) to zero target_modeled_train = remove_negative(target_modeled_train) # save actual target and predicted target side by side compare = pd.DataFrame(target_train) compare.columns = ["target_actual"] compare["target_predicted"] = target_modeled_train # Save the score # replace this with functions score = model.score(data_train, target_train) # Print model coefficients (to see what are the weights) # print "Model variables const + %s" %model_col # print "Model coeff %s + %s" % (model.intercept_, model.coef_) return {"model": model, "data_train": data_train, "target_train": target_train, "model_data": model_data, "model_col": model_col, "score": score, "target_modeled_train": target_modeled_train, "compare": compare}
def missed_customers(): """ Returns a list of tuples of the customer name, the prediction, and the actual amount that the customer has bought. """ raw = get_dataframe() vec = DictVectorizer() today = datetime.date.today() currentMonth = today.month currentYear = today.year lastMonth = (today.replace(day=1) - datetime.timedelta(days=1)).month lastMonthYear = (today.replace(day=1) - datetime.timedelta(days=1)).year results = [] # Exclude this month's value #df = raw.loc[(raw['Month'] != currentMonth) & (raw['Year'] != currentYear)] df = raw print('aa') #print(df['CustomerName'].unique()) #print(df['CustomerName'].unique().tolist()) for customer in set(df['CustomerName'].unique().tolist()): # compare this month's real value to the prediction actual_value = 0.0 actual_previous_value = 0.0 # print("here2") # Get the actual_value and actual_previous_value # print("sddjd") # print(raw.loc[(raw['CustomerName'] == customer) & (raw['Year'] ==currentYear ) ]['Sales']) # print("sdfs") # new_raw = raw.loc[(raw['CustomerName'] == customer) , 'Sales'] # new_raw2 = new_raw.loc[(raw['Year'] == currentYear) ] # print( new_raw.iloc[0] ) # print( raw.loc[(raw['CustomerName'] == customer )['Sales']]) # print("\n") # print("Current year") # print(currentYear) print("currentMonth") print(currentMonth) # print("last month") # print(lastMonth) # print("lastMonthYear") # print(lastMonthYear) print("sales") # print(raw.loc[ # (raw['CustomerName'] == customer) & # (raw['Year'].astype(float) == currentYear) & # (raw['Month'].astype(float) == currentMonth) # ]['Sales']) # # print(float(pd.to_numeric( raw.loc[ # (raw['CustomerName'] == customer) & # (raw['Year'].astype(float) == currentYear) & # (raw['Month'].astype(float) == currentMonth) # ]['Sales']))) # actual_previous_value = float(raw.loc[ (raw['CustomerName'] == customer) & # (raw['Year'].astype(float) == currentYear ) & (raw['Month'] == int(currentMonth)) ]['Sales']) # print(actual_previous_value) # print('before me') try: actual_previous_value = float( raw.loc[ (raw['CustomerName'] == customer) & (raw['Year'] == currentYear) & (raw['Month'] == currentMonth) ]['Sales'] ) actual_value = float( raw[ (raw['CustomerName'] == customer) & (raw['Year'] == lastMonthYear) & (raw['Month'] == lastMonth) ]['Sales'] ) except TypeError: # If the customer had no sales in the target month, then move on continue # Transforming Data print('Data') print(actual_previous_value) print(actual_value) print('before me') temp = df.loc[df['CustomerName'] == customer] targets = temp['Sales'] del temp['CustomerName'] del temp['Sales'] del temp['PreviousMonthSales'] print(temp) print(targets) X_train, X_test, y_train, y_test = train_test_split(temp, targets, train_size=0.8, random_state=42) records = temp.to_dict(orient="records") vec_data = vec.fit_transform(records).toarray() print('\ntemp\n') #print(temp) #print(records) print(vec_data) print(targets) # Fitting the regressor, and use all available cores regressor = ensemble.RandomForestRegressor(n_jobs=-1 , oob_score=True , max_features=0.33) regressor.fit(vec_data, targets) #score_eval(regressor ,vec_data , targets ) r2_eval2(regressor ,X_train, X_test, y_train, y_test) # Predict the past two months using the regressor previous_predict = regressor.predict(vec.transform({ 'Year': lastMonthYear, 'Month': lastMonth }).toarray())[0] month_predict = regressor.predict(vec.transform({ 'Year': currentYear, 'Month': currentMonth }).toarray())[0] print('bb') print(previous_predict) print('cc') print(month_predict) if (predict_heuristic(previous_predict, month_predict, actual_previous_value, actual_value)): results.append(( customer, month_predict, actual_previous_value )) return results
train[i] = pd.factorize(train[i])[0] # factorize categorical columns in test set for i in test.columns: if test[i].dtype == 'object': print i test[i] = pd.factorize(test[i])[0] # fill all NaN values with -1 train = train.fillna(-1) test = test.fillna(-1) # Generate Labels and drop them from training set labels = np.array(train[['LATF', 'LONGF']]) train = train.drop(['LATF', 'LONGF'], axis=1) train = np.array(train) test = np.array(test) train = train.astype(float) test = test.astype(float) # Initialize the famous Random Forest Regressor from scikit-learn clf = ensemble.RandomForestRegressor(n_jobs=-1, n_estimators=100) clf.fit(train, labels) preds = clf.predict(test) # Write predictions to file sample['LATITUDE'] = preds[:, 1] sample['LONGITUDE'] = preds[:, 0] sample.to_csv('benchmark.csv', index=False)
plot_learning_curve(estimator, title, X_train, Y_train) plt.show() ################### ################### Support Vector Regression ################### svr = svm.SVR(kernel='linear') svr.fit(X_train, Y_train) print("Training Score SVR: ", str(svr.score(X_train, Y_train))) print("Test Score SVR : ", str(svr.score(X_test, Y_test))) ################### ################### Random Forest Regression ################### rf = ensemble.RandomForestRegressor( n_estimators=30, oob_score=True) #30 arbres et OOB Estimation rf.fit(train, target) print("Training Score RandomForest: ", str(rf.score(train, target))) print("OOB Score RandomForest: ", str(rf.oob_score_)) ################### ################### Améliorations ################### ## On cherche le paramètre le plus influent de notre modèle ## Pour cela on utilise deux algorithmes qu'on a utilisé : ## la régréssio linéaire et Random Forest def param_import(): col = list(train.columns.values) #on trouve d'abord les coefficients de la régréssion linéaire
# 载入模型库 # 1.线性回归 from sklearn import linear_model model_LinearRegression = linear_model.LinearRegression() # 2.决策树回归 from sklearn import tree model_DecisionTreeRegressor = tree.DecisionTreeRegressor() # 3.支持向量机回归 from sklearn import svm model_SVR = svm.SVR() # 4.K近邻回归 from sklearn import neighbors model_KNeighborsRegressor = neighbors.KNeighborsRegressor() # 5.随机森林回归 from sklearn import ensemble model_RandomForestRegressor = ensemble.RandomForestRegressor(n_estimators=20) # 6.AdaBoost回归 from sklearn import ensemble model_AdaBoostRegressor = ensemble.AdaBoostRegressor(n_estimators=50) # 7.梯度增强随机森林回归 from sklearn import ensemble model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor( n_estimators=100) # 8.bagging 回归 from sklearn.ensemble import BaggingRegressor model_BaggingRegressor = BaggingRegressor() # 9.ExtraTree回归 from sklearn.tree import ExtraTreeRegressor model_ExtraTreeRegressor = ExtraTreeRegressor() # 读取.mat文件,并将文件转化为.csv文件
"name": "GBR", "model": ensemble.GradientBoostingRegressor(max_features=0.1, n_estimators=2100, max_depth=6, min_samples_leaf=1, learning_rate=0.02) }) # Random Forest estimators.append({ "name": "RF", "model": ensemble.RandomForestRegressor(max_features=0.1, n_estimators=512) }) # Support Vector Machine estimators.append({ "name": "SVR", "model": svm.SVR(cache_size=1000, kernel='poly', C=1, gamma=0.1, epsilon=0.1, degree=3) })
# Level 2 Score: clf = ensemble.RandomForestClassifier(n_estimators=nET, max_features=50, max_depth=37, criterion='entropy', n_jobs=-1, random_state=rnd, verbose=0) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="classifier", filename = "RFC", setused=setused, tag = '50_37') # Level 2 Score: clf = ensemble.RandomForestClassifier(n_estimators=nET, max_features=60, max_depth=45, criterion='entropy', n_jobs=-1, random_state=rnd, verbose=0) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="classifier", filename = "RFC", setused=setused, tag = '60_45') # Level 2 Score: clf = ensemble.RandomForestRegressor(n_estimators=nET*2, max_features=10, max_depth=8, n_jobs=-1, random_state=rnd, verbose=0) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "RFR", setused=setused, tag = '10_8') # Level 2 Score: clf = ensemble.RandomForestRegressor(n_estimators=nET*1.75, max_features=20, max_depth=15, n_jobs=-1, random_state=rnd, verbose=0) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "RFR", setused=setused, tag = '20_15') # Level 2 Score: clf = ensemble.RandomForestRegressor(n_estimators=nET*1.5, max_features=30, max_depth=23, n_jobs=-1, random_state=rnd, verbose=0) model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "RFR", setused=setused, tag = '30_23')
import os from sklearn import datasets from sklearn import ensemble as en from sklearn import model_selection as ms from sklearn import externals as ex iris_dataset = datasets.load_iris() X = iris_dataset.data y = iris_dataset.target data = ms.train_test_split(X, y) X_train, X_test, y_train, y_test = data print("model: random forest") model = en.RandomForestRegressor(max_depth=6) model.fit(X_train, y_train) # Save files into the /task location while the # task is running to ensure you save all files # If not present, save in project root if os.path.isdir(os.path.join("/task")): model_path = os.path.join("/task", "model.pkl") else: model_path = os.path.join("model.pkl") ex.joblib.dump(model, model_path) train_acc = model.score(X_train, y_train) test_acc = model.score(X_test, y_test) print("training accuracy: " + str(train_acc)) print("test accuracy: " + str(test_acc))