def regtree(par, *data): X_train, X_test, Y_train, Y_test = data regTreeModel=tree.DecisionTreeRegressor\ (max_features=par[0],min_samples_split=par[1],min_samples_leaf=par[2], min_weight_fraction_leaf=par[3],max_leaf_nodes=int(par[4])) fitModel = linear_model.LinearRegression() Yp,Yptrain,regTreeModel,fitModelList,predind=\ SSRS.RegressionTree(X_train,X_test,Y_train,Y_test,regTreeModel,fitModel,Field, doFitSelection=0,doMultiBand=1) rmse, rmse_band = SSRS.RMSEcal(Yp, Y_test) print(rmse) return rmse
Y = UCData attrind = np.array(range(1, 51) + range(62, 78, 3)) Field = [Field[i] for i in range(1, 51) + range(62, 78, 3)] X = AttrData[:, attrind] X[np.isnan(X)] = 0 scaler = preprocessing.StandardScaler().fit(X) Xn = scaler.fit_transform(X) ### cluster model = KMeans(init='k-means++', n_clusters=6, n_init=10, max_iter=1000) model = AffinityPropagation(preference=-150, verbose=True) #model = Birch(branching_factor=10, n_clusters=4, threshold=0.3, compute_labels=True) model = MeanShift(bandwidth=estimate_bandwidth(X, quantile=0.1, n_samples=100), bin_seeding=True) label = SSRS.Cluster(X, model) ### classification model = tree.DecisionTreeClassifier() model = GaussianNB() model = svm.SVC() model = SGDClassifier() Tp = SSRS.Classification_cross(XXn, T=label, nfold=10, model=model) SSRS.plotErrorMap(label, Tp) ### regression regModel = linear_model.LinearRegression() #regModel=svm.SVC() regModel = KNeighborsRegressor(n_neighbors=10) regModel = tree.DecisionTreeRegressor()
Xn=scaler.fit_transform(X) ### cluster model = KMeans(init='k-means++', n_clusters=6, n_init=10, max_iter=1000) model = AffinityPropagation(preference=-150,verbose=True) #model = Birch(branching_factor=10, n_clusters=4, threshold=0.3, compute_labels=True) model = MeanShift(bandwidth=estimate_bandwidth(X, quantile=0.1, n_samples=100), bin_seeding=True) label=SSRS.Cluster(X, model) ### classification model = tree.DecisionTreeClassifier() model = GaussianNB() model = svm.SVC() model = SGDClassifier() Tp = SSRS.Classification_cross(XXn, T=label, nfold=10, model=model) SSRS.plotErrorMap(label, Tp) ### regression regModel=linear_model.LinearRegression() #regModel=svm.SVC() regModel=KNeighborsRegressor(n_neighbors=10) regModel = tree.DecisionTreeRegressor() regModel = GaussianNB() rmse_band,Yp,Ytest=SSRS.RegressionLearn(X,XXn,0.2,regModel)
## plot tree regModel.fit(X2_train, Y2_train) savedir=r"/Volumes/wrgroup/Kuai/USGSCorr/figure_tree/" savedir=r"Y:\Kuai\USGSCorr\figure_tree\\" with open(savedir+"tree.dot", 'w') as f: f = tree.export_graphviz(regModel, out_file=f,feature_names=[Field[i] for i in attrsel], label='none',node_ids=True) os.system("dot -Tpng tree.dot -o tree.png") regTree=regModel.tree_ feature_names=[Field[i] for i in attrind] Xin=X2_train Yin=Y2_train string,nodeind,leaf,label=SSRS.traverseTree(regTree,feature_names,Xin) for i in range(0,regTree.node_count): plt.figure() plt.boxplot(Yin[nodeind[i],:]) plt.title(string[i],fontsize=8) #plt.tight_layout() plt.savefig(savedir+"Train_node%i"%i) plt.close() Xin=X2_test Yin=Y2_test string,nodeind,leaf,label=SSRS.traverseTree(regTree,feature_names,Xin) for i in range(0,regTree.node_count): plt.figure() plt.boxplot(Yin[nodeind[i],:]) plt.title(string[i],fontsize=8)
def testModel(predListTest): nmodel = predListTest.__len__() nit = 50 # test 1: different size of training and test testErr = np.ones([nmodel, 5, 50]) testsize = [0.2, 0.4, 0.5, 0.6, 0.7] for k in range(0, nit): print(k) for j in range(0, nmodel): for i in range(0, 5): ind = range(0, nind) X_train,X_test,Y_train,Y_test,ind_train,ind_test = \ cross_validation.train_test_split(Xn,dist,ind,test_size=testsize[i],random_state=k) regTreeModel = tree.DecisionTreeRegressor(max_leaf_nodes=20, min_samples_leaf=20) fitModel = linear_model.LinearRegression() predSel = predListTest[j] predName = [Field[jj] for jj in predSel] Yp,Yptrain,regTreeModel,fitModelList,predind=\ SSRS.RegressionTree(X_train[:,predSel],X_test[:,predSel],Y_train,Y_test,regTreeModel,fitModel,predName, doFitSelection=0,doMultiBand=1) rmse, rmse_band = SSRS.RMSECal(Yp, Y_test) testErr[j, i, k] = rmse # test 2: use 1 HUC2 as test testErr1_huc2_rt = np.ones([nmodel, 18]) trainErr1_huc2_rt = np.ones([nmodel, 18]) IDhucfile = r"E:\work\SSRS\data\IDhuc_mb_4949.mat" mat = sio.loadmat(IDhucfile) IDhuc = mat["IDhuc"] huc2 = IDhuc[indvalid, 1] for k in range(0, nit): print(k) for i in range(0, 18): ind = range(0, nind) X_train,X_test,Y_train,Y_test,ind_train,ind_test = \ cross_validation.train_test_split(Xn,dist,ind,test_size=0.2,random_state=k) ind_test = np.where(huc2 == i + 1)[0] X_test = Xn[ind_test, :] Y_test = dist[ind_test, :] for j in range(0, nmodel): regTreeModel = tree.DecisionTreeRegressor(max_leaf_nodes=20) fitModel = linear_model.LinearRegression() predSel = predListTest[j] predName = [Field[jj] for jj in predSel] Yp,Yptrain,regTreeModel,fitModelList,predind=\ SSRS.RegressionTree(X_train[:,predSel],X_test[:,predSel],Y_train,Y_test,regTreeModel,fitModel,predName, doFitSelection=0,doMultiBand=1) rmse, rmse_band = SSRS.RMSECal(Yptrain, Y_train) trainErr1_huc2_rt[j, i] = rmse rmse, rmse_band = SSRS.RMSECal(Yp, Y_test) testErr1_huc2_rt[j, i] = rmse # test 3: leave out 1 HUC2 one time testErr1_huc2 = np.ones([nmodel, 18]) trainErr1_huc2 = np.ones([nmodel, 18]) IDhucfile = r"E:\work\SSRS\data\IDhuc_mb_4949.mat" mat = sio.loadmat(IDhucfile) IDhuc = mat["IDhuc"] huc2 = IDhuc[indvalid, 1] for i in range(0, 18): ind_test = np.where(huc2 == i + 1)[0] ind_train = np.where(huc2 != i + 1)[0] X_train = Xn[ind_train, :] X_test = Xn[ind_test, :] Y_train = dist[ind_train, :] Y_test = dist[ind_test, :] for j in range(0, nmodel): regTreeModel = tree.DecisionTreeRegressor(max_leaf_nodes=20) fitModel = linear_model.LinearRegression() predSel = predListTest[j] predName = [Field[jj] for jj in predSel] Yp,Yptrain,regTreeModel,fitModelList,predind=\ SSRS.RegressionTree(X_train[:,predSel],X_test[:,predSel],Y_train,Y_test,regTreeModel,fitModel,predName, doFitSelection=0,doMultiBand=1) rmse, rmse_band = SSRS.RMSECal(Yptrain, Y_train) trainErr1_huc2[j, i] = rmse rmse, rmse_band = SSRS.RMSECal(Yp, Y_test) testErr1_huc2[j, i] = rmse # test 4: leave out 2 HUC2 one time testErr2_huc2 = np.ones([nmodel, 18 * 17]) trainErr2_huc2 = np.ones([nmodel, 18 * 17]) hucTab = np.ones([18 * 17, 2]) IDhucfile = r"E:\work\SSRS\data\IDhuc_mb_4949.mat" mat = sio.loadmat(IDhucfile) IDhuc = mat["IDhuc"] huc2 = IDhuc[indvalid, 1] n = -1 for i in range(0, 18): print(i) for j in range(0, 18): if i == j: continue n = n + 1 hucTab[n, 0] = i hucTab[n, 1] = j ind_test = np.where((huc2 == i + 1) | (huc2 == j + 1))[0] ind_train = np.where((huc2 != i + 1) & (huc2 != j + 1))[0] X_train = Xn[ind_train, :] X_test = Xn[ind_test, :] Y_train = dist[ind_train, :] Y_test = dist[ind_test, :] for k in range(0, nmodel): regTreeModel = tree.DecisionTreeRegressor(max_leaf_nodes=20) fitModel = linear_model.LinearRegression() predSel = predListTest[k] predName = [Field[jj] for jj in predSel] Yp,Yptrain,regTreeModel,fitModelList,predind=\ SSRS.RegressionTree(X_train[:,predSel],X_test[:,predSel],Y_train,Y_test,regTreeModel,fitModel,predName, doFitSelection=0,doMultiBand=1) rmse, rmse_band = SSRS.RMSECal(Yptrain, Y_train) trainErr2_huc2[k, n] = rmse rmse, rmse_band = SSRS.RMSECal(Yp, Y_test) testErr2_huc2[k, n] = rmse return testErr,trainErr1_huc2,testErr1_huc2,\ trainErr1_huc2_rt,testErr1_huc2_rt,\ trainErr2_huc2,testErr2_huc2,hucTab
X = np.delete(X, indnan, 0) Y = np.delete(Y, indnan, 0) indvalid = np.delete(indvalid, indnan, 0) scaler = preprocessing.StandardScaler().fit(X) Xn = scaler.fit_transform(X) [nind, nband] = Y.shape [nind, nattr] = X.shape # test for k in kmean score_cluster = np.zeros(8) for i in range(2, 10): print(i) nc = i model = KMeans(init='k-means++', n_clusters=nc, n_init=10, max_iter=1000) label, center = SSRS.Cluster(Y, model, doplot=0) score_cluster[i - 2] = metrics.silhouette_score(Y, label) plt.plot(range(2, 10), score_cluster, '-*') ## cluster nc = 6 model = KMeans(init='k-means++', n_clusters=nc, n_init=15, max_iter=1000, tol=1e-15, verbose=True) label, center = SSRS.Cluster(Y, model, doplot=0) ## PCA pca = PCA(n_components=nband)
scaler = preprocessing.StandardScaler().fit(X) Xn = scaler.fit_transform(X) [nind, nband] = Y.shape [nind, nattr] = X.shape ################################################################ # CLUSTER ################################################################ nc = 6 model = KMeans(init='k-means++', n_clusters=nc, n_init=15, max_iter=1000, tol=1e-15, verbose=True) label, center = SSRS.Cluster(Y, model, doplot=0) ## PCA pca = PCA(n_components=nband) pca.fit(Y) Ypca = pca.transform(Y) Cpca = pca.transform(center) Ypca[:, 0] = -Ypca[:, 0] Cpca[:, 0] = -Cpca[:, 0] ## rename clusters ythe = np.array([0]) label, Cpca, center = SSRS.Cluster_rename(label, ythe, Cpca, center) ## plot PCA and cluster after resign name SSRS.Cluster_plot(Y, label, center)
# nn = Regressor( # layers=[ # Layer("Sigmoid", units=200), # Layer("Sigmoid", units=200), # Layer("Linear")], # learning_rate=0.1, # n_iter=200,verbose=1) X_train,X_test,Y_train,Y_test = cross_validation.train_test_split(\ Xn,Y,test_size=0.2,random_state=0) # predict correlation Yp,Yptrain,regModelList=SSRS.Regression\ (X_train,X_test,Y_train,Y_test,multiband=1,regModel=regModel,doplot=0) rmse, rmse_band = SSRS.RMSEcal(Yp, Y_test) rmse_train, rmse_band_train = SSRS.RMSEcal(Yptrain, Y_train) print(rmse) print(rmse_train) print(np.corrcoef(Yp[:, 0], Y_test[:, 0])) par = [1.0, 16, 6, 20, 0.15] regTreeModel=tree.DecisionTreeRegressor\ (max_features=par[0],max_depth=par[1],min_samples_split=par[2],min_samples_leaf=par[3], min_weight_fraction_leaf=par[4],max_leaf_nodes=18) fitModel = linear_model.LinearRegression() Yp,Yptrain,regTreeModel,fitModelList,predind=SSRS.RegressionTree\ (X_train,X_test,Y_train,Y_test,regTreeModel,fitModel,Field,doFitSelection=0) rmse, rmse_band = SSRS.RMSEcal(Yp, Y_test) rmse_train, rmse_band_train = SSRS.RMSEcal(Yptrain, Y_train) print(rmse)
Y2=np.argmax(Y[:,15:30],axis=1) ## Regression regModel=linear_model.LinearRegression() #regModel=svm.SVC() regModel=KNeighborsRegressor(n_neighbors=20) regModel=tree.DecisionTreeRegressor() regModel=GaussianNB() regModel=sklearn.linear_model.SGDRegressor() regModel=RandomForestRegressor() X_train,X_test,Y_train,Y_test = cross_validation.train_test_split(\ Xn,np.column_stack((Y1,Y2)),test_size=0.2,random_state=0) Yp,rmse,rmse_train,rmse_band,rmse_band_train=SSRS.Regression\ (X_train,X_test,Y_train,Y_test,multiband=1,regModel=regModel,doplot=0) print(rmse) print(rmse_train) ## Classification model = tree.DecisionTreeClassifier() model = GaussianNB() model = svm.SVC() model = SGDClassifier() model=sklearn.ensemble.RandomForestClassifier() Yin=Y1 Tp = SSRS.Classification_cross(Xn, T=Yin, nfold=10, model=model) SSRS.plotErrorMap(Yin, Tp) np.sqrt(((Yin - Tp) ** 2).mean()) np.count_nonzero(np.abs(Yin-Tp)<2)/4627.