def adaBoostModel(train_x, train_y, test_x, test_y, workOrFreeDay): rng = np.random.RandomState(1) adaBoost = AdaboostR(DTR(max_depth=5), n_estimators=300, random_state=rng) adaBoost.fit(train_x, train_y) predicted = adaBoost.predict(test_x) # show test results printEvaluationScores(predicted, test_y, "AdaBoost model with MSFsc", workOrFreeDay) # invokes method to print the tree structure of the 300 trained tree #saveTreeStrucutre(adaBoost) # Predict without MSFSC x_trainWithoutMSFSC = train_x.copy() x_testWithoutMSFSC = test_x.copy() del x_trainWithoutMSFSC['MSFSC'] del x_testWithoutMSFSC['MSFSC'] adaBoost.fit(x_trainWithoutMSFSC, train_y) predicted = adaBoost.predict(x_testWithoutMSFSC) # show test results printEvaluationScores(predicted, test_y, "AdaBoost model with without MSFsc", workOrFreeDay)
def test_DecitionTreeRegressor(): import numpy as np import pandas as pd from sklearn.datasets import load_boston dataset = load_boston() X, y, features = dataset['data'], dataset['target'], dataset[ 'feature_names'] X = pd.DataFrame(X, columns=features) y = pd.DataFrame(y, columns=['target']) data = pd.concat([X, y], axis=1) features = data.columns[:-1] target = data.columns[-1] from sklearn.model_selection import train_test_split X_train, X_vali, y_train, y_vali = train_test_split(X, y, test_size=0.2, random_state=23) print('X_train shape: ', X_train.shape) print('X_vali shape: ', X_vali.shape) print('y_train shape: ', y_train.shape) print('y_vali shape: ', y_vali.shape) from ml.tree import DecisionTreeRegressor from sklearn.tree import DecisionTreeRegressor as DTR models = {} models['my_dtr'] = DecisionTreeRegressor(max_depth=5) models['sklearn_dtr'] = DTR(max_depth=5) for name, model in models.items(): model.fit(X_train, y_train) print('%s score: %.8f' % (name, model.score(X_vali, y_vali)))
def get_new_model(self): if (self.model_type.split("_")[-1] == "Regressor"): if (self.model_type == "Linear-Regressor"): from sklearn.linear_model import LinearRegression self.model = LinearRegression(**self.model_args) elif (self.model_type == "Support-Vector-Regressor"): import sklearn.svm as SVR self.model = SVR(**self.model_args) elif (self.model_type == "Decision-Tree-Regressor"): from sklearn.tree import DecisionTreeRegressor as DTR self.model = DTR(**self.model_args) elif (self.model_type == "Random-Forest-Regressor"): from sklearn.ensemble import RandomForestRegressor as RFR self.model = RFR(**self.model_args) else: if (self.model_type == "Logistic-Regression-Classifier"): from sklearn.linear_model import LogisticRegression self.model = LogisticRegression(**self.model_args) elif (self.model_type == "KNN-Classifier"): from sklearn.neighbors import KNeighborsClassifier as KNN self.model = KNN(**self.model_args) elif (self.model_type == "Support-Vector-Classifier"): import sklearn.svm as SVC self.model = SVC(**self.model_args) elif (self.model_type == "Naive-Bayes-Classifier"): from sklearn.naive_bayes import GNB self.model = GNB(**self.model_args) elif (self.model_type == "Decision-Tree-Classifier"): from sklearn.tree import DecisionTreeClassifier as DTC self.model = DTC(**self.model_args) elif (self.model_type == "Random-Forest-Classifier"): from sklearn.ensemble import RandomForestClassifier as RFC self.model = RFC(**self.model_args)
def show_bias_variance(feature_variables, n_test, n_train): target_variable = 'Class' X, Y = filter_data(feature_variables) n_x = len(X) random_test_index = random.sample(range(n_x), n_test) X_test = [X[i] for i in random_test_index] Y_test = [[Y[i]] for i in random_test_index] max_df = 30 dfs = range(1, max_df+1) #max_depth result = {i: [] for i in dfs} train_pool_index = [i for i in range(n_x) if i not in random_test_index] train_indexes = [] for i in range(50): train_indexes_sample = random.sample(train_pool_index, n_train) as_list = sorted(train_indexes_sample) train_indexes.append(as_list) for df in dfs: model = DTR(max_depth = df, max_features=len(feature_variables)) prediction_errors = [] training_errors = [] for i in range(50): X_train = [X[j] for j in train_indexes[i]] Y_train = [[Y[j]] for j in train_indexes[i]] model.fit(X_train, Y_train) y_predict = model.predict(X_test) y_predict_train = model.predict(X_train) mse = statistics.mean([(y_predict[j]-Y_test[j][0])**2 for j in range(n_test)]) mse_train = statistics.mean([(y_predict_train[j]-Y_train[j][0])**2 for j in range(n_train)]) prediction_errors.append(mse) training_errors.append(mse_train) result[df].append(prediction_errors) result[df].append(statistics.mean(prediction_errors)) result[df].append(training_errors) result[df].append(statistics.mean(training_errors)) fig, ax = plt.subplots() sort_dfs = sorted(dfs) for i in range(max_df): ax.plot(sort_dfs, [result[df][0][i] for df in sort_dfs], '-', color = (1,0.2,0,0.25), lw = 2) p=ax.plot(sort_dfs, [result[df][1] for df in sort_dfs], '-', color = (1,0,0,1), lw = 1.5) p[0].set_label("Expected Test Error Estimate") for i in range(max_df): ax.plot(sort_dfs, [result[df][2][i] for df in sort_dfs], '-', color = (0,0.2,1,0.25), lw = 2) p=ax.plot(sort_dfs, [result[df][3] for df in sort_dfs], '-', color = (0,0,1,1), lw = 1.5) p[0].set_label("Expected Training Error Estimate") ax.legend() ax.set_xlabel('Max Depth') ax.set_ylabel('Prediction Error') fig.show()
def function(Xt, Xy, Yt, Yy, weight, dynamic=False): for i in range(50): estimator = DTR(max_depth=5, random_state=42) if i == 0: estimator.fit(X_train, y_train) else: estimator.fit( X_train, square_grad(y_train, gbm_predict(X_train, ests, coef))) ests.append(estimator) if (dynamic): coef.append(weight / (1 + i)) else: coef.append(weight) return MSE(y_test, gbm_predict(X_test, ests, coef))
def test_gbdt(): import numpy as np import pandas as pd from sklearn.datasets import load_boston dataset = load_boston() X, y, features = dataset['data'], dataset['target'], dataset[ 'feature_names'] X = pd.DataFrame(X, columns=features) y = pd.DataFrame(y, columns=['target']) data = pd.concat([X, y], axis=1) features = data.columns[:-1] target = data.columns[-1] from sklearn.model_selection import train_test_split X_train, X_vali, y_train, y_vali = train_test_split(X, y, test_size=0.2, random_state=25) print('X_train shape: ', X_train.shape) print('X_vali shape: ', X_vali.shape) print('y_train shape: ', y_train.shape) print('y_vali shape: ', y_vali.shape) from sklearn.tree import DecisionTreeRegressor as DTR dtr = DTR(max_depth=5) dtr.fit(X_train, y_train.values.reshape(-1)) print('sklearn dtr score: ', dtr.score(X_vali, y_vali)) from sklearn.ensemble import GradientBoostingRegressor as GBR import xgboost as xgb gbr = GBR(max_depth=5) gbr.fit(X_train, y_train) print('sklearn gbr score: ', gbr.score(X_vali, y_vali)) from ml.tree import DecisionTreeRegressor mydtr = DecisionTreeRegressor(max_depth=5) mydtr.fit(X_train, y_train) print('my dtr score: ', mydtr.score(X_vali, y_vali)) from ml.ensemble import GradientBoostingRegressor mygbr = GradientBoostingRegressor() mygbr.fit(X_train, y_train) print('my gbr score: ', mygbr.score(X_vali, y_vali))
def adaBoostModelWithCrossFoldValidation(inputData, outputData, workOrFreeDay): rng = np.random.RandomState(1) adaBoost = AdaboostR(DTR(max_depth=5), n_estimators=300, random_state=rng) # do leave one-out cross prediction adaBoostPredict = cross_val_predict(adaBoost, inputData, outputData, cv=len(inputData)) # show test results printEvaluationScores(adaBoostPredict, outputData, "AdaBoost model with MSFsc and LOO prediction", workOrFreeDay) # Predict without MSFSC dataWithoutMSFSC = inputData.copy() del dataWithoutMSFSC['MSFSC'] adaBoostPredict = cross_val_predict(adaBoost, dataWithoutMSFSC, outputData, cv=len(inputData)) # show test results printEvaluationScores(adaBoostPredict, outputData, "AdaBoost model without MSFsc and LOO prediction", workOrFreeDay)
def __init__(self, criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, ccp_alpha=0.0): self.max_leaf_nodes = max_leaf_nodes self.min_samples_split = min_samples_split self.random_state = random_state self.min_samples_leaf = min_samples_leaf self.ccp_alpha = ccp_alpha self.min_impurity_decrease = min_impurity_decrease self.max_features = max_features self.splitter = splitter self.max_depth = max_depth self.min_weight_fraction_leaf = min_weight_fraction_leaf self.min_impurity_split = min_impurity_split self.criterion = criterion self.model = DTR( ccp_alpha=self.ccp_alpha, min_impurity_decrease=self.min_impurity_decrease, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_split=self.min_impurity_split, splitter=self.splitter, min_samples_split=self.min_samples_split, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, criterion=self.criterion, random_state=self.random_state)
def bagging(X_train, y_train, X_test, boot_count, depth): trees = np.array([DTR(max_depth=depth) for _ in range(0, boot_count)]) X_train_bootstrap = np.array([]) y_train_bootstrap = np.array([]) # Формирование трейн-выборок бутстрэпом: for i in range(0, boot_count): for j in range(0, X_train.shape[0]): random_index = random.choice( [i for i in range(0, X_train.shape[0])]) X_train_bootstrap = np.append(X_train_bootstrap, X_train[random_index]) y_train_bootstrap = np.append(y_train_bootstrap, y_train[random_index]) X_train_bootstrap = X_train_bootstrap.reshape(boot_count, X_train.shape[0], X_train.shape[1]) y_train_bootstrap = y_train_bootstrap.reshape(boot_count, X_train.shape[0]) # Обучаем деревья на трейн-выборках: fitted_trees = np.array([ trees[i].fit(X_train_bootstrap[i], y_train_bootstrap[i]) for i in range(0, boot_count) ]) # Предсказываем ансамблем деревьев: y_predicts = np.array([tree.predict(X_test) for tree in fitted_trees]) y_predicts = y_predicts.reshape(boot_count, X_test.shape[0]) # Усреднение y_pred = np.array([]) for i in range(0, X_test.shape[0]): mean_value = 0 for j in range(0, boot_count): mean_value += y_predicts[j][i] mean_value = mean_value / boot_count y_pred = np.append(y_pred, mean_value) return y_pred
PAR(C=1.0, fit_intercept=False, tol=None, shuffle=True, verbose=1, loss='epsilon_insensitive', epsilon=0.01, random_state=rng), 'svr_rbf': SVR(kernel='rbf', C=1e3, shrinking=True, verbose=True), 'svr_ply': SVR(kernel='poly', C=1e3, degree=3, shrinking=True, verbose=True), 'gpr': GPR(kernel=None, alpha=1e-10, optimizer='fmin_l_bfgs_b', random_state=rng), 'dtr': DTR(max_depth=10), 'kr_rbf': KernelRidge(kernel='rbf', gamma=0.1, alpha=1e-2), 'kr_ply': KernelRidge(kernel='poly', gamma=10.1, alpha=1e-2, degree=3), 'mlp_r': MLPRegressor( hidden_layer_sizes=( 10, 8, 5, ), activation='tanh', solver='adam', #'lbfgs', # alpha=0.0001, batch_size=32,
def getDTC(data, target, depth): Y = data[target] X = data.drop(target, axis=1) model = DTR(max_depth=depth) return model
}) df_['Category of interaction'] = df_['Category of interaction'].map({ 'positive': 1, 'negative': -1, 'neutral': 0 }) #### seperating dependent and independent variables x = df_.drop(['Churn date'], axis=1) y = df_['Churn date'] #splitting into training and testdata from sklearn.model_selection import train_test_split train_x, test_x, train_y, test_y = train_test_split(x, y, random_state=23, test_size=0.3) from sklearn.tree import DecisionTreeRegressor as DTR dtr = DTR(max_depth=41, random_state=23) dtr.fit(train_x, train_y) pickle.dump(dtr, open('model.pkl', 'wb')) model = pickle.load(open('model.pkl', 'rb')) print(model.predict([[737000, 737002, 0]]))
for i in range(0, len(y)): if y[i] == 0: y[i] = 1 from sklearn.model_selection import train_test_split as tts X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0) from sklearn.linear_model import LinearRegression as LR reg = LR() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) y_pred_train = reg.predict(X_train) Table(y_pred, y_pred_train, y_train, y_test, 'LR', X_train, X_test) from sklearn.tree import DecisionTreeRegressor as DTR reg = DTR() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) y_pred_train = reg.predict(X_train) Table(y_pred, y_pred_train, y_train, y_test, 'DTR', X_train, X_test) from sklearn.ensemble import RandomForestRegressor as RF reg = RF() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) y_pred_train = reg.predict(X_train) Table(y_pred, y_pred_train, y_train, y_test, 'RFR', X_train, X_test) plotting(y_pred, 'RFR') from lightgbm import LGBMRegressor as lgb reg = lgb()
# In[21]: from sklearn.tree import DecisionTreeRegressor as DTR from sklearn.model_selection import RandomizedSearchCV parameters={"splitter":["best","random"], "max_depth" : [1,3,5,7,9,11,12], "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10], "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5], "max_features":["auto","log2","sqrt",None], "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] } randGrid = RandomizedSearchCV(DTR(), parameters, cv=10, scoring='r2', n_iter=1000, verbose=1, n_jobs=3) randGrid.fit(xall, yall) print(randGrid.best_params_) print(randGrid.best_score_) # In[22]: parameters={"splitter":["best","random"], "max_depth" : [1,3,5,7,9,11,12], "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10], # "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5], "max_features":["auto","log2","sqrt",None], # "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] }
display(samples - np.round(data.mean())) display(samples - np.round(data.median())) from sklearn.tree import DecisionTreeRegressor as DTR from sklearn.metrics import accuracy_score from sklearn import cross_validation # Make a copy of the DataFrame, using the 'drop' function to drop the given feature new_data = data.drop(['Frozen'], axis=1, inplace=False) # Split the data into training and testing sets using the given feature as the target new_y = data.drop(['Fresh','Milk','Grocery','Detergents_Paper','Delicatessen'], axis=1, inplace=False) X_train, X_test, y_train, y_test = cross_validation.train_test_split(new_data, new_y, test_size=0.25, random_state=42) # Create a decision tree regressor and fit it to the training set regressor = DTR(random_state=42) regressor.fit(X_train, y_train) # Report the score of the prediction using the testing set score = regressor.score(X_test, y_test) print(score) # Scale the data using the natural logarithm log_data = np.log(data) # Scale the sample data using the natural logarithm log_samples = np.log(samples) # Produce a scatter matrix for each pair of newly-transformed features pd.scatter_matrix(log_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');
# loading original dataset data = pd.read_csv(path + 'SPP+.csv') # arranging features from original dataset for model learning x = data.drop([ 'Wavelength (nm)', 'Width (nm)', 'AspectRatio', 'Length (nm)', 'Linewidth (nm)', 'MaxCscat' ], axis=1) w_y = data['Width (nm)'] l_y = data['Length (nm)'] # parameters for GridSearchCV class param_grid = {'max_depth': range(1, 31)} # Initialize GridSearchCV class width_gs = GridSearchCV(estimator=DTR(), param_grid=param_grid, cv=10, scoring='neg_mean_squared_error') length_gs = GridSearchCV(estimator=DTR(), param_grid=param_grid, cv=10, scoring='neg_mean_squared_error') width_gs.fit(x, w_y) length_gs.fit(x, l_y) joblib_width_file = "joblib_width_gs.pkl" joblib.dump(width_gs, joblib_width_file) joblib_length_file = "joblib_length_gs.pkl"
def __init__(self, featureset=None, target=None, mode='predict', path=''): if (mode == 'train'): self.__svm = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) self.__svr = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) self.__nusvm = NuSVC(cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, nu=0.5, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) self.__nusvr = NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='auto', kernel='rbf', max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False) self.__linsvm = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0) self.__linsvr = LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True, intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000, random_state=None, tol=0.0001, verbose=0) self.__mlpc = MLPC(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(100, 25), learning_rate='constant', learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) self.__mlpr = MLPR(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(100, 25), learning_rate='constant', learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=None, shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) self.__dtc = DTC(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') self.__dtr = DTR(criterion='mse', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') self.__rfc = RFC(bootstrap=True, class_weight=None, criterion='gini', max_depth=100, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) self.__rfr = RFR(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) (self.__svm, self.__svr, self.__nusvm, self.__nusvr, self.__linsvm, self.__linsvr, self.__mlpc, self.__mlpr, self.__dtc, self.__dtr, self.__rfc, self.__rfr) = self.__trainAll(X=list(featureset), Y=list(target)) self.__saveModelsToFile(path) else: self.__svm = joblib.load(path + 'Mel_SVM.pkl') self.__svr = joblib.load(path + 'Mel_SVR.pkl') self.__nusvm = joblib.load(path + 'Mel_NuSVM.pkl') self.__nusvr = joblib.load(path + 'Mel_NuSVR.pkl') self.__linsvm = joblib.load(path + 'Mel_LinSVM.pkl') self.__linsvr = joblib.load(path + 'Mel_LinSVR.pkl') self.__mlpc = joblib.load(path + 'Mel_MLPC.pkl') self.__mlpr = joblib.load(path + 'Mel_MLPR.pkl') self.__dtc = joblib.load(path + 'Mel_DTC.pkl') self.__dtr = joblib.load(path + 'Mel_DTR.pkl') self.__rfc = joblib.load(path + 'Mel_RFC.pkl') self.__rfr = joblib.load(path + 'Mel_RFR.pkl')
trees[i].fit(X_train_bootstrap[i], y_train_bootstrap[i]) for i in range(0, boot_count) ]) # Предсказываем ансамблем деревьев: y_predicts = np.array([tree.predict(X_test) for tree in fitted_trees]) y_predicts = y_predicts.reshape(boot_count, X_test.shape[0]) # Усреднение y_pred = np.array([]) for i in range(0, X_test.shape[0]): mean_value = 0 for j in range(0, boot_count): mean_value += y_predicts[j][i] mean_value = mean_value / boot_count y_pred = np.append(y_pred, mean_value) return y_pred X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, shuffle=True) y_pred = bagging(X_train, y_train, X_test, boot_count=200, depth=10) y_dt_pred = DTR().fit(X_train, y_train).predict(X_test) y = RandomForestRegressor().fit(X_train, y_train).predict(X_test) print(mean_squared_error(y, y_test)) print(mean_squared_error(y_dt_pred, y_test)) print(mean_squared_error(y_pred, y_test))
import numpy as np import pandas as pd import matplotlib.pyplot as plt #Importing the dataset dataset = pd.read_csv("D:\work\ML A to Z\Own\Regression\Faces.csv") #Matrix and vector X = dataset.iloc[:, 1:5].values y = dataset.iloc[:, 5].values #training and test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #Regressor from sklearn.tree import DecisionTreeRegressor as DTR regressor = DTR() regressor.fit(X_train, y_train) #prediction y_pred = regressor.predict(X_test) #Plotting plt.scatter(X_test, y_test, color="red") plt.plot(X_test, regressor.predict(X_test), color="blue") plt.show()
# some_prepared = full_pipeline.transform(some_data) # print("Predictions: ", lr.predict(some_prepared)) # print("Original Labels: ", list(some_labels)) # 可以看出預測結果差的離譜(代表資料的擬合不足 underfitting) # 透過 Scikit-Learn mean_squared_error 來計算整個訓練資料上回歸模型的RMSE from sklearn.metrics import mean_squared_error housing_predictions = lr.predict(housing_prepared) # lr_mse = mean_squared_error(housing_labels, housing_predictions) # rmse = np.sqrt(lr_mse) # print("RMSE: ", rmse) # 預測誤差 68628.19819848923 美元 # 因此對於擬合不足的模型, 可以選用更強大的模型或是提供更好的特徵, 減少限制等等 # 這裡則更換更強大的模型 DecisionTreeRegressor from sklearn.tree import DecisionTreeRegressor as DTR tree_reg = DTR() tree_reg.fit(housing_prepared, housing_labels) predicted = tree_reg.predict(housing_prepared) # mse = mean_squared_error(housing_labels, predicted) # rmse = np.sqrt(mse) # print("RMSE: ", rmse) # RMSE: 0.0 # 這裡的結果正確率是100%, 但要考慮是否過度擬合 Overfitting # 因此, 要使用交叉驗證來進行更好的評估模型 from sklearn.model_selection import cross_val_score as cvs scores = cvs(tree_reg, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10) rmse_scores = np.sqrt(-scores) # Scikit-Learn 交叉驗證更傾向於效用函數(越大越好), 而不是成本函數(越小越好), 所以計算分數實際上是負的MSE # 來查看結果 def display_score(scores): print("Score: ", scores)
z = sqrt(mean_squared_error(y_test, predicted)) print('RMS Evaluation: {}'.format(z)) print('Prediction/Fit Run Time: {}\n'.format(elapsed_time)) print("Bayesian Ridge:") model = BR() start_time = time.time() model.fit(X_train, y_train) predicted = model.predict(X_test) elapsed_time = time.time() - start_time z = sqrt(mean_squared_error(y_test, predicted)) print('RMS Evaluation: {}'.format(z)) print('Prediction/Fit Run Time: {}\n'.format(elapsed_time)) print("Decision Tree Regression:") model = DTR(max_depth=3) start_time = time.time() model.fit(X_train, y_train) predicted = model.predict(X_test) elapsed_time = time.time() - start_time z = sqrt(mean_squared_error(y_test, predicted)) print('RMS Evaluation: {}'.format(z)) print('Prediction/Fit Run Time: {}\n'.format(elapsed_time)) print("Linear Regression:") model = LNR() start_time = time.time() model.fit(X_train, y_train) predicted = model.predict(X_test) elapsed_time = time.time() - start_time z = sqrt(mean_squared_error(y_test, predicted))
def main(): # ---------------------------- # Training data # ---------------------------- # Loading training data trainingDataFile = 'Training_set.csv' trainingData = pd.read_csv(trainingDataFile) # Obtaining unique cases of events (Note: This remains the same for both training and test data) myEventSet = [] for x in trainingData.events: if x not in myEventSet: myEventSet.append(x) print('Unique events are as follows: \n', myEventSet,'\n') # Event string value reassignment based on unique event cases in 'myEventSet' newEvents = [] for x in trainingData.events: for i in range(len(myEventSet)): if x == myEventSet[i]: newEvents.append(i) # Converting datetime to Seconds and saving day of the week day = [] numDateTrainData = [] for i in range(len(trainingData.date)): date_obj = datetime.strptime(str(trainingData.date[i]), '%Y-%m-%d') numDateTrainData.append(date_obj.timestamp()) day.append(date_obj.weekday()) #print(trainingData.date) dictReqCount = {} for i in range(len(trainingData.date)): if day[i] not in dictReqCount.keys(): dictReqCount[day[i]] = [] dictReqCount[day[i]].append(trainingData.request_count[i]) #print(dictReqCount) dictAvgReqCount = {} for key,val in dictReqCount.items(): dictAvgReqCount[key] = sum(val)/len(val) #print(dictAvgReqCount) maxValue = max(dictAvgReqCount.values()) maxKey = [key for key,val in dictAvgReqCount.items() if val == maxValue] print('Day #{} of the week has the max mean request count'.format(maxKey[0])) minValue = min(dictAvgReqCount.values()) minKey = [key for key, val in dictAvgReqCount.items() if val == minValue] print('Day #{} of the week has the min mean request count'.format(minKey[0])) # Assembling feature arrays features_trainingData = [] for i in range(len(numDateTrainData)): row = [numDateTrainData[i], day[i], trainingData.calendar_code[i], trainingData.site_count[i], trainingData.max_temp[i], trainingData.min_temp[i], trainingData.precipitation[i], newEvents[i]]; features_trainingData.append(row) #for i in range(len(features_trainingData)): # print(len(features_trainingData[i])) #Y = list(trainingData.request_count) Y = trainingData.request_count X = features_trainingData #print('length of Y =', len(Y)) #print(features_trainingData) # Models that work on both continuous and discrete data scoring = 'neg_mean_squared_error' models = [DTR(),GNB(),RFR(),KNR()] '''models = [[DTR(), DTR(max_depth=2), DTR(max_depth=5)], [GNB(), GNB(priors=None)], [RFR(), RFR(), RFR()], [KNR(), KNR(), KNR()]] ''' seed = 7 kfold = MS.KFold(n_splits=10, random_state=seed) i = 0 mErr = [] for model in models: results = MS.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) mErr.append(results.mean()) i += 1 #print(mErr) best_model_index = 0 maxAbsErrInd = math.fabs(mErr[0]) for i in range(1, len(mErr)): if (math.fabs(mErr[i]) < maxAbsErrInd): best_model_index = i maxAbsErrInd = math.fabs(mErr[i]) print('\nModel #%d (i.e. %s) performed best' %(best_model_index, str(models[best_model_index]).split('(')[0])) # ------------------------------------------------------- # Test Data # ------------------------------------------------------- # Loading test data testDataFile = 'Test_set.csv' testData = pd.read_csv(testDataFile) # Event string reassignment using myEventSet from training data newEvents = [] for x in testData.events: for i in range(len(myEventSet)): if x == myEventSet[i]: newEvents.append(i) # Converting datetime to Seconds and determining days of the week day = [] numDateTestData = [] for i in range(len(testData.date)): date_obj = datetime.strptime(str(testData.date[i]), '%Y-%m-%d') numDateTestData.append(date_obj.timestamp()) day.append(date_obj.weekday()) # Assembling feature arrays features_testData = [] for i in range(len(numDateTestData)): row = [numDateTestData[i], day[i], testData.calendar_code[i], testData.site_count[i], testData.max_temp[i], testData.min_temp[i], testData.precipitation[i], newEvents[i]]; features_testData.append(row) # Test data features X_test = features_testData # Test data prediction bestModel = models[best_model_index] Y_pred = bestModel.fit(X, Y).predict(X_test) Y_pred_train = bestModel.fit(X, Y).predict(X) print('\nThe predicted values for request count using the test data is as follows:\n',Y_pred) output_file = open('predicted_request_count.csv','w') for i in range(len(Y_pred)): output_file.write(str(Y_pred[i])+'\n') output_file.close() # Plot the results plt.figure(1) plt.scatter(numDateTrainData, Y, c="darkorange", label="Training data") plt.scatter(numDateTestData, Y_pred, c="cornflowerblue", label="Test data model prediction") plt.scatter(numDateTrainData, Y_pred_train, c="red", label="Training data model prediction") plt.xlabel("Numerical Date") plt.ylabel("Page Count") plt.title("Best Model") plt.legend() plt.show()
for i in data1.columns: if data1[i].dtype ==object: print(i) data1 =cat_to_num(data1,i) data1.drop(['gender','ssc_b','hsc_b','degree_t'],inplace =True,axis =1) x = data1 y = data1['salary'] from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2) from sklearn.tree import DecisionTreeRegressor as DTR regr = DTR() regr.fit(x_train,y_train) from sklearn.linear_model import LogisticRegression model = LogisticRegression() model.fit(x_train,y_train) from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(n_estimators = 1000, random_state = 42) rf.fit(x_test, y_test); from sklearn.metrics import r2_score print(r2_score(y_test,regr.predict(x_test))) ax1 = sns.distplot(y_test,hist=False,color ="r",label ="Actual Value") sns.distplot(regr.predict(x_test),color ="b",hist = False,label = "Preicted Value",ax =ax1)
def trainDecisionTreeModel(inputData, outputData, workOrFreeDay): regressor = DTR(random_state=0, max_depth = 5) predicted = cross_val_predict(regressor, inputData, outputData, cv=10) printEvaluationScores(predicted, outputData, "Simple DecisionTree", workOrFreeDay)
plt.xlabel('Date'); # So this is as far as a linear regression will go. This gives us a baseline $R^2$ of 61.2% to build from with a more complicated model. # # The most obvious thing that will need to be improved is the fact that negative radiation is impossible. Therefore, we will need a model that can deal with this. # ## Decision Tree Algorithm # In[32]: from sklearn.tree import DecisionTreeRegressor as DTR # fit random forest dt = DTR() dt.fit(X_train, y_train) # In[33]: dt.score(X=X_test, y=y_test) # In[34]: # predict using the random forest model y_pred = dt.predict(X_test)
enet_msq = msq(enet.predict(X_test), y_test) enet_r2 = r2(enet.predict(X_test), y_test) print('\nThe mean squared error of the ElasticNet model is: \t\t\t%s' % enet_msq) print('The R2 score of the ElasticNet model is: \t\t\t\t%s' % enet_r2) # ============================================================================= # TREE CLASSIFICATIONS # ============================================================================= from sklearn.tree import DecisionTreeRegressor as DTR from sklearn.tree import DecisionTreeClassifier as DTC from sklearn.neighbors import KNeighborsRegressor as KNR from sklearn.neighbors import KNeighborsClassifier as KNC # DECISION TREE REGRESSOR dtr = DTR() dtr.fit(X_train, y_train) dtr_msq = msq(dtr.predict(X_test), y_test) dtr_r2 = r2(dtr.predict(X_test), y_test) print( '\nThe mean squared error of the Decision Tree Regressor model is: \t%s' % dtr_msq) print('The R2 score of the Decision Tree Regressor model is: \t\t\t%s' % dtr_r2) # DECISION TREE CLASSIFIER dtc = DTC() dtc.fit(X_train, y_train) dtc_msq = msq(dtc.predict(X_test), y_test) dtc_r2 = r2(dtc.predict(X_test), y_test) print(
""" ##############################Regression############################## """ from sklearn.metrics import mean_squared_error #k-Neighbors Regressor from sklearn.neighbors import KNeighborsRegressor as KNR knr_energy = KNR(weights='distance').fit(X_train_energy_pca, y_train_energy) y_pred_knr = knr_energy.predict(X_test_energy_pca) print("Mean squared error for kNN: {:.3f}.".format( mean_squared_error(y_pred_knr, y_test_energy))) #Decision Tree Regressor from sklearn.tree import DecisionTreeRegressor as DTR dtr_energy = DTR(max_depth=11, min_samples_split=16, min_samples_leaf=2, random_state=37).fit(X_train_energy_stand, y_train_energy) y_pred_dtr = dtr_energy.predict(X_test_energy_stand) print("Mean squared error for DTR: {:.3f}.".format( mean_squared_error(y_pred_dtr, y_test_energy))) #Random Forest Regressor from sklearn.ensemble import RandomForestRegressor as RFR rfr_energy = RFR(n_estimators=100, min_samples_leaf=2, max_leaf_nodes=1000, random_state=37).fit(X_train_energy, y_train_energy) y_pred_rfr = rfr_energy.predict(X_test_energy) print("Mean squared error for RFR: {:.3f}.".format( mean_squared_error(y_pred_rfr, y_test_energy)))
mydata = mydata.loc[myreturn.index] myreturn = myreturn.sort_values(ascending=False) total = len(myreturn.index) #mark the first 1/3 stocks (by return) as 1, and the last 1/3 as 0 top_index = myreturn.index[:int(total/3)] bottom_index = myreturn.index[int(total*2/3):] top = mydata.loc[top_index] bottom = mydata.loc[bottom_index] top['target'] = myreturn.loc[top_index] bottom['target'] = myreturn.loc[bottom_index] mydata = pd.concat([top,bottom],axis=0).dropna(axis=0) target = mydata.pop('target') #train a new regression tree of today dtr = DTR(max_depth=m_depth) mytrees = [] myscores = [] for i in range(0,num_trees): train_X,test_X, train_y, test_y = train_test_split(mydata,target,test_size = 0.2,random_state = 11+2*i) newresult = dtr.fit(train_X,train_y) mytrees.append(newresult) y_predict = pd.Series(newresult.predict(test_X)) test_y = pd.Series(test_y) test_y = test_y.sort_values(ascending=False) for rank in range(0,int(len(test_y)/2)): test_y[rank] = 1 for rank in range(int(len(test_y)/2),len(test_y)): test_y[rank] = 0 test_y = test_y.sort_index().tolist()
print("\nMean squared error for Linear Regression: {:.3f}.".format( mean_squared_error(LinReg_y_pred, y_test))) plt.scatter(LinReg_y_pred, y_test, c='green') """ ####################Ridge Regression#################### """ ridge = Ridge().fit(X_train, y_train) Ridge_y_pred = ridge.predict(X_test) print("\nMean squared error for Ridge Regression: {:.3f}.".format( mean_squared_error(Ridge_y_pred, y_test))) plt.scatter(Ridge_y_pred, y_test, c='red') """ ####################Regression Tree#################### """ #No changes R_tree = DTR(random_state=37).fit(X_train, y_train) RT_y_pred = R_tree.predict(X_test) print("\nMean squared error for Regression Tree: {:.3f}.".format( mean_squared_error(RT_y_pred, y_test))) #Depth set to 11 R_tree = DTR(max_depth=11, random_state=37).fit(X_train, y_train) RT_y_pred = R_tree.predict(X_test) print("\tDepth set to 11: {:.3f}.".format(mean_squared_error( RT_y_pred, y_test))) #Min samples split set to 16 R_tree = DTR(min_samples_split=16, random_state=37).fit(X_train, y_train) RT_y_pred = R_tree.predict(X_test) print("\tMin samples split set to 16: {:.3f}.".format( mean_squared_error(RT_y_pred, y_test)))
def train_some_model(X, X_cat, y, model='linear', params=None): ada_scores = [] ridge_scores = [] gbr_scores = [] linear_scores = [] knn_scores = [] k = 0 K = 10 for train, test in model_selection.KFold(K, shuffle=True).split(X, y): X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[ train], y.iloc[test] X_cat_train, X_cat_test = X_cat.iloc[train], X_cat.iloc[test] sale_price_mean = X_cat_train['SalePrice'].mean() for c in X_cat.columns: if c == 'SalePrice': continue hood_price = X_cat_train.groupby( c)['SalePrice'].mean().reset_index() hood_price.columns = [c, c + '_mean_price'] merged_train = X_cat_train.reset_index().merge( hood_price, how='left', on=[c]).set_index('index')[c + '_mean_price'] X_train = pd.concat((X_train, merged_train), axis=1) merged_test = X_cat_test.reset_index().merge( hood_price, how='left', on=[ c ]).set_index('index')[c + '_mean_price'].fillna(sale_price_mean) X_test = pd.concat((X_test, merged_test), axis=1) y_train = np.log(y_train) y_test = np.log(y_test) split = (X_train, y_train, X_test, y_test) dtr = DTR(criterion='mse', max_depth=None) ada = ensemble.AdaBoostRegressor(base_estimator=dtr, n_estimators=100, learning_rate=1.0, loss='exponential') score = get_score(ada, split) ada_scores.append(score) ridge = linear_model.Ridge(250) score = get_score(ridge, split) ridge_scores.append(score) gbr = ensemble.GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5) score = get_score(gbr, split) gbr_scores.append(score) linear = linear_model.LinearRegression() score = get_score(linear, split) linear_scores.append(score) # is bad: # knn = KNR(n_neighbors=10) # score = get_score(knn, split) # knn_scores.append(score) # print(dict(zip(X_train.columns, model.feature_importances_))) # print(X_train.columns[np.argpartition(model.feature_importances_, -4)[-4:]]) # print(model.feature_importances_[np.argpartition(model.feature_importances_, -4)[-4:]]) # print(model.feature_importances_) k += 1 scores = [ada_scores, ridge_scores, gbr_scores, linear_scores, knn_scores] return scores, k