def KNN(): print("--------------------KNeighbors-------------------------") neigh = KNeighborsRegressor(n_neighbors=10, weights='distance', n_jobs=-1) valiN = cross_val_score(neigh, Xvalid, Yvalid, cv=kf, verbose=1, n_jobs=-1) print(valiN.mean()) neigh.fit(Xtrain, Ytrain) print(neigh.get_params()) print('Score:', neigh.score(Xvalid, Yvalid)) Y = neigh.predict(Xvalid) print("MSE:", mean_squared_error(Yvalid, Y, squared=False))
def surrogateKNN(Xarchive, Farchive, X, file_loc, file_loc_general, toUpdate, first_iter=False, problem='LABS'): Xnew = Xarchive.T X_pred = X.T SMAC = False if SMAC: with open("/home/naamah/Documents/CatES/result_All/X1.p", "wb") as fp: pickle.dump(Xnew, fp) with open("/home/naamah/Documents/CatES/result_All/F1.p", "wb") as fp: pickle.dump(Farchive, fp) anf = smac_KNN.main_loop(problem) print("SMAC {}".format(anf)) sys.exit("Error message") neigh = KNeighborsRegressor(n_neighbors=9, algorithm="ball_tree", p=1, weights="distance", leaf_size=60) # KNN_LABS_444 # if problem=="LABS": # neigh = KNeighborsRegressor(n_neighbors=9, algorithm="ball_tree", p=1, weights="distance",leaf_size=60) # KNN_LABS_444 # # # # elif problem=="NKL": # neigh = KNeighborsRegressor(n_neighbors=10, algorithm="brute", p=1, weights="distance",leaf_size=76) # KNN_NKL_4442 # # # # else: #problem=="QAP" # neigh = KNeighborsRegressor(n_neighbors=8, algorithm="auto", p=1, weights="distance",leaf_size=19) # KNN_QAP_4446 if not os.path.exists(file_loc_general + "/surrogate_configuration"): with open(file_loc_general + "/surrogate_configuration", 'a') as file: file.write("clf:\n{}\n\nTuning Algorithem: {} ".format( neigh.get_params(), "smac")) file.close() neigh.fit(Xnew, Farchive) F_pred = neigh.predict(X_pred) return F_pred
def knn_regression(df, significant_cols, target, cat_cols, num_cols): ss = StandardScaler() ohe = OneHotEncoder(drop='first', sparse=False) X = df[significant_cols] y = df[target] estimator = KNeighborsRegressor(n_jobs=-1) params = { 'n_neighbors': np.arange(5, int(X.shape[0] * 0.1)), 'weights': ['uniform', 'distance'], 'p': [1, 2] } X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) X_train_cat = ohe.fit_transform(X_train[cat_cols]) X_train_num = ss.fit_transform(X_train[num_cols]) X_test_cat = ohe.transform(X_test[cat_cols]) X_test_num = ss.transform(X_test[num_cols]) train_data = np.c_[X_train_cat, X_train_num] test_data = np.c_[X_test_cat, X_test_num] gs = GridSearchCV(estimator, params, scoring='r2', cv=3) gs.fit(train_data, y_train) estimator = gs.best_estimator_ r2_cv_scores = cross_val_score(estimator, train_data, y_train, scoring='r2', cv=3, n_jobs=-1) rmse_cv_scores = cross_val_score(estimator, train_data, y_train, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1) params = estimator.get_params() r2 = np.mean(r2_cv_scores) rmse = np.abs(np.mean(rmse_cv_scores)) r2_variance = np.var(r2_cv_scores, ddof=1) rmse_variance = np.abs(np.var(rmse_cv_scores, ddof=1)) estimator.fit(train_data, y_train) y_pred = estimator.predict(test_data) r2_validation = r2_score(y_test, y_pred) rmse_validation = np.sqrt(mean_squared_error(y_test, y_pred)) return r2, rmse, r2_variance, rmse_variance, r2_validation, rmse_validation, params
data, target, test_size=0.10, random_state=42) # drop column 'name' x_train = x_train_with_names.drop('name', axis=1) x_test = x_test_with_names.drop('name', axis=1) # parameters n_neighbors = 5 # building the model neigh = KNeighborsRegressor(n_neighbors=n_neighbors) # training the model neigh.fit(x_train, y_train) # looking at the methods get_params = neigh.get_params() # returns the parameters of the model kneighbours = neigh.kneighbors( x_test.head(1), n_neighbors=n_neighbors ) # the first array gives the distance between the new data point and the k neighbours, and the second array gives the sample number of the k neighbours kneighbours_graph = neigh.kneighbors_graph( x_test.head(1), n_neighbors=n_neighbors, mode='distance' ) # returns a sparce matrix for the k neighbours for the new data points prediction_array = neigh.predict(x_test) # predicted test values train_score = neigh.score(x_train, y_train) # the mean auuracy of the training dataset test_score = neigh.score(x_test, y_test) # the mean acccuracy for the test dataset print( 'The mean accuracy of the train dataset is: %.3f and the mean accuracy of the test dataset is: %.3f' % (train_score, test_score))
class KNearestNeighbor: model = KNeighborsRegressor() def __init__(self, X, Y, neighbors=[1, 5, 20], weights='uniform', algorithm='auto', n_jobs=1): """ X,Y: traindata neighbors: number of neighbors, if list than 5-fold cross-validation is used to get optimal value weights: must be one of ['uniform', 'distance'], if list than 5-fold cross-validation is used to get optimal value algorithm: must be one of [‘auto', ‘ball_tree', ‘kd_tree', ‘brute'] n_jobs: The number of parallel jobs to run for neighbors search. If -1, then the number of jobs is set to the number of CPU cores. Doesn't affect fit method. """ if isinstance(neighbors, list) or isinstance(weights, list): neighbors, weights, optimal_err = self.cross_validation( X, Y, neighbors, weights, algorithm, n_jobs) self.model = KNeighborsRegressor(n_neighbors=neighbors, weights=weights, algorithm=algorithm, n_jobs=n_jobs) self.model.fit(X, Y) def evaluate(self, X, Y): Y_pred = self.predict(X) mse = mean_squared_error(Y, Y_pred) Y_pred[Y_pred > 0.5] = 1 Y_pred[Y_pred <= 0.5] = 0 acs = accuracy_score(Y, Y_pred) print("\n%s: %.4f" % ("MSE", mse)) print("%s: %.2f%%" % ("Accuracy", acs * 100)) return mse, acs def predict(self, X): return self.model.predict(X) def kneighbors(X=None, n_neighbors=None, return_distance=True): return self.model.kneighbors(X=X, n_neighbors=n_neighbors, return_distance=return_distance) def cross_validation(self, X, Y, neighbors, weights, algorithm, n_jobs, cv=5): if not isinstance(neighbors, list): neighbors = [neighbors] if not isinstance(weights, list): weights = [weights] cv_scores = [] param = [] for k in neighbors: for w in weights: self.model = KNeighborsRegressor(n_neighbors=k, weights=w, algorithm=algorithm, n_jobs=n_jobs) scores = cross_val_score(self.model, X, Y, cv=cv, scoring='neg_mean_squared_error') cv_scores.append(-1 * scores.mean()) param.append([k, w]) #print(cv_scores) optimal_p = param[cv_scores.index(min(cv_scores))] optimal_k = optimal_p[0] optimal_weights = optimal_p[1] print("Optimal value for neighbors is: %i" % (optimal_k)) print("Optimal weight method is: %s" % (optimal_weights)) return optimal_k, optimal_weights, min(cv_scores) def get_params(self, deep=True): return self.model.get_params(deep)
print('CATB Best params: ', catb_cv_model.best_params_) # Final model print('CATB Baslangic zamani: ', datetime.now()) catb_tuned = CatBoostRegressor (**catb_cv_model.best_params_).fit(X_train, y_train) # ??? print('CATB Bitis zamani: ', datetime.now()) y_pred = catb_tuned.predict(X_test) print('CATB tuned: ', np.sqrt(mean_squared_error(y_test, y_pred))) # KNN Modelling and Prediction RMSE = [] knn_model = KNeighborsRegressor().fit(X_train, y_train) knn_model.get_params() y_pred = knn_model.predict(X_test) print('KNN Base: ', np.sqrt(mean_squared_error(y_test, y_pred))) for k in range(20): k += 2 knn_model = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train) y_pred = knn_model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) RMSE.append(rmse) print('KNN for k = ', k, ' RMSE value: ', rmse) # Find optimum k value with GridSearch knn_params = {'n_neighbors': np.arange(2, 30, 1)} knn_model = KNeighborsRegressor()
# находятся линейные модели, в neighbors - методы основанные на ближайших # соседях. from sklearn.neighbors import KNeighborsRegressor # Импортируем алгоритм knn из sklearn. Работа с алгоритмами машинного обучения # в библиотеке состоит из трех этапов. # Создание объекта, который будет реализовывать алгоритм. # Вызов fit: обучение модели на тренировочной подвыборке # Вызов predict: получение предсказаний на тестовой выборке knn = KNeighborsRegressor(n_neighbors=5, weights='uniform', p=2)#p - указывает # на метрику близости. р = 2 - евклидово расстояниеб р = 1 - манхэттенское # KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', # metric_params=None, n_jobs=None, n_neighbors=5, p=2, # weights='uniform') knn.fit(X_train, y_train)# обучаем модель print(knn.get_params()) predictions = knn.predict(X_test)# получаем предсказания # Посчитаем метрику, соответствующая функция есть в scikit-learn! Будет считать # средне квадратичную ошибку, так как мы решаем задачу регрессии. from sklearn.metrics import mean_squared_error print(mean_squared_error(y_test, predictions)) # Давайте попробуем сделать лучше! У нашего алгоритма есть множество # гиперпараметров: количество соседей, параметры метрики и веса. Запустим поиск # по сетке гиперараметров, алгоритм переберет все возможные комбинации, # посчитает метрику для каждого набора и выдаст лучший набор. from sklearn.model_selection import GridSearchCV grid_searcher = GridSearchCV(KNeighborsRegressor(), param_grid={'n_neighbors': range(1, 40, 2), 'weights': ['uniform', 'distance'], 'p': [1, 2, 3]}, cv=5)
def surrogateKNN(Xarchive, Farchive, X, file_loc, file_loc_general, toUpdate, first_iter=False, problem='Pump', knn_parm=None): Xnew = Xarchive.T X_pred = X.T SMAC = False if SMAC: with open("/home/naamah/Documents/CatES/result_All/X1.p", "wb") as fp: pickle.dump(Xnew, fp) with open("/home/naamah/Documents/CatES/result_All/F1.p", "wb") as fp: pickle.dump(Farchive, fp) anf = smac_KNN.main_loop(problem) print("SMAC {}".format(anf)) sys.exit("Error message") if (knn_parm == None): if problem == "Pump": #neigh = KNeighborsRegressor(n_neighbors=10, algorithm="ball_tree", p=1, weights="distance",leaf_size=1) neigh = KNeighborsRegressor(n_neighbors=10, algorithm="ball_tree", p=3, weights="distance", leaf_size=10) # R elif problem == "NKL": neigh = KNeighborsRegressor(n_neighbors=9, algorithm="auto", p=1, weights="distance") else: #problem=="QAP" # neigh = KNeighborsRegressor(n_neighbors=10, algorithm="ball_tree", p=1, weights="distance",leaf_size=1) neigh = KNeighborsRegressor(n_neighbors=10, algorithm="auto", p=3, weights="uniform", leaf_size=98) # R else: neigh = KNeighborsRegressor(n_neighbors=knn_parm.get("n_neighbors"), algorithm=knn_parm.get("algorithm"), p=knn_parm.get("p"), weights=knn_parm.get("weights"), leaf_size=knn_parm.get("leaf_size")) # R if not os.path.exists(file_loc_general + "/surrogate_configuration"): with open(file_loc_general + "/surrogate_configuration", 'a') as file: file.write("clf:\n{}\n\nTuning Algorithem: {} ".format( neigh.get_params(), "irace")) file.close() neigh.fit(Xnew, Farchive) F_pred = neigh.predict(X_pred) return F_pred
plt.plot(list(evres['validation_1']['mae'])) plt.title('Model mae') plt.ylabel('mae') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') #plt.savefig("Keras_NN_Accuracy.png") plt.show() plt.clf() #print(trainbst.evals_result()) print("####################KNN") neigh = KNeighborsRegressor(n_neighbors=10, weights='distance', n_jobs=-1) valiN = cross_val_score(neigh, Xvalid, Yvalid, cv=kf, verbose=1, n_jobs=-1) print(valiN.mean()) neigh.fit(Xtrain, Ytrain) print(neigh.get_params()) print(neigh.score(Xvalid, Yvalid)) # plt.plot(list(evres['validation_0']['rmse'])) # plt.plot(list(evres['validation_1']['rmse'])) # plt.title('Model rmse') # plt.ylabel('rmse') # plt.xlabel('Epoch') # plt.legend(['Train', 'Test'], loc='upper left') # #plt.savefig("Keras_NN_Accuracy.png") # plt.show() # plt.clf() """ print("##################SVM") s=SVR(verbose=True) s.fit(Xtrain,Ytrain) print(s.score(Xvalid,Yvalid))
class KNN(object): def __init__(self, task_type="cla", module_type="performance", **params): assert task_type in ["cla", "reg"] # 两种类型 assert module_type in ["balance", "debug", "performance", None] # 三种 性能模型 self.module_type = module_type if self.module_type == "debug": params["n_jobs"] = 1 elif self.module_type == "performance": # 性能模型 params["n_jobs"] = cpu_count() # cpu核心数 elif self.module_type == "balance": # 均衡模型 params["n_jobs"] = cpu_count() // 2 else: params["n_jobs"] = None self.task_type = task_type # weights 取值{"uniform", "distance",None} # 默认使用的uniform # "algorithm" 取值 {"auto", "ball_tree", "kd_tree", "brute",None}s # 权重 uniform 均匀权重, distance 按照其距离的倒数 # "ball_tree" 使用BallTree算法, # kd_tree 使用的KDTree 算法 # brute 使用暴力搜索 算法。 # p的取值, int 类型数据, 默认为2 马尔科夫功率参数 # p=1 时,等校于p=2使用manhattan_distance(l1)和euclidean_distance(l2). # 对于任意的p, 使用minkowskidistance(l_p) if self.task_type == "cla": self.model = KNeighborsClassifier( n_neighbors=params.get("n_neighbors", 5), weights=params.get("weights", 'uniform'), algorithm=params.get("algorithm", 'auto'), leaf_size=params.get("leaf_size", 30), # 叶子大小 p=params.get("p", 2), metric=params.get("metric", 'minkowski'), metric_params=params.get("metric_params", None), n_jobs=params.get("n_jobs", None) # 并行数 ) else: self.model = KNeighborsRegressor( n_neighbors=params.get("n_neighbors", 5), weights=params.get("weights", 'uniform'), algorithm=params.get("algorithm", 'auto'), leaf_size=params.get("leaf_size", 30), p=params.get("p", 2), metric=params.get("metric", 'minkowski'), metric_params=params.get("metric_params", None), n_jobs=params.get("n_jobs", None)) def fit(self, x, y=None): self.model.fit(X=x, y=y) def get_params(self): return self.model.get_params(deep=True) def set_params(self, params): self.model.set_params(**params) def predict(self, x): return self.model.predict(X=x) def predict_proba(self, x): if self.task_type == "cla": return self.model.predict_proba(X=x) else: ValueError("回归任务无法使用") def get_score(self, x, y, sample_weight): return self.model.score(X=x, y=y, sample_weight=sample_weight) def search_kneighbors(self, x=None, n_neighbors=None, return_distance=True): # 查找K近邻居 return self.model.kneighbors(X=x, n_neighbors=n_neighbors, return_distance=return_distance) def get_kneighbors_graph(self, x=None, n_neighbors=None, mode='connectivity'): # 获取最近邻图 """ :param x: :param n_neighbors: :param mode: "distance","connectivity" :return: """ return self.model.kneighbors_graph(X=x, n_neighbors=n_neighbors, mode=mode)