def fun_knn_fs(x, *args): X, y, flag, n_splits, random_seed = args n_samples, n_var = X.shape w = { 0: 'uniform', 1: 'distance', } p = { 'p': int(round(x[2])), 'n_neighbors': int(round(x[0])), 'weights': w[int(round(x[1]))], } clf = KNeighborsRegressor() clf.set_params(**p) if len(x) <= 3: ft = np.array([1 for i in range(n_var)]) ft = np.where(ft > 0.5) else: ft = np.array([1 if k > 0.5 else 0 for k in x[2::]]) ft = np.where(ft > 0.5) #x[4::] = [1 if k>0.5 else 0 for k in x[4::]] #ft = np.array([1 if k>0.5 else 0 for k in x[4::]]) #ft = np.where(ft>0.5) n_splits = n_splits try: #cv=KFold(n_splits=n_splits, shuffle=True, random_state=random_seed) #cv=KFold(n=n_samples, n_folds=5, shuffle=True, random_state=int(random_seed)) cv = KFold(n_splits=n_splits, shuffle=True, random_state=int(random_seed)) y_p = cross_val_predict(clf, X[:, ft].squeeze(), y, cv=cv, n_jobs=1) #r = -r2_score(y_p,y) r = RMSE(y_p, y) #r = mean_squared_error(y,y_p)**0.5 #r = -accuracy_score(y,y_p) #r = -f1_score(y,y_p,average='weighted') except: y_p = [None] r = 1e12 #print(r,'\t',p) if flag == 'eval': return r else: clf.fit(X[:, ft].squeeze(), y) return { 'Y_TRUE': y, 'Y_PRED': y_p, 'EST_PARAMS': p, 'PARAMS': x, 'EST_NAME': 'KNN', 'ESTIMATOR': clf, 'ACTIVE_VAR': ft, 'DATA': X, 'SEED': random_seed }
def knn(x_train, y_train, x_val, y_val, x_test): n_neighbors = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 120] print('KNN') print(n_neighbors) print('Total Neighbors %i' % (len(n_neighbors))) print("Train VS Val") neigh = KNeighborsRegressor(algorithm='kd_tree', leaf_size=30, weights='uniform', n_jobs=-1) y_val_predicted_list = [] y_train_val_predicted_list = [] y_test_predicted_list = [] for a in n_neighbors: neigh.set_params(n_neighbors=a) neigh.fit(x_train, y_train) y_val_predicted_list.append(neigh.predict(x_val)) print("Train + Val VS Train + Val") x_train_val = np.concatenate((x_train, x_val), axis=0) y_train_val = np.concatenate((y_train, y_val), axis=0) for a in n_neighbors: neigh.set_params(n_neighbors=a) neigh.fit(x_train_val, y_train_val) y_train_val_predicted_list.append(neigh.predict(x_train_val)) # Train + Val VS Test y_test_predicted_list.append(neigh.predict(x_test)) return y_val_predicted_list, y_train_val_predicted_list, y_test_predicted_list
def KNN_regression(X_train, y_train, X_test, params): # Случайный поиск по сетке if hyperparameters == 'RandomGridSearch': # Осуществляем поиск по сетке с кросс-валидацией (число фолдов равно 3) weights = ['uniform', 'distance'] algorithm = ['auto', 'kd_tree'] n_neighbors = [2, 5,10,15,20] param_grid = {'weights': weights, 'n_neighbors': n_neighbors, 'algorithm': algorithm} # Задаем модель, которую будем обучать estimator = KNeighborsRegressor() # Производим обучение модели с заданными вариантами параметров (осуществляем поиск по сетке) optimizer = RandomizedSearchCV(estimator, param_grid, n_iter = 5, cv = 3, iid = 'deprecated', scoring = 'neg_mean_absolute_error') optimizer.fit(X_train, y_train) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ # Полный поиск по сетке elif hyperparameters == 'GridSearch': weights = ['uniform', 'distance'] algorithm = ['auto', 'kd_tree'] n_neighbors = [2, 5, 10, 15, 20] param_grid = {'weights': weights, 'n_neighbors': n_neighbors, 'algorithm': algorithm} # Задаем модель, которую будем обучать estimator = KNeighborsRegressor() # Производим обучение модели с заданными вариантами параметров (осуществляем поиск по сетке) optimizer = GridSearchCV(estimator, param_grid, cv = 3, iid='deprecated', scoring='neg_mean_absolute_error') optimizer.fit(X_train, y_train) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ elif hyperparameters == 'Custom': estimator = KNeighborsRegressor() # Задаем нужные параметры estimator.set_params(**params) # Проверка по кросс-валидации fold = KFold(n_splits = 3, shuffle = True) validation_score = cross_val_score(estimator = estimator, X = X_train, y = y_train, cv = fold, scoring = 'neg_mean_absolute_error') # Обучаем модель уже на всех данных estimator.fit(X_train, np.ravel(y_train)) predicted = estimator.predict(X_test) return(predicted, validation_score)
def fit(self, Xs, y): r""" Fit the regressor object to the data in Xs, y. Parameters ---------- Xs : list of array-likes or numpy.ndarray - Xs length: n_views - Xs[i] shape: (n_samples, n_features_i) A list of the different views of data to train on. y : array, shape (n_samples,) The target values of the training data. Unlabeled examples should have label np.nan. Returns ------- self : returns an instance of self """ # check whether Xs contain NaN and both Xs and y # are consistent with each other Xs, y = check_Xs_y_nan_allowed(Xs, y, multiview=True, enforce_views=self.n_views) y = np.array(y) # Xs contain two view X1 = Xs[0] X2 = Xs[1] # Storing the indexes of the unlabeled samples U = [i[0] for i in enumerate(y) if np.isnan(i[1])] # Storing the indexes of the labeled samples L = [i[0] for i in enumerate(y) if not np.isnan(i[1])] # making two true labels for each view # So that we can make changes to it without altering original labels y1 = y.copy() y2 = y.copy() # contains the indexes of labeled samples L1 = L.copy() L2 = L.copy() # fitting the estimator object on the train data self.estimator1_.fit(X1[L1], y1[L1]) self.estimator2_.fit(X2[L2], y2[L2]) # declaring a variable which keeps tracks # of the number of iteration performed it = 0 # Randomly selected index of unlabeled data samples unlabeled_pool = random.sample(U, min(len(U), self.unlabeled_pool_size)) # Removing the unlabeled samples which were selected earlier U = [i for i in U if i not in unlabeled_pool] while it < self.num_iter and unlabeled_pool: it += 1 # list of k nearest neighbors for all unlabeled samples neighbors1 = self.estimator1_.kneighbors( X1[unlabeled_pool], n_neighbors=self.k_neighbors_, return_distance=False) neighbors2 = self.estimator2_.kneighbors( X2[unlabeled_pool], n_neighbors=self.k_neighbors_, return_distance=False) # Stores the delta value of each view delta1 = [] delta2 = [] for i, (u, neigh) in enumerate(zip(unlabeled_pool, neighbors1)): # Making a copy of L1 to include the unlabeled index new_L1 = L1.copy() new_L1.append(u) # Predicts the value of unlabeled index pred = self.estimator1_.predict(np.expand_dims(X1[u], axis=0)) # assigning the predicted value to new y new_y1 = y1.copy() new_y1[u] = pred # prediction array before inclusion of unlabeled index pred_before_inc = [] pred_before_inc = self.estimator1_.predict((X1[L1])[neigh]) # new estimator for training a regressor model on new L1 new_estimator = KNeighborsRegressor() # Setting the same parameters as that of estimator1 object new_estimator.set_params(**self.estimator1_.get_params()) new_estimator.fit(X1[new_L1], new_y1[new_L1]) # prediction array after inclusion of unlabeled index pred_after_inc = [] pred_after_inc = new_estimator.predict((X1[L1])[neigh]) mse_before_inc = mean_squared_error((y1[L1])[neigh], pred_before_inc) mse_after_inc = mean_squared_error((y1[L1])[neigh], pred_after_inc) # appending the calculated value to delta1 delta1.append(mse_before_inc - mse_after_inc) for i, (u, neigh) in enumerate(zip(unlabeled_pool, neighbors2)): # Making a copy of L2 to include the unlabeled index new_L2 = L2.copy() new_L2.append(u) # Predicts the value of unlabeled index pred_before_inc = [] pred = self.estimator2_.predict(np.expand_dims(X2[u], axis=0)) # assigning the predicted value to new y new_y2 = y2.copy() new_y2[u] = pred # prediction array before inclusion of unlabeled index pred_before_inc = self.estimator2_.predict((X2[L2])[neigh]) # new estimator for training a regressor model on new L2 new_estimator = KNeighborsRegressor() # Setting the same parameters as that of estimator2 object new_estimator.set_params(**self.estimator2_.get_params()) new_estimator.fit(X2[new_L2], new_y2[new_L2]) # prediction array after inclusion of unlabeled index pred_after_inc = [] pred_after_inc = new_estimator.predict((X2[L2])[neigh]) mse_before_inc = mean_squared_error((y2[L2])[neigh], pred_before_inc) mse_after_inc = mean_squared_error((y2[L2])[neigh], pred_after_inc) # appending the calculated value to delta2 delta2.append(mse_before_inc - mse_after_inc) delta1_index = np.argsort(delta1) delta2_index = np.argsort(delta2) # list containing the indexes to be included to_include1 = [] to_include2 = [] """ If the length of both the delta's is equal to 1 then include the corresponding index whose value is positive and greater than the other values. Else selecting the indexes which have postive and maximum value from each delta's and incase both the indexes are equal then look at the second best positive value. The indexes which are selected from delta1 will be added to the labels of the estimator2 object. Similarly, the indexes which are selected from delta2 will be added to the labels of the estimator1 object. """ if delta1_index.shape[0] == 1 and delta2_index.shape[0] == 1: if delta1[0] > 0 and delta2[0] > 0: if delta1[0] >= delta2[0]: L2.append(unlabeled_pool[0]) to_include2.append(0) else: L1.append(unlabeled_pool[0]) to_include1.append(0) elif delta1[0] > 0: L2.append(unlabeled_pool[0]) to_include2.append(0) elif delta2[0] > 0: L1.append(unlabeled_pool[0]) to_include1.append(0) else: # Top two indexes from each delta index1_1, index1_2 = delta1_index[-1], delta1_index[-2] index2_1, index2_2 = delta2_index[-1], delta2_index[-2] if index1_1 != index2_1: if delta1[index1_1] > 0: L2.append(unlabeled_pool[index1_1]) to_include2.append(index1_1) if delta2[index2_1] > 0: L1.append(unlabeled_pool[index2_1]) to_include1.append(index2_1) else: if delta1[index1_1] > 0 and delta2[index2_1] > 0: if delta1[index1_1] >= delta2[index2_1]: L2.append(unlabeled_pool[index1_1]) to_include2.append(index1_1) if delta2[index2_2] > 0: L1.append(unlabeled_pool[index2_2]) to_include1.append(index2_2) else: L1.append(unlabeled_pool[index2_1]) to_include1.append(index2_1) if delta1[index1_2] > 0: L2.append(unlabeled_pool[index1_2]) to_include2.append(index1_2) elif delta1[index1_1] > 0: L2.append(unlabeled_pool[index1_1]) to_include2.append(index1_1) elif delta2[index2_1] > 0: L1.append(unlabeled_pool[index2_1]) to_include1.append(index2_1) # break if to_include1 and to_include2 are empty if len(to_include1) == 0 and len(to_include2) == 0: break # including the selected index for i in to_include1: pred = self.estimator2_.predict( np.expand_dims(X2[unlabeled_pool[i]], axis=0)) y1[unlabeled_pool[i]] = pred # including the selected index for i in to_include2: pred = self.estimator1_.predict( np.expand_dims(X1[unlabeled_pool[i]], axis=0)) y2[unlabeled_pool[i]] = pred # Currently to_include contains the index of unlabeled samples # in the order in which they are stored in unlabeled_pool # Converting them to the value which unlabeled_pool stores # example unlabeled_pool = [10, 15, 17] # current to_include = [1, 2] # updated to_include = [15, 17] to_include1 = [unlabeled_pool[i] for i in to_include1] to_include2 = [unlabeled_pool[i] for i in to_include2] # removing the selected index unlabeled_pool = [ u for u in unlabeled_pool if (u not in to_include1) and (u not in to_include2) ] # replenishing the unlabeled pool for u in U: if len(unlabeled_pool) < self.unlabeled_pool_size: if u not in unlabeled_pool: unlabeled_pool.append(u) else: break U = [i for i in U if i not in unlabeled_pool] # fitting the model on new train data self.estimator1_.fit(X1[L1], y1[L1]) self.estimator2_.fit(X2[L2], y2[L2]) return self
X_knn_train, X_knn_test, y_knn_train, y_knn_test = train_test_split(knn_x, knn_y, test_size=0.20,random_state=10) knn = KNeighborsRegressor(n_neighbors=15) knn.fit(X_knn_train, y_knn_train) #y_pred = knn.predict(X_knn_test) accuracy_knn = knn.score(X_knn_test,y_knn_test) print('Accuracy of knn is : ' + str(np.round(accuracy_knn*100, 2)) + '%') # different number of n_estimators n_neig = np.arange(1, 10, 1) n_scores = [] for i in n_neig: knn.set_params(n_neighbors=i) knn.fit(X_knn_train, y_knn_train) n_scores.append(knn.score(X_knn_test, y_knn_test)) # Plotting the result plt.figure(figsize=(10, 8)) plt.title("With different Estimators") plt.xlabel("Number of Neighbors K") plt.ylabel("Score") plt.plot(n_neig, n_scores,color='green', marker='o', linestyle='dashed',linewidth=2, markersize=12) from sklearn.ensemble import RandomForestClassifier from sklearn import preprocessing from sklearn.metrics import accuracy_score
# Find the mean accuracy of knn regression using X_test and y_test model.fit(X_train, y_train) # In[36]: # Calculate the mean accuracy of the KNN model accuracy = model.score(X_test, y_test) 'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%' # In[37]: # Try different numbers of n_estimators - this will take a minute or so n_neighbors = np.arange(1, 20, 1) scores = [] for n in n_neighbors: model.set_params(n_neighbors=n) model.fit(X_train, y_train) scores.append(model.score(X_test, y_test)) plt.figure(figsize=(7, 5)) plt.title("Effect of Estimators") plt.xlabel("Number of Neighbors K") plt.ylabel("Score") plt.plot(n_neighbors, scores) # In[38]: # The RandomForestRegressor # In[39]: model = RandomForestRegressor(n_jobs=-1)
def knn_regression(X_train, y_train, X_test, params, use_cv: bool = True): # If there are not enough points for cross validation if use_cv is False: if params is None: model = KNeighborsRegressor() else: model = KNeighborsRegressor(**params) model.fit(X_train, y_train) predicted = model.predict(X_test) # Calculate score on train train_predicted = model.predict(X_train) validation_score = mean_absolute_error( np.ravel(y_train), np.ravel(train_predicted)) return predicted, validation_score # Random grid search if hyperparameters == 'RandomGridSearch': # Carry out a random grid search with cross-validation (the number of folds is 3) weights = ['uniform', 'distance'] algorithm = ['auto', 'kd_tree'] n_neighbors = [2, 5, 10, 15, 20] param_grid = { 'weights': weights, 'n_neighbors': n_neighbors, 'algorithm': algorithm } # Set the model to be trained estimator = KNeighborsRegressor() # Train the model with the given options of parameters optimizer = RandomizedSearchCV( estimator, param_grid, n_iter=5, cv=3, iid='deprecated', scoring='neg_mean_absolute_error') optimizer.fit(X_train, y_train) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ # Full grid search elif hyperparameters == 'GridSearch': weights = ['uniform', 'distance'] algorithm = ['auto', 'kd_tree'] n_neighbors = [2, 5, 10, 15, 20] param_grid = { 'weights': weights, 'n_neighbors': n_neighbors, 'algorithm': algorithm } # Set the model to be trained estimator = KNeighborsRegressor() # Train the model with the given options of parameters optimizer = GridSearchCV(estimator, param_grid, cv=3, iid='deprecated', scoring='neg_mean_absolute_error') optimizer.fit(X_train, y_train) regression = optimizer.best_estimator_ predicted = regression.predict(X_test) validation_score = optimizer.best_score_ elif hyperparameters == 'Custom': estimator = KNeighborsRegressor() # Set the params estimator.set_params(**params) # Cross-validation fold = KFold(n_splits=3, shuffle=True) validation_score = cross_val_score( estimator=estimator, X=X_train, y=y_train, cv=fold, scoring='neg_mean_absolute_error') estimator.fit(X_train, np.ravel(y_train)) predicted = estimator.predict(X_test) return predicted, validation_score
mat = loadmat(rfp) mat p_act = np.array(mat['xk'][:, 0:4]) p_ref = np.array(mat['uk']) u_v = np.array(mat['xk'][:, 6:]) split = int(.75 * len(p_act)) norm_p = normalize(p_act) norm_u_v = normalize(u_v) #norm_p, norm_u_v = shuffle(norm_p, norm_u_v) weights = ['uniform', 'distance'] algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'] kn = KNeighborsRegressor() kn.set_params(n_neighbors=2, weights=weights[0], algorithm=algorithm[0], p=1) kn.fit(norm_p[:split], norm_u_v[:split]) prediction = kn.predict(norm_p[split:]) plt.plot(norm_u_v[split:, 0]) plt.plot(prediction[:, 0]) # plt.plot(norm_u_v[split:,1]) # plt.plot(prediction[0,1]) plt.grid() plt.legend(['actual u', 'predicted u', 'actual v', 'predicted v']) plt.show() plt.plot(norm_u_v[split:, 1]) plt.plot(prediction[:, 1]) plt.legend(['actual v', 'predicted v']) plt.show()
print(n_neighbors) print("Total Neighbors") print(len(n_neighbors)) coefs = [] rmse_val = [] rmse_test = [] print("Train VS Val") neigh = KNeighborsRegressor(algorithm='kd_tree', leaf_size=30, weights='uniform', n_jobs=-1) for a in n_neighbors: neigh.set_params(n_neighbors=a) neigh.fit(x_train, y_train) # coefs.append(neigh.coef_) y_val_predicted = neigh.predict(x_val) rmse = sqrt(mean_squared_error(y_val, y_val_predicted)) rmse_val.append(rmse) print('RMSE KNN %.3f N. Neighbours: %i' % (rmse, a)) print("Train + Val VS Test") x_train_val = np.concatenate((x_train, x_val), axis=0) y_train_val = np.concatenate((y_train, y_val), axis=0) for a in n_neighbors: neigh.set_params(n_neighbors=a) neigh.fit(x_train_val, y_train_val) # coefs.append(neigh.coef_)
# KNN model train, test set : 25% and 15 features X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10) model = KNeighborsRegressor(n_neighbors=15) model.fit(X_train, y_train) print('\n-KNN Model Description-\n', model, '\n') # show KNN model accuracy accuracy = model.score(X_test, y_test) 'Accuracy: ' + str(np.round(accuracy * 100, 2)) + '%' print('-Accuracy-\n', accuracy) # Try different numbers of n_estimators - this will take a minute or so n_neighbors = np.arange(1, 20, 1) scores = [] for n in n_neighbors: model.set_params(n_neighbors=n) model.fit(X_train, y_train) scores.append(model.score(X_test, y_test)) plt.figure(figsize=(7, 5)) plt.title("Effect of Estimators") plt.xlabel("Number of Neighbors K") plt.ylabel("Score") plt.plot(n_neighbors, scores) plt.show()
class KNN(object): def __init__(self, task_type="cla", module_type="performance", **params): assert task_type in ["cla", "reg"] # 两种类型 assert module_type in ["balance", "debug", "performance", None] # 三种 性能模型 self.module_type = module_type if self.module_type == "debug": params["n_jobs"] = 1 elif self.module_type == "performance": # 性能模型 params["n_jobs"] = cpu_count() # cpu核心数 elif self.module_type == "balance": # 均衡模型 params["n_jobs"] = cpu_count() // 2 else: params["n_jobs"] = None self.task_type = task_type # weights 取值{"uniform", "distance",None} # 默认使用的uniform # "algorithm" 取值 {"auto", "ball_tree", "kd_tree", "brute",None}s # 权重 uniform 均匀权重, distance 按照其距离的倒数 # "ball_tree" 使用BallTree算法, # kd_tree 使用的KDTree 算法 # brute 使用暴力搜索 算法。 # p的取值, int 类型数据, 默认为2 马尔科夫功率参数 # p=1 时,等校于p=2使用manhattan_distance(l1)和euclidean_distance(l2). # 对于任意的p, 使用minkowskidistance(l_p) if self.task_type == "cla": self.model = KNeighborsClassifier( n_neighbors=params.get("n_neighbors", 5), weights=params.get("weights", 'uniform'), algorithm=params.get("algorithm", 'auto'), leaf_size=params.get("leaf_size", 30), # 叶子大小 p=params.get("p", 2), metric=params.get("metric", 'minkowski'), metric_params=params.get("metric_params", None), n_jobs=params.get("n_jobs", None) # 并行数 ) else: self.model = KNeighborsRegressor( n_neighbors=params.get("n_neighbors", 5), weights=params.get("weights", 'uniform'), algorithm=params.get("algorithm", 'auto'), leaf_size=params.get("leaf_size", 30), p=params.get("p", 2), metric=params.get("metric", 'minkowski'), metric_params=params.get("metric_params", None), n_jobs=params.get("n_jobs", None)) def fit(self, x, y=None): self.model.fit(X=x, y=y) def get_params(self): return self.model.get_params(deep=True) def set_params(self, params): self.model.set_params(**params) def predict(self, x): return self.model.predict(X=x) def predict_proba(self, x): if self.task_type == "cla": return self.model.predict_proba(X=x) else: ValueError("回归任务无法使用") def get_score(self, x, y, sample_weight): return self.model.score(X=x, y=y, sample_weight=sample_weight) def search_kneighbors(self, x=None, n_neighbors=None, return_distance=True): # 查找K近邻居 return self.model.kneighbors(X=x, n_neighbors=n_neighbors, return_distance=return_distance) def get_kneighbors_graph(self, x=None, n_neighbors=None, mode='connectivity'): # 获取最近邻图 """ :param x: :param n_neighbors: :param mode: "distance","connectivity" :return: """ return self.model.kneighbors_graph(X=x, n_neighbors=n_neighbors, mode=mode)