def skcv_knn(coordinates, Xdata, Ydata, number_of_folds, dzradii, k_neighbors, visualization): print("Starting skcv-knn analysis...") # Calculate sorted pairwise distance matrix and indexes performanceTable = np.zeros([len(dzradii),2]) data_distances, data_distance_indexes = distanceMatrix(coordinates) folds = random_folds(len(Ydata), number_of_folds) for rind, dzradius in enumerate(dzradii): print("Analysis ongoing, dead zone radius: " + str(dzradius) + "m / " + str(dzradii[len(dzradii)-1]) + "m") # Calculate dead zone folds dz_folds = dzfolds(dzradius, folds, data_distances, data_distance_indexes) # Initialize performance variables P = np.zeros(Ydata.shape) for fold_id, dz_fold in enumerate(dz_folds): X_tr = np.delete(Xdata, dz_fold, axis=0) Y_tr = np.delete(Ydata, dz_fold, axis=0) learner = KNeighborsRegressor(n_neighbors=k_neighbors) learner.fit(X_tr, Y_tr) preds = learner.predict(Xdata[dz_fold]) if preds.ndim == 0: P[folds[fold_id]] = preds else: P[folds[fold_id]] = preds[0:len(folds[fold_id])] if visualization: # Check for visualization testcoords = coordinates[folds[fold_id],:] dzcoords = coordinates[dz_fold, :] visualize_skcv(coordinates, testcoords, dzcoords, dzradius) perf = cindex(Ydata, P) performanceTable[rind,0] = dzradius performanceTable[rind, 1] = perf plotRes_skcv(performanceTable, rind, number_of_folds, "K-nn") print("Analysis done.") return performanceTable
class KNeighborsRegressorImpl(): def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None): self._hyperparams = { 'n_neighbors': n_neighbors, 'weights': weights, 'algorithm': algorithm, 'leaf_size': leaf_size, 'p': p, 'metric': metric, 'metric_params': metric_params, 'n_jobs': n_jobs } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
def health_smoothing_rad_neighbors(df,health,cols,rad=10): #see R2-plotter for details X=df[cols] y=df[health] knn= KNeighborsRegressor(n_neighbors=rad).fit(X,y) Y=knn.predict(X) df[health+'-smooth']=Y return df
def health_smoothing(df, health, cols, rad=10): X = df[cols] #features y = df[health] #data knn = KNeighborsRegressor(n_neighbors=rad).fit(X, y) #fit KNN for smoothing Y = knn.predict(X) #smoothed column df[health + '-smooth'] = Y #make new column in dataframe return df
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def evalOne(parameters): all_obs = [] all_pred = [] # all_obs_train = [] # all_pred_train = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target") model = KNeighborsRegressor(weights = parameters["weights"], n_neighbors = parameters["neighbors"], p = parameters["p"]) model.fit(trainX, trainY) # train_prediction = model.predict(trainX) prediction = model.predict(testX) all_obs.extend(testY) all_pred.extend(prediction) # all_obs_train.extend(trainY) # all_pred_train.extend(train_prediction) return rmseEval(all_obs, all_pred)[1]
def trainNearestNeighbors(data, columns, targetColumn, parameters): modelColumns = [] for column in columns: if column != targetColumn: modelColumns.append(column) modelData = [] for i in range(0, len(data[targetColumn])): record = [] for column in modelColumns: record.append(data[column][i]) modelData.append(record) model = KNeighborsRegressor(weights = parameters["weights"], n_neighbors = parameters["neighbors"], p = parameters["p"]) model.fit (modelData, data[targetColumn]) return NearestNeighborsModel(model, modelColumns)
def lposcv_knn(coordinates, Xdata, Ydata, dzradii, k, visualization): print("Starting lposcv-knn analysis...") # Calculate sorted pairwise distance matrix and indexes performanceTable = np.zeros([len(dzradii), 2]) data_distances, data_distance_indexes = distanceMatrix(coordinates) negcount = len(np.where(Ydata == 0)[0]) folds = lpo_folds(Ydata, negcount) for rind, dzradius in enumerate(dzradii): print("Analysis ongoing, dead zone radius: " + str(dzradius) + "m / " + str(dzradii[len(dzradii) - 1]) + "m") # Calculate dead zone folds dz_folds = dzfolds(dzradius, folds, data_distances, data_distance_indexes) perfs = [] for dz_fold in dz_folds: X_tr = np.delete(Xdata, dz_fold, axis=0) Y_tr = np.delete(Ydata, dz_fold, axis=0) learner = KNeighborsRegressor(n_neighbors=k) learner.fit(X_tr, Y_tr) preds = learner.predict(Xdata[dz_fold]) if preds[0] > preds[1]: perfs.append(1.) elif preds[0] == preds[1]: perfs.append(0.5) else: perfs.append(0.) if visualization: # Check for visualization testcoords = coordinates[dz_fold[0:2], :] dzcoords = coordinates[dz_fold, :] visualize_lposcv(coordinates, testcoords, dzcoords, dzradius) perf = np.mean(perfs) performanceTable[rind, 0] = dzradius performanceTable[rind, 1] = perf plotRes_lposcv(performanceTable, rind, len(folds), 'knn') print("Analysis done.") return performanceTable
def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None): self._hyperparams = { 'n_neighbors': n_neighbors, 'weights': weights, 'algorithm': algorithm, 'leaf_size': leaf_size, 'p': p, 'metric': metric, 'metric_params': metric_params, 'n_jobs': n_jobs } self._wrapped_model = Op(**self._hyperparams)
print(metrics.mean_squared_error(y_test, y_pred_lgb)) print(end - start) start = time.time() reg = ExtraTreesRegressor(n_estimators=100, max_depth=7, min_samples_leaf=10, n_jobs=8) reg.fit(X_train, y_train) end = time.time() y_pred = reg.predict(X_test) print(metrics.mean_squared_error(y_test, y_pred)) print(end - start) start = time.time() reg = KNeighborsRegressor(n_neighbors=4, algorithm='kd_tree') reg.fit(X_train, y_train) end = time.time() y_pred = reg.predict(X_test) print(metrics.mean_squared_error(y_test, y_pred)) print(end - start) y_pred = (y_pred_lr + y_pred_lgb) / 2 print(metrics.mean_squared_error(y_test, y_pred)) def cross_search(x, y, model, param_grid): from sklearn.model_selection import GridSearchCV grid_search = GridSearchCV(model, param_grid, n_jobs=8, verbose=1, cv=5) grid_search.fit(x, y) # best_parameters = grid_search.best_estimator_.get_params()
from ex30.ex30_lib_graph import plot2 from sklearn.neighbors.regression import KNeighborsRegressor OUTPUT_PNG_FILE = '/experiments/ex30/ex30_knn.png' X = [[float(x)] for x in range(0, 24)] Y = [ 12.0, 13.0, 13.0, 13.0, 28.0, 31.0, 38.0, 60.0, 85.0, 80.0, 64.0, 60.0, 59.0, 58.0, 65.0, 70.0, 80.0, 90.0, 110.0, 100.0, 85.0, 65.0, 45.0, 20.0 ] X2 = [[float(x) / 10.0] for x in range(0, 231)] model = KNeighborsRegressor(n_neighbors=1, p=1) model.fit(X, Y) Y_pred = model.predict(X2) print(str(Y_pred)) plot2(Y, Y_pred, OUTPUT_PNG_FILE, "Observed pollution concentration levels", "Predicted pollution concentration levels by NNR")
# waveデータでn-近傍回帰してみる mg.plots.plot_knn_regression(n_neighbors=3) # 実際にやってみる X, y = mg.datasets.make_wave(n_samples=1024) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) plt.plot(X_train, y_train, '^', label="train", markersize=8) plt.plot(X_test, y_test, 'v', label="test", markersize=8) plt.xlabel("Feature") plt.ylabel("Target") from sklearn.neighbors.regression import KNeighborsRegressor X, y = mg.datasets.make_wave(n_samples=1024) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) reg = KNeighborsRegressor(n_neighbors=3).fit(X_train, y_train) reg.score(X_test, y_test) # 回帰問題なので R^2スコアを返す # n_samples を増やすとデータの傾向が見えてくる. # どうやら,sinカーブを線形にバイアスしたものにランダム加算したものっぽい. # knnのnを変えながら評価してみる X, y = mg.datasets.make_wave(n_samples=1024) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # plt.subplots(行数, 列数, figsize=(5,10)) fig, axis = plt.subplots(5, 1, figsize=(8, 16)) # ついでにスコアの推移も見る training_accuracy = []
kernel = RBF(length_scale=2.5) hidden = [256, 128, 64, 32] inp = (len(data.stoxx) * WIN,) model = Sequential() model.add(Dense(hidden[0], activation='relu', input_shape=inp)) for h in hidden[1:]: model.add(Dense(h, activation='relu')) model.add(Dense(1, activation='linear')) model.compile('adam', 'mse') predictors = [ ('RF', RandomForestRegressor(n_estimators=250)), ('GP', GaussianProcessRegressor(kernel=kernel)), ('NN', KNeighborsRegressor(n_neighbors=80)), ('NE', KerasPredictor(model, 10, 512, False)), ('GB', GradientBoostingRegressor(n_estimators=250)) ] for target in range(0, len(data.stoxx)): window = SlidingWindow(WIN, target, LAG) plt.figure(num=None, figsize=(10, 5), dpi=200, facecolor='w', edgecolor='k') plt.title( 'Stock: {} from {} to {}'.format( data.stoxx[target][0], split, dates[-1] ) ) i = 0 for name, base in predictors: predictor = WindowedPredictor(base, window, target)
('Lasso', 'alpha', 1e-5, Lasso(alpha=1e-5)), ('Lasso', 'alpha', 1e-4, Lasso(alpha=1e-4)), ('Lasso', 'alpha', 1e-3, Lasso(alpha=1e-3)), ('Lasso', 'alpha', 1e-2, Lasso(alpha=1e-2)), ('Lasso', 'alpha', 1e-1, Lasso(alpha=1e-1)), ('Lasso', 'alpha', 1.0, Lasso(alpha=1.0)), ('Ridge', 'alpha', 1e-6, Ridge(alpha=1e-5)), ('Ridge', 'alpha', 1e-5, Ridge(alpha=1e-5)), ('Ridge', 'alpha', 1e-4, Ridge(alpha=1e-4)), ('Ridge', 'alpha', 1e-3, Ridge(alpha=1e-3)), ('Ridge', 'alpha', 1e-2, Ridge(alpha=1e-2)), ('Ridge', 'alpha', 1e-1, Ridge(alpha=1e-1)), ('Ridge', 'alpha', 1.0, Ridge(alpha=1.0)), ('KNeighborsRegressor', 'n_neighbors', 7, KNeighborsRegressor(n_neighbors=7)), ('KNeighborsRegressor', 'n_neighbors', 6, KNeighborsRegressor(n_neighbors=6)), ('KNeighborsRegressor', 'n_neighbors', 5, KNeighborsRegressor(n_neighbors=5)), ('KNeighborsRegressor', 'n_neighbors', 4, KNeighborsRegressor(n_neighbors=4)), ('KNeighborsRegressor', 'n_neighbors', 3, KNeighborsRegressor(n_neighbors=3)), ('KNeighborsRegressor', 'n_neighbors', 2, KNeighborsRegressor(n_neighbors=2)), ('KNeighborsRegressor', 'n_neighbors', 1, KNeighborsRegressor(n_neighbors=1)), ('KNeighborsRegressor_distance', 'n_neighbors', 7, KNeighborsRegressor(n_neighbors=7, weights='distance')), ('KNeighborsRegressor_distance', 'n_neighbors', 6, KNeighborsRegressor(n_neighbors=6, weights='distance')), ('KNeighborsRegressor_distance', 'n_neighbors', 5, KNeighborsRegressor(n_neighbors=5, weights='distance')), ('KNeighborsRegressor_distance', 'n_neighbors', 4, KNeighborsRegressor(n_neighbors=4, weights='distance')), ('KNeighborsRegressor_distance', 'n_neighbors', 3, KNeighborsRegressor(n_neighbors=3, weights='distance')), ('KNeighborsRegressor_distance', 'n_neighbors', 2, KNeighborsRegressor(n_neighbors=2, weights='distance')), ('KNeighborsRegressor_distance', 'n_neighbors', 1, KNeighborsRegressor(n_neighbors=1, weights='distance')),
import numpy as np import pandas as pd from sklearn.model_selection import cross_val_score, KFold from sklearn.neighbors.regression import KNeighborsRegressor # reads and cleans DataFrame dc_listings = pd.read_csv("dc_airbnb.csv") dc_listings.rename(columns=lambda col_header: col_header.strip(), inplace=True) stripped_commas = dc_listings['price'].str.replace(',', '') stripped_dollars = stripped_commas.str.replace('$', '') dc_listings['price'] = stripped_dollars.astype('float') num_folds = [3, 5, 7, 9, 10, 11, 13, 15, 17, 19, 21, 23] for fold in num_folds: kf = KFold(fold, shuffle=True, random_state=1) model = KNeighborsRegressor() mses = cross_val_score(model, dc_listings[["accommodates"]], dc_listings["price"], scoring="neg_mean_squared_error", cv=kf) rmses = np.sqrt(np.absolute(mses)) avg_rmse = np.mean(rmses) std_rmse = np.std(rmses) print(str(fold), "folds: ", "avg RMSE: ", str(avg_rmse), "std RMSE: ", str(std_rmse))
# pca = PCA(n_components=10).fit(X_scaled) # scalers.append(pca) # X_pca = pca.transform(X_scaled) # X = X_pca X_train, X_test, y_train, y_test = train_test_split(X, y) # %% Apply models ------------------------------- models = [] # Ridge model = Ridge(alpha=.4).fit(X_train, y_train) models.append(model) # KNeighbors model = KNeighborsRegressor(n_neighbors=3).fit(X_train, y_train) models.append(model) # Lasso model = Lasso().fit(X_train, y_train) models.append(model) # GradientBoosting model = GradientBoostingRegressor().fit(X_train, y_train) models.append(model) # RandomForest model = RandomForestRegressor().fit(X_train, y_train) models.append(model) # SVM
# load the data data = {} columns = [] loadData("/data/york_hour_2013.csv", ["timestamp", "atc"], data, columns) all_features = deepcopy(columns) all_features.remove("target") all_features.remove("location") output = open(OUTPUT_DATA_FILE, 'w') output.write("location,observation,prediction\n") for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, all_features, "target") model = KNeighborsRegressor(weights="distance", n_neighbors=23, p=2.0) model.fit(trainX, trainY) prediction = model.predict(testX) for i in range(0, len(testY)): output.write(str(location)) output.write(",") output.write(str(testY[i])) output.write(",") output.write(str(prediction[i])) output.write("\n") output.close()
def predict_personality(test_path): #train_path = '/Users/lyulan/TCSS555/Data/Train' train_path = '/data/train/' print 'train folder is: ', train_path # prepare training data personality_dic = {} profile_path = train_path + '/Profile/Profile.csv' with open(profile_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: userid = row['userid'] personality = [] personality.append(float(row['ope'])) personality.append(float(row['con'])) personality.append(float(row['ext'])) personality.append(float(row['agr'])) personality.append(float(row['neu'])) personality_dic[userid] = personality #print personality_dic train_userids = [] train_texts = [] train_personalities = [] #train_filepath = train_path + '/Text' train_filepath = train_path + '/text' train_files = [f for f in listdir(train_filepath) if f.endswith(".txt")] for n in train_files: f = io.open(train_filepath + '/' + n, 'r', encoding='latin-1') text = f.read() userid = n.replace('.txt', '') f.close() # get rid of users without personality data if userid in personality_dic: train_userids.append(userid) train_texts.append(text) train_personalities.append(personality_dic[userid]) # prepare testing data test_userids = [] test_texts = [] #test_filepath = test_path + '/Text' test_filepath = test_path + '/text' test_files = [f for f in listdir(test_filepath) if f.endswith(".txt")] for n in test_files: f = io.open(test_filepath + '/' + n, 'r', encoding='latin-1') text = f.read() userid = n.replace('.txt', '') f.close() test_userids.append(userid) test_texts.append(text) # extract features from training data using tfidf_vectorizer tfidf_vectorizer = TfidfVectorizer(min_df=0.005, max_df=0.3, ngram_range=(1,3), stop_words='english',use_idf=True, tokenizer=tokenize) trainX = tfidf_vectorizer.fit_transform(train_texts) print("Training data: n_samples: %d, n_features: %d" % trainX.shape) # extract features from testing data using the same tfidf_vectorizer testX = tfidf_vectorizer.transform(test_texts) print("Testing data: n_samples: %d, n_features: %d" % testX.shape) # get feature names extracted by tfidf_vectorizer feature_names = tfidf_vectorizer.get_feature_names() print(feature_names) result = {} result['userid'] = test_userids for x in range(0, 5): y_train = np.array(train_personalities)[:,x] # reduce features by SelectKBest nkb_features = 50 skb = SelectKBest(score_func=f_regression, k=nkb_features) X_train = skb.fit_transform(trainX, y_train) X_test = skb.transform(testX) # train using KNNRegressor knn = KNeighborsRegressor(n_neighbors=500) knn.fit(X_train, y_train) predicted = knn.predict(X_test) if x==0: result['ope'] = predicted elif x==1: result['con'] = predicted elif x==2: result['ext'] = predicted elif x==3: result['agr'] = predicted else: result['neu'] = predicted result_df = pd.DataFrame(result, columns=['userid', 'ope', 'con', 'ext', 'agr', 'neu']) return result_df
RandomForestRegressor(n_estimators=200, n_jobs=5, random_state=randomstate), ExtraTreesRegressor(n_estimators=200, n_jobs=5, random_state=randomstate), # GradientBoostingRegressor(random_state=randomstate), # learning_rate is a hyper-parameter in the range (0.0, 1.0] # HistGradientBoostingClassifier(random_state=randomstate), # learning_rate is a hyper-parameter in the range (0.0, 1.0] AdaBoostRegressor(n_estimators=200, random_state=randomstate), GaussianProcessRegressor(normalize_y=True), ARDRegression(), # HuberRegressor(), # epsilon: greater than 1.0, default 1.35 LinearRegression(n_jobs=5), PassiveAggressiveRegressor( random_state=randomstate), # C: 0.25, 0.5, 1, 5, 10 SGDRegressor(random_state=randomstate), TheilSenRegressor(n_jobs=5, random_state=randomstate), RANSACRegressor(random_state=randomstate), KNeighborsRegressor( weights='distance'), # n_neighbors: 3, 6, 9, 12, 15, 20 RadiusNeighborsRegressor(weights='distance'), # radius: 1, 2, 5, 10, 15 MLPRegressor(max_iter=10000000, random_state=randomstate), DecisionTreeRegressor( random_state=randomstate), # max_depth = 2, 3, 4, 6, 8 ExtraTreeRegressor(random_state=randomstate), # max_depth = 2, 3, 4, 6, 8 SVR() # C: 0.25, 0.5, 1, 5, 10 ] selectors = [ reliefF.reliefF, fisher_score.fisher_score, # chi_square.chi_square, JMI.jmi, CIFE.cife, DISR.disr,
import numpy as np import pandas as pd from sklearn.model_selection import cross_val_score, KFold from sklearn.neighbors.regression import KNeighborsRegressor # reads and cleans DataFrame dc_listings = pd.read_csv("dc_airbnb.csv") dc_listings.rename(columns=lambda col_header: col_header.strip(), inplace=True) stripped_commas = dc_listings['price'].str.replace(',', '') stripped_dollars = stripped_commas.str.replace('$', '') dc_listings['price'] = stripped_dollars.astype('float') kf = KFold(n_splits=5, shuffle=True, random_state=1) knn = KNeighborsRegressor() mses = cross_val_score(estimator=knn, X=dc_listings[['accommodates']], y=dc_listings['price'], scoring="neg_mean_squared_error", cv=kf) # mses_abs_sqrt = [x ** 0.5 for x in abs(mses)] mses_abs_sqrt = np.sqrt(np.absolute(mses)) avg_rmse = np.mean(mses_abs_sqrt) print('mses_abs_sqrt:', mses_abs_sqrt) print('avg_rmse:', avg_rmse) # mses_abs_sqrt: [ 117.38752023 137.27049219 146.0660213 106.37023494 145.75598127] # avg_rmse: 130.570049986
'GaussianProcessClassifier':GaussianProcessClassifier(), 'GaussianProcessRegressor':GaussianProcessRegressor(), 'GaussianRandomProjection':GaussianRandomProjection(), 'GenericUnivariateSelect':GenericUnivariateSelect(), 'GradientBoostingClassifier':GradientBoostingClassifier(), 'GradientBoostingRegressor':GradientBoostingRegressor(), 'GraphLasso':GraphLasso(), 'GraphLassoCV':GraphLassoCV(), 'HuberRegressor':HuberRegressor(), 'Imputer':Imputer(), 'IncrementalPCA':IncrementalPCA(), 'IsolationForest':IsolationForest(), 'Isomap':Isomap(), 'KMeans':KMeans(), 'KNeighborsClassifier':KNeighborsClassifier(), 'KNeighborsRegressor':KNeighborsRegressor(), 'KernelCenterer':KernelCenterer(), 'KernelDensity':KernelDensity(), 'KernelPCA':KernelPCA(), 'KernelRidge':KernelRidge(), 'LSHForest':LSHForest(), 'LabelPropagation':LabelPropagation(), 'LabelSpreading':LabelSpreading(), 'Lars':Lars(), 'LarsCV':LarsCV(), 'Lasso':Lasso(), 'LassoCV':LassoCV(), 'LassoLars':LassoLars(), 'LassoLarsCV':LassoLarsCV(), 'LassoLarsIC':LassoLarsIC(), 'LatentDirichletAllocation':LatentDirichletAllocation(),