def test_blob_classification_numpy(self): """ Tests kNN for classification using randomly-generated points drawn from Gaussian-shaped clusters. Splits data into training and testing sets. """ k = 3 X, y = generate_cluster_samples() train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y) knn = KNN(k) knn.fit(train_X, train_y) pred_y = knn.predict_numpy(test_X) # verify shape of output self.assertEqual(len(pred_y.shape), 1) self.assertEqual(pred_y.shape[0], test_X.shape[0]) # with k=1, each point should match itself accuracy = accuracy_score(test_y, pred_y) self.assertAlmostEqual(accuracy, 1.0)
def main(): K = [1, 3] #load CM1 data = pd.read_csv('./lvq_output/seed_137/kc1_lvq3.csv') X, Y = data.drop(columns=['defects']), data['defects'] # normalize data #X = normalize_data(X) # create k-fold splits kf = KFold(n_splits=10) # instanciate classifier for k in K: clf = KNN(k=k) print("k equals {}".format(k)) start_time = time.time() acc = [] for train, test in kf.split(X): clf.fit(X.iloc[train], Y.iloc[train]) predictions = clf.predict(X.iloc[test]) acc.append((np.sum(predictions == Y.iloc[test]) / len(test)) * 100) end_time = time.time() acc = np.array(acc) print("mean accuracy: {}".format(np.mean(acc))) print("standard deviation: {}".format(np.std(acc))) print("time elapsed: {}".format(end_time - start_time))
def test_synthetic_data(self): """ Test KNN.predict using some synthetic data """ x_train = np.array([[1, 2], [1, 3], [2, 2], [2, 3], [1, 1], [2, 1]]) y_train = np.array([1, 1, 1, -1, -1, -1]) model = KNN(k=3) model.fit(x_train, y_train) x_test = np.array([ [1.8, 2.6], [2.0, 1.8], [1.5, 2.0], [1.0, 2.5], [1.5, 1.0], [2.0, 1.0], ]) pred = model.predict(x_test) self.assertTrue(np.array_equal(pred, np.array([1, 1, 1, 1, -1, -1]))) # one labels should change if using 1-nn model.k = 1 pred2 = model.predict(x_test) self.assertTrue(np.array_equal(pred2, np.array([-1, 1, 1, 1, -1, -1])))
def test_knn_regression(): while True: N = np.random.randint(2, 100) M = np.random.randint(2, 100) k = np.random.randint(1, N) ls = np.min([np.random.randint(1, 10), N - 1]) weights = np.random.choice(["uniform", "distance"]) X = np.random.rand(N, M) X_test = np.random.rand(N, M) y = np.random.rand(N) knn = KNN(k=k, leaf_size=ls, metric=euclidean, classifier=False, weights=weights) knn.fit(X, y) preds = knn.predict(X_test) gold = KNeighborsRegressor( p=2, leaf_size=ls, n_neighbors=k, weights=weights, metric="minkowski", algorithm="ball_tree", ) gold.fit(X, y) gold_preds = gold.predict(X_test) for mine, theirs in zip(preds, gold_preds): np.testing.assert_almost_equal(mine, theirs) print("PASSED")
def main(): trainSet = pd.read_csv('datasets/train_set.csv', converters={'Trajectory': literal_eval}) testSet = pd.read_csv('datasets/test_set_a2.csv', converters={'Trajectory': literal_eval}) # labels for categories le = preprocessing.LabelEncoder() categoryIds = le.fit_transform(trainSet['journeyPatternId']) allSequences = [] for trainIndex, trainRow in trainSet.iterrows(): allSequences.append(trainRow['Trajectory']) # initialize KNN classifier clf = KNN(5, DTW) crossValidation(clf, allSequences, categoryIds, le) clf.fit(allSequences, categoryIds) # predict the categories for the testSet predIds = clf.predict(testSet['Trajectory']) predCategs = le.inverse_transform(predIds) writeInCsv(predCategs)
def foo(k_num=5, distance=distance_metric(p=1)): _data = [[i[0], i[3]] for i in data] # Split the data into train and test parts #train_d, train_l, test_d, test_l = tt_split(_data, label) train_d, train_l, test_d, test_l = (_data[0:30] + _data[50:80] + _data[100:130], label[0:30] + label[50:80] + label[100:130], _data[30:50] + _data[80:100] + _data[130:], label[30:50] + label[80:100] + label[130:]) # Initialize the KNN object knn = KNN(neighbors_num=k_num, distance=distance) # Fill the data in KNN knn.fit(train_d, train_l) # Take prediction from KNN result = knn.predict(test_d) # Print the results on screen as data, real label, predicted label. #print("%20s - %20s | %20s | %s" %("[Data]", "<Real Label>", "<Predicted Label>", "Truth")) n = 0 for i, j, r in zip(test_d, test_l, result): truthness = True if j == r else False if truthness: n += 1 #print("%20s - %20s | %20s | %s" %(i, j, r, truthness)) #print("Acc:", n / len(test_d)) return n / len(test_d), n, len(test_d)
class Stacking(): def __init__(self): pass def fit(self, X, y): self.rf = RandomForest(num_trees=15, max_depth=np.inf) self.rf.fit(X, y) y_rf = self.rf.predict(X) self.nb = NaiveBayes() self.nb.fit(X, y) y_nb = self.nb.predict(X) self.knn = KNN(k=3) self.knn.fit(X, y) y_knn = self.knn.predict(X) newX = np.array([y_rf, y_nb, y_knn]).transpose() model = DecisionTree(max_depth=np.inf, stump_class=DecisionStumpErrorRate) self.model = model model.fit(newX, y) def predict(self, X): y_rf = self.rf.predict(X) y_nb = self.nb.predict(X) y_knn = self.knn.predict(X) x_test = np.array([y_rf, y_nb, y_knn]).transpose() return self.model.predict(x_test)
def test_knn_clf(): while True: N = np.random.randint(2, 100) M = np.random.randint(2, 100) k = np.random.randint(1, N) n_classes = np.random.randint(10) ls = np.min([np.random.randint(1, 10), N - 1]) weights = "uniform" X = np.random.rand(N, M) X_test = np.random.rand(N, M) y = np.random.randint(0, n_classes, size=N) knn = KNN(k=k, leaf_size=ls, metric=euclidean, classifier=True, weights=weights) knn.fit(X, y) preds = knn.predict(X_test) gold = KNeighborsClassifier( p=2, leaf_size=ls, n_neighbors=k, weights=weights, metric="minkowski", algorithm="ball_tree", ) gold.fit(X, y) gold_preds = gold.predict(X_test) for mine, theirs in zip(preds, gold_preds): np.testing.assert_almost_equal(mine, theirs) print("PASSED")
def fit(self, X, y): # instantiate the input models rf = RandomForest(num_trees=15) knn = KNN(k=3) nb = NaiveBayes(num_classes=2) # Random Forest fit and predict rf.create_splits(X) rf.fit(X, y) rf_pred = rf.predict(X) # K-Nearest Neighbors fit and predict knn.fit(X, y) knn_pred = knn.predict(X) # Naive Bayes fit and predict nb.fit(X, y) nb_pred = nb.predict(X) # use predictions from input models as inputs for meta-classifiers meta_input = np.hstack((rf_pred.reshape( (rf_pred.size, 1)), knn_pred.reshape( (knn_pred.size, 1)), nb_pred.reshape((nb_pred.size, 1)))) # use Decision Tree as meta-classifier dt = DecisionTree(max_depth=np.inf) dt.fit(meta_input, y) self.rf = rf self.knn = knn self.nb = nb self.meta_classifier = dt
def eval_model(case): l, k = case results = {'precision': [], 'recall': [], 'f1': []} model = KNN(l, k) for i in range(folds): print(l, k, 'cross validation', i) training, testing = split_data(corpus, i, folds) print(l, k, 'fit model', i) model.fit([d.vector for d in training], [d.label for d in training]) print(l, k, 'predict', i) preds = [model.predict(d.vector) for d in testing] labels = [d.label for d in testing] metrics = model_metrics(labels, preds) for m, key in zip(metrics, ['precision', 'recall', 'f1']): results[key].append(m) print(l, k, mean(results['precision']), mean(results['recall']), mean(results['f1'])) return results
def knn(corpus, idf): query = read_folder('./query') tf_idf(query, idf) print('fit KNN model') classifier = KNN(5, 5) classifier.fit([d.vector for d in corpus], corpus) start_time = time.time() for i, d in enumerate(query): print('Query Doc', i) print(d.features) # neighbors = classifier.brute_force(d.vector) neighbors = classifier.neighbors(d.vector) print('Query Neighbors', i) for n in neighbors: print(n.features) print('\n') print('\n') print("--- %s seconds ---" % (time.time() - start_time))
def knn_validate(data, kernel, metric, k_neighbors, show_plot): plot = Plot() matrix_full = [[0, 0], [0, 0]] y_predict_arr = [] for i in range(len(data)): data.updateTrainTest(i) trainDots, trainClass = data.getDotsByMode('train', False) testDots, testClass = data.getDotsByMode('test', False) knn = KNN(kernel=kernel, metric=metric, neighbors=k_neighbors) knn.fit(trainDots, trainClass) y_predict, distance = knn.predict(testDots) y_predict_arr.append(y_predict[0]) if show_plot: tDots = np.array(trainDots) tCls = np.array(trainClass) plot.knn(tDots[tCls == 1.0], tDots[tCls == -1.0], distance, testDots[0], y_predict[0]) matrix = get_metrics(y_predict, testClass) matrix_full[0][0] += matrix[0][0] matrix_full[0][1] += matrix[0][1] matrix_full[1][0] += matrix[1][0] matrix_full[1][1] += matrix[1][1] return y_predict_arr, get_f_measure(matrix_full), matrix_full
def main(): K = [1, 2, 3, 5, 7, 9, 11, 13, 15] #load CM1 data = arff.loadarff('./datasets/CM1.arff') X, Y = build_dataframe(data) # normalize data X = normalize_data(X) # create k-fold splits kf = KFold(n_splits=10) # instanciate classifier for k in K: clf = KNN(k=k) print("k equals {}".format(k)) start_time = time.time() acc = [] for train, test in kf.split(X): clf.fit(X.iloc[train], Y.iloc[train]) predictions = clf.predict(X.iloc[test]) acc.append((np.sum(predictions == Y.iloc[test]) / len(test)) * 100) end_time = time.time() acc = np.array(acc) print("mean accuracy: {}".format(np.mean(acc))) print("standard deviation: {}".format(np.std(acc))) print("time elapsed: {}".format(end_time - start_time))
def test_iris_regression(self): """ Tests kNN for regression """ k = 1 iris_dataset = load_iris() knn = KNN(k, "average") # get petal length as input # ensure this is 2D X = iris_dataset.data[:, 2].reshape(-1, 1) # get petal width as output y = iris_dataset.data[:, 3] knn.fit(X, y) predicted = knn.predict(X) # verify shape of output self.assertEqual(len(predicted.shape), 1) self.assertEqual(predicted.shape[0], iris_dataset.data.shape[0]) # with k=1, each point should match itself # but with only 1 dimension, some points have # the same values mse = mean_squared_error(y, predicted) self.assertLess(mse, 0.1)
def calc_accuracy_multiclass(train_X,train_y, test_X, test_y,num_folds,K): knn_classifier = KNN(k=K) knn_classifier.fit(train_X, train_y) predict = knn_classifier.predict(test_X) #rint('predicted ',predict) #print('real value',test_y) accuracy = multiclass_accuracy(predict, test_y) print("Accuracy: %4.2f" % accuracy) return accuracy
def rerank_results(feedback, similar_images, similar_image_vectors, query_image_vector): global feedback_imgs_g, feedback_vals_g, similar_images_g, similar_image_vectors_g similar_images_g = similar_images similar_image_vectors_g = similar_image_vectors # Add DT based relevance feedback function clf = DecisionTree() feedback_imgs = list(feedback.keys()) feedback_vals = list(feedback.values()) x_train_old, y_train = get_training_set(feedback_imgs, feedback_vals) x_train = [] for i in x_train_old: j = i.tolist() x_train.append(j) clf.fit(x_train, y_train) # x_test = similar_image_vectors_g.values() x_test = [] for i in similar_image_vectors_g.values(): j = i.tolist() x_test.append(j) predictions = clf.predict(x_test) #relevant images indices_rel = [i for i, x in enumerate(predictions) if x == 1] print("Relevant", indices_rel) x_train_knn_rel = [] rel_len = len(indices_rel) for i in indices_rel: x_train_knn_rel.append(x_test[i]) knn = KNN(rel_len) #knn = KNeighborsClassifier(n_neighbours=rel_len) knn.fit(x_train_knn_rel) neighbours_rel = knn.get_neighbours([query_image_vector]) print("Neighbours Rel", neighbours_rel) #irrelevant images indices_ir = [i for i, x in enumerate(predictions) if x == -1] print("Irrelevant", indices_ir) x_train_knn_ir = [] ir_len = len(indices_ir) for i in indices_ir: x_train_knn_ir.append(x_test[i]) knn = KNN(ir_len) knn.fit(x_train_knn_ir) neighbours_ir = knn.get_neighbours([query_image_vector]) print("Neighbours IR", neighbours_ir) ranked_indices = [] ranked_indices.extend(indices_rel) ranked_indices.extend(indices_ir) rel_similar_images = [ list(similar_image_vectors_g.keys())[index] for index in ranked_indices ] return rel_similar_images
def test_fit(self): """ Test KNN.fit is actually storing the training data """ x_train = np.array([[1, 2], [1, 3], [2, 2], [2, 3], [1, 1], [2, 1]]) y_train = np.array([1, 1, 1, -1, -1, -1]) model = KNN() model.fit(x_train, y_train) self.assertTrue(np.array_equal(x_train, model.x_train)) self.assertTrue(np.array_equal(y_train, model.y_train))
def run_knn(data, target_column): st.sidebar.title('Choose parameters for KNN') ts = st.sidebar.slider('Training size', min_value=0.0, max_value=1.0, step=0.01, value=0.7) k = st.sidebar.number_input('k', min_value=1, max_value=int(len(data)*ts), step=1, value=3) run_status = st.sidebar.button('Run Algorithm') if run_status: with st.spinner('Running...'): x_train, x_test, y_train, y_test = train_test_split(data.drop([target_column], axis=1), data[target_column], test_size=1 - ts) clf = KNN(k=k) clf.fit(x_train, y_train) """ ## :dart: Accuracy """ st.subheader(accuracy_score(y_test, clf.predict(x_test)))
def cross_validation(corpus, idf): nb_results = {'precision': [], 'recall': [], 'f1': []} knn_results = {'precision': [], 'recall': [], 'f1': []} vocab = sorted(idf.keys()) random.shuffle(corpus) for i in range(10): print('cross validation', i) training, testing = split_data(corpus, i, 10) nb = NaiveBayes(training, vocab, 0.1) knn = KNN(5, 5) knn.fit([d.vector for d in training], [d.label for d in training]) labels = [d.label for d in testing] nb_preds = [nb.predict(d) for d in testing] knn_preds = [knn.predict(d.vector) for d in testing] metrics = model_metrics(labels, nb_preds) for m, k in zip(metrics, ['precision', 'recall', 'f1']): nb_results[k].append(m) metrics = model_metrics(labels, knn_preds) for m, k in zip(metrics, ['precision', 'recall', 'f1']): knn_results[k].append(m) for m in ['precision', 'recall', 'f1']: print('nb', m) print(nb_results[m]) print(m, 'nb mean', mean(nb_results[m])) print('knn', m) print(knn_results[m]) print(m, 'knn mean', mean(knn_results[m])) diff = [a - b for a, b in zip(nb_results[m], knn_results[m])] print(m, 'diff') print(diff) t = mean(diff) / (stdev(diff) / len(diff)**0.5) print(m, 't value:', t)
def test_iris_classification_loop(self): """ Tests kNN for classification with loops """ k = 1 iris_dataset = load_iris() knn = KNN(k) knn.fit(iris_dataset.data, iris_dataset.target) predicted = knn.predict_loop(iris_dataset.data) # verify shape of output self.assertEqual(len(predicted.shape), 1) self.assertEqual(predicted.shape[0], iris_dataset.data.shape[0]) # with k=1, each point should match itself accuracy = accuracy_score(iris_dataset.target, predicted) self.assertAlmostEqual(accuracy, 1.0)
def fit(self, X, y): N, D = X.shape rfModel = RandomForestClassifier(n_estimators=50) nbModel = NaiveBayes(num_classes=2) knnModel = KNN(3) knnModel.fit(X, y) knn_y_pred = knnModel.predict(X).astype(int) nbModel.fit(X, y) nb_y_pred = nbModel.predict(X).astype(int) rfModel.fit(X, y) rf_y_pred = rfModel.predict(X).astype(int) Xy_label_combined = np.array( (knn_y_pred, nb_y_pred, rf_y_pred)).transpose() self.Xy_label_combined = Xy_label_combined self.y = y
def cosine(): train_d, train_l, test_d, test_l = tt_split(data, label) # Initialize the KNN object knn = KNN(neighbors_num=5, distance=cosine_distance()) # Fill the data in KNN knn.fit(train_d, train_l) # Take prediction from KNN result = knn.predict(test_d) # Print the results on screen as data, real label, predicted label. #print("%20s - %20s | %20s | %s" %("[Data]", "<Real Label>", "<Predicted Label>", "Truth")) n = 0 for i, j, r in zip(test_d, test_l, result): truthness = True if j == r else False if truthness: n += 1 print("%20s - %20s | %20s | %s" % (i, j, r, truthness)) print("Acc:", n / len(test_d)) return n / len(test_d)
def test_predict(self): knn = KNN(3) model = knn.fit(self.X, self.y) md = model.predict(self.X.iloc[4:]) exp_md = pd.DataFrame({ 4: ['b', 6], 5: ['b', 6], 6: ['b', 6] }, index=[0, 1]).T pdt.assert_frame_equal(exp_md, md)
def test_knn(): df = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None) y = df.iloc[0:100, 4].values y = np.where(y == 'Iris-setosa', -1, 1) y = np.random.randint(2, size=100) x = df.iloc[0:100, [0, 2]].values print("Testing 2-D Iris data set with only one neighbor...") neighbor = KNN(k=1) neighbor.fit(x, y) neighbor.plot(x, y) print("Testing Iris data set with 15 neighbor...") iris = datasets.load_iris() x = iris.data[:, :2] y = iris.target neighbor = KNN(15) neighbor.fit(x, y) y_pred = neighbor.predict(x) neighbor.accuracy(y_pred, y) neighbor.plot(x, y) print( "Adding new point to dataset and testing with full Iris 1-k data set..." ) neighbor = KNN(1) neighbor.fit(x, y) y2 = np.array([1]) y2 = np.append(y, y2) x2 = np.vstack([x, [5.0, 3.2]]) neighbor.plot(x2, y2) print("Testing SKLearn's model...") clf = neighbors.KNeighborsClassifier(1) clf.fit(x, y) plot_decision_regions(x2, y2, clf)
def plot_knn(): np.random.seed(12345) fig, axes = plt.subplots(4, 4) for i, ax in enumerate(axes.flatten()): n_in = 1 n_out = 1 d = np.random.randint(1, 5) n_ex = np.random.randint(5, 500) std = np.random.randint(0, 1000) intercept = np.random.rand() * np.random.randint(-300, 300) X_train, y_train, X_test, y_test, coefs = random_regression_problem( n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i) LR = LinearRegression(fit_intercept=True) LR.fit(X_train, y_train) y_pred = LR.predict(X_test) loss = np.mean((y_test.flatten() - y_pred.flatten())**2) knn_1 = KNN(k=1, classifier=False, leaf_size=10, weights="uniform") knn_1.fit(X_train, y_train) y_pred_1 = knn_1.predict(X_test) loss_1 = np.mean((y_test.flatten() - y_pred_1.flatten())**2) knn_5 = KNN(k=5, classifier=False, leaf_size=10, weights="uniform") knn_5.fit(X_train, y_train) y_pred_5 = knn_5.predict(X_test) loss_5 = np.mean((y_test.flatten() - y_pred_5.flatten())**2) knn_10 = KNN(k=10, classifier=False, leaf_size=10, weights="uniform") knn_10.fit(X_train, y_train) y_pred_10 = knn_10.predict(X_test) loss_10 = np.mean((y_test.flatten() - y_pred_10.flatten())**2) xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test)) xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test)) X_plot = np.linspace(xmin, xmax, 100) y_plot = LR.predict(X_plot) y_plot_1 = knn_1.predict(X_plot) y_plot_5 = knn_5.predict(X_plot) y_plot_10 = knn_10.predict(X_plot) ax.scatter(X_test, y_test, alpha=0.5) ax.plot(X_plot, y_plot, label="OLS", alpha=0.5) ax.plot(X_plot, y_plot_1, label="KNN (k=1)", alpha=0.5) ax.plot(X_plot, y_plot_5, label="KNN (k=5)", alpha=0.5) ax.plot(X_plot, y_plot_10, label="KNN (k=10)", alpha=0.5) ax.legend() # ax.set_title( # "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format( # loss, loss_poly, loss_rbf # ) # ) ax.xaxis.set_ticklabels([]) ax.yaxis.set_ticklabels([]) plt.tight_layout() plt.savefig("img/knn_plots.png", dpi=300) plt.close("all")
def test_query(self): knn = KNN(3) model = knn.fit(self.X, self.y) gen = model.query(self.X.iloc[4:]) dist, md = next(gen) exp_dist = np.array([ 0, euclidean(self.X.iloc[4], self.X.iloc[6]), euclidean(self.X.iloc[4], self.X.iloc[5]) ]) exp_md = pd.DataFrame([['b', 5], ['b', 7], ['b', 6]], index=[4, 6, 5]) npt.assert_allclose(exp_dist, dist) pdt.assert_frame_equal(exp_md, md)
def plot(h=.02): _data = [[i[0], i[3]] for i in data] # Split the data into train and test parts ##train_d, train_l, test_d, test_l = tt_split(_data, label) train_d, train_l, test_d, test_l = (_data[0:30] + _data[50:80] + _data[100:130], label[0:30] + label[50:80] + label[100:130], _data[30:50] + _data[80:100] + _data[130:], label[30:50] + label[80:100] + label[130:]) # Initialize the KNN object knn = KNN(neighbors_num=3, distance=cosine_distance()) # Fill the data in KNN knn.fit(train_d, train_l) _t = np.array(train_d) x_min, x_max = _t[:, 0].min() - .2, _t[:, 0].max() + .2 y_min, y_max = _t[:, 1].min() - .2, _t[:, 1].max() + .2 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = np.c_[xx.ravel(), yy.ravel()] Z = np.array(knn.predict(Z)) Z = Z.reshape(xx.shape) print(Z) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points plt.scatter(_t[:, 0], _t[:, 1], c=train_l, cmap=cmap_bold, edgecolor='k', s=20) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.show()
# https://deeplearningcourses.com/c/data-science-supervised-machine-learning-in-python # https://www.udemy.com/data-science-supervised-machine-learning-in-python from knn import KNN from util import get_xor import matplotlib.pyplot as plt if __name__ == '__main__': X, Y = get_xor() # display the data plt.scatter(X[:, 0], X[:, 1], s=100, c=Y, alpha=0.5) plt.show() # get the accuracy model = KNN(3) model.fit(X, Y) print "Accuracy:", model.score(X, Y)
# https://deeplearningcourses.com/c/data-science-supervised-machine-learning-in-python # https://www.udemy.com/data-science-supervised-machine-learning-in-python from __future__ import print_function, division from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future from knn import KNN from util import get_xor import matplotlib.pyplot as plt if __name__ == '__main__': X, Y = get_xor() # display the data plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5) plt.show() # get the accuracy model = KNN(3) model.fit(X, Y) print("Accuracy:", model.score(X, Y))
import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from knn import KNN iris = datasets.load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = KNN(k=3) clf.fit(X_train, y_train) predictions = clf.predict(X_test) accuracy = np.sum(predictions == y_test) / len(y_test) print(accuracy * 100) # my_cmap = ListedColormap(["#FF781F", "#149414", "#52307C"]) # plt.figure() # # Displaying 2 out of 4 features so that we can see in 2D # plt.scatter(X[:, 0], X[:, 1], c=y, cmap=my_cmap, edgecolor='k', s=20) # plt.show()
clf.fit(X, y) print("Naive Bayes (sklearn) validation error: %.3f" % (1 - clf.score(X, y))) elif question == '3': with open(os.path.join('..', 'data', 'citiesSmall.pkl'), 'rb') as f: dataset = pickle.load(f) X = dataset['X'] y = dataset['y'] Xtest = dataset['Xtest'] ytest = dataset['ytest'] for i in [1]: knn = KNN(i) knn.fit(X, y) print("Training") tr_prediction = knn.predict(X) knn.getError(tr_prediction, y) print("Testing") prediction = knn.predict(Xtest) knn.getError(prediction, ytest) utils.plotClassifier(knn, X, y) neigh = KNeighborsClassifier(n_neighbors=1) neigh.fit(X, y) # print(neigh.predict(ytest)) print("Sklearn KNN: {}".format(1 - neigh.score(X, y))) utils.plotClassifier(neigh, X, y) elif question == '4':
if elem[2]*av_score[elem[1]-1] > 0 or (elem[2]==0 and av_score[elem[1]-1] <= 0): accuracy+=1 print "Simple Accuracy:", np.around(100.0*accuracy/len(validationData)), "%" ############# PERSONAL PREF ############# print 20 * "#", "Personal Pref", 20 * "#" jokeDataNew = jokeData # replace nan by 0 for i in range(len(jokeData)): jokeDataNew[i] = [0 if np.isnan(x) else x for x in jokeData[i] ] for k in [10, 100, 1000]: print "K Value:", k knn = KNN(k) knn.fit(jokeDataNew) neighbours = knn.neighbours av_score = [] accuracy = 0 for i in range(100): average_score = (np.mean([jokeDataNew[ind] for ind in neighbours[i]], 0)) av_score.append(average_score) for elem in validationData: if (elem[2]*av_score[elem[0]-1][elem[1]-1] > 0) or (elem[2]==0 and av_score[elem[0]-1][elem[1]-1] < 0): accuracy+=1 print "Pref Accuracy:", np.around(100.0*accuracy/len(validationData)), "%" ############# LATENT FACTOR ANALYSIS ############# print 20 * "#", "PCA", 20 * "#"