class Stacking(): def __init__(self): pass def fit(self, X, y): self.rf = RandomForest(num_trees=15, max_depth=np.inf) self.rf.fit(X, y) y_rf = self.rf.predict(X) self.nb = NaiveBayes() self.nb.fit(X, y) y_nb = self.nb.predict(X) self.knn = KNN(k=3) self.knn.fit(X, y) y_knn = self.knn.predict(X) newX = np.array([y_rf, y_nb, y_knn]).transpose() model = DecisionTree(max_depth=np.inf, stump_class=DecisionStumpErrorRate) self.model = model model.fit(newX, y) def predict(self, X): y_rf = self.rf.predict(X) y_nb = self.nb.predict(X) y_knn = self.knn.predict(X) x_test = np.array([y_rf, y_nb, y_knn]).transpose() return self.model.predict(x_test)
def execute(): df = pd.read_csv('data/reduced_dataset_v3.csv') df = df.drop( ['Unnamed: 0', 'Time', 'Time Code', 'Country Name', 'Country Code'], axis=1) sbs_reg = StepByStepRegression(df, 'correlation') # print(emp_df.columns) sbs_reg.normalize() # print(normalized_df.head()) sorted_columns = sbs_reg.get_least_nan_columns(sbs_reg.normalized_df) # print('sorted columns', sorted_columns) sorted_df = pd.DataFrame() correlated_features_list = dict() for col, nulls in sorted_columns.iteritems(): sorted_df[col] = sbs_reg.normalized_df[col] # print('sorted dataframe', sorted_df.columns) if sbs_reg.sim_fun == 'KNN': knn = KNN(sorted_df, 3) correlated_features_list = knn.findKNeighbors() elif sbs_reg.sim_fun == 'correlation': correlated_features_list = sbs_reg.correlation(sorted_df) print('correlated features', correlated_features_list) sbs_reg.fill_missing_data(sorted_df, correlated_features_list) un_normalized = sbs_reg.un_normalize(sorted_df) print(un_normalized)
def main(): K = [1, 3] #load CM1 data = pd.read_csv('./lvq_output/seed_137/kc1_lvq3.csv') X, Y = data.drop(columns=['defects']), data['defects'] # normalize data #X = normalize_data(X) # create k-fold splits kf = KFold(n_splits=10) # instanciate classifier for k in K: clf = KNN(k=k) print("k equals {}".format(k)) start_time = time.time() acc = [] for train, test in kf.split(X): clf.fit(X.iloc[train], Y.iloc[train]) predictions = clf.predict(X.iloc[test]) acc.append((np.sum(predictions == Y.iloc[test]) / len(test)) * 100) end_time = time.time() acc = np.array(acc) print("mean accuracy: {}".format(np.mean(acc))) print("standard deviation: {}".format(np.std(acc))) print("time elapsed: {}".format(end_time - start_time))
def main(): trainSet = pd.read_csv('datasets/train_set.csv', converters={'Trajectory': literal_eval}) testSet = pd.read_csv('datasets/test_set_a2.csv', converters={'Trajectory': literal_eval}) # labels for categories le = preprocessing.LabelEncoder() categoryIds = le.fit_transform(trainSet['journeyPatternId']) allSequences = [] for trainIndex, trainRow in trainSet.iterrows(): allSequences.append(trainRow['Trajectory']) # initialize KNN classifier clf = KNN(5, DTW) crossValidation(clf, allSequences, categoryIds, le) clf.fit(allSequences, categoryIds) # predict the categories for the testSet predIds = clf.predict(testSet['Trajectory']) predCategs = le.inverse_transform(predIds) writeInCsv(predCategs)
def main(n): print('Teste: ' + str(n)) teste = open('./particoes/ts' + str(n) + '.txt', 'r') arquivoTreino = './particoes/cj' + str(n) + '.txt' k = [1, 3, 5, 10] count = 0 hitDTW = [0, 0, 0, 0] hitEuclidiana = [0, 0, 0, 0] for testeLinha in teste: count += 1 #Conta quantas linhas ja foram executadas linhateste = testeLinha.split(" ") vetordeteste = list(map(float, linhateste[1:])) ResultadoDTW = KNN(arquivoTreino, vetordeteste, k).runKNN_DTW() ResultadoEuclidiana = KNN(arquivoTreino, vetordeteste, k).runKNN_Euclidiana() #Contador de acertos: pos = 0 for i in ResultadoDTW: if (int(linhateste[0]) == int(i[1])): hitDTW[pos] += 1 pos += 1 pos = 0 for i in ResultadoEuclidiana: if (int(linhateste[0]) == int(i[1])): hitEuclidiana[pos] += 1 pos += 1 print("Progresso" + str(n) + ": " + str(count * 100 / 240) + "%") #Indicador de progresso do programa #Relatorio dos testes: relatorio = open("./relatorios/relatorioFinal" + str(n) + ".txt", "w") relatorio.write('Teste: ' + str(n) + "\n") relatorio.write("Accuracy DTW K=1: " + str(hitDTW[0] * 100 / count) + "% \n") relatorio.write("Accuracy DTW K=3: " + str(hitDTW[1] * 100 / count) + "% \n") relatorio.write("Accuracy DTW K=5: " + str(hitDTW[2] * 100 / count) + "% \n") relatorio.write("Accuracy DTW K=10: " + str(hitDTW[3] * 100 / count) + "% \n") relatorio.write("Accuracy DTW TOTAL: " + str(( (hitDTW[0] + hitDTW[1] + hitDTW[2] + hitDTW[3]) * 100) / (4 * count)) + "% \n") relatorio.write("Accuracy Euclidiana K=1: " + str(hitEuclidiana[0] * 100 / count) + "% \n") relatorio.write("Accuracy Euclidiana K=3: " + str(hitEuclidiana[1] * 100 / count) + "% \n") relatorio.write("Accuracy Euclidiana K=5: " + str(hitEuclidiana[2] * 100 / count) + "% \n") relatorio.write("Accuracy Euclidiana K=10: " + str(hitEuclidiana[3] * 100 / count) + "% \n") relatorio.write("Accuracy Euclidiana TOTAL: " + str(((hitEuclidiana[0] + hitEuclidiana[1] + hitEuclidiana[2] + hitEuclidiana[3]) * 100) / (4 * count)) + "% \n")
def test_knn(k, train_data, train_labels, test_data): """ test_knn function Trains a KNN classifier with the given testing set then tests it on the testing data. Outputs as a CSV file. Args ---- k : integer number of neighbors to use for KNN train_data : np.array training dataset train_labels : np.array training dataset labels test_data : np.array testing dataset Returns ------- Tuple (np.array, np.array) """ print("Final k:" + str(k)) knn = KNN(k, train_data, train_labels) # print to CSV with open('predictions_digit_recognizer.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['ImageId', 'Label']) for i in range(len(test_data)): data = test_data[i] guess = knn.classify(data) writer.writerow([str(i + 1), str(int(guess))])
def foo(k_num=5, distance=distance_metric(p=1)): _data = [[i[0], i[3]] for i in data] # Split the data into train and test parts #train_d, train_l, test_d, test_l = tt_split(_data, label) train_d, train_l, test_d, test_l = (_data[0:30] + _data[50:80] + _data[100:130], label[0:30] + label[50:80] + label[100:130], _data[30:50] + _data[80:100] + _data[130:], label[30:50] + label[80:100] + label[130:]) # Initialize the KNN object knn = KNN(neighbors_num=k_num, distance=distance) # Fill the data in KNN knn.fit(train_d, train_l) # Take prediction from KNN result = knn.predict(test_d) # Print the results on screen as data, real label, predicted label. #print("%20s - %20s | %20s | %s" %("[Data]", "<Real Label>", "<Predicted Label>", "Truth")) n = 0 for i, j, r in zip(test_d, test_l, result): truthness = True if j == r else False if truthness: n += 1 #print("%20s - %20s | %20s | %s" %(i, j, r, truthness)) #print("Acc:", n / len(test_d)) return n / len(test_d), n, len(test_d)
def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val): optimal_k = 0 function = "" scalar = "" model = None scalar_method = [] scalar_name = [] f1_scores = -2 ** 32 if len(x_train) <= 30: max_k = len(x_train) else: max_k = 30 for m, n in scaling_classes.items(): scalar_method.append(n()) scalar_name.append(m) for i in range(len(scalar_method)): x_train = scalar_method[i](x_train) x_val = scalar_method[i](x_val) for key, value in distance_funcs.items(): for k_value in range(1, max_k, 2): train_model = KNN(k_value, value) train_model.train(x_train, y_train) pre_val = train_model.predict(x_val) cur_f1 = f1_score(y_val, pre_val) if f1_scores < cur_f1: optimal_k = k_value function = key model = train_model f1_scores = cur_f1 scalar = scalar_name[i] self.best_k = optimal_k self.best_distance_function = function self.best_scaler = scalar self.best_model = model return self.best_k, self.best_distance_function, self.best_scaler, self.best_model
def question_4(points): """ question 4 :param points: list of Point """ k_list = [5, 7] normalization_list = [[DummyNormalizer, "DummyNormalizer"], [SumNormalizer, "SumNormalizer"], [MinMaxNormalizer, "MinMaxNormalizer"], [ZNormalizer, "ZNormalizer"]] print("Question 4:") for k in k_list: print("K=", k, sep="") m = KNN(k) m.train(points) cv = CrossValidation() for i in normalization_list: normalize_object = i[0]() normalize_object.fit(points) new_points = normalize_object.transform(points) # 2 is the best n-fold average_score = cv.run_cv(new_points, 2, m, accuracy_score, False, True) formatted_average_score = "{:.2f}".format(average_score) print("Accuracy of", i[1], "is", formatted_average_score) print()
class TestKNN(unittest.TestCase): def setUp(self): np.genfromtxt = mock.MagicMock(return_value=np.empty([2, 2])) self.target = KNN("", "") def test_distinct(self): input = [ 1, 2, 3, 2, 2, 1, 5, 4, 4, 5, 4, 3, 2, 2, 1, 2, 5, 4, 6, 5, 4, 4, 3, 2, 3, 4, 3, 1, 5, 2, 6, 4, 6 ] expected = [1, 2, 3, 5, 4, 6] self.assertEqual(self.target._distinct(input), expected) def test_calculateDistance(self): instance1 = np.array([3, 104]) instance2 = np.array([18, 90]) self.assertEqual(self.target.calculateDistance(instance1, instance2), 20.518284528683193) def test_calculateDistances(self): magicMock = mock.MagicMock() magicMock.side_effect = [1, 0.3, 2, 0.15, 3, 2, 6, 99, 0.015, 0.191] self.target.calculateDistance = magicMock result = self.target.calculateDistances(np.empty(10), np.empty([10, 10])) self.assertEqual(result[0], [1, 0.3, 2, 0.15, 3, 2, 6, 99, 0.015, 0.191]) self.assertTrue((result[1] == np.array([8, 3, 9, 1, 0, 2, 5, 4, 6, 7])).all())
def tuning_without_scaling(self, distance_funcs, x_train, y_train, x_val, y_val): """ In this part, you should try different distance function you implemented in part 1.1, and find the best k. Use k range from 1 to 30 and increment by 2. Use f1-score to compare different models. :param distance_funcs: dictionary of distance functions you must use to calculate the distance. Make sure you loop over all distance functions for each data point and each k value. You can refer to test.py file to see the format in which these functions will be passed by the grading script :param x_train: List[List[int]] training data set to train your KNN model :param y_train: List[int] train labels to train your KNN model :param x_val: List[List[int]] Validation data set will be used on your KNN predict function to produce predicted labels and tune k and distance function. :param y_val: List[int] validation labels Find(tune) best k, distance_function and model (an instance of KNN) and assign to self.best_k, self.best_distance_function and self.best_model respectively. NOTE: self.best_scaler will be None NOTE: When there is a tie, choose model based on the following priorities: Then check distance function [euclidean > minkowski > gaussian > inner_prod > cosine_dist] If they have same distance fuction, choose model which has a less k. """ max_score = 0 for i in distance_funcs: for k in range(1, 30, 2): cur_model = KNN(k, distance_funcs[i]) cur_model.train(x_train, y_train) cur_score = f1_score(y_val, cur_model.predict(x_val)) if cur_score > max_score: max_score = cur_score self.best_k = k self.best_distance_function = i self.best_model = cur_model
class TestKNN(unittest.TestCase): def setUp(self): self.knn = KNN() data = load_iris() X_train, X_test, y_train, y_test = train_test_split( data['data'], data['target'], random_state=0) self.knn.train(X_train, y_train) self.X_test = X_test self.y_test = y_test def test_distance_measure(self): a = np.random.rand(5) b = np.random.rand(5) self.assertTrue(self.knn.measure_distance(a, b)) def test_prediction_time(self): start_time = time.time() self.knn.predict(self.X_test, k=25) end_time = time.time() diff = (end_time - start_time) * 1000 logger.info("Running time for K={} is: {}".format(25, diff))
def test_iris_regression(self): """ Tests kNN for regression """ k = 1 iris_dataset = load_iris() knn = KNN(k, "average") # get petal length as input # ensure this is 2D X = iris_dataset.data[:, 2].reshape(-1, 1) # get petal width as output y = iris_dataset.data[:, 3] knn.fit(X, y) predicted = knn.predict(X) # verify shape of output self.assertEqual(len(predicted.shape), 1) self.assertEqual(predicted.shape[0], iris_dataset.data.shape[0]) # with k=1, each point should match itself # but with only 1 dimension, some points have # the same values mse = mean_squared_error(y, predicted) self.assertLess(mse, 0.1)
def eval_model(case): l, k = case results = {'precision': [], 'recall': [], 'f1': []} model = KNN(l, k) for i in range(folds): print(l, k, 'cross validation', i) training, testing = split_data(corpus, i, folds) print(l, k, 'fit model', i) model.fit([d.vector for d in training], [d.label for d in training]) print(l, k, 'predict', i) preds = [model.predict(d.vector) for d in testing] labels = [d.label for d in testing] metrics = model_metrics(labels, preds) for m, key in zip(metrics, ['precision', 'recall', 'f1']): results[key].append(m) print(l, k, mean(results['precision']), mean(results['recall']), mean(results['f1'])) return results
def testknn(): features = [[1, 1], [1, -1], [-1, -1], [-1, 1]] labels = [0, 1, 0, 1] knn = KNN(3, Distances.euclidean_distance) knn.train(features, labels) print(knn.predict([[0, 0]]))
def knn(corpus, idf): query = read_folder('./query') tf_idf(query, idf) print('fit KNN model') classifier = KNN(5, 5) classifier.fit([d.vector for d in corpus], corpus) start_time = time.time() for i, d in enumerate(query): print('Query Doc', i) print(d.features) # neighbors = classifier.brute_force(d.vector) neighbors = classifier.neighbors(d.vector) print('Query Neighbors', i) for n in neighbors: print(n.features) print('\n') print('\n') print("--- %s seconds ---" % (time.time() - start_time))
def knn_validate(data, kernel, metric, k_neighbors, show_plot): plot = Plot() matrix_full = [[0, 0], [0, 0]] y_predict_arr = [] for i in range(len(data)): data.updateTrainTest(i) trainDots, trainClass = data.getDotsByMode('train', False) testDots, testClass = data.getDotsByMode('test', False) knn = KNN(kernel=kernel, metric=metric, neighbors=k_neighbors) knn.fit(trainDots, trainClass) y_predict, distance = knn.predict(testDots) y_predict_arr.append(y_predict[0]) if show_plot: tDots = np.array(trainDots) tCls = np.array(trainClass) plot.knn(tDots[tCls == 1.0], tDots[tCls == -1.0], distance, testDots[0], y_predict[0]) matrix = get_metrics(y_predict, testClass) matrix_full[0][0] += matrix[0][0] matrix_full[0][1] += matrix[0][1] matrix_full[1][0] += matrix[1][0] matrix_full[1][1] += matrix[1][1] return y_predict_arr, get_f_measure(matrix_full), matrix_full
def test_knn_regression(): while True: N = np.random.randint(2, 100) M = np.random.randint(2, 100) k = np.random.randint(1, N) ls = np.min([np.random.randint(1, 10), N - 1]) weights = np.random.choice(["uniform", "distance"]) X = np.random.rand(N, M) X_test = np.random.rand(N, M) y = np.random.rand(N) knn = KNN(k=k, leaf_size=ls, metric=euclidean, classifier=False, weights=weights) knn.fit(X, y) preds = knn.predict(X_test) gold = KNeighborsRegressor( p=2, leaf_size=ls, n_neighbors=k, weights=weights, metric="minkowski", algorithm="ball_tree", ) gold.fit(X, y) gold_preds = gold.predict(X_test) for mine, theirs in zip(preds, gold_preds): np.testing.assert_almost_equal(mine, theirs) print("PASSED")
def main(): df = pd.read_csv(f".\Data\{args.dataset}") X = np.array(df.iloc[:, :-1]) y = np.array(df.iloc[:, -1]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) knn = KNN(X_train, y_train, k=args.k) if args.type == "clf": y_pred = knn.predict(X_test, knn_type="clf") print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) elif args.type == "reg": y_pred = knn.predict(X_test, knn_type="reg") mse = np.mean((y_test - y_pred)**2) print(mse) else: return print("Undefined knn type") accuracy = np.mean(y_pred == y_test) print(accuracy)
def overSample(matall, vecall, mat, T, tar, n): if tar <= 0: return mat danger = 0 choice = np.zeros(T, dtype=int) set1 = set() for i in range(T): if i % 20 == 0: print('%d of %d (in finding danger set...)' % (i, T)) vec = KNN.kNearestNeighbours(matall, matall.shape[0], n, i) typ = BorderlineSmote.sampleType(vec, matall, vecall) if typ == 1: choice[danger] = i danger += 1 set1.add(i) N = int(tar / danger) tar = N * danger ret = np.zeros((tar, n)) tot = 0 for i in range(danger): vec = KNN.kNearestNeighbours(mat, T, n, choice[i]) BorderlineSmote.pupulate(ret, mat, tot, i, vec, N, n, set1) tot += N return ret
def run_knn(points): m = KNN(5) m.train(points) print(f'predicted class: {m.predict(points[0])}') print(f'true class: {points[0].label}') cv = CrossValidation() cv.run_cv(points, 10, m, accuracy_score)
def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val): best_score=-100 best_k=None best_dist='' best_model=None best_scaler=None xtrain=x_train xval=x_val for key in distance_funcs: for i in scaling_classes: for k in range(1,30,2): x_train=xtrain x_val=xval if k>len(x_train): break b=scaling_classes[i]() x_train=b.__call__(x_train) x_val=b.__call__(x_val) a=KNN(k,distance_funcs[key]) a.train(x_train,y_train) P=a.predict(x_val) score=f1_score(y_val,P) if score>best_score: best_score=score best_k=k best_scaler=i best_dist=key best_model=a self.best_scaler=best_scaler self.best_k = best_k self.best_distance_function =best_dist self.best_model = best_model """
def computeKNNCrossValidation(args, dict_algorithms): if (args.debug): print("Running knn...", end='') model = KNN(args) dict_algorithms["knn"] = model.computeCrossValidation() if (args.debug): print("ok!")
def test_knn_clf(): while True: N = np.random.randint(2, 100) M = np.random.randint(2, 100) k = np.random.randint(1, N) n_classes = np.random.randint(10) ls = np.min([np.random.randint(1, 10), N - 1]) weights = "uniform" X = np.random.rand(N, M) X_test = np.random.rand(N, M) y = np.random.randint(0, n_classes, size=N) knn = KNN(k=k, leaf_size=ls, metric=euclidean, classifier=True, weights=weights) knn.fit(X, y) preds = knn.predict(X_test) gold = KNeighborsClassifier( p=2, leaf_size=ls, n_neighbors=k, weights=weights, metric="minkowski", algorithm="ball_tree", ) gold.fit(X, y) gold_preds = gold.predict(X_test) for mine, theirs in zip(preds, gold_preds): np.testing.assert_almost_equal(mine, theirs) print("PASSED")
def tuning_without_scaling(self, distance_funcs, x_train, y_train, x_val, y_val): """ In this part, you need to try different distance functions you implemented in part 1.1 and different values of k (among 1, 3, 5, ... , 29), and find the best model with the highest f1-score on the given validation set. :param distance_funcs: dictionary of distance functions (key is the function name, value is the function) you need to try to calculate the distance. Make sure you loop over all distance functions for each k value. :param x_train: List[List[int]] training data set to train your KNN model :param y_train: List[int] training labels to train your KNN model :param x_val: List[List[int]] validation data :param y_val: List[int] validation labels Find the best k, distance_function (its name), and model (an instance of KNN) and assign them to self.best_k, self.best_distance_function, and self.best_model respectively. NOTE: self.best_scaler will be None. NOTE: When there is a tie, choose the model based on the following priorities: First check the distance function: euclidean > Minkowski > cosine_dist (this will also be the insertion order in "distance_funcs", to make things easier). For the same distance function, further break tie by prioritizing a smaller k. """ best_f1 = 0 for name, func in distance_funcs.items(): for k in range(1, 30, 2): model = KNN(k, func) model.train(x_train, y_train) valid_f1 = f1_score(y_val, model.predict(x_val)) if valid_f1 > best_f1: self.best_distance_function = name self.best_k = k best_f1 = valid_f1 self.best_model = model
def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val): """ This part is the same as "tuning_without_scaling", except that you also need to try two different scalers implemented in Part 1.3. More specifically, before passing the training and validation data to KNN model, apply the scalers in scaling_classes to both of them. :param distance_funcs: dictionary of distance functions (key is the function name, value is the function) you need to try to calculate the distance. Make sure you loop over all distance functions for each k value. :param scaling_classes: dictionary of scalers (key is the scaler name, value is the scaler class) you need to try to normalize your data :param x_train: List[List[int]] training data set to train your KNN model :param y_train: List[int] train labels to train your KNN model :param x_val: List[List[int]] validation data :param y_val: List[int] validation labels Find the best k, distance_function (its name), scaler (its name), and model (an instance of KNN), and assign them to self.best_k, self.best_distance_function, best_scaler, and self.best_model respectively. NOTE: When there is a tie, choose the model based on the following priorities: First check scaler, prioritizing "min_max_scale" over "normalize" (which will also be the insertion order of scaling_classes). Then follow the same rule as in "tuning_without_scaling". """ # You need to assign the final values to these variables best_f1 = 0 for scaling_name, scaling_func in scaling_classes.items(): scaler = scaling_func() x_train_scaled = scaler(x_train) x_val_scaled = scaler(x_val) for name, func in distance_funcs.items(): for k in range(1, 30, 2): model = KNN(k, func) model.train(x_train_scaled, y_train) valid_f1 = f1_score(y_val, model.predict(x_val_scaled)) if valid_f1 > best_f1: self.best_distance_function = name self.best_k = k best_f1 = valid_f1 self.best_model = model self.best_scaler = scaling_name
def fit(self, X, y): # instantiate the input models rf = RandomForest(num_trees=15) knn = KNN(k=3) nb = NaiveBayes(num_classes=2) # Random Forest fit and predict rf.create_splits(X) rf.fit(X, y) rf_pred = rf.predict(X) # K-Nearest Neighbors fit and predict knn.fit(X, y) knn_pred = knn.predict(X) # Naive Bayes fit and predict nb.fit(X, y) nb_pred = nb.predict(X) # use predictions from input models as inputs for meta-classifiers meta_input = np.hstack((rf_pred.reshape( (rf_pred.size, 1)), knn_pred.reshape( (knn_pred.size, 1)), nb_pred.reshape((nb_pred.size, 1)))) # use Decision Tree as meta-classifier dt = DecisionTree(max_depth=np.inf) dt.fit(meta_input, y) self.rf = rf self.knn = knn self.nb = nb self.meta_classifier = dt
def test_compare_to_scikit_learn_changing_k(self): normalizer = Normalizer(self.data) data = normalizer.normalize() testSize = 100 trainSize = len(data.data) - testSize for i in range(1, 12): with self.subTest(i=i): print("k: ", i) neighbours = i trainData = {} testData = {} trainData['data'] = data.data[:trainSize] trainData['target'] = data.target[:trainSize] testData['data'] = data.data[trainSize:] testData['target'] = data.target[:trainSize] knn = KNN(trainData) #scikit-learn model: model = KNeighborsClassifier(n_neighbors=neighbours) model.fit(trainData['data'], trainData['target']) ourCounter = 0 sciCounter = 0 for i, e in enumerate(testData['data']): if knn.makeGuess(e, neighbours) == testData['target'][i]: ourCounter+=1 if model.predict([e]) == testData['target'][i]: sciCounter+=1 self.assertAlmostEqual(ourCounter/(testSize), sciCounter/(testSize), 3)
def test_blob_classification_numpy(self): """ Tests kNN for classification using randomly-generated points drawn from Gaussian-shaped clusters. Splits data into training and testing sets. """ k = 3 X, y = generate_cluster_samples() train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y) knn = KNN(k) knn.fit(train_X, train_y) pred_y = knn.predict_numpy(test_X) # verify shape of output self.assertEqual(len(pred_y.shape), 1) self.assertEqual(pred_y.shape[0], test_X.shape[0]) # with k=1, each point should match itself accuracy = accuracy_score(test_y, pred_y) self.assertAlmostEqual(accuracy, 1.0)
def main(): K = [1, 2, 3, 5, 7, 9, 11, 13, 15] #load CM1 data = arff.loadarff('./datasets/CM1.arff') X, Y = build_dataframe(data) # normalize data X = normalize_data(X) # create k-fold splits kf = KFold(n_splits=10) # instanciate classifier for k in K: clf = KNN(k=k) print("k equals {}".format(k)) start_time = time.time() acc = [] for train, test in kf.split(X): clf.fit(X.iloc[train], Y.iloc[train]) predictions = clf.predict(X.iloc[test]) acc.append((np.sum(predictions == Y.iloc[test]) / len(test)) * 100) end_time = time.time() acc = np.array(acc) print("mean accuracy: {}".format(np.mean(acc))) print("standard deviation: {}".format(np.std(acc))) print("time elapsed: {}".format(end_time - start_time))
def testscale(self, scale, trial = None): backup = self.data, self.testdata self.data, self.testdata = self.rescale(self.data, scale), self.rescale(self.testdata, scale) self.knn = KNN(self.data) result = self.validation(trial) self.data, self.testdata = backup self.knn = KNN(self.data) return result
def knncv(Xtrain, Ytrain, Xtest, Ytest): knn = KNN(Xtrain,Ytrain) m = len(Ytest) Ypredict = np.zeros(m) for i in xrange(m): x,y = Xtest[i],Ytest[i] results = knn.predict(x,k=4,classes=classes) prediction = results.argmax() Ypredict[i] = prediction return Ypredict
def train_and_val(): training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat') training_set_indices, validation_set_indices = dp.read_training_val_set('dataset/train.txt', 'dataset/val.txt') feature = Features() features_labels_pair = feature.amino_acid_count(training_data) training_set = [] for index in training_set_indices: training_set.append(features_labels_pair[index]) dp.remove_ambiguous_entry_plus(training_set) k_nn = KNN(training_set, 23) confusion_matrix = np.zeros([3,3]) correct = 0.0 total = 0.0 validation_set = [] for index in validation_set_indices: validation_set.append(features_labels_pair[index]) dp.remove_ambiguous_entry_plus(validation_set) for feature_vector, correct_class in validation_set: prediction = k_nn.predict_codon_cosine(feature_vector, k_nn.no_weight) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0,0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0,1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0,2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1,0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1,1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1,2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2,0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2,1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2,2] += 1 #print prediction, correct_class print confusion_matrix print correct/total
def __init__(self, test_file, trained_file, global_file): self._doc = open(test_file) # self._stats = json.load(open(global_file)) # self._n = self._stats["N"] # self._gf = self._stats["freq"] self._knn = KNN(trained_file) self.result = {} self.output = []
def train_and_test(): training_data = dp.read_data("dataset/splice-Xtrain.dat", "dataset/splice-Ytrain.dat") test_data = dp.read_data("dataset/test40.txt", "dataset/ytest40.txt") feature = Features() dp.remove_ambiguous_entry_plus(training_data) training_set = feature.amino_acid_count(training_data) test_set = feature.amino_acid_count(test_data) k_nearest_neighbors = KNN(training_set, 26) confusion_matrix = np.zeros([3, 3]) correct = 0.0 total = 0.0 for index in range(len(test_set)): feature_vector, correct_class = test_set[index] prediction = k_nearest_neighbors.predict_codon_cosine(feature_vector, k_nearest_neighbors.no_weight) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0, 0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0, 1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0, 2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1, 0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1, 1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1, 2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2, 0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2, 1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2, 2] += 1 print confusion_matrix print correct / total
def __init__(self, data, testdata = {}, k = 7, function = None, scale = None): if scale: data = self.rescale(data, scale) testdata = self.rescale(testdata, scale) self.data = data self.testdata = testdata self.knn = KNN(data) self.k = k if function: self.function = function else: self.function = self.knn.inverseWeight
def main(argv): knn = KNN('iris.data', 'iris.test', 4) training_set = knn.get_training_set() testing_set = knn.get_testing_set() print('**********\n***** IRIS ****\n***********') print_result(knn, training_set, testing_set) knn = KNN('wdbc.data', 'wdbc.test', 1, 0) print('**********\n***** Breast Cancer in Wisconsin ****\n***********') training_set = knn.get_training_set() testing_set = knn.get_testing_set() print_result(knn, training_set, testing_set)
# https://deeplearningcourses.com/c/data-science-supervised-machine-learning-in-python # https://www.udemy.com/data-science-supervised-machine-learning-in-python from __future__ import print_function, division from builtins import range, input # Note: you may need to update your version of future # sudo pip install -U future from knn import KNN from util import get_xor import matplotlib.pyplot as plt if __name__ == '__main__': X, Y = get_xor() # display the data plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5) plt.show() # get the accuracy model = KNN(3) model.fit(X, Y) print("Accuracy:", model.score(X, Y))
def Detection(self): if (db_type != 'mongodb' and db_type != '1' and db_type != 'redis' and db_type != '2'): return if self.algorithm == 'knn' or self.algorithm == '1': print("K nearest neighbor algorithm") detection_algorithm = KNN() elif self.algorithm == 'perceptron' or self.algorithm == '2': detection_algorithm = Perceptron() self.do_perceptron_learn = True # only do once before the testing! #print('Invalid algorithm. SVM not yet supported') #return else: print('Invalid algorithm selection') return have_batchfile = len(self.batchfile) != 0 if have_batchfile: batchex = BatchExecutor(self.batchfile, detection_algorithm, self.do_perceptron_learn, hostname=self.hostname, port=27017) batchex.start() metric_groups = self.metric_groups counter_keys = self.counter_keys data = [] new_metrics = {} old_metrics = {} anomaly_metrics = {} sleep = self.sleep_interval fp = open('./out.txt', 'a+') # just run forever until ctrl-c (in non-batch mode) or run until the # batch executor finishes (in batch mode) while True: if have_batchfile: allDone, duration = batchex.wait_for_measure_to_be_ready_all_done_or_failed() if allDone: break if duration == None: print("FATAL ERROR: Batch execution failed!") break else: traintest, duration = self.getTrainOrTest() #train/test for a set duration if duration == -1: print("Running forever") forever = 1 else: if duration == 0.0: raise ValueError("invalid duration") print("Running for {} seconds".format(duration)) forever = 0 ii = 0 # fetch the metrics data = self.getData() #Initial block is to set up old_metrics #since we only care about the changes in some values, not the # aggregates #put all the new metrics in new_metrics for metric_group, items in metric_groups.items(): #If the item is not a list, then take it straight from data if not items: try: new_metrics[metric_group] = float(data[metric_group]) except KeyError: pass else: #set to 0 so that we can recalculate the aggregate values #this is for resetting the values on the subsequent #iteration (e.g. testing -> training) if (metric_group in new_metrics and metric_group not in data): new_metrics[metric_group] = 0 anomaly_metrics[metric_group] = 0 #iterate over the list of items for item in items: #if the metric_group is in data, then its items will be #as well if metric_group in data: try: new_metrics[metric_group + item] = float(data[metric_group][item]) anomaly_metrics[metric_group + item] = float(data[metric_group][item]) except KeyError: pass #if the metric_group isn't in data, but its items are #then aggregate all of the items into the metric_group #This happens in Redis to aggregate all types of #commands together elif item in data: if metric_group not in new_metrics: new_metrics[metric_group] = 0 anomaly_metrics[metric_group] = 0 try: new_metrics[metric_group] += float(data[item]['calls']) anomaly_metrics[metric_group] += float(data[item]['calls']) except KeyError: pass while duration > 0 or forever == 1: time.sleep(sleep) duration -= sleep point = () # fetch the metrics data = self.getData() #put all the new metrics in new_metrics for metric_group, items in metric_groups.items(): #set old to new so that we can take the difference #between the two measurements if metric_group in new_metrics: old_metrics[metric_group] = new_metrics[metric_group] new_metrics[metric_group] = 0 anomaly_metrics[metric_group] = 0 #set to 0 so that we can recalculate the aggregate values #If the item is not a list, then take it straight from data if not items: try: new_metrics[metric_group] = float(data[metric_group]) anomaly_metrics[metric_group] = float(data[metric_group]) except KeyError: pass else: #iterate over the list of items for item in items: #if the metric_group is in data, then its items #will be as well if metric_group in data: try: old_metrics[metric_group + item] = new_metrics[metric_group + item] new_metrics[metric_group + item] = float(data[metric_group][item]) anomaly_metrics[metric_group + item] = float(data[metric_group][item]) except KeyError: pass #if the metric_group isn't in data, but its items #are then aggregate all of the items into the #metric_group #This happens in Redis to aggregate all types of #commands together elif item in data: try: new_metrics[metric_group] += float(data[item]['calls']) anomaly_metrics[metric_group] += float(data[item]['calls']) except KeyError: pass #make per second values for the counters for counter_group, items in counter_keys.items(): #if the item is not a list, then we can just subtract the #entire counter group. This is in Redis where we aggregate #all command types together if not items: if counter_group in new_metrics: try: anomaly_metrics[counter_group] = (new_metrics[counter_group] - old_metrics[counter_group]) / sleep except KeyError: pass else: #iterate over all items in the list for item in items: if counter_group in data: try: anomaly_metrics[counter_group+item] = (new_metrics[counter_group+item] - old_metrics[counter_group+item]) / sleep except KeyError: pass #create a tuple from the anomaly_metrics dictionary #yes I know this is a slow and dumb way to do this for items in anomaly_metrics: point += (anomaly_metrics[items],) sys.stdout.write("{}, {}\n".format(items, anomaly_metrics[items])) if not ii % 100000: #sys.stdout.write("{}, ".format(items)) fp.write("{}, ".format(items)) sys.stdout.write("\n") if not ii % 50: print(ii) if not ii % 100000: #print('\n') fp.write("\n") ii += 1 #print anomaly_metrics #sys.stdout.write("{}\n".format(point)) fp.write("{}\n".format(point)) if have_batchfile: batchex.signal_measuring_done(point, duration) elif traintest == '1': detection_algorithm.trainSet.append({point:'Normal'}) detection_algorithm.size_normal_train += 1 elif traintest == '2': detection_algorithm.trainSet.append({point:'Anomaly'}) detection_algorithm.size_anomaly_train += 1 elif traintest == '3': #print point if self.do_perceptron_learn == True: detection_algorithm.preProcess() self.do_perceptron_learn = False label = detection_algorithm.getLabel(point) if label == 'Normal' or label == 0: print 'Normal' #fp.write('Normal\n') elif label == 'Anomaly' or label == 1: print 'Anomaly' #fp.write('Anomaly\n') print('\n') fp.flush()
import numpy as np import numpy.matlib from knn import KNN from pyspark import SparkConf, SparkContext from utils import cdist, vote, get_confusion_matrix, get_image_rdd sc = SparkContext() # train data # Each element in x and y is (SubGroupKey, iterableResults) # in which iterableResults are (PixelKey, features/labels) x, y = get_image_rdd(sc, n_groups=1000, start=0, end=1300) # knn model knn = KNN(x,y) del x, y # test data x_, y_ = get_image_rdd(sc, val=1) x_list, y_list = x_.collect(), y_.collect() del x_, y_ print 'Length of x_list', len(x_list) cm = numpy.matlib.zeros((2,2), dtype=float) pred = [] # iterate 10 pixels at a time flag = True counter = 0 while flag: x__, y__ = x_list[:10], y_list[:10] x_list[:10] = [] y_list[:10] = []
def printStats(self): data = [] knn_class = KNN() sleep = 1 q = 0 i = 0 u = 0 d = 0 qcpu = 0 icpu = 0 ucpu = 0 dcpu = 0 ii = 0 con = 0 hostname = "localhost" idx_b_a = 0 idx_b_h = 0 idx_b_m = 0 new_bytesIn = 0 new_bytesOut = 0 new_numRequests = 0 bytesIn = 0 bytesOut = 0 numRequests = 0 network_skip_flag = 0 # just run forever until ctrl-c while True: do_normal_train = raw_input('Do normal training?: ') do_anomaly_train = raw_input('Do anomaly training?: ') do_test = raw_input('Do testing?: ') if do_normal_train == 'y': do_normal_train = True else: do_normal_train = False if do_anomaly_train == 'y': do_anomaly_train = True else: do_anomaly_train = False if do_test == 'y': do_test = True else: do_test = False # set previous values before overwriting pq = q pi = i pu = u pd = d pqcpu = qcpu picpu = icpu pucpu = ucpu pdcpu = dcpu pidx_b_a = idx_b_a pidx_b_h = idx_b_h pidx_b_m = idx_b_m # fetch the stats data = ( self.db.command( { "serverStatus" : 1 } ) ) #print data['indexCounters'];sys.exit() res = int(data['mem']['resident']) vir = int(data['mem']['virtual']) mapd = int(data['mem']['mapped']) old_bytesIn = new_bytesIn old_bytesOut = new_bytesOut old_numRequests = new_numRequests new_bytesIn = int(data['network']['bytesIn']) new_bytesOut = int(data['network']['bytesOut']) new_numRequests = int(data['network']['numRequests']) if(network_skip_flag == 0): network_skip_flag = 1 else: bytesIn = new_bytesIn - old_bytesIn bytesOut = new_bytesOut - old_bytesOut numRequests = new_numRequests - old_numRequests template="%12s%22s%12s%12s%12s%12s" header=('hostname', 'time', 'resident','virtual', 'mapped', 'load', 'bytesIn', 'bytesOut', 'numRequests') datastr="hostname, self.thetime(), res, vir, mapd, self.getload(), bytesIn, bytesOut, numRequests" point = (0, 0, 0, 0, 0, res, vir, mapd, 0, 0, 0, 0, self.getload(), bytesIn, bytesOut, numRequests) if "opcounters" in data: q = int(data['opcounters']['query']) i = int(data['opcounters']['insert']) u = int(data['opcounters']['update']) d = int(data['opcounters']['delete']) try: qcpu = int(data['opcounters']['queryCpuTime']) icpu = int(data['opcounters']['insertCpuTime']) ucpu = int(data['opcounters']['updateCpuTime']) dcpu = int(data['opcounters']['deleteCpuTime']) except KeyError: qcpu = 0 icpu = 0 ucpu = 0 dcpu = 0 con = int(data['connections']['current']) template="%12s%22s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s" header=('hostname', 'time', 'query', 'insert', 'update', \ 'delete', 'active con', 'resident', \ 'virtual','mapped','load', 'bytesIn', 'bytesOut', 'numRequests', \ 'queryCpu', 'insertCpu', 'updateCpu', 'deleteCpu') datastr="hostname, self.thetime(), (q-pq)/sleep, (i-pi)/sleep,(u-pu)/sleep, (d-pd)/sleep, con, res, vir, mapd, self.getload(), bytesIn, bytesOut, numRequests, (qcpu-pqcpu)/sleep, (icpu-picpu)/sleep, (ucpu-pucpu)/sleep, (dcpu-pdcpu)/sleep" point = ((q-pq)/sleep, (i-pi)/sleep,(u-pu)/sleep, (d-pd)/sleep, con, res, vir, mapd, 0, 0, 0, 0, self.getload(), bytesIn, bytesOut, numRequests, (qcpu-pqcpu)/sleep, (icpu-picpu)/sleep, (ucpu-pucpu)/sleep, (dcpu-pdcpu)/sleep) # opcounters will be in data if indexcounters is if "indexCounters" in data: #idx_b_a = int(data['indexCounters']['btree']['accesses']) #idx_b_h = int(data['indexCounters']['btree']['hits']) #idx_b_m = int(data['indexCounters']['btree']['misses']) #idx_b_o = round(float(data['indexCounters']['btree']['missRatio']),2) idx_b_a = int(data['indexCounters']['accesses']) idx_b_h = int(data['indexCounters']['hits']) idx_b_m = int(data['indexCounters']['misses']) idx_b_o = round(float(data['indexCounters']['missRatio']),2) template="%12s%22s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s" header=('hostname', 'time', 'query', 'insert', 'update', \ 'delete', 'active con', 'resident', \ 'virtual','mapped','idx acc','idx hit','idx miss','idx ratio','load', 'bytesIn', 'bytesOut', 'numRequests', \ 'queryCpu', 'insertCpu', 'updateCpu', 'deleteCpu') datastr="hostname, self.thetime(), (q-pq)/sleep, (i-pi)/sleep,(u-pu)/sleep, (d-pd)/sleep, \ con, res, vir, mapd, (idx_b_a-pidx_b_a)/sleep, (idx_b_h-pidx_b_h)/sleep, (idx_b_m-pidx_b_m)/sleep, idx_b_o, self.getload(), bytesIn, bytesOut, numRequests, (qcpu-pqcpu)/sleep, (icpu-picpu)/sleep, (ucpu-pucpu)/sleep, (dcpu-pdcpu)/sleep" point = ((q-pq)/sleep, (i-pi)/sleep,(u-pu)/sleep, (d-pd)/sleep, con, res, vir, mapd, (idx_b_a-pidx_b_a)/sleep, (idx_b_h-pidx_b_h)/sleep, (idx_b_m-pidx_b_m)/sleep, idx_b_o, self.getload(), bytesIn, bytesOut, numRequests, (qcpu-pqcpu)/sleep, (icpu-picpu)/sleep, (ucpu-pucpu)/sleep, (dcpu-pdcpu)/sleep) if do_normal_train: knn_class.trainSet.append({point:'Normal'}) knn_class.size_normal_train += 1 if do_anomaly_train: knn_class.trainSet.append({point:'Anomaly'}) knn_class.size_anomaly_train += 1 if do_test: #print point label = knn_class.getLabel(point) if (ii % 25 == 0): print template % header if do_test: # This is for testing, we print out the predicted label print template % (eval(datastr)), label else: print template % (eval(datastr)) ii += 1 time.sleep(sleep)
class Forecast(): def __init__(self, data, testdata = {}, k = 7, function = None, scale = None): if scale: data = self.rescale(data, scale) testdata = self.rescale(testdata, scale) self.data = data self.testdata = testdata self.knn = KNN(data) self.k = k if function: self.function = function else: self.function = self.knn.inverseWeight def rescale(self, data, scale): scaledata = {} for key in data.keys(): scaled = [data[key]['input'][i] * scale[i] for i in range(len(scale))] scaledata[key] = {'input': scaled, 'result': data[key]['result']} return scaledata def estimate(self, testdata): weathers = {} results = self.knn.estimate(testdata, self.k, self.function) for result in results: weather = self.data[result[0] + datetime.timedelta(hours = 6)]['result'] if weather not in weathers: weathers[weather] = 0 weathers[weather] += result[1] return sorted(weathers.items(), key = lambda x: x[1], reverse = True) def validation(self, trial = None): if not trial: trial = 100 count = 0 trial = min(trial, len(self.testdata)) testdata = self.testdata.values() random.shuffle(testdata) for test in testdata[:trial]: result = self.estimate(test['input']) if result[0][0] == test['result']: count += 1 return float(count) / trial def optimization(self, trial = None, krange = 10): best = 0.0 backup = self.k, self.function k, method = None, None for i in range(1, krange): for j in dir(self.knn): if not j.endswith('Weight'): continue self.k, self.function = i, getattr(self.knn, j) result = self.validation(trial) if result > best: best = result k, method = i, j print i, j, best self.k, self.function = backup return k, method def testscale(self, scale, trial = None): backup = self.data, self.testdata self.data, self.testdata = self.rescale(self.data, scale), self.rescale(self.testdata, scale) self.knn = KNN(self.data) result = self.validation(trial) self.data, self.testdata = backup self.knn = KNN(self.data) return result def annealing(self, domain, T = 10000, cool = 0.95, step = 1, trial = None, vector = None): if not vector: vector = [float(random.randint(domain[i][0], domain[i][1])) for i in range(len(domain))] best = self.testscale(vector, trial) result = vector while T > 0.1: newvec = vector[:] i = random.randint(0, len(domain) - 1) newvec[i] += random.randint(-step, step) if newvec[i] < domain[i][0]: newvec[i] = domain[i][0] elif newvec[i] > domain[i][1]: newvec[i] = domain[i][1] value = self.testscale(newvec, trial) p = 1 / pow(math.e, abs(best - value) / T) T *= cool print newvec, value if best < value: best = value result = newvec elif random.random() < p: vector = newvec return result
def main(): ############################################# # Set up the data as per the first Practicum ############################################# spam_values = np.genfromtxt('../input_data/spambase.data', delimiter=',') fl = open('../input_data/spambase.names', 'r') lines = [line.strip() for line in fl] # J : strip from beginning and ending whitespace fl.close() colnames = [line.partition(':')[0] for line in lines if not (len(line) == 0 or line[0] == '|' or line[0] == '1')] colnames.append('spam') spam_df = pd.DataFrame(spam_values,columns=colnames) spam_df['spam']=2*spam_df['spam']-1 # J: Apparently DataFrame.shape is a list or something and the first cell contains the number of samples in the DataFrame nsamples = spam_df.shape[0] ntest = np.floor(.2 * nsamples) ntune = np.floor(.1 * nsamples) # we want to make this reproducible so we seed the random number generator np.random.seed(1) all_indices = np.arange(nsamples) # J: important to shuffle so that you don't know which portion is training, which is testing and which is tuning data np.random.shuffle(all_indices) test_indices = all_indices[:ntest] # J: Get shuffled test indices first tune_indices = all_indices[ntest:(ntest+ntune)] # J: tune indices second train_indices = all_indices[(ntest+ntune):] # J: train indices (the majority) last # J : now that the "*indices" arrays have been shuffled, you can actually draw the relevant data through # DataFrame.ix. The second argument includes all columns, labels included. spam_train = spam_df.ix[train_indices,:] spam_tune = spam_df.ix[tune_indices,:] spam_test = spam_df.ix[test_indices,:] pd.save(spam_train, '../proc_data/training_data/spam_train.pdat') pd.save(spam_tune, '../proc_data/training_data/spam_tune.pdat') pd.save(spam_test, '../proc_data/testing_data/spam_test.pdat') ####################################################################### # See how features are sorted according to their Information Gain score ####################################################################### # atestTree = DecisionTree(spam_train, 5, True) # print atestTree.__sortFeatures__(spam_train, spam_train.columns) ############################################### # Training classifiers and saving them on disk ############################################### # Already trained those two, it took about 4 hours total. # majVoteTree = DecTree.DecisionTree(spam_train, 5, False) # print "Tuning a majority vote classifier on all depths between 1 and 15 inclusive." # majVoteTree.tune(spam_tune,1, 15) # print "Saving this classifier to disk." # majVoteTree.dump("../proc_data/dtreeWithMajVote_1_to_15.pyobj") # # IGTree = DecTree.DecisionTree(spam_train, 5, True) # print "Tuning an information gain classifier on all depths between 1 and 15 inclusive." # IGTree.tune(spam_tune,1, 15) # print "Saving this classifier to disk." # IGTree.dump("../proc_data/dtreeWithIG_1_to_15.pyobj") HectorsKNN = KNN(spam_train, spam_train['spam'], 5) print "Tuning Hector's KNN classifier for all values of K between 1 and 41 inclusive:" HectorsKNN.tune(spam_tune, spam_tune['spam'], k=range(1,42,2)) print "Saving this classifier to disk." HectorsKNN.dump("../proc_data/HectorsKNN_1_to_41.pyobj") ########################################### # Playing with stored classifiers ########################################### # Part 1: A decision tree classifier trained with Majority Vote, depths 1 to 10 # print "Loading a decision tree trained with Majority Vote for depths 1 to 10..." # majVoteTree = load("../proc_data/dtreeWithMajVote_1_to_15.pyobj") # print "According to the tuning set, the optimal depth for this tree is: " + str(majVoteTree.depth) # classifications = majVoteTree.classify(spam_test) # testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0) # print 'For this depth, the error on the test set was %0.3f' % testErrorRate # print "We will now test all different hyper-parameters found during tuning on the test data:" # majVoteTree.classifyWithAllDepths(spam_test) # print "\n===========================================================\n" # # # Part 2: A decision tree classifier trained with Information Gain, depths 1 to 10 # # print "Loading a decision tree trained with Information Gain for depths 1 to 10..." # IGTree = load("../proc_data/dtreeWithIG_1_to_15.pyobj") # print "According to the tuning set, the optimal depth for this tree is: " + str(IGTree.depth) # classifications = IGTree.classify(spam_test) # testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0) # print 'For this depth, the error on the test set was %0.3f' % testErrorRate # print "We will now test all different hyper-parameters found during tuning on the test data:" # IGTree.classifyWithAllDepths(spam_test) # Part 3: Hector's KNN-classifier print "Reloading Hector's classifier from disk:" HectorsKNN = load("../proc_data/HectorsKNN_1_to_41.pyobj") print "According to the tuning set, the optimal K for this classifier is: " + str(HectorsKNN.k) + "." classifications = HectorsKNN.classify(spam_test) testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0) print 'For this value of K, the error on the test set was %0.3f' % testErrorRate print "We will now test all different hyper-parameters found during tuning on the test data:" HectorsKNN.classifyWithAllK(spam_test) # Part 4: Weighted Features KNN print "Exiting..."
# Entrada para o tf-idf, devemos anotar os documentos com suas classes. # Receberá como entrada um array de tuplas: ([tokens], classe) parsed_trainning_documents_with_classes = [] for k in parsed_trainning_documents.keys(): parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]] # Execução tf-idf print('generating tf.idf...') tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes) tf_idf_calculator.run() # testa os parâmetros do knn: métrica de distância e valor de K for metric in ['cosine', 'euclid']: for k in range(5, 11, 2): knn = KNN(tf_idf_calculator.results, k, metric) # confusion_matrix[A][B] = quantas vezes um documento da classe A foi atribuído à classe B topics = ['baseball', 'christian', 'guns'] confusion_matrix = {topic:{t:0 for t in topics} for topic in topics} print_log = False i = 0 ytrue = [] ypred = [] for topic in topics: for doc in reader.test[topic]: ytrue.append(topic) # classifica os documentos de teste words = parser.process_sent(doc) query = tf_idf_calculator.generate_tf_vector(words)
def recommendation(all_docs, test_docs, classifier_list): print("Recommendation System") print("---------------------") # ask user for the desired option count and recommendation count. set default value in case invalid inputs. try: option_count = int(raw_input("\nEnter number of articles to choose from. [number from 5 to 10 suggested]: ")) if option_count < 1 or option_count > 20: print("Invalid Choice.. By default selected 5.") option_count = 5 except: print("Invalid Choice.. By default selected 5.") option_count = 5 try: k_n = int(raw_input("\nEnter number of recommendation per article. [number from 5 to 10 suggested]: ")) if k_n < 1 or k_n > 20: print("Invalid Choice.. By default selected 5.") k_n = 5 except: print("Invalid Choice.. By default selected 5.") k_n = 5 end = False # run the loop until user quits. while not end: # pick random documents from test docs and provide titles to the user. user_docs = random.sample(test_docs, option_count) while True: print("\n---Available Choices For Articles(Titles)---\n") for i in range(len(user_docs)): print(str(i+1) + ": " + user_docs[i].title) print("r: Refresh List") print("q: Quit()\n") choice = raw_input("Enter Choice: ") if choice == 'q': end = True break elif choice == 'r': break else: try: user_choice = int(choice) - 1 if user_choice < 0 or user_choice >= len(user_docs): print("Invalid Choice.. Try Again..") continue except: print("Invalid Choice.. Try Again..") continue selected_doc = user_docs[user_choice] # classifiers are sorted according to their f_measure in decreasing order. It helps when all # three classifiers differ in their predictions. classifier_list = sorted(classifier_list, key=lambda cl: cl.stats['f_measure'], reverse=True) prediction_list = list() for classifier in classifier_list: prediction_list.append(classifier.classify([selected_doc])[0]) prediction_count = Counter(prediction_list) top_prediction = prediction_count.most_common(1) if top_prediction[0][1] > 1: prediction = top_prediction[0][0] else: prediction = prediction_list[0] # create knn instance using documents of predicted topic. and find k closest documents. knn = KNN(all_docs[prediction]) k_neighbours = knn.find_k_neighbours(selected_doc, k_n) while True: print("\nRecommended Articles for : " + selected_doc.title) for i in range(len(k_neighbours)): print(str(i+1) + ": " + k_neighbours[i].title) next_choice = raw_input("\nEnter Next Choice: [Article num to read the article. " "'o' to read the original article. " "'b' to go back to article choice list.] ") if next_choice == 'b': break elif next_choice == 'o': text = selected_doc.text print("\nArticle Text for original title : " + selected_doc.title) print(text) else: try: n_choice = int(next_choice) - 1 if n_choice < 0 or n_choice >= k_n: print("Invalid Choice.. Try Again..") continue except: print("Invalid Choice.. Try Again..") continue text = k_neighbours[n_choice].text print("\nArticle Text for recommended title : " + k_neighbours[n_choice].title) print(text)
class ModelTester(object): def __init__(self, test_file, trained_file, global_file): self._doc = open(test_file) # self._stats = json.load(open(global_file)) # self._n = self._stats["N"] # self._gf = self._stats["freq"] self._knn = KNN(trained_file) self.result = {} self.output = [] def test(self): c = 0 while True: line = self._doc.readline() if not line: break print "dsf" id = line.split(',')[0] line = line.replace(', ', ',') parts = line.split() if parts < 2: raise ValueError labels,features = parts[0], ' '.join(parts[1:]) tfidf = self.get_tfidf(features) top_cats = self._knn.find_knn(tfidf) # for i in top_cats: # l = line.split(" ") # try : # self.result[str(i)].append(line.replace('\n', '')) # except : # self.result[str(i)] = [] # self.result[str(i)].append(line.replace('\n', '')) stri = str(id)+"," for i in top_cats : stri=stri+" "+str(i) self.output.append(stri) if c % 10== 0 :print c,stri c =c +1 def write(self, output_file ) : ''' fp = open(output_file+'.json', 'w') json.dump(self.result, fp) fp.close() ''' for o in self.output: print o def get_tfidf(self, features): features = re.findall(r'\d+:\d+', features) fs = "" doc_sum = 0.0 for f in features: # print f r,w = f.split(':') w = float(w) doc_sum += w for f in features: r,w = f.split(':') w = float(w) w /= doc_sum try: w *= log(self._n/(self._gf[r]+1)) except KeyError: w *= log(self._n) fs += r+":"+str(w)+" " return fs
for elem in validationData: if elem[2]*av_score[elem[1]-1] > 0 or (elem[2]==0 and av_score[elem[1]-1] <= 0): accuracy+=1 print "Simple Accuracy:", np.around(100.0*accuracy/len(validationData)), "%" ############# PERSONAL PREF ############# print 20 * "#", "Personal Pref", 20 * "#" jokeDataNew = jokeData # replace nan by 0 for i in range(len(jokeData)): jokeDataNew[i] = [0 if np.isnan(x) else x for x in jokeData[i] ] for k in [10, 100, 1000]: print "K Value:", k knn = KNN(k) knn.fit(jokeDataNew) neighbours = knn.neighbours av_score = [] accuracy = 0 for i in range(100): average_score = (np.mean([jokeDataNew[ind] for ind in neighbours[i]], 0)) av_score.append(average_score) for elem in validationData: if (elem[2]*av_score[elem[0]-1][elem[1]-1] > 0) or (elem[2]==0 and av_score[elem[0]-1][elem[1]-1] < 0): accuracy+=1 print "Pref Accuracy:", np.around(100.0*accuracy/len(validationData)), "%" ############# LATENT FACTOR ANALYSIS #############
def get_data(): width = 8 height = 8 N = width * height X = np.zeros((N, 2)) Y = np.zeros(N) n = 0 start_t = 0 for i in xrange(width): t = start_t for j in xrange(height): X[n] = [i, j] Y[n] = t n += 1 t = (t + 1) % 2 # alternate between 0 and 1 start_t = (start_t + 1) % 2 return X, Y if __name__ == '__main__': X, Y = get_data() # display the data plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5) plt.show() # get the accuracy model = KNN(3) model.fit(X, Y) print "Train accuracy:", model.score(X, Y)
def main(self, algo="KNN", textview=None): # Remplace "print" def print_output(text): if textview != None: buf = textview.get_buffer() buf.insert_at_cursor(text + "\n") textview.scroll_mark_onscreen(buf.get_insert()) else: log.info(text) # liste des types de set if self.validation == 1: listeTypesSet = ["train", "validation", "test"] else: listeTypesSet = ["train", "test"] # liste des resultats utilises pour les courbes listeRes=[] # creation des trainFile et testFile log.debug("Construction des fichiers d'entrainement") tools.constructLfwNamesCurrent( self.nbExemples ) #TODO ca ne sert plus a rien finalement ( nbClassesLFW, nbClassesORL ) = tools.trainAndTestConstruction( self.pourcentageTrain, self.nbExemples ) # Chargement des données dataTrain, dataTrainIndices, nClass = tools.loadImageData( "train", self.categorie) # tranformation pca print_output("Calcul des vecteurs propres...") pca_model = PCA( dataTrain ) pca_model.transform() # on transforme les donné dans un le "eigen space" ##### Recherche pas KNN if algo == "KNN": print_output("Début de l'algorithme des K plus proches voisins...") # On build le model pour recherche par KNN knn_model = KNN( pca_model.getWeightsVectors(), dataTrainIndices, nClass, self.K ) # On build le model pour Parzen parzen_model = ParzenWindows( pca_model.getWeightsVectors(), dataTrainIndices, nClass, self.Theta ) ## TEST ########################### #TODO Toute cette partie est a revoir pour sortir des graphes # de train, validation, test for trainTest in listeTypesSet: if trainTest == "train": dataTest, dataTestIndices = dataTrain, dataTrainIndices else : ### si l'on n'effectue pas de validation on concatene les entrees de test et de validation initiales pour obtenir le test #if "validation" not in listeTypesSet: #dataTestInitial, dataTestInitialIndices, nClass = tools.loadImageData( "test", self.categorie ) #dataValidation, dataValidationIndices, nClass = tools.loadImageData( "validation", self.categorie ) #dataTest = np.zeros(dataTestInitial.size + dataValidation.size) #dataTestIndices = np.zeros( dataTest.size ) #dataTest[ : dataTestInitial.size], dataTestIndices[ : dataTestInitial.size] = dataTestInitial, dataTestInitialIndices #dataTest[dataTestInitial.size : ], dataTestIndices[dataTestInitial.size : ] = dataValidation, dataValidationIndices #else: dataTest, dataTestIndices, nClass = tools.loadImageData( trainTest, self.categorie ) print_output("Projection des données de test...") dataTest_proj = pca_model.getProjection( dataTest ) # compteurs de bons résultats nbGoodResult = 0 nbGoodResult2 = 0 nbGoodResult3 = 0 t_start = time.clock() for i in range(0, int( dataTest.shape[1] )): # k = 1, pour réference # on force k knn_model.setK( 1 ) result1NN = knn_model.compute_predictions( dataTest_proj[:,i] ) if(result1NN == dataTestIndices[i]): nbGoodResult += 1 # k = n # replace k a ca position initial knn_model.setK( self.K ) resultKNN = knn_model.compute_predictions( dataTest_proj[:,i] ) if(resultKNN == dataTestIndices[i]): nbGoodResult2 += 1 resultParzen = parzen_model.compute_predictions( dataTest_proj[:,i] ) if(resultParzen == dataTestIndices[i]): nbGoodResult3 += 1 out_str = "Classic method: "+ str( result1NN ) +" | KNN method: "+ str( resultKNN ) +" | KNN+Parzen method: "+ str( resultParzen ) +" | Expected: "+ str( dataTestIndices[i] ) +"\n" # +1 car l'index de la matrice commence a 0 print_output(out_str) resClassic = (float(nbGoodResult) / float(dataTest.shape[1])) * 100. out_str = "\nAccuracy with classic method: %.3f" % resClassic + "%\n" resKNN = (nbGoodResult2 / float(dataTest.shape[1])) * 100. out_str += "Accuracy with KNN method (k="+ str( self.K ) +"): %.3f" % resKNN + "%\n" res = (nbGoodResult3 / float(dataTest.shape[1])) * 100. out_str += "Accuracy with KNN + Parzen window method (theta="+ str( self.Theta ) +"): %.3f" % res + "%\n" print_output(out_str) t_stop = time.clock() log.info("Temps total: %.4fs\n" % float(t_stop-t_start)) #### recupere les valeurs finale de l'erreur listeRes.append( 100 - resClassic ) listeRes.append( 100 - resKNN ) listeRes.append( 100 - res ) #### Recherche pas NNET elif algo == "NNET": print_output("Début de l'algorithme du Perceptron multicouche...") # parametre, donnees, etc... dataTrain = pca_model.getWeightsVectors() dataTrainTargets = (dataTrainIndices - 1).reshape(dataTrainIndices.shape[0], -1) #! contrairement au KNN le NNET prends les vecteurs de features en ligne et non pas en colonne train_set = np.concatenate((dataTrain.T, dataTrainTargets), axis=1) # recuperation des données de validation dataValidation, dataValidationIndices, nClass = tools.loadImageData( "validation", self.categorie ) print_output("Projection des données de validation...") dataValidation_proj = pca_model.getProjection( dataValidation ) dataValidationTargets = (dataValidationIndices - 1).reshape(dataValidationIndices.shape[0], -1) validation_set = np.concatenate((dataValidation_proj.T, dataValidationTargets), axis=1) # recuperation des données de test dataTest, dataTestIndices, nClass = tools.loadImageData( "test", self.categorie ) print_output("Projection des données de test...") dataTest_proj = pca_model.getProjection( dataTest ) dataTestTargets = (dataTestIndices - 1).reshape(dataTestIndices.shape[0], -1) test_set = np.concatenate((dataTest_proj.T, dataTestTargets), axis=1) # On build et on entraine le model pour recherche par KNN nnet_model = NeuralNetwork( dataTrain.shape[0], self.n_hidden, nClass, self.lr, self.wd ) if self.validation == 1: train_out, valid_out, test_out = nnet_model.train( train_set, self.n_epoch, self.batch_size, valid_set=validation_set, test_set=test_set) else : train_out, test_out = nnet_model.train( train_set, self.n_epoch, self.batch_size, test_set=test_set) # affichage des courbes d'entrainement x = [] y = [] y_err = [] color = [] legend = [] legend_err = [] filename = IMG_DIR + "Risque__Epoch_"+ str(self.n_epoch) +"_Hidden_"+ str(self.n_hidden) +"_Lr_"+ str(self.lr) +"_L2_"+ str(self.wd) + "_Categorie_" + str(self.categorie) + "_Batch_" + str(self.batch_size) + "_" filename_err = IMG_DIR + "Erreur_classification__Epoch_"+ str(self.n_epoch) +"_Hidden_"+ str(self.n_hidden) +"_Lr_"+ str(self.lr) +"_L2_"+ str(self.wd) + "_Categorie_" + str(self.categorie) + "_Batch_" + str(self.batch_size) + "_" train_out = np.array(train_out) x.append(np.array(xrange(train_out.shape[0]))) # parametres courbes train color.append('g-') legend.append("R Train") filename += "_Train" y.append(train_out[:,0]) y_err.append(train_out[:,1]) legend_err.append("Err Train") filename_err += "_Train" # parametre courbes validation if self.validation == 1: valid_out = np.array(valid_out) x.append(np.array(xrange(valid_out.shape[0]))) y.append(valid_out[:,0]) y_err.append(valid_out[:,1]) color.append('b-') legend.append("R Validation") legend_err.append("Err Validation") filename += "_Validation" filename_err += "_Validation" # parametre courbes test test_out = np.array(test_out) x.append(np.array(xrange(test_out.shape[0]))) y.append(test_out[:,0]) y_err.append(test_out[:,1]) color.append('r-') legend.append("R Test") legend_err.append("Err Test") filename += "_Test" filename_err += "_Test" # affichage title = u"\nEpoque: " + str(self.n_epoch) + " - Taille du batch: " + str(self.batch_size) + u" - Neurones cachés: " + str(self.n_hidden) + "\nL2: " + str(self.wd) + " - Taux d'apprentissage: " + str(self.lr) + u" - Catégorie: " + str(self.categorie) tools.drawCurves(x, y, color, legend, bDisplay=True, filename=filename, title=title, xlabel="Epoque", ylabel=u"Risque régularisé") tools.drawCurves(x, y_err, color, legend_err, bDisplay=True, filename=filename_err, title=title, xlabel="Epoque", ylabel="Erreur classification") #### construction fichier pour courbes ameliorees if self.stock == 1 : fichier = open("curvErrorNNet"+''.join( ''.join( title.split(' ') ).split('\n') ),"w") fichier.write("#epoch errorTrain errorValidation errorTest\n") if len(x) == 3: for j in range(len( x[0] )): fichier.write(str( x[0][j] )+" "+str( y[0][j] )+" "+str( y[1][j] )+" "+str( y[2][j] )+"\n") fichier.close() """ /!\ Cette partie n'est plus utile car effectué dans le nnet durant le train ## TEST ########################### #TODO Toute cette partie est a revoir pour sortir des graphes # de train, validation, test # compteurs de bons résultats nbGoodResult = 0 for i in range(0, int( dataTest.shape[1] )): # resultNNET = np.argmax(nnet_model.compute_predictions( dataTest_proj[:,i] ), axis=1)[0] if(resultNNET == dataTestTargets[i]): nbGoodResult += 1 out_str = "Result: "+ str( resultNNET ) + " | Expected: "+ str( dataTestTargets[i] ) +"\n" # +1 car l'index de la matrice commence a 0 print_output(out_str) res = (float(nbGoodResult) / float(dataTest.shape[1])) * 100. out_str = "\nAccuracy : %.3f" % res + "%\n" print_output(out_str) """ return listeRes