if y_true[i] == pos: TP += 1 else: FP += 1 FPR = FP / num_neg TPR = TP / num_pos if verbose: print("{},{}".format(FPR, TPR)) if __name__ == "__main__": if len(sys.argv) == 1: k = 30 train = "datasets/votes_train.json" test = "datasets/votes_test.json" else: k = int(sys.argv[1]) train = str(sys.argv[2]) test = str(sys.argv[3]) # parse the json files for data X_train, y_train, meta_train = parse_json(train) X_test, y_test, meta_test = parse_json(test) # fit KNN and predict confidence knn = KNNClassifier(k=k) knn.fit(X_train, y_train, meta_train) y_conf = knn.predict(X_test, verbose=False, confidence=True) roc_curve(y_test, y_conf, meta_test, verbose=True)
max_k = int(sys.argv[1]) train = str(sys.argv[2]) val = str(sys.argv[3]) test = str(sys.argv[4]) # parse the json files for data X_train, y_train, meta_train = parse_json(train) X_val, y_val, meta_val = parse_json(val) X_test, y_test, meta_test = parse_json(test) # train classifier on TRAIN, predict on VAL (for k=1,2,...,max_k) acc = {} for k in range(1, max_k + 1): knn = KNNClassifier(k=k) knn.fit(X_train, y_train, meta_train) y_pred = knn.predict(X_val, verbose=False) acc[k] = accuracy_score(y_val, y_pred) print("{},{}".format(k, acc[k])) best_k = max(acc, key=lambda key: acc[ key]) # note that 'max' always returns first value in case of ties print(best_k) # train on TRAIN + VAL, predict on TEST knn_best = KNNClassifier(k=best_k) X_train_val = pd.concat([X_train, X_val], ignore_index=True) y_train_val = pd.concat([y_train, y_val], ignore_index=True) knn_best.fit(X_train_val, y_train_val, meta_train)
# Create the k-NN object. knn = KNNClassifier(train_X[:, 1:], train_y[:, 1:], metric='euclidean') # Iterate through all possible values of k: for k in range(min_k, max_k + 1): knn.set_k(k) # 1. Perform KNN training and classify all the test points. In this step, you will # obtain a prediction for each test point. y_pred = [] for i in range(test_X.shape[0]): result = knn.predict(test_X[i, 1:]) if result: y_pred.append(result) else: knn.set_k(k - 1) y_pred.append(knn.predict(test_X[i, 1:])) knn.set_k(k) y_pred = np.array(y_pred) # 2. Compute performance metrics given the true-labels vector and the predicted- # labels vector (you might consider to use obtain_performance_metrics() function) perf = obtain_performance_metrics(test_y[:, 1], y_pred) # 3. Write performance results in the output file, as indicated the in homework
import pandas as pd if __name__ == "__main__": if len(sys.argv) == 1: k = 10 train = "datasets/votes_train.json" test = "datasets/votes_test.json" else: k = int(sys.argv[1]) train = str(sys.argv[2]) test = str(sys.argv[3]) # parse the json files for data X_train, y_train, meta_train = parse_json(train) X_test, y_test, meta_test = parse_json(test) for i in range(10): N = X_train.shape[0] ind = math.floor( (i + 1) * N / 10 - 1) # subtract 1 since indexing starts at 0 knn = KNNClassifier(k=k) knn.fit(X_train.ix[0:ind, :], y_train.ix[0:ind], meta_train) y_pred = knn.predict(X_test, verbose=False) acc = accuracy_score(y_test, y_pred) print(X_train.ix[0:ind, :].shape[0], end="") print(",{}".format(acc))