Esempio n. 1
0
    def test_homo(self, gamma=3):
        n_trunc = 1000
        kq, kd = 15, 30
        lam = 0.2

        diffusion = Diffusion(self.cross_gallery_fc)
        inverse = diffusion.get_laplacian_inverse(n_trunc, kd)
        knn = KNN(self.cross_gallery_fc, method='cosine')
        sims, ids = knn.search(self.cross_query_fc, kq)
        sims[sims < 0] = 0
        sims /= np.sum(sims, axis=-1).reshape(-1, 1)
        sims = sims**gamma
        scores_qg = np.empty(
            (len(self.test_query_set), len(self.test_gallery_set)),
            dtype=np.float32)
        for i in range(len(self.test_query_set)):
            scores_qg[i] = (sims[i] @ inverse[ids[i]])

        diffusion = Diffusion(self.cross_query_fc)
        inverse = diffusion.get_laplacian_inverse(n_trunc, kd)
        knn = KNN(self.cross_query_fc, method='cosine')
        sims, ids = knn.search(self.cross_gallery_fc, kq)
        sims[sims < 0] = 0
        sims /= np.sum(sims, axis=-1).reshape(-1, 1)
        sims = sims**gamma
        scores_gq = np.empty(
            (len(self.test_gallery_set), len(self.test_query_set)),
            dtype=np.float32)
        for i in range(len(self.test_gallery_set)):
            scores_gq[i] = (sims[i] @ inverse[ids[i]])

        scores = lam * scores_qg + (1 - lam) * scores_gq.T
        self.evaluate(-scores)
Esempio n. 2
0
File: utils.py Progetto: yujiegu/ML
    def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train,
                            y_train, x_val, y_val):
        f1 = -1
        for j in scaling_classes:
            scaler1 = scaling_classes[j]()
            x_normal = scaler1(x_train)
            x_val_normal = scaler1(x_val)
            for i in distance_funcs:
                for k in range(1, 30, 2):
                    a = KNN(k, distance_funcs[i])
                    a.train(x_normal, y_train)
                    pointlabel = a.predict(x_val_normal)
                    f1current = f1_score(y_val, pointlabel)
                    if (f1current > f1):
                        f1 = f1current
                        best_k = k
                        best_distance_function = i
                        best_scaler = j
        best_model = KNN(best_k, distance_funcs[best_distance_function])
        best_model.train(x_train, y_train)

        # assign the final values to these variables
        self.best_k = best_k
        self.best_distance_function = best_distance_function
        self.best_scaler = best_scaler
        self.best_model = best_model
        return best_k, best_distance_function, best_scaler, best_model
Esempio n. 3
0
def test_knn():
    df = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
        header=None)
    y = df.iloc[0:100, 4].values
    y = np.where(y == 'Iris-setosa', -1, 1)
    y = np.random.randint(2, size=100)
    x = df.iloc[0:100, [0, 2]].values
    print("Testing 2-D Iris data set with only one neighbor...")
    neighbor = KNN(k=1)
    neighbor.fit(x, y)
    neighbor.plot(x, y)
    print("Testing Iris data set with 15 neighbor...")
    iris = datasets.load_iris()
    x = iris.data[:, :2]
    y = iris.target
    neighbor = KNN(15)
    neighbor.fit(x, y)
    y_pred = neighbor.predict(x)
    neighbor.accuracy(y_pred, y)
    neighbor.plot(x, y)
    print(
        "Adding new point to dataset and testing with full Iris 1-k data set..."
    )
    neighbor = KNN(1)
    neighbor.fit(x, y)
    y2 = np.array([1])
    y2 = np.append(y, y2)
    x2 = np.vstack([x, [5.0, 3.2]])
    neighbor.plot(x2, y2)
    print("Testing SKLearn's model...")
    clf = neighbors.KNeighborsClassifier(1)
    clf.fit(x, y)
    plot_decision_regions(x2, y2, clf)
Esempio n. 4
0
def main(n):
    print('Teste: ' + str(n))
    teste = open('./particoes/ts' + str(n) + '.txt', 'r')
    arquivoTreino = './particoes/cj' + str(n) + '.txt'
    k = [1, 3, 5, 10]

    count = 0
    hitDTW = [0, 0, 0, 0]
    hitEuclidiana = [0, 0, 0, 0]
    for testeLinha in teste:
        count += 1  #Conta quantas linhas ja foram executadas
        linhateste = testeLinha.split(" ")
        vetordeteste = list(map(float, linhateste[1:]))

        ResultadoDTW = KNN(arquivoTreino, vetordeteste, k).runKNN_DTW()
        ResultadoEuclidiana = KNN(arquivoTreino, vetordeteste,
                                  k).runKNN_Euclidiana()

        #Contador de acertos:
        pos = 0
        for i in ResultadoDTW:
            if (int(linhateste[0]) == int(i[1])):
                hitDTW[pos] += 1
            pos += 1

        pos = 0
        for i in ResultadoEuclidiana:
            if (int(linhateste[0]) == int(i[1])):
                hitEuclidiana[pos] += 1
            pos += 1

        print("Progresso" + str(n) + ": " + str(count * 100 / 240) +
              "%")  #Indicador de progresso do programa

    #Relatorio dos testes:
    relatorio = open("./relatorios/relatorioFinal" + str(n) + ".txt", "w")
    relatorio.write('Teste: ' + str(n) + "\n")
    relatorio.write("Accuracy DTW K=1: " + str(hitDTW[0] * 100 / count) +
                    "% \n")
    relatorio.write("Accuracy DTW K=3: " + str(hitDTW[1] * 100 / count) +
                    "% \n")
    relatorio.write("Accuracy DTW K=5: " + str(hitDTW[2] * 100 / count) +
                    "% \n")
    relatorio.write("Accuracy DTW K=10: " + str(hitDTW[3] * 100 / count) +
                    "% \n")
    relatorio.write("Accuracy DTW TOTAL: " + str((
        (hitDTW[0] + hitDTW[1] + hitDTW[2] + hitDTW[3]) * 100) / (4 * count)) +
                    "% \n")
    relatorio.write("Accuracy Euclidiana K=1: " +
                    str(hitEuclidiana[0] * 100 / count) + "% \n")
    relatorio.write("Accuracy Euclidiana K=3: " +
                    str(hitEuclidiana[1] * 100 / count) + "% \n")
    relatorio.write("Accuracy Euclidiana K=5: " +
                    str(hitEuclidiana[2] * 100 / count) + "% \n")
    relatorio.write("Accuracy Euclidiana K=10: " +
                    str(hitEuclidiana[3] * 100 / count) + "% \n")
    relatorio.write("Accuracy Euclidiana TOTAL: " +
                    str(((hitEuclidiana[0] + hitEuclidiana[1] +
                          hitEuclidiana[2] + hitEuclidiana[3]) * 100) /
                        (4 * count)) + "% \n")
Esempio n. 5
0
def plot_knn():
    np.random.seed(12345)
    fig, axes = plt.subplots(4, 4)
    for i, ax in enumerate(axes.flatten()):
        n_in = 1
        n_out = 1
        d = np.random.randint(1, 5)
        n_ex = np.random.randint(5, 500)
        std = np.random.randint(0, 1000)
        intercept = np.random.rand() * np.random.randint(-300, 300)
        X_train, y_train, X_test, y_test, coefs = random_regression_problem(
            n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i)

        LR = LinearRegression(fit_intercept=True)
        LR.fit(X_train, y_train)
        y_pred = LR.predict(X_test)
        loss = np.mean((y_test.flatten() - y_pred.flatten())**2)

        knn_1 = KNN(k=1, classifier=False, leaf_size=10, weights="uniform")
        knn_1.fit(X_train, y_train)
        y_pred_1 = knn_1.predict(X_test)
        loss_1 = np.mean((y_test.flatten() - y_pred_1.flatten())**2)

        knn_5 = KNN(k=5, classifier=False, leaf_size=10, weights="uniform")
        knn_5.fit(X_train, y_train)
        y_pred_5 = knn_5.predict(X_test)
        loss_5 = np.mean((y_test.flatten() - y_pred_5.flatten())**2)

        knn_10 = KNN(k=10, classifier=False, leaf_size=10, weights="uniform")
        knn_10.fit(X_train, y_train)
        y_pred_10 = knn_10.predict(X_test)
        loss_10 = np.mean((y_test.flatten() - y_pred_10.flatten())**2)

        xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test))
        xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test))
        X_plot = np.linspace(xmin, xmax, 100)
        y_plot = LR.predict(X_plot)
        y_plot_1 = knn_1.predict(X_plot)
        y_plot_5 = knn_5.predict(X_plot)
        y_plot_10 = knn_10.predict(X_plot)

        ax.scatter(X_test, y_test, alpha=0.5)
        ax.plot(X_plot, y_plot, label="OLS", alpha=0.5)
        ax.plot(X_plot, y_plot_1, label="KNN (k=1)", alpha=0.5)
        ax.plot(X_plot, y_plot_5, label="KNN (k=5)", alpha=0.5)
        ax.plot(X_plot, y_plot_10, label="KNN (k=10)", alpha=0.5)
        ax.legend()
        #  ax.set_title(
        #      "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format(
        #          loss, loss_poly, loss_rbf
        #      )
        #  )

        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])

    plt.tight_layout()
    plt.savefig("img/knn_plots.png", dpi=300)
    plt.close("all")
def rerank_results(feedback, similar_images, similar_image_vectors,
                   query_image_vector):
    global feedback_imgs_g, feedback_vals_g, similar_images_g, similar_image_vectors_g
    similar_images_g = similar_images
    similar_image_vectors_g = similar_image_vectors

    # Add DT based relevance feedback function
    clf = DecisionTree()
    feedback_imgs = list(feedback.keys())

    feedback_vals = list(feedback.values())
    x_train_old, y_train = get_training_set(feedback_imgs, feedback_vals)
    x_train = []
    for i in x_train_old:
        j = i.tolist()
        x_train.append(j)

    clf.fit(x_train, y_train)
    # x_test = similar_image_vectors_g.values()
    x_test = []
    for i in similar_image_vectors_g.values():
        j = i.tolist()
        x_test.append(j)

    predictions = clf.predict(x_test)
    #relevant images
    indices_rel = [i for i, x in enumerate(predictions) if x == 1]
    print("Relevant", indices_rel)
    x_train_knn_rel = []
    rel_len = len(indices_rel)
    for i in indices_rel:
        x_train_knn_rel.append(x_test[i])
    knn = KNN(rel_len)
    #knn = KNeighborsClassifier(n_neighbours=rel_len)
    knn.fit(x_train_knn_rel)
    neighbours_rel = knn.get_neighbours([query_image_vector])
    print("Neighbours Rel", neighbours_rel)
    #irrelevant images
    indices_ir = [i for i, x in enumerate(predictions) if x == -1]
    print("Irrelevant", indices_ir)
    x_train_knn_ir = []
    ir_len = len(indices_ir)
    for i in indices_ir:
        x_train_knn_ir.append(x_test[i])
    knn = KNN(ir_len)
    knn.fit(x_train_knn_ir)
    neighbours_ir = knn.get_neighbours([query_image_vector])
    print("Neighbours IR", neighbours_ir)
    ranked_indices = []
    ranked_indices.extend(indices_rel)
    ranked_indices.extend(indices_ir)
    rel_similar_images = [
        list(similar_image_vectors_g.keys())[index] for index in ranked_indices
    ]
    return rel_similar_images
Esempio n. 7
0
    def tuning_without_scaling(self, distance_funcs, x_train, y_train, x_val, y_val):
        """
        In this part, you should try different distance function you implemented in part 1.1, and find the best k.
        Use k range from 1 to 30 and increment by 2. Use f1-score to compare different models.

        :param distance_funcs: dictionary of distance functions you must use to calculate the distance.
            Make sure you loop over all distance functions for each data point and each k value.
            You can refer to test.py file to see the format in which these functions will be
            passed by the grading script
        :param x_train: List[List[int]] training data set to train your KNN model
        :param y_train: List[int] train labels to train your KNN model
        :param x_val:  List[List[int]] Validation data set will be used on your KNN predict function to produce
            predicted labels and tune k and distance function.
        :param y_val: List[int] validation labels

        Find(tune) best k, distance_function and model (an instance of KNN) and assign to self.best_k,
        self.best_distance_function and self.best_model respectively.
        NOTE: self.best_scaler will be None

        NOTE: When there is a tie, choose model based on the following priorities:
        Then check distance function  [euclidean > minkowski > gaussian > inner_prod > cosine_dist]
        If they have same distance fuction, choose model which has a less k.
        """
        
        # You need to assign the final values to these variables
        self.best_k = None
        self.best_distance_function = None
        self.best_model = None

        max_f1 = -1
        for func_key in distance_funcs:
            for k in range(1, 30, 2):
                model = KNN(k, distance_funcs[func_key])
                model.train(x_train, y_train)
                predicted_y = model.predict(x_val)
                current_f1 = f1_score(y_val, predicted_y)
                # handle tie!!!
                if current_f1 > max_f1:
                    # print("replace")
                    # print("current func key: " + str(func_key))
                    # print("best func key: " + str(self.best_distance_function))
                    max_f1 = current_f1
                    self.best_k = k
                    self.best_distance_function = func_key
                    self.best_model = model
                elif current_f1 == max_f1:
                    # print("current func key: " + str(func_key))
                    # print("best func key: " + str(self.best_distance_function))
                    if self.distance_function_map[func_key] > self.distance_function_map[self.best_distance_function]:
                        self.best_k = k
                        self.best_distance_function = func_key
                        self.best_model = model
                    elif self.distance_function_map[func_key] == self.distance_function_map[self.best_distance_function]:
                        self.best_k = min(self.best_k, k)
                        self.best_model = KNN(self.best_k, distance_funcs[func_key])
Esempio n. 8
0
    def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val):
        """
        This part is the same as "tuning_without_scaling", except that you also need to try two different scalers implemented in Part 1.3. More specifically,
        before passing the training and validation data to KNN model, apply the scalers in scaling_classes to both of them.

        :param distance_funcs: dictionary of distance functions (key is the function name, value is the function) you need to try to calculate the distance.
        Make sure you loop over all distance functions for each k value.
        :param scaling_classes: dictionary of scalers (key is the scaler name, value is the scaler class) you need to try to normalize your data
        :param x_train: List[List[int]] training data set to train your KNN model
        :param y_train: List[int] train labels to train your KNN model
        :param x_val: List[List[int]] validation data
        :param y_val: List[int] validation labels

        Find the best k, distance_function (its name), scaler (its name), and model (an instance of KNN), and assign them to self.best_k,
        self.best_distance_function, best_scaler, and self.best_model respectively.

        NOTE: When there is a tie, choose the model based on the following priorities:
        First check scaler, prioritizing "min_max_scale" over "normalize" (which will also be the insertion order of scaling_classes).
        Then follow the same rule as in "tuning_without_scaling".
        """
        f1_scores = []
        ks = []
        diss = []
        scals = []
        i = 0
        for s_i in scaling_classes:
            scaling_class = scaling_classes[s_i]()
            x_train_new = scaling_class(x_train)
            x_val_new = scaling_class(x_val)
            for d_i in distance_funcs:
                for k_i in range(1, 30, 2):
                    knn = KNN(k_i, distance_funcs[d_i])
                    knn.train(x_train_new, y_train)
                    predicted_labels = knn.predict(x_val_new)
                    real_labels = y_val
                    f1_scores.append([f1_score(real_labels, predicted_labels), i])
                    ks.append(k_i)
                    diss.append(d_i)
                    scals.append(s_i)
                    i += 1
        indexes = sorted(f1_scores, key=lambda x: (x[0], -x[1]))

        # You need to assign the final values to these variables
        self.best_k = ks[indexes[len(indexes) - 1][1]]
        self.best_distance_function = diss[indexes[len(indexes) - 1][1]]
        distance_funcs = {
            'euclidean': Distances.euclidean_distance,
            'minkowski': Distances.minkowski_distance,
            'cosine_dist': Distances.cosine_similarity_distance,
        }
        self.best_model = KNN(self.best_k, distance_funcs[self.best_distance_function])
        self.best_model.train(x_train, y_train)
        self.best_scaler = scals[indexes[len(indexes) - 1][1]]
Esempio n. 9
0
    def testscale(self, scale, trial=None):
        backup = self.data, self.testdata

        self.data, self.testdata = self.rescale(self.data,
                                                scale), self.rescale(
                                                    self.testdata, scale)
        self.knn = KNN(self.data)
        result = self.validation(trial)

        self.data, self.testdata = backup
        self.knn = KNN(self.data)
        return result
Esempio n. 10
0
def main(argv):


    knn = KNN('iris.data', 'iris.test', 4)
    training_set = knn.get_training_set()
    testing_set = knn.get_testing_set()
    print('**********\n***** IRIS ****\n***********')
    print_result(knn, training_set, testing_set)

    knn = KNN('wdbc.data', 'wdbc.test', 1, 0)
    print('**********\n***** Breast Cancer in Wisconsin ****\n***********')
    training_set = knn.get_training_set()
    testing_set = knn.get_testing_set()
    print_result(knn, training_set, testing_set)
Esempio n. 11
0
    def tuning_without_scaling(self, distance_funcs, x_train, y_train, x_val,
                               y_val):
        """
        In this part, you should try different distance function you implemented in part 1.1, and find the best k.
        Use k range from 1 to 30 and increment by 2. Use f1-score to compare different models.

        :param distance_funcs: dictionary of distance functions you must use to calculate the distance.
            Make sure you loop over all distance functions for each data point and each k value.
            You can refer to test.py file to see the format in which these functions will be
            passed by the grading script
        :param x_train: List[List[int]] training data set to train your KNN model
        :param y_train: List[int] train labels to train your KNN model
        :param x_val:  List[List[int]] Validation data set will be used on your KNN predict function to produce
            predicted labels and tune k and distance function.
        :param y_val: List[int] validation labels

        Find(tune) best k, distance_function and model (an instance of KNN) and assign to self.best_k,
        self.best_distance_function and self.best_model respectively.
        NOTE: self.best_scaler will be None

        NOTE: When there is a tie, choose model based on the following priorities:
        Then check distance function  [canberra > minkowski > euclidean > gaussian > inner_prod > cosine_dist]
        If they have same distance fuction, choose model which has a less k.
        """
        num_k = 0
        F1_stats = []
        distance_funcs_lists = [
            'canberra', 'minkowski', 'euclidean', 'gaussian', 'inner_prod',
            'cosine_dist'
        ]

        for i in range(len(distance_funcs_lists)):
            k = 1
            while k < 30 and k <= len(x_train):
                classfier = KNN(k, distance_funcs[distance_funcs_lists[i]])
                classfier.train(x_train, y_train)
                preds = classfier.predict(x_val)
                F1_stats.append(f1_score(y_val, preds))
                k += 2
                num_k += 1

        num_k //= 6
        ind = F1_stats.index(max(F1_stats))
        self.best_k = (ind % num_k) * 2 + 1
        self.best_distance_function = distance_funcs_lists[ind // num_k]
        self.best_model = KNN(self.best_k,
                              distance_funcs[self.best_distance_function])
        self.best_model.train(x_train, y_train)
Esempio n. 12
0
    def test_compare_to_scikit_learn_changing_k(self):
        normalizer = Normalizer(self.data)
        data = normalizer.normalize()

        testSize = 100
        trainSize = len(data.data) - testSize
        for i in range(1, 12):
            with self.subTest(i=i):
                print("k: ", i)
                neighbours = i

                trainData = {}
                testData = {}

                trainData['data'] = data.data[:trainSize]
                trainData['target'] = data.target[:trainSize]

                testData['data'] = data.data[trainSize:]
                testData['target'] = data.target[:trainSize]
                knn = KNN(trainData)

                #scikit-learn model:
                model = KNeighborsClassifier(n_neighbors=neighbours)
                model.fit(trainData['data'], trainData['target'])

                ourCounter = 0
                sciCounter = 0
                for i, e in enumerate(testData['data']):
                    if knn.makeGuess(e, neighbours) == testData['target'][i]:
                        ourCounter+=1

                    if model.predict([e]) == testData['target'][i]:
                        sciCounter+=1

                self.assertAlmostEqual(ourCounter/(testSize), sciCounter/(testSize), 3)
Esempio n. 13
0
def question_4(points):
    """
    question 4
    :param points: list of Point
    """
    k_list = [5, 7]
    normalization_list = [[DummyNormalizer, "DummyNormalizer"],
                          [SumNormalizer, "SumNormalizer"],
                          [MinMaxNormalizer, "MinMaxNormalizer"],
                          [ZNormalizer, "ZNormalizer"]]
    print("Question 4:")
    for k in k_list:
        print("K=", k, sep="")
        m = KNN(k)
        m.train(points)
        cv = CrossValidation()
        for i in normalization_list:
            normalize_object = i[0]()
            normalize_object.fit(points)
            new_points = normalize_object.transform(points)
            #  2 is the best n-fold
            average_score = cv.run_cv(new_points, 2, m, accuracy_score, False,
                                      True)
            formatted_average_score = "{:.2f}".format(average_score)
            print("Accuracy of", i[1], "is", formatted_average_score)
            print()
Esempio n. 14
0
def execute():
    df = pd.read_csv('data/reduced_dataset_v3.csv')

    df = df.drop(
        ['Unnamed: 0', 'Time', 'Time Code', 'Country Name', 'Country Code'],
        axis=1)
    sbs_reg = StepByStepRegression(df, 'correlation')
    # print(emp_df.columns)
    sbs_reg.normalize()
    # print(normalized_df.head())

    sorted_columns = sbs_reg.get_least_nan_columns(sbs_reg.normalized_df)
    # print('sorted columns', sorted_columns)
    sorted_df = pd.DataFrame()
    correlated_features_list = dict()
    for col, nulls in sorted_columns.iteritems():
        sorted_df[col] = sbs_reg.normalized_df[col]
    # print('sorted dataframe', sorted_df.columns)

    if sbs_reg.sim_fun == 'KNN':
        knn = KNN(sorted_df, 3)
        correlated_features_list = knn.findKNeighbors()
    elif sbs_reg.sim_fun == 'correlation':
        correlated_features_list = sbs_reg.correlation(sorted_df)
    print('correlated features', correlated_features_list)
    sbs_reg.fill_missing_data(sorted_df, correlated_features_list)
    un_normalized = sbs_reg.un_normalize(sorted_df)
    print(un_normalized)
Esempio n. 15
0
    def test_iris_regression(self):
        """
        Tests kNN for regression
        """

        k = 1
        iris_dataset = load_iris()

        knn = KNN(k, "average")

        # get petal length as input
        # ensure this is 2D
        X = iris_dataset.data[:, 2].reshape(-1, 1)

        # get petal width as output
        y = iris_dataset.data[:, 3]

        knn.fit(X, y)
        predicted = knn.predict(X)

        # verify shape of output
        self.assertEqual(len(predicted.shape), 1)
        self.assertEqual(predicted.shape[0], iris_dataset.data.shape[0])

        # with k=1, each point should match itself
        # but with only 1 dimension, some points have
        # the same values
        mse = mean_squared_error(y, predicted)
        self.assertLess(mse, 0.1)
Esempio n. 16
0
def test_knn_regression():
    while True:
        N = np.random.randint(2, 100)
        M = np.random.randint(2, 100)
        k = np.random.randint(1, N)
        ls = np.min([np.random.randint(1, 10), N - 1])
        weights = np.random.choice(["uniform", "distance"])

        X = np.random.rand(N, M)
        X_test = np.random.rand(N, M)
        y = np.random.rand(N)

        knn = KNN(k=k,
                  leaf_size=ls,
                  metric=euclidean,
                  classifier=False,
                  weights=weights)
        knn.fit(X, y)
        preds = knn.predict(X_test)

        gold = KNeighborsRegressor(
            p=2,
            leaf_size=ls,
            n_neighbors=k,
            weights=weights,
            metric="minkowski",
            algorithm="ball_tree",
        )
        gold.fit(X, y)
        gold_preds = gold.predict(X_test)

        for mine, theirs in zip(preds, gold_preds):
            np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
Esempio n. 17
0
def main():
    trainSet = pd.read_csv('datasets/train_set.csv',
                           converters={'Trajectory': literal_eval})

    testSet = pd.read_csv('datasets/test_set_a2.csv',
                          converters={'Trajectory': literal_eval})

    # labels for categories
    le = preprocessing.LabelEncoder()
    categoryIds = le.fit_transform(trainSet['journeyPatternId'])

    allSequences = []

    for trainIndex, trainRow in trainSet.iterrows():
        allSequences.append(trainRow['Trajectory'])

    # initialize KNN classifier
    clf = KNN(5, DTW)

    crossValidation(clf, allSequences, categoryIds, le)
    clf.fit(allSequences, categoryIds)

    # predict the categories for the testSet
    predIds = clf.predict(testSet['Trajectory'])
    predCategs = le.inverse_transform(predIds)

    writeInCsv(predCategs)
Esempio n. 18
0
    def fit(self, X, y):
        # instantiate the input models
        rf = RandomForest(num_trees=15)
        knn = KNN(k=3)
        nb = NaiveBayes(num_classes=2)

        # Random Forest fit and predict
        rf.create_splits(X)
        rf.fit(X, y)
        rf_pred = rf.predict(X)

        # K-Nearest Neighbors fit and predict
        knn.fit(X, y)
        knn_pred = knn.predict(X)

        # Naive Bayes fit and predict
        nb.fit(X, y)
        nb_pred = nb.predict(X)

        # use predictions from input models as inputs for meta-classifiers
        meta_input = np.hstack((rf_pred.reshape(
            (rf_pred.size, 1)), knn_pred.reshape(
                (knn_pred.size, 1)), nb_pred.reshape((nb_pred.size, 1))))

        # use Decision Tree as meta-classifier
        dt = DecisionTree(max_depth=np.inf)
        dt.fit(meta_input, y)

        self.rf = rf
        self.knn = knn
        self.nb = nb
        self.meta_classifier = dt
Esempio n. 19
0
    def test_blob_classification_numpy(self):
        """
        Tests kNN for classification using
        randomly-generated points drawn from
        Gaussian-shaped clusters.
        
        Splits data into training and testing
        sets.
        """

        k = 3
        X, y = generate_cluster_samples()

        train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y)

        knn = KNN(k)
        knn.fit(train_X, train_y)
        pred_y = knn.predict_numpy(test_X)

        # verify shape of output
        self.assertEqual(len(pred_y.shape), 1)
        self.assertEqual(pred_y.shape[0], test_X.shape[0])

        # with k=1, each point should match itself
        accuracy = accuracy_score(test_y, pred_y)
        self.assertAlmostEqual(accuracy, 1.0)
Esempio n. 20
0
def main():
    K = [1, 2, 3, 5, 7, 9, 11, 13, 15]

    #load CM1
    data = arff.loadarff('./datasets/CM1.arff')

    X, Y = build_dataframe(data)

    # normalize data
    X = normalize_data(X)

    # create k-fold splits
    kf = KFold(n_splits=10)

    # instanciate classifier
    for k in K:
        clf = KNN(k=k)
        print("k equals {}".format(k))

        start_time = time.time()
        acc = []
        for train, test in kf.split(X):

            clf.fit(X.iloc[train], Y.iloc[train])
            predictions = clf.predict(X.iloc[test])
            acc.append((np.sum(predictions == Y.iloc[test]) / len(test)) * 100)

        end_time = time.time()

        acc = np.array(acc)

        print("mean accuracy: {}".format(np.mean(acc)))
        print("standard deviation: {}".format(np.std(acc)))
        print("time elapsed: {}".format(end_time - start_time))
Esempio n. 21
0
    def test_synthetic_data(self):
        """
        Test KNN.predict using some synthetic data
        """
        x_train = np.array([[1, 2], [1, 3], [2, 2], [2, 3], [1, 1], [2, 1]])
        y_train = np.array([1, 1, 1, -1, -1, -1])

        model = KNN(k=3)
        model.fit(x_train, y_train)

        x_test = np.array([
            [1.8, 2.6],
            [2.0, 1.8],
            [1.5, 2.0],
            [1.0, 2.5],
            [1.5, 1.0],
            [2.0, 1.0],
        ])

        pred = model.predict(x_test)

        self.assertTrue(np.array_equal(pred, np.array([1, 1, 1, 1, -1, -1])))

        # one labels should change if using 1-nn
        model.k = 1
        pred2 = model.predict(x_test)

        self.assertTrue(np.array_equal(pred2, np.array([-1, 1, 1, 1, -1, -1])))
Esempio n. 22
0
def computeKNNCrossValidation(args, dict_algorithms):
    if (args.debug):
        print("Running knn...", end='')
    model = KNN(args)
    dict_algorithms["knn"] = model.computeCrossValidation()
    if (args.debug):
        print("ok!")
Esempio n. 23
0
File: main.py Progetto: crab-a/lab4
def run_knn(points):
    m = KNN(5)
    m.train(points)
    print(f'predicted class: {m.predict(points[0])}')
    print(f'true class: {points[0].label}')
    cv = CrossValidation()
    cv.run_cv(points, 10, m, accuracy_score)
Esempio n. 24
0
 def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val):
     optimal_k = 0
     function = ""
     scalar = ""
     model = None
     scalar_method = []
     scalar_name = []
     f1_scores = -2 ** 32
     if len(x_train) <= 30:
         max_k = len(x_train)
     else:
         max_k = 30
     for m, n in scaling_classes.items():
         scalar_method.append(n())
         scalar_name.append(m)
     for i in range(len(scalar_method)):
         x_train = scalar_method[i](x_train)
         x_val = scalar_method[i](x_val)
         for key, value in distance_funcs.items():
             for k_value in range(1, max_k, 2):
                 train_model = KNN(k_value, value)
                 train_model.train(x_train, y_train)
                 pre_val = train_model.predict(x_val)
                 cur_f1 = f1_score(y_val, pre_val)
                 if f1_scores < cur_f1:
                     optimal_k = k_value
                     function = key
                     model = train_model
                     f1_scores = cur_f1
                     scalar = scalar_name[i]
     self.best_k = optimal_k
     self.best_distance_function = function
     self.best_scaler = scalar
     self.best_model = model
     return self.best_k, self.best_distance_function, self.best_scaler, self.best_model
Esempio n. 25
0
def main():
    df = pd.read_csv(f".\Data\{args.dataset}")

    X = np.array(df.iloc[:, :-1])
    y = np.array(df.iloc[:, -1])
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    knn = KNN(X_train, y_train, k=args.k)

    if args.type == "clf":
        y_pred = knn.predict(X_test, knn_type="clf")
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))

    elif args.type == "reg":
        y_pred = knn.predict(X_test, knn_type="reg")
        mse = np.mean((y_test - y_pred)**2)
        print(mse)

    else:
        return print("Undefined knn type")

    accuracy = np.mean(y_pred == y_test)
    print(accuracy)
Esempio n. 26
0
    def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train,
                            y_train, x_val, y_val):
        """
        This part is the same as "tuning_without_scaling", except that you also need to try two different scalers implemented in Part 1.3. More specifically, before passing the training and validation data to KNN model, apply the scalers in scaling_classes to both of them. 
		
        :param distance_funcs: dictionary of distance functions (key is the function name, value is the function) you need to try to calculate the distance. Make sure you loop over all distance functions for each k value.
        :param scaling_classes: dictionary of scalers (key is the scaler name, value is the scaler class) you need to try to normalize your data
        :param x_train: List[List[int]] training data set to train your KNN model
        :param y_train: List[int] train labels to train your KNN model
        :param x_val: List[List[int]] validation data
        :param y_val: List[int] validation labels

        Find the best k, distance_function (its name), scaler (its name), and model (an instance of KNN), and assign them to self.best_k, self.best_distance_function, best_scaler, and self.best_model respectively.
        
        NOTE: When there is a tie, choose the model based on the following priorities:
        First check scaler, prioritizing "min_max_scale" over "normalize" (which will also be the insertion order of scaling_classes). Then follow the same rule as in "tuning_without_scaling".
        """

        # You need to assign the final values to these variables
        best_f1 = 0
        for scaling_name, scaling_func in scaling_classes.items():
            scaler = scaling_func()
            x_train_scaled = scaler(x_train)
            x_val_scaled = scaler(x_val)
            for name, func in distance_funcs.items():
                for k in range(1, 30, 2):
                    model = KNN(k, func)
                    model.train(x_train_scaled, y_train)
                    valid_f1 = f1_score(y_val, model.predict(x_val_scaled))
                    if valid_f1 > best_f1:
                        self.best_distance_function = name
                        self.best_k = k
                        best_f1 = valid_f1
                        self.best_model = model
                        self.best_scaler = scaling_name
Esempio n. 27
0
def tuneK(dataMat, labels, folds, categ, cnnSub=None):
	''' Tune for best K by finding K with the smallest classification error.
	The function loops through value of 1-10 and find the smallest K in that 
	given range which produces the smallest K. The function randomly takes
	one fold of the 5-fold crossvalidation as the testing set for this tuning.
	The K with the smallest associated error is returned.
	'''
	Ks = np.arange(1,11) # list of K valujes
	err = np.empty(len(Ks)) # pre-allocate erros
	
	pick = np.random.randint(len(folds)) # randomly pick fold as validation set
	trnIdx = np.hstack([x for n,x in enumerate(folds) if n != pick]) # train
	vldIdx = np.hstack([x for n,x in enumerate(folds) if n == pick]) # validate
	dataTrain,labelTrain = dataMat[trnIdx,:],labels[trnIdx] # training
	dataTest,labelTest = dataMat[vldIdx,:],labels[vldIdx] # validation

	if cnnSub is not None :
		dataTrain = dataMat[cnnSub]
		labelTrain = labels[cnnSub]
	
	for n,k in enumerate(Ks): # loop through K's
		pred = KNN(dataTrain, labelTrain, dataTest, k, categorical=categ)
		err[n] = errRate(pred, labelTest, categorical=categ)
	
	return Ks[np.argmin(err)], err # return K with smallest error
Esempio n. 28
0
    def tuning_without_scaling(self, distance_funcs, x_train, y_train, x_val,
                               y_val):
        """
        In this part, you need to try different distance functions you implemented in part 1.1 and different values of k (among 1, 3, 5, ... , 29), and find the best model with the highest f1-score on the given validation set.
		
        :param distance_funcs: dictionary of distance functions (key is the function name, value is the function) you need to try to calculate the distance. Make sure you loop over all distance functions for each k value.
        :param x_train: List[List[int]] training data set to train your KNN model
        :param y_train: List[int] training labels to train your KNN model
        :param x_val:  List[List[int]] validation data
        :param y_val: List[int] validation labels

        Find the best k, distance_function (its name), and model (an instance of KNN) and assign them to self.best_k,
        self.best_distance_function, and self.best_model respectively.
        NOTE: self.best_scaler will be None.

        NOTE: When there is a tie, choose the model based on the following priorities:
        First check the distance function:  euclidean > Minkowski > cosine_dist 
		(this will also be the insertion order in "distance_funcs", to make things easier).
        For the same distance function, further break tie by prioritizing a smaller k.
        """

        best_f1 = 0
        for name, func in distance_funcs.items():
            for k in range(1, 30, 2):
                model = KNN(k, func)
                model.train(x_train, y_train)
                valid_f1 = f1_score(y_val, model.predict(x_val))
                if valid_f1 > best_f1:
                    self.best_distance_function = name
                    self.best_k = k
                    best_f1 = valid_f1
                    self.best_model = model
Esempio n. 29
0
def test_knn_clf():
    while True:
        N = np.random.randint(2, 100)
        M = np.random.randint(2, 100)
        k = np.random.randint(1, N)
        n_classes = np.random.randint(10)
        ls = np.min([np.random.randint(1, 10), N - 1])
        weights = "uniform"

        X = np.random.rand(N, M)
        X_test = np.random.rand(N, M)
        y = np.random.randint(0, n_classes, size=N)

        knn = KNN(k=k,
                  leaf_size=ls,
                  metric=euclidean,
                  classifier=True,
                  weights=weights)
        knn.fit(X, y)
        preds = knn.predict(X_test)

        gold = KNeighborsClassifier(
            p=2,
            leaf_size=ls,
            n_neighbors=k,
            weights=weights,
            metric="minkowski",
            algorithm="ball_tree",
        )
        gold.fit(X, y)
        gold_preds = gold.predict(X_test)

        for mine, theirs in zip(preds, gold_preds):
            np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
Esempio n. 30
0
def test_knn(k, train_data, train_labels, test_data):
    """ test_knn function

    Trains a KNN classifier with the given testing set then tests it
    on the testing data. Outputs as a CSV file.

    Args
    ----
    k : integer
        number of neighbors to use for KNN
    train_data : np.array
        training dataset
    train_labels : np.array
        training dataset labels
    test_data : np.array
        testing dataset

    Returns
    -------
    Tuple (np.array, np.array)
    """
    print("Final k:" + str(k))
    knn = KNN(k, train_data, train_labels)

    # print to CSV
    with open('predictions_digit_recognizer.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['ImageId', 'Label'])
        for i in range(len(test_data)):
            data = test_data[i]
            guess = knn.classify(data)
            writer.writerow([str(i + 1), str(int(guess))])