Example #1
0
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    allknn = []
    performance = []
    function = []
    for key,value in distance_funcs.items():
        for k in range(1,min(30,len(Xtrain)),2):
            knn = KNN(k,value)
            knn.train(Xtrain,ytrain)
            answer = knn.predict(Xval)
            score = f1_score(yval,answer)
            allknn.append(knn)
            function.append(key)
            performance.append(score)
            #print(k,',',key,',',answer,',',yval,',',score)
    result = allknn[np.argmax(np.array(performance))]
    best_function = function[np.argmax(np.array(performance))]
    return result, result.k, best_function
Example #2
0
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval,
                                          yval):
    Xtrain = np.array(Xtrain, dtype=float)
    ytrain = np.array(ytrain, dtype=int)
    Xval = np.array(Xval, dtype=float)
    yval = np.array(yval, dtype=int)
    f1 = np.zeros((30, 4))
    upper_k = 30
    if len(Xtrain) < 30:
        upper_k = len(Xtrain)
    m = 0
    for k in range(1, upper_k, 2):
        c = 0
        for j in distance_funcs:
            inst = KNN(k, distance_funcs[j])
            inst.train(Xtrain, ytrain)
            pred_val = inst.predict(Xval)
            f1[k][c] = f1_score(yval, pred_val)
            if f1[k][c] > m:
                best_k = k
                best_func = j
                m = f1[k][c]
                best_model = inst
            c = c + 1
    print(f1)
    print(best_model, best_k, best_func)
    return best_model, best_k, best_func
    raise NotImplementedError
Example #3
0
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval,
                                          yval):
    # distance_funcs: dictionary of distance funtion
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model

    distance_funcs_list = [
        'euclidean', 'gaussian', 'inner_prod', 'cosine_dist'
    ]
    best_f1 = 0
    best_model = None
    best_k = -1
    best_func = "*"
    for k in range(1, 30, 2):
        if k < len(Xtrain):
            for func_string in distance_funcs_list:
                knn = KNN(k, distance_funcs[func_string])
                knn.train(Xtrain, ytrain)
                predicted_vals = knn.predict(Xval)
                curr_f1 = f1_score(yval, predicted_vals)
                if curr_f1 > best_f1:
                    best_f1 = curr_f1
                    best_model = knn
                    best_k = k
                    best_func = func_string
    # print(best_model, best_k, best_func)
    return best_model, best_k, best_func
Example #4
0
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval,
                                          yval):
    func_names = list(distance_funcs.keys())
    func_names.reverse()
    best_model = KNN(k=np.inf, distance_function=None)
    best_name = func_names[0]
    best_valid_f1_score = -1 * np.inf
    for k in range(1, min(30, len(Xtrain) - 1), 2):
        for item in distance_funcs.items():
            name, distance_func = item
            model = KNN(k=k, distance_function=distance_func)
            model.train(Xtrain, ytrain)
            train_predict_labels = model.predict(Xtrain)
            train_f1_score = f1_score(ytrain, train_predict_labels)
            valid_predict_labels = model.predict(Xval)
            valid_f1_score = f1_score(yval, valid_predict_labels)

            if valid_f1_score > best_valid_f1_score or \
            (valid_f1_score == best_valid_f1_score and func_names.index(name) > func_names.index(best_name)):
                best_model = model
                best_name = name
                best_valid_f1_score = valid_f1_score

            #Dont change any print statement
            '''print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=model.k) + 
                        'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) +
                        'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))
                print()
                print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=best_name, best_k=best_model.k) +
                    'valid f1 score: {valid_f1_score:.5f}'.format(valid_f1_score=best_valid_f1_score))
                print()'''
    return best_model, best_model.k, best_name
Example #5
0
def model_selection_with_transformation(distance_funcs, scaling_classes,
                                        Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # scaling_classes: diction of scalers
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    # return best_scaler: best function choosed for best_model
    best_f1_score, best_k = 0, -1
    for scaling_name, scaling_class in scaling_classes.items():
        for name, func in distance_funcs.items():
            scaler = scaling_class()
            train_features_scaled = scaler(Xtrain)
            valid_features_scaled = scaler(Xval)
            k_lim = len(Xtrain) - 1
            for k in range(1, min(31, k_lim), 2):
                model = KNN(k=k, distance_function=func)
                model.train(train_features_scaled, ytrain)
                valid_f1_score = f1_score(yval,
                                          model.predict(valid_features_scaled))
                if valid_f1_score > best_f1_score:
                    best_f1_score, best_k = valid_f1_score, k
                    model1 = model
                    func1 = name
                    scaler1 = scaling_name
    return model1, best_k, func1, scaler1
Example #6
0
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval,
                                          yval):
    # distance_funcs: dictionary of distance funtion
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    best_model = None
    best_k = 0
    best_f1_score = -1
    best_function = ""
    best_choices = []
    highest_range = len(Xtrain) - 1
    if highest_range > 31:
        highest_range = 31
    for name, distance_func in distance_funcs.items():
        for k in range(1, highest_range, 2):
            model = KNN(k, distance_function=distance_func)
            model.train(Xtrain, ytrain)
            train_f1_score = f1_score(ytrain, model.predict(Xtrain))
            valid_f1_score = f1_score(yval, model.predict(Xval))
            #print("name: ", name, "k: ", k, "train_score: ", train_f1_score, "valid_score: ", valid_f1_score)
            # print("Train score")
            if (best_f1_score < valid_f1_score):
                best_f1_score = valid_f1_score
                best_k = k
                best_model = model
                best_function = name
        best_choices.append([best_k, best_function, best_f1_score])
    print("best_k:", best_k, "best_function:", best_function, "best_f1_score:",
          best_f1_score)
    return best_model, best_k, best_function
Example #7
0
    def test_normalization(self):
        scaling_functions = {
            'min_max_scale': MinMaxScaler,
            'normalize': NormalizationScaler,
        }
        distance_funcs = {
            'euclidean': euclidean_distance,
            'gaussian': gaussian_kernel_distance,
            'inner_prod': inner_product_distance,
        }
        features, labels = generate_data_cancer()
        train_features, train_labels = features[:400], labels[:400]
        valid_features, valid_labels = features[400:460], labels[400:460]
        test_features, test_labels = features[460:], labels[460:]
        assert len(train_features) == len(train_labels) == 400
        assert len(valid_features) == len(valid_labels) == 60
        assert len(test_features) == len(test_labels) == 109
        for scaling_name, scaling_class in scaling_functions.items():
            for name, func in distance_funcs.items():
                scaler = scaling_class()
                train_features_scaled = scaler(train_features)
                valid_features_scaled = scaler(valid_features)

                best_f1_score, best_k = 0, -1
                for k in [1, 3, 10, 20, 50]:
                    model = KNN(k=k, distance_function=func)
                    model.train(train_features_scaled, train_labels)
                    train_f1_score = f1_score(
                        train_labels, model.predict(train_features_scaled))

                    valid_f1_score = f1_score(
                        valid_labels, model.predict(valid_features_scaled))
                    print('[part 2.2] {name}\t{scaling_name}\tk: {k:d}\t'.
                          format(name=name, scaling_name=scaling_name, k=k) +
                          'train: {train_f1_score:.5f}\t'.format(
                              train_f1_score=train_f1_score) +
                          'valid: {valid_f1_score:.5f}'.format(
                              valid_f1_score=valid_f1_score))

                    if valid_f1_score > best_f1_score:
                        best_f1_score, best_k = valid_f1_score, k

                # now change it to new scaler, since the training set changes
                scaler = scaling_class()
                combined_features_scaled = scaler(train_features +
                                                  valid_features)
                test_features_scaled = scaler(test_features)

                model = KNN(k=best_k, distance_function=func)
                model.train(combined_features_scaled,
                            train_labels + valid_labels)
                test_f1_score = f1_score(test_labels,
                                         model.predict(test_features_scaled))
                print()
                print('[part 2.2] {name}\t{scaling_name}\t'.format(
                    name=name, scaling_name=scaling_name) +
                      'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(
                          best_k=best_k, test_f1_score=test_f1_score))
                print()
Example #8
0
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval,
                                          yval):
    # distance_funcs: dictionary of distance funtion
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    #print(distance_funcs)
    best_k = -1
    best_score_train = 0
    best_score_val = -1
    best_distance = ""
    best_model = None
    #print(len(Xtrain), len(Xval))
    if len(Xtrain) <= 30:
        K = len(Xtrain) - 1
    else:
        K = 30
    for key, val in distance_funcs.items():
        k = 1
        while k <= K:
            kNN = KNN(k, val)
            #print("train")
            kNN.train(Xtrain, ytrain)
            #print('Xval before prediction')
            yval_pred = kNN.predict(Xval)
            #print("predict1")
            valid_f1_score = f1_score(yval, yval_pred)
            #print("f1_Score1")
            ytrain_pred = kNN.predict(Xtrain)
            #print("predict2")
            train_f1_score = f1_score(ytrain, ytrain_pred)
            #print("f1_Score2")
            print(best_score_val, valid_f1_score, k, best_k)
            if best_score_val < valid_f1_score:
                best_k = k
                best_score_val = valid_f1_score
                best_score_train = train_f1_score
                best_distance = key
                best_model = kNN
            #Dont change any print statement
            #print('[part 1.1] {key}\tk: {k:d}\t'.format(key=key, k=k) +
            #           'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) +
            #          'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))
            k = k + 2
            #print(best_score_val, best_k, best_distance)

# if best_k==9 and best_distance=='cosine_dist':
#    best_k=3
#   best_model=KNN(best_k,distance_funcs.get(best_distance))
#  best_model.train(Xtrain, ytrain)

    print('final', best_model, best_k, best_distance)
    return best_model, best_k, best_distance
Example #9
0
def model_selection_with_transformation(distance_funcs, scaling_classes,
                                        Xtrain, ytrain, Xval, yval):
    #Dont change any print statement
    best_f1_score, best_k = 0, -1
    best_function = {}
    best_scaler = {}
    for scaling_name, scaling_class in scaling_classes.items():
        for name, func in distance_funcs.items():
            scaler = scaling_class()
            train_features_scaled = scaler(Xtrain)
            valid_features_scaled = scaler(Xval)

            for k in range(1, 30, 2):
                if len(Xtrain) < k:
                    break
                model = KNN(k=k, distance_function=func)
                model.train(train_features_scaled, ytrain)
                train_f1_score = f1_score(ytrain,
                                          model.predict(train_features_scaled))

                valid_f1_score = f1_score(yval, model.predict(Xval))
                print('[part 2.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(
                    name=name, scaling_name=scaling_name, k=k) +
                      'train: {train_f1_score:.5f}\t'.format(
                          train_f1_score=train_f1_score) +
                      'valid: {valid_f1_score:.5f}'.format(
                          valid_f1_score=valid_f1_score))

                if valid_f1_score > best_f1_score:
                    best_f1_score, best_k = valid_f1_score, k
                    best_function = name
                    best_scaler = scaling_name

    # now change it to new scaler, since the training set changes
    scaler = scaling_classes.get(best_scaler)()
    combined_features_scaled = scaler(np.concatenate((Xtrain, Xval), axis=0))
    #test_features_scaled = scaler(X_test)

    model = KNN(k=best_k, distance_function=func)
    model.train(combined_features_scaled, np.concatenate((ytrain, yval),
                                                         axis=0))
    '''test_f1_score = f1_score(ytest, model.predict(test_features_scaled))
            print()
            print('[part 2.2] {name}\t{scaling_name}\t'.format(name=name, scaling_name=scaling_name) +
                  'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(best_k=best_k, test_f1_score=test_f1_score))
            print()'''
    '''
    print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name=name, scaling_name=scaling_name, k=k) +
          'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) +
          'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))
    print()
    print('[part 1.2] {name}\t{scaling_name}\t'.format(name=name, scaling_name=scaling_name) +
          'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(best_k=best_k, test_f1_score=test_f1_score))
    print()'''
    return model, best_k, best_function, best_scaler
Example #10
0
 def test_inner_product_knn(self):
     knn = KNN(1, inner_product_distance)
     features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]]
     values = [0, 0, 0, 1, 1, 1]
     knn.train(features, values)
     point1 = [0, 0]
     neighbor = knn.get_neighbors(point1)
     self.assertEqual(0, knn.get_response(neighbor))
     numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor)
     point2 = [10, 10]
     neighbor = knn.get_neighbors(point2)
     numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor)
     self.assertEqual(1, knn.get_response(neighbor))
Example #11
0
def model_selection_with_transformation(distance_funcs, scaling_classes,
                                        Xtrain, ytrain, Xval, yval):
    s_dict = {}
    for s_item in scaling_classes.items():
        s_name, s_func = s_item
        scaling = s_func()
        scale_train = scaling(Xtrain)
        scale_valid = scaling(Xval)
        s_dict[s_name] = [scale_train, scale_valid]
    func_names = list(distance_funcs.keys())
    func_names.reverse()
    scaling_name = list(scaling_classes.keys())
    scaling_name.reverse()
    best_model = KNN(k=np.inf, distance_function=None, scaling_class=None)
    best_name = func_names[0]
    best_scaling_name = scaling_name[0]
    best_valid_f1_score = -1 * np.inf
    for k in range(1, min(30, len(Xtrain) - 1), 2):
        for item in distance_funcs.items():
            for s_name in scaling_name:
                name, distance_func = item
                model = KNN(k=k,
                            distance_function=distance_func,
                            scaling_class=scaling_classes[s_name])
                model.train(s_dict[s_name][0], ytrain)
                train_predict_labels = model.predict(s_dict[s_name][0])
                train_f1_score = f1_score(ytrain, train_predict_labels)
                valid_predict_labels = model.predict(s_dict[s_name][1])
                valid_f1_score = f1_score(yval, valid_predict_labels)

                if valid_f1_score > best_valid_f1_score or \
                (valid_f1_score == best_valid_f1_score and scaling_name.index(s_name) > scaling_name.index(best_scaling_name)) or \
                (valid_f1_score == best_valid_f1_score and scaling_name.index(s_name) == scaling_name.index(best_scaling_name) and func_names.index(name) > func_names.index(best_name)):
                    best_model = model
                    best_name = name
                    best_scaling_name = s_name
                    best_valid_f1_score = valid_f1_score

                #Dont change any print statement
                '''print('[part 1.1] {name}\t{s_name}\tk: {k:d}\t'.format(name=name, s_name=s_name, k=model.k) + 
                        'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) +
                        'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))
                print()
                print('[part 1.1] {name}\t{s_name}\tbest_k: {best_k:d}\t'.format(name=best_name, s_name=best_scaling_name, best_k=best_model.k) +
                    'valid f1 score: {valid_f1_score:.5f}'.format(valid_f1_score=best_valid_f1_score))
                print()'''
    return best_model, best_model.k, best_name, best_scaling_name
Example #12
0
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    
    print(Xtrain,ytrain,Xval,yval)
    
    model=KNN(1,distance_funcs['euclidean'])
    
    optf1=0
    bestk=-1
    bestfunc=''
    maxk=29
    if(len(Xtrain)<maxk):
        maxk=len(Xtrain)-1
        
    
    for key_func in distance_funcs:
        k=1
        while(k<=maxk):
            model.train(Xtrain,ytrain)
            model.k=k
            model.distance_function=distance_funcs[key_func]
            ypre=model.predict(Xval)
            get_f1=f1_score(yval,ypre)
            
            print('[part 1.1] {name}\tk: {k:d}\t'.format(name=key_func, k=k) +
                      'valid: {valid_f1_score:.5f}'.format(valid_f1_score=get_f1))
            print()
            
            if(get_f1>optf1):
                bestk=k
                bestfunc=key_func
                optf1=get_f1
                
            
            k+=2
    print("bestk:  ",bestk,"bestfunc:  ",key_func)
    model.k=bestk
    model.distance_function=distance_funcs[bestfunc]
    return model,bestk,bestfunc
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval,
                                          yval):
    best_f1_score = -1
    best_k = 0
    best_distance_func_name = {}

    for name, dist_func in distance_funcs.items():

        for k in range(1, 31, 2):
            if len(Xtrain) < k:
                break
            model = KNN(k=k, distance_function=dist_func)

            model.train(Xtrain, ytrain)
            train_f1_score = f1_score(ytrain, model.predict(Xtrain))

            valid_f1_score = f1_score(yval, model.predict(Xval))

            if valid_f1_score > best_f1_score:
                best_f1_score = valid_f1_score
                best_k = k
                best_distance_func_name = name
            '''
            #Dont change any print statement
            print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) + 
                      'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) +
                      'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))
            '''
    #print(best_k, best_distance_func_name)
    best_model = KNN(
        k=best_k,
        distance_function=distance_funcs.get(best_distance_func_name))
    best_model.train(np.concatenate((Xtrain, Xval), axis=0),
                     np.concatenate((ytrain, yval), axis=0))
    '''
    model = KNN(k = best_k, distance_function = distance_funcs.get(best_distance_func_name))
    model.train(np.concatenate((Xtrain, Xval),axis = 0), np.concatenate((ytrain, yval),axis = 0))
    test_f1_score = f1_score(ytest, model.predict(Xtest))
    name = best_distance_func_name
    print()
    print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=name, best_k=best_k) +
          'test f1 score: {test_f1_score:.5f}'.format(test_f1_score=test_f1_score))
    print()'''

    return best_model, best_k, best_distance_func_name
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval,
                                          yval):
    # distance_funcs: dictionary of distance funtion
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model

    upper_bound = len(Xtrain)
    if upper_bound > 30:
        upper_bound = 30

    max_f1 = []
    for key, distance_func in distance_funcs.items():
        max_score = -1
        min_k = 0
        best_model = []
        for k in range(1, upper_bound, 2):
            knn = KNN(k, distance_func)
            knn.train(Xtrain, ytrain)
            pred_labels = knn.predict(Xval)
            curr_f1 = f1_score(yval, pred_labels)
            if curr_f1 > max_score:
                max_score = curr_f1
                min_k = k
                best_model = knn
        max_f1.append((max_score, key, min_k, best_model))
    max_f1.sort(reverse=True)

    # filter ties
    majority = filter_ties(max_f1)

    SORT_ORDER = {
        "euclidean": 0,
        "gaussian": 1,
        "inner_prod": 2,
        "cosine_dist": 3
    }
    # break ties
    majority.sort(key=lambda val: SORT_ORDER[val[1]])

    return majority[0][3], majority[0][2], majority[0][1]
Example #15
0
def model_selection_with_transformation(distance_funcs, scaling_classes,
                                        Xtrain, ytrain, Xval, yval):
    Xtrain = np.array(Xtrain, dtype=float)
    ytrain = np.array(ytrain, dtype=int)
    Xval = np.array(Xval, dtype=float)
    yval = np.array(yval, dtype=int)
    f1 = np.zeros((30, 4, 2))
    upper_k = 30
    if len(Xtrain) < 30:
        upper_k = len(Xtrain)
    m = 0
    for k in range(1, upper_k, 2):
        c = 0
        for j in distance_funcs:
            inst = KNN(k, distance_funcs[j])
            X_t = np.copy(Xtrain)
            X_v = np.copy(Xval)
            for i in scaling_classes:
                if i == 'min_max_scale':
                    scale = MinMaxScaler()
                    Xtrain = scale.__call__(Xtrain)
                    c1 = 0
                    Xval = scale.__call__(Xval)
                if i == 'normalize':
                    scale = NormalizationScaler()
                    Xtrain = scale.__call__(Xtrain)
                    c1 = 1
                    Xval = scale.__call__(Xval)
                inst.train(Xtrain, ytrain)
                pred_val = inst.predict(Xval)
                f1[k][c][c1] = f1_score(yval, pred_val)
                if f1[k][c][c1] > m:
                    best_model = inst
                    best_k = k
                    best_func = j
                    best_scaler = i
                    m = f1[k][c][c1]
                Xtrain = np.copy(X_t)
                Xval = np.copy(X_v)
            c = c + 1
    print(best_model, best_k, best_func, best_scaler)
    print(f1)
    return best_model, best_k, best_func, best_scaler
    raise NotImplementedError
Example #16
0
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval,
                                          yval):
    best_f1_score, best_k = -1, 0
    best_function = {}
    for name, func in distance_funcs.items():

        for k in range(1, 30, 2):
            if len(Xtrain) < k:
                break

            model = KNN(k=k, distance_function=func)
            model.train(Xtrain, ytrain)
            #print("model")
            #print(model.predict(Xtrain))
            train_f1_score = f1_score(ytrain, model.predict(Xtrain))
            valid_f1_score = f1_score(yval, model.predict(Xval))
            print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) +
                  'train: {train_f1_score:.5f}\t'.format(
                      train_f1_score=train_f1_score) +
                  'valid: {valid_f1_score:.5f}'.format(
                      valid_f1_score=valid_f1_score))

            print()
            '''print('[part 2.1] {name}\tk: {k:d}\t'.format(name=name, k=k) + 
                  'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) +
                  'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))'''

            if valid_f1_score > best_f1_score:
                best_f1_score, best_k = valid_f1_score, k
                best_function = name

        model = KNN(k=best_k, distance_function=func)
        #print(Xtrain.shape,Xval.shape,ytrain.shape,yval.shape)
        model.train(np.concatenate((Xtrain, Xval), axis=0),
                    np.concatenate((ytrain, yval), axis=0))
        '''test_f1_score = f1_score(ytest, model.predict(Xtest))
        print()
        print('[part 2.1] {name}\tbest_k: {best_k:d}\t'.format(name=name, best_k=best_k) +
          'test f1 score: {test_f1_score:.5f}'.format(test_f1_score=test_f1_score))
        print()
        print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=name, best_k=best_k) +
              'test f1 score: {test_f1_score:.5f}'.format(test_f1_score=test_f1_score))
        print()'''
    return model, best_k, best_function
def test_knn2():
    from hw1_knn import KNN
    from utils import euclidean_distance

    result = []
    x = np.random.normal(size=(100, 2)).tolist()
    x = set([tuple(_) for _ in x])
    x = list([list(_) for _ in x])
    y = np.random.randint(low=0, high=5, size=(50)).flatten().tolist()

    x_test = x[50:]
    x = x[:50]

    for k in [1]:
        model = KNN(k=k, distance_function=euclidean_distance)
        model.train(x, y)
        result.append('[TEST KNN2],' +
                      weights_to_string(model.predict(x_test), is_int=True))
    return result
Example #18
0
def model_selection_with_transformation(distance_funcs, scaling_classes,
                                        Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # scaling_classes: diction of scalers
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    # return best_scaler: best function choosed for best_model
    # raise NotImplementedError

    distance_funcs_list = [
        'euclidean', 'gaussian', 'inner_prod', 'cosine_dist'
    ]
    scalar_funcs_list = ['min_max_scale', 'normalize']
    best_f1 = 0
    best_model = None
    best_k = -1
    best_func = "*"
    best_scalar = "+"
    for k in range(1, 30, 2):
        for func_string in distance_funcs_list:
            for scaling_string in scalar_funcs_list:
                if k < len(Xtrain):
                    scalar_object = scaling_classes[scaling_string]()
                    scaled_Xtrain = scalar_object(Xtrain)
                    scaled_Xval = scalar_object(Xval)
                    knn = KNN(k, distance_funcs[func_string])
                    knn.train(scaled_Xtrain, ytrain)
                    predicted_vals = knn.predict(scaled_Xval)
                    curr_f1 = f1_score(yval, predicted_vals)
                    if curr_f1 > best_f1:
                        best_f1 = curr_f1
                        best_model = knn
                        best_k = k
                        best_func = func_string
                        best_scalar = scaling_string
    print(best_model, best_k, best_func, best_scalar)
    return best_model, best_k, best_func, best_scalar
Example #19
0
def model_selection_with_transformation(distance_funcs, scaling_classes,
                                        Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # scaling_classes: diction of scalers
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    # return best_scaler: best function choosed for best_model
    best_model = None
    best_choice = []
    best_scaler = ""
    best_k = 0
    best_f1_score = 0
    best_function = ""
    highest_range = len(Xtrain) - 1
    if highest_range > 31:
        highest_range = 31
    for scaler_name, scaling_class in scaling_classes.items():
        for name, distance_func in distance_funcs.items():
            scaler = scaling_class()
            Xtrain_scaled = scaler(Xtrain)
            Xval_scaled = scaler(Xval)
            for k in range(1, highest_range, 2):
                model = KNN(k, distance_function=distance_func)
                model.train(Xtrain_scaled, ytrain)
                train_f1_score = f1_score(ytrain, model.predict(Xtrain_scaled))
                valid_f1_score = f1_score(yval, model.predict(Xval_scaled))
                #print("scaler:", scaler_name, " name:", name, "k: ", k, "train score: ", train_f1_score, "valid_f1_score", valid_f1_score)
                if (best_f1_score < valid_f1_score):
                    best_f1_score = valid_f1_score
                    best_k = k
                    best_function = name
                    best_scaler = scaler_name
                    best_model = model
    #print("best_scaler:", best_scaler, "best_function:", best_function, "best_k:", best_k, "score:", best_f1_score)
    return best_model, best_k, best_function, best_scaler
Example #20
0
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval,
                                          yval):
    # distance_funcs: dictionary of distance funtion
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    best_f1_score, best_k = -1, 0
    for name, func in distance_funcs.items():
        k_lim = len(Xtrain) - 1
        for k in range(1, min(31, k_lim), 2):
            model = KNN(k=k, distance_function=func)
            model.train(Xtrain, ytrain)
            valid_f1_score = f1_score(yval, model.predict(Xval))
            if valid_f1_score > best_f1_score:
                best_f1_score, best_k = valid_f1_score, k
                model1 = model
                func1 = name
    return model1, best_k, func1
Example #21
0
 def test_knn(self):
     features, labels = generate_data_cancer()
     train_features, train_labels = features[:400], labels[:400]
     valid_features, valid_labels = features[400:460], labels[400:460]
     test_features, test_labels = features[460:], labels[460:]
     assert len(train_features) == len(train_labels) == 400
     assert len(valid_features) == len(valid_labels) == 60
     assert len(test_features) == len(test_labels) == 109
     distance_funcs = {
         # 'euclidean': euclidean_distance,
         # 'gaussian': gaussian_kernel_distance,
         'inner_prod': inner_product_distance,
     }
     for name, func in distance_funcs.items():
         best_f1_score, best_k = -1, 0
         for k in [1]:
             model = KNN(k=k, distance_function=func)
             model.train(train_features, train_labels)
             # print(train_labels)
             # print(model.predict(train_features))
             train_f1_score = f1_score(train_labels,
                                       model.predict(train_features))
             valid_f1_score = f1_score(valid_labels,
                                       model.predict(valid_features))
             print(f'[part 2.1] {name}\tk: {k:d}\t'
                   f'train: {train_f1_score:.5f}\t'
                   f'valid: {valid_f1_score:.5f}')
             if valid_f1_score > best_f1_score:
                 best_f1_score, best_k = valid_f1_score, k
     model = KNN(k=best_k, distance_function=func)
     model.train(train_features + valid_features,
                 train_labels + valid_labels)
     test_f1_score = f1_score(test_labels, model.predict(test_features))
     print()
     print(f'[part 2.1] {name}\tbest_k: {best_k:d}\t'
           f'test f1 score: {test_f1_score:.5f}')
     print()
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    #raise NotImplementedError
    best_model = None
    best_k = 0
    best_func = distance_funcs['euclidean']
    max_score = 0
    n = 30

    if len(Xtrain) < 30:
        n = len(Xtrain) - 1  

    for func in distance_funcs:
        for k in range(1, n, 2):
            model = KNN(k, distance_funcs[func])
            model.train(Xtrain, ytrain)
            predicted = model.predict(Xval)
            
            temp_f1 = f1_score(yval, predicted)
            
            print('[part 1.1] {name}\tk: {k:d}\t'.format(name = func, k = k) +
                  'train: {train_f1_score:.5f}\t'.format(train_f1_score = temp_f1))
            print()
            
            if temp_f1 > max_score:
                max_score = temp_f1
                best_model = model
                best_k = k
                best_func = func
                              
            if temp_f1 == max_score:
                if k < best_k:
                    max_score = temp_f1
                    best_model = model
                    best_k = k
                    best_func = func

    
    
    print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name = best_func, best_k = best_k) +
        'test f1 score: {test_f1_score:.5f}'.format(test_f1_score = max_score))
    print()
          
    """
    #Dont change any print statement

    
    print('[part 1.1] {name}\tk: {k:d}\t'.format(name=name, k=k) +
    'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) +
    'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))
    print()
    
    print('[part 1.1] {name}\tbest_k: {best_k:d}\t'.format(name=name, best_k=best_k) +
    'test f1 score: {test_f1_score:.5f}'.format(test_f1_score=test_f1_score))
    print()
    
    """
        
    return best_model, best_k, best_func
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # scaling_classes: diction of scalers
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    # return best_scaler: best function choosed for best_model
    #raise NotImplementedError

    best_model = None
    best_k = 0
    best_func = distance_funcs['euclidean']
    best_scaler = scaling_classes['min_max_scale']
    max_score = 0
    n = 30

    if len(Xtrain) < 30:
        n = len(Xtrain) - 1
        
    for sc in scaling_classes:
        scaler = scaling_classes[sc]()
        
        scaled_train = scaler(Xtrain)
        scaled_val = scaler(Xval)
        
        for func in distance_funcs:
            for k in range(1, n, 2):
                model = KNN(k, distance_funcs[func])
                model.train(scaled_train, ytrain)
                predicted = model.predict(scaled_val)
                
                temp_f1 = f1_score(yval, predicted)
                #valid_f1_score = f1_score(yval, scaled_val)
                """
                print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name = func, scaling_name = sc, k = k) +
                      'train: {train_f1_score:.5f}\t'.format(train_f1_score = temp_f1) +
                      'valid: {valid_f1_score:.5f}'.format(valid_f1_score = valid_f1_score))
                print()
                """
                
                print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name = func, scaling_name = sc, k = k) +
                      'train: {train_f1_score:.5f}\t'.format(train_f1_score = temp_f1))
                print()
                
                if temp_f1 > max_score:
                    max_score = temp_f1
                    best_model = model
                    best_k = k
                    best_func = func
                    best_scaler = sc
                                      
                if temp_f1 == max_score:
                    if k < best_k:
                        max_score = temp_f1
                        best_model = model
                        best_k = k
                        best_func = func
                        best_scaler = sc
    
    print('[part 1.2] {name}\t{scaling_name}\t'.format(name = best_func, scaling_name = best_scaler) +
          'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(best_k = best_k, test_f1_score = max_score))
    print()
    

    """
    #Dont change any print statement
    print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name=name, scaling_name=scaling_name, k=k) +
    'train: {train_f1_score:.5f}\t'.format(train_f1_score=train_f1_score) +
    'valid: {valid_f1_score:.5f}'.format(valid_f1_score=valid_f1_score))
    
    print()
    print('[part 1.2] {name}\t{scaling_name}\t'.format(name=name, scaling_name=scaling_name) +
    'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(best_k=best_k, test_f1_score=test_f1_score))
    print()
    """
            
    return best_model, best_k, best_func, best_scaler
def model_selection_with_transformation(distance_funcs, scaling_classes,
                                        Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # scaling_classes: diction of scalers
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    # return best_scaler: best function choosed for best_model

    upper_bound = len(Xtrain)
    if upper_bound > 30:
        upper_bound = 30
    max_f1 = []
    for scale_class_name, scaling_class in scaling_classes.items():
        scale_obj = scaling_class()
        if scale_class_name == 'normalize':
            trans_Xtrain = transpose_list(scale_obj(transpose_list(Xtrain)))
            trans_Xval = transpose_list(scale_obj(transpose_list(Xval)))
        else:
            trans_Xtrain = scale_obj(Xtrain)
            trans_Xval = scale_obj(Xval)

        for dist_func_name, distance_func in distance_funcs.items():
            max_score = -1
            min_k = 0
            best_model = []
            for k in range(1, upper_bound, 2):
                knn = KNN(k, distance_func)
                knn.train(trans_Xtrain, ytrain)
                pred_labels = knn.predict(trans_Xval)
                curr_f1 = f1_score(yval, pred_labels)
                if curr_f1 > max_score:
                    max_score = curr_f1
                    min_k = k
                    best_model = knn
            max_f1.append((max_score, scale_class_name, dist_func_name, min_k,
                           best_model))
    max_f1.sort(reverse=True)

    # filter ties
    majority = filter_ties(max_f1)

    # break ties
    SORT_ORDER_SCALAR = {"min_max_scale": 0, "normalize": 1}
    majority.sort(key=lambda val: SORT_ORDER_SCALAR[val[1]])

    majority = filter_ties(majority)

    SORT_ORDER = {
        "euclidean": 0,
        "gaussian": 1,
        "inner_prod": 2,
        "cosine_dist": 3
    }
    majority.sort(key=lambda val: SORT_ORDER[val[2]])

    return majority[0][4], majority[0][3], majority[0][2], majority[0][1]
Example #25
0
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # scaling_classes: diction of scalers
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    # return best_scaler: best function choosed for best_model
    # raise NotImplementedError

    # initiate model, best k, current k and best f1 score:
    best_model = None
    best_f1_score = None
    best_k = None
    current_k = 1
    best_func = None
    best_scaler = None
    best_method = None
    best_scaling_method = None
    order1 = ['euclidean', 'gaussian', 'inner_prod', 'cosine_dist']
    order2 = ['min_max_scale', 'normalize']

    if len(ytrain) < 30:
        max_k = len(ytrain) - 1
    else:
        max_k = 30

    # loop through different scaling methods
    for scaling_method in scaling_classes.keys():
        scaling_class = scaling_classes[scaling_method]
        print()
        print("Current scaling method: ", scaling_class)

        # create the scaler
        scaler = scaling_class()
        # print(scaler)

        # scale dataset, no need to scale ytrain because values already from 0 to 1
        scaled_Xtrain = scaler(Xtrain)
        scaled_Xval = scaler(Xval)

        # loop until k reaches number of sample -1
        # while current_k < len(ytrain):
        while current_k < max_k:
            # loop through each distance function method:
            for method in distance_funcs.keys():
                distance_func = distance_funcs[method]

                # create the model based on this current k and distance function
                kNNClassifier = KNN(current_k, distance_func)

                # Train this model with training data
                kNNClassifier.train(scaled_Xtrain, ytrain)

                # Get f1 score on validation dataset to optimize (best method is the one with highest validation f1 score)
                kNNF1Score = f1_score(yval,kNNClassifier.predict(scaled_Xval))

                # Dont change any print statement
                print()
                print('[part 1.2] {scaling_name}\t{distance_name}\tk: {k:d}\t'.format(distance_name=distance_func, scaling_name=scaling_class, k=current_k) +
                      'valid: {valid_f1_score:.5f}'.format(valid_f1_score=kNNF1Score))

                # update best values
                if best_f1_score == None or best_f1_score < kNNF1Score:
                    best_f1_score = kNNF1Score
                    best_k = current_k
                    best_model = kNNClassifier
                    best_scaler = scaling_class
                    best_func = distance_func
                    best_scaling_method = scaling_method
                    best_method = method

                # break ties
                if best_f1_score == kNNF1Score:
                    if order2.index(scaling_method) < order2.index(best_scaling_method):
                        best_func = distance_func
                        best_f1_score = kNNF1Score
                        best_k = current_k
                        best_model = kNNClassifier
                        best_scaler = scaling_class
                    elif order2.index(scaling_method) < order2.index(best_scaling_method):
                        if order1.index(method) < order1.index(best_method):
                            best_func = distance_func
                            best_f1_score = kNNF1Score
                            best_k = current_k
                            best_model = kNNClassifier
                            best_scaler = scaling_class


            # start with k = 1 and incrementally increase by 2
            current_k += 2

        # reset counter k
        current_k = 1

    print("Best scaling method: ", str(best_scaler), " and best k is ", best_k)
    print("Best distance method: ", str(best_model.distance_function), " and best k is ", best_k)
    print("Corresponding valid_f1_score: ", best_f1_score)
    print("predicted_yval: ", best_model.predict(Xval))
    print("true_yval: ", yval)
    return best_model, best_k, best_method, best_scaling_method
Example #26
0
def model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    # raise NotImplementedError

    # initiate model, best k, current k and best f1 score:
    best_model = None
    best_k = None
    best_f1_score = None
    current_k = 1
    best_func = None
    best_method = None

    # order of preferrence of directions
    order = ["euclidean", "gaussian", "inner_prod", "cosine_dist"]
    #
    #     # return true if index of direction 1 is smaller than index of direction 2 in the above list
    #     return directions.index(distanceFunc1) < directions.index(distanceFunc2)

    # loop until k reaches number of sample -1

    if len(ytrain) < 30:
        max_k = len(ytrain) - 1
    else:
        max_k = 30

    # while current_k < len(ytrain):
    while current_k < max_k:
        # loop through each distance function method:
        for method in distance_funcs.keys():
            distance_func = distance_funcs[method]

            # create the model based on this current k and distance function
            kNNClassifier = KNN(current_k, distance_func)

            # Train this model with training data
            kNNClassifier.train(Xtrain, ytrain)

            # Get f1 score on validation dataset to optimize (best method is the one with highest validation f1 score)
            kNNF1Score = f1_score(yval,kNNClassifier.predict(Xval))

            # Dont change any print statement
            print()
            print('[part 1.1] {name}\tk: {k:d}\t'.format(name=distance_func, k=current_k) +
                   'valid: {valid_f1_score:.5f}'.format(valid_f1_score=kNNF1Score))

            # update best values
            if best_f1_score == None or best_f1_score < kNNF1Score:
                best_f1_score = kNNF1Score
                best_k = current_k
                best_model = kNNClassifier
                best_func = distance_func
                best_method = method

            # break ties by order of preference
            if best_f1_score == kNNF1Score:
                if order.index(method) < order.index(best_method):
                    best_func = distance_func
                    best_f1_score = kNNF1Score
                    best_k = current_k
                    best_model = kNNClassifier

        # start with k = 1 and incrementally increase by 2
        current_k += 2

    print("Best distance method: ", str(best_model.distance_function), " and best k is ", best_k)
    print("Corresponding valid_f1_score: ", str(f1_score(yval,best_model.predict(Xval))))
    print("predicted_yval: ", best_model.predict(Xval))
    print("true_yval: ", yval)
    return best_model, best_k, best_method
Example #27
0
def model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # scaling_classes: diction of scalers
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    # return best_scaler: best function choosed for best_model
    model=KNN(1,distance_funcs['euclidean'])
    # initilize
    bestk=1
    bestfunc='euclidean'
    bestscaler='min_max_scale'
    optf1=0
    kmax=29
    if(len(Xtrain)<kmax):
        kmax=len(Xtrain)-1
    
    for scaling_name in scaling_classes:
            
            scaling=scaling_classes[scaling_name]()
            New_Xtrain=scaling.__call__(Xtrain)
            
            New_Xval=scaling.__call__(Xval)
            
            print(scaling_name,New_Xval)
            
            for key_func in distance_funcs:
                k=1
         
                while(k<kmax):
                
                    model.k=k
                    model.distance_function=distance_funcs[key_func]
                    model.train(New_Xtrain,ytrain)
                    
                    
                    ypreval=model.predict(New_Xval)
                
                    get_f1=f1_score(yval,ypreval)
                    if(get_f1>optf1):
                        bestk=k
                        bestfunc=key_func
                        bestscaler=scaling_name
                        optf1=get_f1
                    
                    print('[part 1.2] {name}\t{scaling_name}\tk: {k:d}\t'.format(name=key_func, scaling_name=scaling_name, k=k) +
                            'valid: {valid_f1_score:.5f}'.format(valid_f1_score=get_f1))
                    
                    print()
                    
                    k+=2
                    
    model.k=bestk
    model.distance_function=distance_funcs[key_func]
    model.scale=scaling_classes[bestscaler]
    print("bestk:  ",bestk,"bestfunc:  ",bestfunc,"bestscale:  ",bestscaler)
   
    
    return model,bestk,bestfunc,bestscaler
    raise NotImplementedError
Example #28
0
def model_selection_with_transformation(distance_funcs, scaling_classes,
                                        Xtrain, ytrain, Xval, yval):
    # distance_funcs: dictionary of distance funtion
    # scaling_classes: diction of scalers
    # Xtrain: List[List[int]] train set
    # ytrain: List[int] train labels
    # Xval: List[List[int]] validation set
    # yval: List[int] validation labels
    # return best_model: an instance of KNN
    # return best_k: best k choosed for best_model
    # return best_func: best function choosed for best_model
    # return best_scaler: best function choosed for best_model
    best_k = -1
    best_score_train = 0
    best_score_val = -1
    best_distance = ""
    scaling_instances = []
    scaling_class_name = []
    best_model = None
    #print(len(Xtrain), len(Xval))
    #print(len(Xtrain), len(Xval))
    if len(Xtrain) <= 30:
        K = len(Xtrain) - 1
    else:
        K = 30
    for key, val in scaling_classes.items():
        scaling_instances.append(val())
        scaling_class_name.append(key)
        best_scaling = scaling_instances[0]
    for i in range(len(scaling_instances)):
        Xtrain_n = scaling_instances[i](Xtrain)
        Xval_n = scaling_instances[i](Xval)
        for key, val in distance_funcs.items():
            k = 1
            while k <= K:
                kNN = KNN(k, val)
                #print("train")
                kNN.train(Xtrain_n, ytrain)
                #print('Xval before prediction')
                yval_pred = kNN.predict(Xval_n)
                #print("predict1")
                #print(len(Xval_n),len(yval_pred), len(yval))
                #print("f1_Score1")
                valid_f1_score = f1_score(yval, yval_pred)
                #print("f1_Score2")
                ytrain_pred = kNN.predict(Xtrain_n)
                #print("predict2")
                train_f1_score = f1_score(ytrain, ytrain_pred)
                if best_score_val < valid_f1_score:
                    best_k = k
                    best_score_val = valid_f1_score
                    best_score_train = train_f1_score
                    best_distance = key
                    best_scaling = scaling_instances[i]
                    scaling_name = scaling_class_name[i]
                    best_model = kNN
                k += 2
    #if best_k==1 and best_distance=='inner_product' and scaling_name=='min_max_scale':
    #   scaling_name='normalize'
    print(best_k, best_distance, scaling_name)
    return best_model, best_k, best_distance, scaling_name
Example #29
0
point1 = [1, 2, 3]
point2 = [3, 5, 7]

print(cosine_sim_distance(point1, point2))

"""

model_selection_without_normalization(distance_funcs, minmax_test, labels,
                                      minmax_test2, labels)

knn_dataset = [[0, 0], [4, 4], [2, 2], [3, 3], [1, 1], [5, 5]]
knn_labels = [1, 0, 1, 1, 0, 0]

knn_test = KNN(2, distance_funcs['euclidean'])
knn_test.train(knn_dataset, knn_labels)

print("KNN mapping:")
print(knn_test.mapping)

pred_test = [[3, 3]]
print(knn_test.predict(pred_test))
print()

print("k nearest neighbors:")
print(knn_test.get_k_neighbors(pred_test[0]))
print()

for d in knn_dataset:
    print(euclidean_distance(pred_test[0], d))
Example #30
0
class TestKNN(TestCase):
    def setUp(self):
        self.knn = KNN(1, euclidean_distance)

    def test_train(self):
        features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]]
        values = [0, 0, 0, 1, 1, 1]
        self.knn.train(features, values)
        point1 = [0, 0]
        neighbor = self.knn.get_neighbors(point1)
        self.assertEqual(0, self.knn.get_response(neighbor))
        numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor)
        point2 = [10, 10]
        neighbor = self.knn.get_neighbors(point2)
        numpy.testing.assert_array_equal(numpy.array([[9, 9, 1]]), neighbor)
        self.assertEqual(1, self.knn.get_response(neighbor))

    @skip("clarify inner product")
    def test_inner_product_knn(self):
        knn = KNN(1, inner_product_distance)
        features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]]
        values = [0, 0, 0, 1, 1, 1]
        knn.train(features, values)
        point1 = [0, 0]
        neighbor = knn.get_neighbors(point1)
        self.assertEqual(0, knn.get_response(neighbor))
        numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor)
        point2 = [10, 10]
        neighbor = knn.get_neighbors(point2)
        numpy.testing.assert_array_equal(numpy.array([[1, 1, 0]]), neighbor)
        self.assertEqual(1, knn.get_response(neighbor))

    def test_predict(self):
        features = [[1, 1], [1, 2], [2, 2], [9, 9], [8, 8], [8, 9]]
        values = [0, 0, 0, 1, 1, 1]
        self.knn.train(features, values)
        points = [[0, 0], [10, 10]]
        self.assertListEqual([0, 1], self.knn.predict(points))

    def test_knn(self):
        features, labels = generate_data_cancer()
        train_features, train_labels = features[:400], labels[:400]
        valid_features, valid_labels = features[400:460], labels[400:460]
        test_features, test_labels = features[460:], labels[460:]
        assert len(train_features) == len(train_labels) == 400
        assert len(valid_features) == len(valid_labels) == 60
        assert len(test_features) == len(test_labels) == 109
        distance_funcs = {
            # 'euclidean': euclidean_distance,
            # 'gaussian': gaussian_kernel_distance,
            'inner_prod': inner_product_distance,
        }
        for name, func in distance_funcs.items():
            best_f1_score, best_k = -1, 0
            for k in [1]:
                model = KNN(k=k, distance_function=func)
                model.train(train_features, train_labels)
                # print(train_labels)
                # print(model.predict(train_features))
                train_f1_score = f1_score(train_labels,
                                          model.predict(train_features))
                valid_f1_score = f1_score(valid_labels,
                                          model.predict(valid_features))
                print(f'[part 2.1] {name}\tk: {k:d}\t'
                      f'train: {train_f1_score:.5f}\t'
                      f'valid: {valid_f1_score:.5f}')
                if valid_f1_score > best_f1_score:
                    best_f1_score, best_k = valid_f1_score, k
        model = KNN(k=best_k, distance_function=func)
        model.train(train_features + valid_features,
                    train_labels + valid_labels)
        test_f1_score = f1_score(test_labels, model.predict(test_features))
        print()
        print(f'[part 2.1] {name}\tbest_k: {best_k:d}\t'
              f'test f1 score: {test_f1_score:.5f}')
        print()

    def test_normalization(self):
        scaling_functions = {
            'min_max_scale': MinMaxScaler,
            'normalize': NormalizationScaler,
        }
        distance_funcs = {
            'euclidean': euclidean_distance,
            'gaussian': gaussian_kernel_distance,
            'inner_prod': inner_product_distance,
        }
        features, labels = generate_data_cancer()
        train_features, train_labels = features[:400], labels[:400]
        valid_features, valid_labels = features[400:460], labels[400:460]
        test_features, test_labels = features[460:], labels[460:]
        assert len(train_features) == len(train_labels) == 400
        assert len(valid_features) == len(valid_labels) == 60
        assert len(test_features) == len(test_labels) == 109
        for scaling_name, scaling_class in scaling_functions.items():
            for name, func in distance_funcs.items():
                scaler = scaling_class()
                train_features_scaled = scaler(train_features)
                valid_features_scaled = scaler(valid_features)

                best_f1_score, best_k = 0, -1
                for k in [1, 3, 10, 20, 50]:
                    model = KNN(k=k, distance_function=func)
                    model.train(train_features_scaled, train_labels)
                    train_f1_score = f1_score(
                        train_labels, model.predict(train_features_scaled))

                    valid_f1_score = f1_score(
                        valid_labels, model.predict(valid_features_scaled))
                    print('[part 2.2] {name}\t{scaling_name}\tk: {k:d}\t'.
                          format(name=name, scaling_name=scaling_name, k=k) +
                          'train: {train_f1_score:.5f}\t'.format(
                              train_f1_score=train_f1_score) +
                          'valid: {valid_f1_score:.5f}'.format(
                              valid_f1_score=valid_f1_score))

                    if valid_f1_score > best_f1_score:
                        best_f1_score, best_k = valid_f1_score, k

                # now change it to new scaler, since the training set changes
                scaler = scaling_class()
                combined_features_scaled = scaler(train_features +
                                                  valid_features)
                test_features_scaled = scaler(test_features)

                model = KNN(k=best_k, distance_function=func)
                model.train(combined_features_scaled,
                            train_labels + valid_labels)
                test_f1_score = f1_score(test_labels,
                                         model.predict(test_features_scaled))
                print()
                print('[part 2.2] {name}\t{scaling_name}\t'.format(
                    name=name, scaling_name=scaling_name) +
                      'best_k: {best_k:d}\ttest: {test_f1_score:.5f}'.format(
                          best_k=best_k, test_f1_score=test_f1_score))
                print()