Example #1
0
class Stacking():
    def __init__(self):
        pass

    def fit(self, X, y):
        self.rf = RandomForest(num_trees=15, max_depth=np.inf)
        self.rf.fit(X, y)
        y_rf = self.rf.predict(X)

        self.nb = NaiveBayes()
        self.nb.fit(X, y)
        y_nb = self.nb.predict(X)

        self.knn = KNN(k=3)
        self.knn.fit(X, y)
        y_knn = self.knn.predict(X)

        newX = np.array([y_rf, y_nb, y_knn]).transpose()

        model = DecisionTree(max_depth=np.inf,
                             stump_class=DecisionStumpErrorRate)
        self.model = model

        model.fit(newX, y)

    def predict(self, X):
        y_rf = self.rf.predict(X)
        y_nb = self.nb.predict(X)
        y_knn = self.knn.predict(X)
        x_test = np.array([y_rf, y_nb, y_knn]).transpose()

        return self.model.predict(x_test)
Example #2
0
def execute():
    df = pd.read_csv('data/reduced_dataset_v3.csv')

    df = df.drop(
        ['Unnamed: 0', 'Time', 'Time Code', 'Country Name', 'Country Code'],
        axis=1)
    sbs_reg = StepByStepRegression(df, 'correlation')
    # print(emp_df.columns)
    sbs_reg.normalize()
    # print(normalized_df.head())

    sorted_columns = sbs_reg.get_least_nan_columns(sbs_reg.normalized_df)
    # print('sorted columns', sorted_columns)
    sorted_df = pd.DataFrame()
    correlated_features_list = dict()
    for col, nulls in sorted_columns.iteritems():
        sorted_df[col] = sbs_reg.normalized_df[col]
    # print('sorted dataframe', sorted_df.columns)

    if sbs_reg.sim_fun == 'KNN':
        knn = KNN(sorted_df, 3)
        correlated_features_list = knn.findKNeighbors()
    elif sbs_reg.sim_fun == 'correlation':
        correlated_features_list = sbs_reg.correlation(sorted_df)
    print('correlated features', correlated_features_list)
    sbs_reg.fill_missing_data(sorted_df, correlated_features_list)
    un_normalized = sbs_reg.un_normalize(sorted_df)
    print(un_normalized)
Example #3
0
def main():
    K = [1, 3]

    #load CM1
    data = pd.read_csv('./lvq_output/seed_137/kc1_lvq3.csv')

    X, Y = data.drop(columns=['defects']), data['defects']

    # normalize data
    #X = normalize_data(X)

    # create k-fold splits
    kf = KFold(n_splits=10)

    # instanciate classifier
    for k in K:
        clf = KNN(k=k)
        print("k equals {}".format(k))

        start_time = time.time()
        acc = []
        for train, test in kf.split(X):

            clf.fit(X.iloc[train], Y.iloc[train])
            predictions = clf.predict(X.iloc[test])
            acc.append((np.sum(predictions == Y.iloc[test]) / len(test)) * 100)

        end_time = time.time()

        acc = np.array(acc)

        print("mean accuracy: {}".format(np.mean(acc)))
        print("standard deviation: {}".format(np.std(acc)))
        print("time elapsed: {}".format(end_time - start_time))
Example #4
0
def main():
    trainSet = pd.read_csv('datasets/train_set.csv',
                           converters={'Trajectory': literal_eval})

    testSet = pd.read_csv('datasets/test_set_a2.csv',
                          converters={'Trajectory': literal_eval})

    # labels for categories
    le = preprocessing.LabelEncoder()
    categoryIds = le.fit_transform(trainSet['journeyPatternId'])

    allSequences = []

    for trainIndex, trainRow in trainSet.iterrows():
        allSequences.append(trainRow['Trajectory'])

    # initialize KNN classifier
    clf = KNN(5, DTW)

    crossValidation(clf, allSequences, categoryIds, le)
    clf.fit(allSequences, categoryIds)

    # predict the categories for the testSet
    predIds = clf.predict(testSet['Trajectory'])
    predCategs = le.inverse_transform(predIds)

    writeInCsv(predCategs)
Example #5
0
def main(n):
    print('Teste: ' + str(n))
    teste = open('./particoes/ts' + str(n) + '.txt', 'r')
    arquivoTreino = './particoes/cj' + str(n) + '.txt'
    k = [1, 3, 5, 10]

    count = 0
    hitDTW = [0, 0, 0, 0]
    hitEuclidiana = [0, 0, 0, 0]
    for testeLinha in teste:
        count += 1  #Conta quantas linhas ja foram executadas
        linhateste = testeLinha.split(" ")
        vetordeteste = list(map(float, linhateste[1:]))

        ResultadoDTW = KNN(arquivoTreino, vetordeteste, k).runKNN_DTW()
        ResultadoEuclidiana = KNN(arquivoTreino, vetordeteste,
                                  k).runKNN_Euclidiana()

        #Contador de acertos:
        pos = 0
        for i in ResultadoDTW:
            if (int(linhateste[0]) == int(i[1])):
                hitDTW[pos] += 1
            pos += 1

        pos = 0
        for i in ResultadoEuclidiana:
            if (int(linhateste[0]) == int(i[1])):
                hitEuclidiana[pos] += 1
            pos += 1

        print("Progresso" + str(n) + ": " + str(count * 100 / 240) +
              "%")  #Indicador de progresso do programa

    #Relatorio dos testes:
    relatorio = open("./relatorios/relatorioFinal" + str(n) + ".txt", "w")
    relatorio.write('Teste: ' + str(n) + "\n")
    relatorio.write("Accuracy DTW K=1: " + str(hitDTW[0] * 100 / count) +
                    "% \n")
    relatorio.write("Accuracy DTW K=3: " + str(hitDTW[1] * 100 / count) +
                    "% \n")
    relatorio.write("Accuracy DTW K=5: " + str(hitDTW[2] * 100 / count) +
                    "% \n")
    relatorio.write("Accuracy DTW K=10: " + str(hitDTW[3] * 100 / count) +
                    "% \n")
    relatorio.write("Accuracy DTW TOTAL: " + str((
        (hitDTW[0] + hitDTW[1] + hitDTW[2] + hitDTW[3]) * 100) / (4 * count)) +
                    "% \n")
    relatorio.write("Accuracy Euclidiana K=1: " +
                    str(hitEuclidiana[0] * 100 / count) + "% \n")
    relatorio.write("Accuracy Euclidiana K=3: " +
                    str(hitEuclidiana[1] * 100 / count) + "% \n")
    relatorio.write("Accuracy Euclidiana K=5: " +
                    str(hitEuclidiana[2] * 100 / count) + "% \n")
    relatorio.write("Accuracy Euclidiana K=10: " +
                    str(hitEuclidiana[3] * 100 / count) + "% \n")
    relatorio.write("Accuracy Euclidiana TOTAL: " +
                    str(((hitEuclidiana[0] + hitEuclidiana[1] +
                          hitEuclidiana[2] + hitEuclidiana[3]) * 100) /
                        (4 * count)) + "% \n")
Example #6
0
def test_knn(k, train_data, train_labels, test_data):
    """ test_knn function

    Trains a KNN classifier with the given testing set then tests it
    on the testing data. Outputs as a CSV file.

    Args
    ----
    k : integer
        number of neighbors to use for KNN
    train_data : np.array
        training dataset
    train_labels : np.array
        training dataset labels
    test_data : np.array
        testing dataset

    Returns
    -------
    Tuple (np.array, np.array)
    """
    print("Final k:" + str(k))
    knn = KNN(k, train_data, train_labels)

    # print to CSV
    with open('predictions_digit_recognizer.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['ImageId', 'Label'])
        for i in range(len(test_data)):
            data = test_data[i]
            guess = knn.classify(data)
            writer.writerow([str(i + 1), str(int(guess))])
Example #7
0
def foo(k_num=5, distance=distance_metric(p=1)):
    _data = [[i[0], i[3]] for i in data]
    # Split the data into train and test parts
    #train_d, train_l, test_d, test_l = tt_split(_data, label)
    train_d, train_l, test_d, test_l = (_data[0:30] + _data[50:80] +
                                        _data[100:130], label[0:30] +
                                        label[50:80] + label[100:130],
                                        _data[30:50] + _data[80:100] +
                                        _data[130:], label[30:50] +
                                        label[80:100] + label[130:])
    # Initialize the KNN object
    knn = KNN(neighbors_num=k_num, distance=distance)
    # Fill the data in KNN
    knn.fit(train_d, train_l)
    # Take prediction from KNN
    result = knn.predict(test_d)

    # Print the results on screen as data, real label, predicted label.
    #print("%20s - %20s | %20s | %s" %("[Data]", "<Real Label>", "<Predicted Label>", "Truth"))

    n = 0
    for i, j, r in zip(test_d, test_l, result):
        truthness = True if j == r else False
        if truthness:
            n += 1
        #print("%20s - %20s | %20s | %s" %(i, j, r, truthness))
    #print("Acc:", n / len(test_d))
    return n / len(test_d), n, len(test_d)
Example #8
0
 def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val):
     optimal_k = 0
     function = ""
     scalar = ""
     model = None
     scalar_method = []
     scalar_name = []
     f1_scores = -2 ** 32
     if len(x_train) <= 30:
         max_k = len(x_train)
     else:
         max_k = 30
     for m, n in scaling_classes.items():
         scalar_method.append(n())
         scalar_name.append(m)
     for i in range(len(scalar_method)):
         x_train = scalar_method[i](x_train)
         x_val = scalar_method[i](x_val)
         for key, value in distance_funcs.items():
             for k_value in range(1, max_k, 2):
                 train_model = KNN(k_value, value)
                 train_model.train(x_train, y_train)
                 pre_val = train_model.predict(x_val)
                 cur_f1 = f1_score(y_val, pre_val)
                 if f1_scores < cur_f1:
                     optimal_k = k_value
                     function = key
                     model = train_model
                     f1_scores = cur_f1
                     scalar = scalar_name[i]
     self.best_k = optimal_k
     self.best_distance_function = function
     self.best_scaler = scalar
     self.best_model = model
     return self.best_k, self.best_distance_function, self.best_scaler, self.best_model
Example #9
0
def question_4(points):
    """
    question 4
    :param points: list of Point
    """
    k_list = [5, 7]
    normalization_list = [[DummyNormalizer, "DummyNormalizer"],
                          [SumNormalizer, "SumNormalizer"],
                          [MinMaxNormalizer, "MinMaxNormalizer"],
                          [ZNormalizer, "ZNormalizer"]]
    print("Question 4:")
    for k in k_list:
        print("K=", k, sep="")
        m = KNN(k)
        m.train(points)
        cv = CrossValidation()
        for i in normalization_list:
            normalize_object = i[0]()
            normalize_object.fit(points)
            new_points = normalize_object.transform(points)
            #  2 is the best n-fold
            average_score = cv.run_cv(new_points, 2, m, accuracy_score, False,
                                      True)
            formatted_average_score = "{:.2f}".format(average_score)
            print("Accuracy of", i[1], "is", formatted_average_score)
            print()
Example #10
0
class TestKNN(unittest.TestCase):
    def setUp(self):
        np.genfromtxt = mock.MagicMock(return_value=np.empty([2, 2]))
        self.target = KNN("", "")

    def test_distinct(self):
        input = [
            1, 2, 3, 2, 2, 1, 5, 4, 4, 5, 4, 3, 2, 2, 1, 2, 5, 4, 6, 5, 4, 4,
            3, 2, 3, 4, 3, 1, 5, 2, 6, 4, 6
        ]
        expected = [1, 2, 3, 5, 4, 6]

        self.assertEqual(self.target._distinct(input), expected)

    def test_calculateDistance(self):
        instance1 = np.array([3, 104])
        instance2 = np.array([18, 90])

        self.assertEqual(self.target.calculateDistance(instance1, instance2),
                         20.518284528683193)

    def test_calculateDistances(self):
        magicMock = mock.MagicMock()
        magicMock.side_effect = [1, 0.3, 2, 0.15, 3, 2, 6, 99, 0.015, 0.191]
        self.target.calculateDistance = magicMock

        result = self.target.calculateDistances(np.empty(10),
                                                np.empty([10, 10]))

        self.assertEqual(result[0],
                         [1, 0.3, 2, 0.15, 3, 2, 6, 99, 0.015, 0.191])
        self.assertTrue((result[1] == np.array([8, 3, 9, 1, 0, 2, 5, 4, 6,
                                                7])).all())
    def tuning_without_scaling(self, distance_funcs, x_train, y_train, x_val,
                               y_val):
        """
        In this part, you should try different distance function you implemented in part 1.1, and find the best k.
        Use k range from 1 to 30 and increment by 2. Use f1-score to compare different models.

        :param distance_funcs: dictionary of distance functions you must use to calculate the distance.
            Make sure you loop over all distance functions for each data point and each k value.
            You can refer to test.py file to see the format in which these functions will be
            passed by the grading script
        :param x_train: List[List[int]] training data set to train your KNN model
        :param y_train: List[int] train labels to train your KNN model
        :param x_val:  List[List[int]] Validation data set will be used on your KNN predict function to produce
            predicted labels and tune k and distance function.
        :param y_val: List[int] validation labels

        Find(tune) best k, distance_function and model (an instance of KNN) and assign to self.best_k,
        self.best_distance_function and self.best_model respectively.
        NOTE: self.best_scaler will be None

        NOTE: When there is a tie, choose model based on the following priorities:
        Then check distance function  [euclidean > minkowski > gaussian > inner_prod > cosine_dist]
        If they have same distance fuction, choose model which has a less k.
        """
        max_score = 0
        for i in distance_funcs:
            for k in range(1, 30, 2):
                cur_model = KNN(k, distance_funcs[i])
                cur_model.train(x_train, y_train)
                cur_score = f1_score(y_val, cur_model.predict(x_val))
                if cur_score > max_score:
                    max_score = cur_score
                    self.best_k = k
                    self.best_distance_function = i
                    self.best_model = cur_model
Example #12
0
class TestKNN(unittest.TestCase):

    def setUp(self):
        self.knn = KNN()

        data = load_iris()
        X_train, X_test, y_train, y_test = train_test_split(
            data['data'], data['target'], random_state=0)

        self.knn.train(X_train, y_train)
        self.X_test = X_test
        self.y_test = y_test

    def test_distance_measure(self):
        a = np.random.rand(5)
        b = np.random.rand(5)

        self.assertTrue(self.knn.measure_distance(a, b))

    def test_prediction_time(self):
        start_time = time.time()
        self.knn.predict(self.X_test, k=25)
        end_time = time.time()
        diff = (end_time - start_time) * 1000
        logger.info("Running time for K={} is: {}".format(25, diff))
Example #13
0
    def test_iris_regression(self):
        """
        Tests kNN for regression
        """

        k = 1
        iris_dataset = load_iris()

        knn = KNN(k, "average")

        # get petal length as input
        # ensure this is 2D
        X = iris_dataset.data[:, 2].reshape(-1, 1)

        # get petal width as output
        y = iris_dataset.data[:, 3]

        knn.fit(X, y)
        predicted = knn.predict(X)

        # verify shape of output
        self.assertEqual(len(predicted.shape), 1)
        self.assertEqual(predicted.shape[0], iris_dataset.data.shape[0])

        # with k=1, each point should match itself
        # but with only 1 dimension, some points have
        # the same values
        mse = mean_squared_error(y, predicted)
        self.assertLess(mse, 0.1)
Example #14
0
    def eval_model(case):
        l, k = case
        results = {'precision': [], 'recall': [], 'f1': []}

        model = KNN(l, k)

        for i in range(folds):
            print(l, k, 'cross validation', i)

            training, testing = split_data(corpus, i, folds)

            print(l, k, 'fit model', i)
            model.fit([d.vector for d in training],
                      [d.label for d in training])

            print(l, k, 'predict', i)
            preds = [model.predict(d.vector) for d in testing]

            labels = [d.label for d in testing]

            metrics = model_metrics(labels, preds)
            for m, key in zip(metrics, ['precision', 'recall', 'f1']):
                results[key].append(m)

        print(l, k, mean(results['precision']), mean(results['recall']),
              mean(results['f1']))

        return results
Example #15
0
def testknn():
    features = [[1, 1], [1, -1], [-1, -1], [-1, 1]]
    labels = [0, 1, 0, 1]

    knn = KNN(3, Distances.euclidean_distance)
    knn.train(features, labels)
    print(knn.predict([[0, 0]]))
Example #16
0
def knn(corpus, idf):
    query = read_folder('./query')
    tf_idf(query, idf)

    print('fit KNN model')

    classifier = KNN(5, 5)
    classifier.fit([d.vector for d in corpus], corpus)

    start_time = time.time()

    for i, d in enumerate(query):
        print('Query Doc', i)
        print(d.features)

        # neighbors = classifier.brute_force(d.vector)
        neighbors = classifier.neighbors(d.vector)
        print('Query Neighbors', i)

        for n in neighbors:
            print(n.features)
            print('\n')

        print('\n')

    print("--- %s seconds ---" % (time.time() - start_time))
Example #17
0
    def knn_validate(data, kernel, metric, k_neighbors, show_plot):
        plot = Plot()
        matrix_full = [[0, 0], [0, 0]]
        y_predict_arr = []
        for i in range(len(data)):
            data.updateTrainTest(i)
            trainDots, trainClass = data.getDotsByMode('train', False)
            testDots, testClass = data.getDotsByMode('test', False)

            knn = KNN(kernel=kernel, metric=metric, neighbors=k_neighbors)
            knn.fit(trainDots, trainClass)
            y_predict, distance = knn.predict(testDots)
            y_predict_arr.append(y_predict[0])

            if show_plot:
                tDots = np.array(trainDots)
                tCls = np.array(trainClass)
                plot.knn(tDots[tCls == 1.0], tDots[tCls == -1.0], distance, testDots[0], y_predict[0])

            matrix = get_metrics(y_predict, testClass)
            matrix_full[0][0] += matrix[0][0]
            matrix_full[0][1] += matrix[0][1]
            matrix_full[1][0] += matrix[1][0]
            matrix_full[1][1] += matrix[1][1]

        return y_predict_arr, get_f_measure(matrix_full), matrix_full
Example #18
0
def test_knn_regression():
    while True:
        N = np.random.randint(2, 100)
        M = np.random.randint(2, 100)
        k = np.random.randint(1, N)
        ls = np.min([np.random.randint(1, 10), N - 1])
        weights = np.random.choice(["uniform", "distance"])

        X = np.random.rand(N, M)
        X_test = np.random.rand(N, M)
        y = np.random.rand(N)

        knn = KNN(k=k,
                  leaf_size=ls,
                  metric=euclidean,
                  classifier=False,
                  weights=weights)
        knn.fit(X, y)
        preds = knn.predict(X_test)

        gold = KNeighborsRegressor(
            p=2,
            leaf_size=ls,
            n_neighbors=k,
            weights=weights,
            metric="minkowski",
            algorithm="ball_tree",
        )
        gold.fit(X, y)
        gold_preds = gold.predict(X_test)

        for mine, theirs in zip(preds, gold_preds):
            np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
Example #19
0
def main():
    df = pd.read_csv(f".\Data\{args.dataset}")

    X = np.array(df.iloc[:, :-1])
    y = np.array(df.iloc[:, -1])
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    knn = KNN(X_train, y_train, k=args.k)

    if args.type == "clf":
        y_pred = knn.predict(X_test, knn_type="clf")
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))

    elif args.type == "reg":
        y_pred = knn.predict(X_test, knn_type="reg")
        mse = np.mean((y_test - y_pred)**2)
        print(mse)

    else:
        return print("Undefined knn type")

    accuracy = np.mean(y_pred == y_test)
    print(accuracy)
Example #20
0
    def overSample(matall, vecall, mat, T, tar, n):
        if tar <= 0:
            return mat

        danger = 0
        choice = np.zeros(T, dtype=int)

        set1 = set()

        for i in range(T):
            if i % 20 == 0:
                print('%d of %d (in finding danger set...)' % (i, T))
            vec = KNN.kNearestNeighbours(matall, matall.shape[0], n, i)
            typ = BorderlineSmote.sampleType(vec, matall, vecall)
            if typ == 1:
                choice[danger] = i
                danger += 1
                set1.add(i)

        N = int(tar / danger)
        tar = N * danger
        ret = np.zeros((tar, n))
        tot = 0

        for i in range(danger):
            vec = KNN.kNearestNeighbours(mat, T, n, choice[i])
            BorderlineSmote.pupulate(ret, mat, tot, i, vec, N, n, set1)
            tot += N

        return ret
Example #21
0
File: main.py Project: crab-a/lab4
def run_knn(points):
    m = KNN(5)
    m.train(points)
    print(f'predicted class: {m.predict(points[0])}')
    print(f'true class: {points[0].label}')
    cv = CrossValidation()
    cv.run_cv(points, 10, m, accuracy_score)
Example #22
0
 def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val):
     best_score=-100
     best_k=None
     best_dist=''
     best_model=None
     best_scaler=None
     xtrain=x_train
     xval=x_val
     for key in distance_funcs:
         for i in scaling_classes:
             for k in range(1,30,2):
                 x_train=xtrain
                 x_val=xval
                 if k>len(x_train):
                     break
                 b=scaling_classes[i]()
                 x_train=b.__call__(x_train)
                 x_val=b.__call__(x_val)
                 a=KNN(k,distance_funcs[key])
                 a.train(x_train,y_train)
                 P=a.predict(x_val)
                 score=f1_score(y_val,P)
                 if score>best_score:
                     best_score=score
                     best_k=k
                     best_scaler=i
                     best_dist=key
                     best_model=a
     self.best_scaler=best_scaler
     self.best_k = best_k
     self.best_distance_function =best_dist
     self.best_model = best_model
     """
Example #23
0
def computeKNNCrossValidation(args, dict_algorithms):
    if (args.debug):
        print("Running knn...", end='')
    model = KNN(args)
    dict_algorithms["knn"] = model.computeCrossValidation()
    if (args.debug):
        print("ok!")
Example #24
0
def test_knn_clf():
    while True:
        N = np.random.randint(2, 100)
        M = np.random.randint(2, 100)
        k = np.random.randint(1, N)
        n_classes = np.random.randint(10)
        ls = np.min([np.random.randint(1, 10), N - 1])
        weights = "uniform"

        X = np.random.rand(N, M)
        X_test = np.random.rand(N, M)
        y = np.random.randint(0, n_classes, size=N)

        knn = KNN(k=k,
                  leaf_size=ls,
                  metric=euclidean,
                  classifier=True,
                  weights=weights)
        knn.fit(X, y)
        preds = knn.predict(X_test)

        gold = KNeighborsClassifier(
            p=2,
            leaf_size=ls,
            n_neighbors=k,
            weights=weights,
            metric="minkowski",
            algorithm="ball_tree",
        )
        gold.fit(X, y)
        gold_preds = gold.predict(X_test)

        for mine, theirs in zip(preds, gold_preds):
            np.testing.assert_almost_equal(mine, theirs)
        print("PASSED")
Example #25
0
    def tuning_without_scaling(self, distance_funcs, x_train, y_train, x_val,
                               y_val):
        """
        In this part, you need to try different distance functions you implemented in part 1.1 and different values of k (among 1, 3, 5, ... , 29), and find the best model with the highest f1-score on the given validation set.
		
        :param distance_funcs: dictionary of distance functions (key is the function name, value is the function) you need to try to calculate the distance. Make sure you loop over all distance functions for each k value.
        :param x_train: List[List[int]] training data set to train your KNN model
        :param y_train: List[int] training labels to train your KNN model
        :param x_val:  List[List[int]] validation data
        :param y_val: List[int] validation labels

        Find the best k, distance_function (its name), and model (an instance of KNN) and assign them to self.best_k,
        self.best_distance_function, and self.best_model respectively.
        NOTE: self.best_scaler will be None.

        NOTE: When there is a tie, choose the model based on the following priorities:
        First check the distance function:  euclidean > Minkowski > cosine_dist 
		(this will also be the insertion order in "distance_funcs", to make things easier).
        For the same distance function, further break tie by prioritizing a smaller k.
        """

        best_f1 = 0
        for name, func in distance_funcs.items():
            for k in range(1, 30, 2):
                model = KNN(k, func)
                model.train(x_train, y_train)
                valid_f1 = f1_score(y_val, model.predict(x_val))
                if valid_f1 > best_f1:
                    self.best_distance_function = name
                    self.best_k = k
                    best_f1 = valid_f1
                    self.best_model = model
Example #26
0
    def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train,
                            y_train, x_val, y_val):
        """
        This part is the same as "tuning_without_scaling", except that you also need to try two different scalers implemented in Part 1.3. More specifically, before passing the training and validation data to KNN model, apply the scalers in scaling_classes to both of them. 
		
        :param distance_funcs: dictionary of distance functions (key is the function name, value is the function) you need to try to calculate the distance. Make sure you loop over all distance functions for each k value.
        :param scaling_classes: dictionary of scalers (key is the scaler name, value is the scaler class) you need to try to normalize your data
        :param x_train: List[List[int]] training data set to train your KNN model
        :param y_train: List[int] train labels to train your KNN model
        :param x_val: List[List[int]] validation data
        :param y_val: List[int] validation labels

        Find the best k, distance_function (its name), scaler (its name), and model (an instance of KNN), and assign them to self.best_k, self.best_distance_function, best_scaler, and self.best_model respectively.
        
        NOTE: When there is a tie, choose the model based on the following priorities:
        First check scaler, prioritizing "min_max_scale" over "normalize" (which will also be the insertion order of scaling_classes). Then follow the same rule as in "tuning_without_scaling".
        """

        # You need to assign the final values to these variables
        best_f1 = 0
        for scaling_name, scaling_func in scaling_classes.items():
            scaler = scaling_func()
            x_train_scaled = scaler(x_train)
            x_val_scaled = scaler(x_val)
            for name, func in distance_funcs.items():
                for k in range(1, 30, 2):
                    model = KNN(k, func)
                    model.train(x_train_scaled, y_train)
                    valid_f1 = f1_score(y_val, model.predict(x_val_scaled))
                    if valid_f1 > best_f1:
                        self.best_distance_function = name
                        self.best_k = k
                        best_f1 = valid_f1
                        self.best_model = model
                        self.best_scaler = scaling_name
Example #27
0
    def fit(self, X, y):
        # instantiate the input models
        rf = RandomForest(num_trees=15)
        knn = KNN(k=3)
        nb = NaiveBayes(num_classes=2)

        # Random Forest fit and predict
        rf.create_splits(X)
        rf.fit(X, y)
        rf_pred = rf.predict(X)

        # K-Nearest Neighbors fit and predict
        knn.fit(X, y)
        knn_pred = knn.predict(X)

        # Naive Bayes fit and predict
        nb.fit(X, y)
        nb_pred = nb.predict(X)

        # use predictions from input models as inputs for meta-classifiers
        meta_input = np.hstack((rf_pred.reshape(
            (rf_pred.size, 1)), knn_pred.reshape(
                (knn_pred.size, 1)), nb_pred.reshape((nb_pred.size, 1))))

        # use Decision Tree as meta-classifier
        dt = DecisionTree(max_depth=np.inf)
        dt.fit(meta_input, y)

        self.rf = rf
        self.knn = knn
        self.nb = nb
        self.meta_classifier = dt
Example #28
0
    def test_compare_to_scikit_learn_changing_k(self):
        normalizer = Normalizer(self.data)
        data = normalizer.normalize()

        testSize = 100
        trainSize = len(data.data) - testSize
        for i in range(1, 12):
            with self.subTest(i=i):
                print("k: ", i)
                neighbours = i

                trainData = {}
                testData = {}

                trainData['data'] = data.data[:trainSize]
                trainData['target'] = data.target[:trainSize]

                testData['data'] = data.data[trainSize:]
                testData['target'] = data.target[:trainSize]
                knn = KNN(trainData)

                #scikit-learn model:
                model = KNeighborsClassifier(n_neighbors=neighbours)
                model.fit(trainData['data'], trainData['target'])

                ourCounter = 0
                sciCounter = 0
                for i, e in enumerate(testData['data']):
                    if knn.makeGuess(e, neighbours) == testData['target'][i]:
                        ourCounter+=1

                    if model.predict([e]) == testData['target'][i]:
                        sciCounter+=1

                self.assertAlmostEqual(ourCounter/(testSize), sciCounter/(testSize), 3)
Example #29
0
    def test_blob_classification_numpy(self):
        """
        Tests kNN for classification using
        randomly-generated points drawn from
        Gaussian-shaped clusters.
        
        Splits data into training and testing
        sets.
        """

        k = 3
        X, y = generate_cluster_samples()

        train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y)

        knn = KNN(k)
        knn.fit(train_X, train_y)
        pred_y = knn.predict_numpy(test_X)

        # verify shape of output
        self.assertEqual(len(pred_y.shape), 1)
        self.assertEqual(pred_y.shape[0], test_X.shape[0])

        # with k=1, each point should match itself
        accuracy = accuracy_score(test_y, pred_y)
        self.assertAlmostEqual(accuracy, 1.0)
Example #30
0
def main():
    K = [1, 2, 3, 5, 7, 9, 11, 13, 15]

    #load CM1
    data = arff.loadarff('./datasets/CM1.arff')

    X, Y = build_dataframe(data)

    # normalize data
    X = normalize_data(X)

    # create k-fold splits
    kf = KFold(n_splits=10)

    # instanciate classifier
    for k in K:
        clf = KNN(k=k)
        print("k equals {}".format(k))

        start_time = time.time()
        acc = []
        for train, test in kf.split(X):

            clf.fit(X.iloc[train], Y.iloc[train])
            predictions = clf.predict(X.iloc[test])
            acc.append((np.sum(predictions == Y.iloc[test]) / len(test)) * 100)

        end_time = time.time()

        acc = np.array(acc)

        print("mean accuracy: {}".format(np.mean(acc)))
        print("standard deviation: {}".format(np.std(acc)))
        print("time elapsed: {}".format(end_time - start_time))
Example #31
0
    def testscale(self, scale, trial = None):
        backup = self.data, self.testdata

        self.data, self.testdata = self.rescale(self.data, scale), self.rescale(self.testdata, scale)
        self.knn = KNN(self.data)
        result = self.validation(trial)

        self.data, self.testdata = backup
        self.knn = KNN(self.data)
        return result
Example #32
0
 def knncv(Xtrain, Ytrain, Xtest, Ytest):
     knn = KNN(Xtrain,Ytrain)
     
     m = len(Ytest)
     Ypredict = np.zeros(m)
     for i in xrange(m):
         x,y = Xtest[i],Ytest[i]
         results = knn.predict(x,k=4,classes=classes)
         prediction = results.argmax()
         Ypredict[i] = prediction
     return Ypredict
def train_and_val():
    training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat')
    training_set_indices, validation_set_indices = dp.read_training_val_set('dataset/train.txt', 'dataset/val.txt')
    feature = Features()
    features_labels_pair = feature.amino_acid_count(training_data)
    training_set = []
    for index in training_set_indices:
        training_set.append(features_labels_pair[index])
    
    dp.remove_ambiguous_entry_plus(training_set)
    k_nn = KNN(training_set, 23)
    
    confusion_matrix = np.zeros([3,3])
    correct = 0.0
    total = 0.0
    
    validation_set = []
    for index in validation_set_indices:
        validation_set.append(features_labels_pair[index])
    
    dp.remove_ambiguous_entry_plus(validation_set)
    for feature_vector, correct_class in validation_set: 
        prediction = k_nn.predict_codon_cosine(feature_vector, k_nn.no_weight)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0,0] += 1
        if  prediction == 0 and correct_class == 1:
            confusion_matrix[0,1] += 1
        if  prediction == 0 and correct_class == 2:
            confusion_matrix[0,2] += 1
        if  prediction == 1 and correct_class == 0:
            confusion_matrix[1,0] += 1
        if  prediction == 1 and correct_class == 1:
            confusion_matrix[1,1] += 1
        if  prediction == 1 and correct_class == 2:
            confusion_matrix[1,2] += 1
        if  prediction == 2 and correct_class == 0:
            confusion_matrix[2,0] += 1
        if  prediction == 2 and correct_class == 1:
            confusion_matrix[2,1] += 1
        if  prediction == 2 and correct_class == 2:
            confusion_matrix[2,2] += 1  
        #print prediction, correct_class
    print confusion_matrix      
    print correct/total     
Example #34
0
	def __init__(self, test_file, trained_file, global_file):
		self._doc = open(test_file)
		# self._stats = json.load(open(global_file))
		# self._n = self._stats["N"]
		# self._gf = self._stats["freq"]
		self._knn = KNN(trained_file)
		self.result = {}
		self.output = []
Example #35
0
def train_and_test():
    training_data = dp.read_data("dataset/splice-Xtrain.dat", "dataset/splice-Ytrain.dat")
    test_data = dp.read_data("dataset/test40.txt", "dataset/ytest40.txt")
    feature = Features()
    dp.remove_ambiguous_entry_plus(training_data)
    training_set = feature.amino_acid_count(training_data)
    test_set = feature.amino_acid_count(test_data)

    k_nearest_neighbors = KNN(training_set, 26)

    confusion_matrix = np.zeros([3, 3])
    correct = 0.0
    total = 0.0

    for index in range(len(test_set)):
        feature_vector, correct_class = test_set[index]
        prediction = k_nearest_neighbors.predict_codon_cosine(feature_vector, k_nearest_neighbors.no_weight)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0, 0] += 1
        if prediction == 0 and correct_class == 1:
            confusion_matrix[0, 1] += 1
        if prediction == 0 and correct_class == 2:
            confusion_matrix[0, 2] += 1
        if prediction == 1 and correct_class == 0:
            confusion_matrix[1, 0] += 1
        if prediction == 1 and correct_class == 1:
            confusion_matrix[1, 1] += 1
        if prediction == 1 and correct_class == 2:
            confusion_matrix[1, 2] += 1
        if prediction == 2 and correct_class == 0:
            confusion_matrix[2, 0] += 1
        if prediction == 2 and correct_class == 1:
            confusion_matrix[2, 1] += 1
        if prediction == 2 and correct_class == 2:
            confusion_matrix[2, 2] += 1

    print confusion_matrix
    print correct / total
Example #36
0
    def __init__(self, data, testdata = {}, k = 7, function = None, scale = None):
        if scale:
            data = self.rescale(data, scale)
            testdata = self.rescale(testdata, scale)

        self.data = data
        self.testdata = testdata
        self.knn = KNN(data)
        self.k = k

        if function:
            self.function = function
        else:
            self.function = self.knn.inverseWeight
Example #37
0
def main(argv):


    knn = KNN('iris.data', 'iris.test', 4)
    training_set = knn.get_training_set()
    testing_set = knn.get_testing_set()
    print('**********\n***** IRIS ****\n***********')
    print_result(knn, training_set, testing_set)

    knn = KNN('wdbc.data', 'wdbc.test', 1, 0)
    print('**********\n***** Breast Cancer in Wisconsin ****\n***********')
    training_set = knn.get_training_set()
    testing_set = knn.get_testing_set()
    print_result(knn, training_set, testing_set)
# https://deeplearningcourses.com/c/data-science-supervised-machine-learning-in-python
# https://www.udemy.com/data-science-supervised-machine-learning-in-python
from __future__ import print_function, division
from builtins import range, input
# Note: you may need to update your version of future
# sudo pip install -U future


from knn import KNN
from util import get_xor
import matplotlib.pyplot as plt

if __name__ == '__main__':
    X, Y = get_xor()

    # display the data
    plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5)
    plt.show()

    # get the accuracy
    model = KNN(3)
    model.fit(X, Y)
    print("Accuracy:", model.score(X, Y))
Example #39
0
    def Detection(self):
        if (db_type != 'mongodb' and db_type != '1' and 
            db_type != 'redis' and db_type != '2'):
            return

        if self.algorithm == 'knn' or self.algorithm == '1':
            print("K nearest neighbor algorithm")
            detection_algorithm = KNN()
        elif self.algorithm == 'perceptron' or self.algorithm == '2':
            detection_algorithm = Perceptron()
            self.do_perceptron_learn = True # only do once before the testing!
            #print('Invalid algorithm. SVM not yet supported')
            #return
        else:
            print('Invalid algorithm selection')
            return
    
        have_batchfile = len(self.batchfile) != 0
        if have_batchfile:
            batchex = BatchExecutor(self.batchfile, detection_algorithm,
                                    self.do_perceptron_learn,
                                    hostname=self.hostname, port=27017)
            batchex.start()

        metric_groups = self.metric_groups
        counter_keys = self.counter_keys

        data = []
        new_metrics = {}
        old_metrics = {}
        anomaly_metrics = {}
        sleep = self.sleep_interval
        fp = open('./out.txt', 'a+')

        # just run forever until ctrl-c (in non-batch mode) or run until the
        # batch executor finishes (in batch mode)
        while True:
            if have_batchfile:
                allDone, duration = batchex.wait_for_measure_to_be_ready_all_done_or_failed()
                if allDone:
                    break
                if duration == None:
                    print("FATAL ERROR: Batch execution failed!")
                    break
            else:
                traintest, duration = self.getTrainOrTest()

            #train/test for a set duration
            if duration == -1:
                print("Running forever") 
                forever = 1
            else:
                if duration == 0.0:
                    raise ValueError("invalid duration")
                print("Running for {} seconds".format(duration))
                forever = 0

            ii = 0
            # fetch the metrics
            data = self.getData()
    
            #Initial block is to set up old_metrics
            #since we only care about the changes in some values, not the 
            # aggregates

            #put all the new metrics in new_metrics
            for metric_group, items in metric_groups.items():
                #If the item is not a list, then take it straight from data
                if not items:
                    try:
                        new_metrics[metric_group] = float(data[metric_group])
                    except KeyError:
                        pass
                else:
                    #set to 0 so that we can recalculate the aggregate values

                    #this is for resetting the values on the subsequent
                    #iteration (e.g. testing -> training)
                    if (metric_group in new_metrics and 
                        metric_group not in data):
                        new_metrics[metric_group] = 0
                        anomaly_metrics[metric_group] = 0

                    #iterate over the list of items
                    for item in items:
                        #if the metric_group is in data, then its items will be
                        #as well
                        if metric_group in data:
                            try:
                                new_metrics[metric_group + item] = float(data[metric_group][item])
                                anomaly_metrics[metric_group + item] = float(data[metric_group][item])
                            except KeyError:
                                pass
                        #if the metric_group isn't in data, but its items are
                        #then aggregate all of the items into the metric_group
                        #This happens in Redis to aggregate all types of
                        #commands together
                        elif item in data:
                            if metric_group not in new_metrics:
                                new_metrics[metric_group] = 0
                                anomaly_metrics[metric_group] = 0
                            try:
                                new_metrics[metric_group] += float(data[item]['calls'])
                                anomaly_metrics[metric_group] += float(data[item]['calls'])
                            except KeyError:
                                pass

            while duration > 0 or forever == 1:
                time.sleep(sleep)
                duration -= sleep
                point = ()

                
                # fetch the metrics
                data = self.getData()

                #put all the new metrics in new_metrics
                for metric_group, items in metric_groups.items():
                    #set old to new so that we can take the difference
                    #between the two measurements 
                    if metric_group in new_metrics:
                        old_metrics[metric_group] = new_metrics[metric_group]
                        new_metrics[metric_group] = 0
                        anomaly_metrics[metric_group] = 0

                    #set to 0 so that we can recalculate the aggregate values
                    #If the item is not a list, then take it straight from data
                    if not items:
                        try:
                            new_metrics[metric_group] = float(data[metric_group])
                            anomaly_metrics[metric_group] = float(data[metric_group])
                        except KeyError:
                            pass

                    else:
                        #iterate over the list of items
                        for item in items:
                            #if the metric_group is in data, then its items 
                            #will be as well
                            if metric_group in data:
                                try:
                                    old_metrics[metric_group + item] = new_metrics[metric_group + item]
                                    new_metrics[metric_group + item] = float(data[metric_group][item])
                                    anomaly_metrics[metric_group + item] = float(data[metric_group][item])
                                except KeyError:
                                    pass
                            #if the metric_group isn't in data, but its items 
                            #are then aggregate all of the items into the 
                            #metric_group
                            #This happens in Redis to aggregate all types of 
                            #commands together
                            elif item in data:
                                try:
                                    new_metrics[metric_group] += float(data[item]['calls'])
                                    anomaly_metrics[metric_group] += float(data[item]['calls'])
                                except KeyError:
                                    pass


                #make per second values for the counters
                for counter_group, items in counter_keys.items():
                    #if the item is not a list, then we can just subtract the
                    #entire counter group. This is in Redis where we aggregate
                    #all command types together
                    if not items:
                        if counter_group in new_metrics:
                            try:
                                anomaly_metrics[counter_group] = (new_metrics[counter_group] - old_metrics[counter_group]) / sleep
                            except KeyError:
                                pass
                    else:
                        #iterate over all items in the list
                        for item in items:
                            if counter_group in data:
                                try:
                                    anomaly_metrics[counter_group+item] = (new_metrics[counter_group+item] - old_metrics[counter_group+item]) / sleep
                                except KeyError:
                                    pass
            
                #create a tuple from the anomaly_metrics dictionary
                #yes I know this is a slow and dumb way to do this
                for items in anomaly_metrics:
                    point += (anomaly_metrics[items],) 
                    sys.stdout.write("{}, {}\n".format(items, anomaly_metrics[items]))
                    if not ii % 100000:
                        #sys.stdout.write("{}, ".format(items))
                        fp.write("{}, ".format(items))
                sys.stdout.write("\n")

                if not ii % 50:
                    print(ii)
                if not ii % 100000:
                    #print('\n')
                    fp.write("\n")
                ii += 1
                            
                #print anomaly_metrics
                #sys.stdout.write("{}\n".format(point))
                fp.write("{}\n".format(point))

                if have_batchfile:
                    batchex.signal_measuring_done(point, duration)
                elif traintest == '1':
                    detection_algorithm.trainSet.append({point:'Normal'})
                    detection_algorithm.size_normal_train += 1
                elif traintest == '2':
                    detection_algorithm.trainSet.append({point:'Anomaly'})
                    detection_algorithm.size_anomaly_train += 1
                elif traintest == '3':
                    #print point
                    if self.do_perceptron_learn == True:	
                        detection_algorithm.preProcess()
                        self.do_perceptron_learn = False
                    label = detection_algorithm.getLabel(point)
                    if label == 'Normal' or label == 0:
                        print 'Normal'
                        #fp.write('Normal\n')
                    elif label == 'Anomaly' or label == 1:
                        print 'Anomaly'
                        #fp.write('Anomaly\n')
                    print('\n')

                fp.flush()
Example #40
0
import numpy as np
import numpy.matlib
from knn import KNN
from pyspark import SparkConf, SparkContext
from utils import cdist, vote, get_confusion_matrix, get_image_rdd

sc = SparkContext()

# train data
# Each element in x and y is (SubGroupKey, iterableResults)
# in which iterableResults are (PixelKey, features/labels)
x, y = get_image_rdd(sc, n_groups=1000, start=0, end=1300)

# knn model
knn = KNN(x,y)
del x, y

# test data
x_, y_ = get_image_rdd(sc, val=1)
x_list, y_list = x_.collect(), y_.collect()
del x_, y_
print 'Length of x_list', len(x_list)
cm = numpy.matlib.zeros((2,2), dtype=float)
pred = []
# iterate 10 pixels at a time
flag = True
counter = 0
while flag:
    x__, y__ = x_list[:10], y_list[:10]
    x_list[:10] = []
    y_list[:10] = []
Example #41
0
    def printStats(self):
        data = []
        knn_class = KNN()
        sleep = 1
        q = 0
        i = 0
        u = 0
        d = 0
        qcpu = 0
        icpu = 0
        ucpu = 0
        dcpu = 0
        ii = 0
        con = 0
        hostname = "localhost"
        idx_b_a = 0
        idx_b_h = 0
        idx_b_m = 0
        new_bytesIn = 0
        new_bytesOut = 0
        new_numRequests = 0
        bytesIn = 0
        bytesOut = 0
        numRequests = 0
        network_skip_flag = 0

        # just run forever until ctrl-c
        while True:
            do_normal_train = raw_input('Do normal training?: ')
            do_anomaly_train = raw_input('Do anomaly training?: ')
            do_test = raw_input('Do testing?: ')
            if do_normal_train == 'y':
                do_normal_train = True
            else:
                do_normal_train = False
            if do_anomaly_train == 'y':
                do_anomaly_train = True
            else:
                do_anomaly_train = False
            if do_test == 'y':
                do_test = True
            else:
                do_test = False
            # set previous values before overwriting
            pq = q
            pi = i
            pu = u
            pd = d
            pqcpu = qcpu
            picpu = icpu
            pucpu = ucpu
            pdcpu = dcpu
            pidx_b_a = idx_b_a
            pidx_b_h = idx_b_h
            pidx_b_m = idx_b_m
            
            # fetch the stats
            data = ( self.db.command( { "serverStatus" : 1 } ) )
            #print data['indexCounters'];sys.exit()

            res = int(data['mem']['resident'])
            vir = int(data['mem']['virtual'])
            mapd = int(data['mem']['mapped'])

            old_bytesIn = new_bytesIn  
            old_bytesOut = new_bytesOut
            old_numRequests = new_numRequests 

            new_bytesIn = int(data['network']['bytesIn'])
            new_bytesOut = int(data['network']['bytesOut'])
            new_numRequests = int(data['network']['numRequests'])

            if(network_skip_flag == 0):
                network_skip_flag = 1
            else:
              bytesIn = new_bytesIn - old_bytesIn
              bytesOut = new_bytesOut - old_bytesOut
              numRequests = new_numRequests - old_numRequests


            template="%12s%22s%12s%12s%12s%12s"
            header=('hostname', 'time', 'resident','virtual', 'mapped', 'load', 'bytesIn', 'bytesOut', 'numRequests')
            datastr="hostname, self.thetime(),  res, vir, mapd, self.getload(), bytesIn, bytesOut, numRequests"
            point = (0, 0, 0, 0, 0, res, vir, mapd, 0, 0, 0, 0, self.getload(), bytesIn, bytesOut, numRequests)

            if "opcounters" in data:
                q = int(data['opcounters']['query'])
                i = int(data['opcounters']['insert'])
                u = int(data['opcounters']['update'])
                d = int(data['opcounters']['delete'])
                try:
                    qcpu = int(data['opcounters']['queryCpuTime'])
                    icpu = int(data['opcounters']['insertCpuTime'])
                    ucpu = int(data['opcounters']['updateCpuTime'])
                    dcpu = int(data['opcounters']['deleteCpuTime'])
                except KeyError:
                    qcpu = 0
                    icpu = 0
                    ucpu = 0
                    dcpu = 0
                con = int(data['connections']['current'])
              
                template="%12s%22s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s"
                header=('hostname', 'time', 'query', 'insert', 'update',  \
                        'delete', 'active con', 'resident', \
                        'virtual','mapped','load', 'bytesIn', 'bytesOut', 'numRequests', \
                        'queryCpu', 'insertCpu', 'updateCpu', 'deleteCpu')
                datastr="hostname, self.thetime(), (q-pq)/sleep, (i-pi)/sleep,(u-pu)/sleep, (d-pd)/sleep, con,  res, vir, mapd, self.getload(), bytesIn, bytesOut, numRequests, (qcpu-pqcpu)/sleep, (icpu-picpu)/sleep, (ucpu-pucpu)/sleep, (dcpu-pdcpu)/sleep"
                point = ((q-pq)/sleep, (i-pi)/sleep,(u-pu)/sleep, (d-pd)/sleep, con, res, vir, mapd, 0, 0, 0, 0, self.getload(), bytesIn, bytesOut, numRequests, (qcpu-pqcpu)/sleep, (icpu-picpu)/sleep, (ucpu-pucpu)/sleep, (dcpu-pdcpu)/sleep)

            # opcounters will be in data if indexcounters is
            if "indexCounters" in data:
                #idx_b_a = int(data['indexCounters']['btree']['accesses'])
                #idx_b_h = int(data['indexCounters']['btree']['hits'])
                #idx_b_m = int(data['indexCounters']['btree']['misses'])
                #idx_b_o = round(float(data['indexCounters']['btree']['missRatio']),2)
                idx_b_a = int(data['indexCounters']['accesses'])
                idx_b_h = int(data['indexCounters']['hits'])
                idx_b_m = int(data['indexCounters']['misses'])
                idx_b_o = round(float(data['indexCounters']['missRatio']),2)
                template="%12s%22s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s%12s"
                header=('hostname', 'time', 'query', 'insert', 'update',  \
                        'delete', 'active con', 'resident', \
                        'virtual','mapped','idx acc','idx hit','idx miss','idx ratio','load', 'bytesIn', 'bytesOut', 'numRequests', \
                        'queryCpu', 'insertCpu', 'updateCpu', 'deleteCpu')
                datastr="hostname, self.thetime(), (q-pq)/sleep, (i-pi)/sleep,(u-pu)/sleep, (d-pd)/sleep, \
                         con,  res, vir, mapd, (idx_b_a-pidx_b_a)/sleep, (idx_b_h-pidx_b_h)/sleep, (idx_b_m-pidx_b_m)/sleep, idx_b_o, self.getload(), bytesIn, bytesOut, numRequests, (qcpu-pqcpu)/sleep, (icpu-picpu)/sleep, (ucpu-pucpu)/sleep, (dcpu-pdcpu)/sleep"
                point = ((q-pq)/sleep, (i-pi)/sleep,(u-pu)/sleep, (d-pd)/sleep, con, res, vir, mapd, (idx_b_a-pidx_b_a)/sleep, (idx_b_h-pidx_b_h)/sleep, (idx_b_m-pidx_b_m)/sleep, idx_b_o, self.getload(), bytesIn, bytesOut, numRequests, (qcpu-pqcpu)/sleep, (icpu-picpu)/sleep, (ucpu-pucpu)/sleep, (dcpu-pdcpu)/sleep)

            if do_normal_train:
            	knn_class.trainSet.append({point:'Normal'})
                knn_class.size_normal_train += 1
            if do_anomaly_train:
                knn_class.trainSet.append({point:'Anomaly'})
                knn_class.size_anomaly_train += 1
            if do_test:
                #print point
                label = knn_class.getLabel(point)

            if (ii % 25 == 0):
                print template % header
            if do_test: # This is for testing, we print out the predicted label
                print template % (eval(datastr)), label
            else:
                print template % (eval(datastr))

            ii += 1
            
            time.sleep(sleep) 
Example #42
0
class Forecast():
    def __init__(self, data, testdata = {}, k = 7, function = None, scale = None):
        if scale:
            data = self.rescale(data, scale)
            testdata = self.rescale(testdata, scale)

        self.data = data
        self.testdata = testdata
        self.knn = KNN(data)
        self.k = k

        if function:
            self.function = function
        else:
            self.function = self.knn.inverseWeight

    def rescale(self, data, scale):
        scaledata = {}
        for key in data.keys():
            scaled = [data[key]['input'][i] * scale[i] for i in range(len(scale))]
            scaledata[key] = {'input': scaled, 'result': data[key]['result']}
        return scaledata

    def estimate(self, testdata):
        weathers = {}
        results = self.knn.estimate(testdata, self.k, self.function)
        for result in results:
            weather = self.data[result[0] + datetime.timedelta(hours = 6)]['result']
            if weather not in weathers:
                weathers[weather] = 0
            weathers[weather] += result[1]
        return sorted(weathers.items(), key = lambda x: x[1], reverse = True)
    
    def validation(self, trial = None):
        if not trial:
            trial = 100

        count = 0
        trial = min(trial, len(self.testdata))
        testdata = self.testdata.values()
        random.shuffle(testdata)
    
        for test in testdata[:trial]:
            result = self.estimate(test['input'])
            if result[0][0] == test['result']:
                count += 1
    
        return float(count) / trial
    
    def optimization(self, trial = None, krange = 10):
        best = 0.0
        backup = self.k, self.function
        k, method = None, None
    
        for i in range(1, krange):
            for j in dir(self.knn):
                if not j.endswith('Weight'):
                    continue
    
                self.k, self.function = i, getattr(self.knn, j)
                result = self.validation(trial)
                if result > best:
                    best = result
                    k, method = i, j

                print i, j, best

        self.k, self.function = backup
        return k, method
    
    def testscale(self, scale, trial = None):
        backup = self.data, self.testdata

        self.data, self.testdata = self.rescale(self.data, scale), self.rescale(self.testdata, scale)
        self.knn = KNN(self.data)
        result = self.validation(trial)

        self.data, self.testdata = backup
        self.knn = KNN(self.data)
        return result
    
    def annealing(self, domain, T = 10000, cool = 0.95, step = 1, trial = None, vector = None):
        if not vector:
            vector = [float(random.randint(domain[i][0], domain[i][1])) for i in range(len(domain))]
        best = self.testscale(vector, trial)
        result = vector
        
        while T > 0.1:
            newvec = vector[:]
            i = random.randint(0, len(domain) - 1)
            newvec[i] += random.randint(-step, step)
    
            if newvec[i] < domain[i][0]:
                newvec[i] = domain[i][0]
            elif newvec[i] > domain[i][1]:
                newvec[i] = domain[i][1]
    
            value = self.testscale(newvec, trial)
            p = 1 / pow(math.e, abs(best - value) / T)
            T *= cool
    
            print newvec, value
            if best < value:
                best = value
                result = newvec
            elif random.random() < p:
                vector = newvec
    
        return result
Example #43
0
def main():
    
    #############################################
    # Set up the data as per the first Practicum
    #############################################
    
    spam_values = np.genfromtxt('../input_data/spambase.data', delimiter=',')
    fl = open('../input_data/spambase.names', 'r')
    lines = [line.strip() for line in fl] # J : strip from beginning and ending whitespace
    fl.close()
    
    colnames = [line.partition(':')[0] for line in lines if not (len(line) == 0 or line[0] == '|' or line[0] == '1')]
    colnames.append('spam')
    
    spam_df = pd.DataFrame(spam_values,columns=colnames)
    spam_df['spam']=2*spam_df['spam']-1
    
    # J: Apparently DataFrame.shape is a list or something and the first cell contains the number of samples in the DataFrame
    nsamples = spam_df.shape[0] 
    ntest = np.floor(.2 * nsamples)
    ntune = np.floor(.1 * nsamples)
    
    # we want to make this reproducible so we seed the random number generator
    np.random.seed(1)
    all_indices = np.arange(nsamples) 
    # J: important to shuffle so that you don't know which portion is training, which is testing and which is tuning data
    np.random.shuffle(all_indices) 
    test_indices = all_indices[:ntest] # J: Get shuffled test indices first
    tune_indices = all_indices[ntest:(ntest+ntune)] # J: tune indices second
    train_indices = all_indices[(ntest+ntune):] # J: train indices (the majority) last
    
    # J : now that the "*indices" arrays have been shuffled, you can actually draw the relevant data through
    # DataFrame.ix. The second argument includes all columns, labels included.
    spam_train = spam_df.ix[train_indices,:]
    spam_tune = spam_df.ix[tune_indices,:]
    spam_test = spam_df.ix[test_indices,:]
    
    pd.save(spam_train, '../proc_data/training_data/spam_train.pdat')
    pd.save(spam_tune, '../proc_data/training_data/spam_tune.pdat')
    pd.save(spam_test, '../proc_data/testing_data/spam_test.pdat')
    
    
    #######################################################################
    # See how features are sorted according to their Information Gain score
    #######################################################################
    
    # atestTree = DecisionTree(spam_train, 5, True)
    # print atestTree.__sortFeatures__(spam_train, spam_train.columns)
    
    ###############################################
    #  Training classifiers and saving them on disk
    ###############################################
    
    # Already trained those two, it took about 4 hours total. 
     
#    majVoteTree = DecTree.DecisionTree(spam_train, 5, False)
#    print "Tuning a majority vote classifier on all depths between 1 and 15 inclusive."
#    majVoteTree.tune(spam_tune,1, 15)
#    print "Saving this classifier to disk."
#    majVoteTree.dump("../proc_data/dtreeWithMajVote_1_to_15.pyobj")
#    
#    IGTree = DecTree.DecisionTree(spam_train, 5, True)
#    print "Tuning an information gain classifier on all depths between 1 and 15 inclusive."
#    IGTree.tune(spam_tune,1, 15)
#    print "Saving this classifier to disk."
#    IGTree.dump("../proc_data/dtreeWithIG_1_to_15.pyobj")

    HectorsKNN = KNN(spam_train, spam_train['spam'], 5)
    print "Tuning Hector's KNN classifier for all values of K between 1 and 41 inclusive:"
    HectorsKNN.tune(spam_tune, spam_tune['spam'], k=range(1,42,2))
    print "Saving this classifier to disk."
    HectorsKNN.dump("../proc_data/HectorsKNN_1_to_41.pyobj") 
    
    ###########################################
    # Playing with stored classifiers
    ###########################################
    
    # Part 1: A decision tree classifier trained with Majority Vote, depths 1 to 10

#    print "Loading a decision tree trained with Majority Vote for depths 1 to 10..."
#    majVoteTree = load("../proc_data/dtreeWithMajVote_1_to_15.pyobj")
#    print "According to the tuning set, the optimal depth for this tree is: " + str(majVoteTree.depth)
#    classifications = majVoteTree.classify(spam_test)
#    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
#    print 'For this depth, the error on the test set was %0.3f' % testErrorRate
#    print "We will now test all different hyper-parameters found during tuning on the test data:"
#    majVoteTree.classifyWithAllDepths(spam_test)
#    print "\n===========================================================\n"
#    
#    # Part 2: A decision tree classifier trained with Information Gain, depths 1 to 10
#    
#    print "Loading a decision tree trained with Information Gain for depths 1 to 10..."
#    IGTree = load("../proc_data/dtreeWithIG_1_to_15.pyobj")
#    print "According to the tuning set, the optimal depth for this tree is: " + str(IGTree.depth)
#    classifications = IGTree.classify(spam_test)
#    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
#    print 'For this depth, the error on the test set was %0.3f' % testErrorRate
#    print "We will now test all different hyper-parameters found during tuning on the test data:"
#    IGTree.classifyWithAllDepths(spam_test)
    
    # Part 3: Hector's KNN-classifier
    
    print "Reloading Hector's classifier from disk:"
    HectorsKNN = load("../proc_data/HectorsKNN_1_to_41.pyobj")
    print "According to the tuning set, the optimal K for this classifier is: " + str(HectorsKNN.k) + "."
    classifications = HectorsKNN.classify(spam_test)
    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
    print 'For this value of K, the error on the test set was %0.3f' % testErrorRate
    print "We will now test all different hyper-parameters found during tuning on the test data:"
    HectorsKNN.classifyWithAllK(spam_test)
    
    # Part 4: Weighted Features KNN
    
    print "Exiting..."
Example #44
0
 
 # Entrada para o tf-idf, devemos anotar os documentos com suas classes.
 # Receberá como entrada um array de tuplas: ([tokens], classe)
 parsed_trainning_documents_with_classes = []
 for k in parsed_trainning_documents.keys():
   parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]]
 
 # Execução tf-idf
 print('generating tf.idf...')
 tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes)
 tf_idf_calculator.run()
 
 # testa os parâmetros do knn: métrica de distância e valor de K
 for metric in ['cosine', 'euclid']:
   for k in range(5, 11, 2):
     knn = KNN(tf_idf_calculator.results, k, metric)
   
     # confusion_matrix[A][B] = quantas vezes um documento da classe A foi atribuído à classe B
     topics = ['baseball', 'christian', 'guns']
     confusion_matrix = {topic:{t:0 for t in topics} for topic in topics}
     
     print_log = False
     i = 0
     ytrue = []
     ypred = []
     for topic in topics:
       for doc in reader.test[topic]:
         ytrue.append(topic)
         # classifica os documentos de teste
         words = parser.process_sent(doc)
         query = tf_idf_calculator.generate_tf_vector(words)
def recommendation(all_docs, test_docs, classifier_list):

    print("Recommendation System")
    print("---------------------")

    # ask user for the desired option count and recommendation count. set default value in case invalid inputs.
    try:
        option_count = int(raw_input("\nEnter number of articles to choose from. [number from 5 to 10 suggested]: "))
        if option_count < 1 or option_count > 20:
            print("Invalid Choice.. By default selected 5.")
            option_count = 5
    except:
        print("Invalid Choice.. By default selected 5.")
        option_count = 5

    try:
        k_n = int(raw_input("\nEnter number of recommendation per article. [number from 5 to 10 suggested]: "))
        if k_n < 1 or k_n > 20:
            print("Invalid Choice.. By default selected 5.")
            k_n = 5
    except:
        print("Invalid Choice.. By default selected 5.")
        k_n = 5

    end = False

    # run the loop until user quits.
    while not end:

        # pick random documents from test docs and provide titles to the user.
        user_docs = random.sample(test_docs, option_count)

        while True:
            print("\n---Available Choices For Articles(Titles)---\n")

            for i in range(len(user_docs)):
                print(str(i+1) + ": " + user_docs[i].title)

            print("r: Refresh List")
            print("q: Quit()\n")

            choice = raw_input("Enter Choice: ")

            if choice == 'q':
                end = True
                break
            elif choice == 'r':
                break
            else:
                try:
                    user_choice = int(choice) - 1
                    if user_choice < 0 or user_choice >= len(user_docs):
                        print("Invalid Choice.. Try Again..")
                        continue
                except:
                    print("Invalid Choice.. Try Again..")
                    continue
                selected_doc = user_docs[user_choice]

                # classifiers are sorted according to their f_measure in decreasing order. It helps when all
                # three classifiers differ in their predictions.
                classifier_list = sorted(classifier_list, key=lambda cl: cl.stats['f_measure'], reverse=True)

                prediction_list = list()
                for classifier in classifier_list:
                    prediction_list.append(classifier.classify([selected_doc])[0])

                prediction_count = Counter(prediction_list)
                top_prediction = prediction_count.most_common(1)

                if top_prediction[0][1] > 1:
                    prediction = top_prediction[0][0]
                else:
                    prediction = prediction_list[0]

                # create knn instance using documents of predicted topic. and find k closest documents.
                knn = KNN(all_docs[prediction])
                k_neighbours = knn.find_k_neighbours(selected_doc, k_n)

                while True:
                    print("\nRecommended Articles for : " + selected_doc.title)
                    for i in range(len(k_neighbours)):
                        print(str(i+1) + ": " + k_neighbours[i].title)
                    next_choice = raw_input("\nEnter Next Choice: [Article num to read the article. "
                                            "'o' to read the original article. "
                                            "'b' to go back to article choice list.]  ")

                    if next_choice == 'b':
                        break
                    elif next_choice == 'o':
                        text = selected_doc.text
                        print("\nArticle Text for original title : " + selected_doc.title)
                        print(text)
                    else:
                        try:
                            n_choice = int(next_choice) - 1
                            if n_choice < 0 or n_choice >= k_n:
                                print("Invalid Choice.. Try Again..")
                                continue
                        except:
                            print("Invalid Choice.. Try Again..")
                            continue
                        text = k_neighbours[n_choice].text
                        print("\nArticle Text for recommended title : " + k_neighbours[n_choice].title)
                        print(text)
Example #46
0
class ModelTester(object):
	def __init__(self, test_file, trained_file, global_file):
		self._doc = open(test_file)
		# self._stats = json.load(open(global_file))
		# self._n = self._stats["N"]
		# self._gf = self._stats["freq"]
		self._knn = KNN(trained_file)
		self.result = {}
		self.output = []

	def test(self):
		c = 0 
		while True:
			line = self._doc.readline()
			if not line:
				break
			print "dsf"
			id = line.split(',')[0]
			line = line.replace(', ', ',')
			parts = line.split()
			if parts < 2:
				raise ValueError

			labels,features = parts[0], ' '.join(parts[1:])
			tfidf = self.get_tfidf(features)
			top_cats = self._knn.find_knn(tfidf)

			# for i in top_cats:
			# 	l = line.split(" ")
			# 	try :
			# 		self.result[str(i)].append(line.replace('\n', ''))
			# 	except : 
			# 		self.result[str(i)] = []
			# 		self.result[str(i)].append(line.replace('\n', ''))
			
			stri = str(id)+","
			for i in top_cats : 
				stri=stri+" "+str(i)
			self.output.append(stri)
			if c % 10== 0 :print c,stri
			c =c +1 

	def write(self, output_file ) :
		'''
		fp = open(output_file+'.json', 'w')
		json.dump(self.result, fp)
		fp.close()	
		'''

		for o in self.output:
			print o



	def get_tfidf(self, features):
		features = re.findall(r'\d+:\d+', features)
		fs = ""
		doc_sum = 0.0
		for f in features:
			# print f
			r,w = f.split(':')
			w = float(w)
			doc_sum += w
		
		for f in features:
			r,w = f.split(':')
			w = float(w)
			w /= doc_sum
			try:
				w *= log(self._n/(self._gf[r]+1))
			except KeyError:
				w *= log(self._n)
			fs += r+":"+str(w)+" "
		return fs
Example #47
0
for elem in validationData:
    if elem[2]*av_score[elem[1]-1] > 0 or (elem[2]==0 and av_score[elem[1]-1] <= 0):
        accuracy+=1

print "Simple Accuracy:", np.around(100.0*accuracy/len(validationData)), "%"

############# PERSONAL PREF #############
print 20 * "#", "Personal Pref", 20 * "#"
jokeDataNew = jokeData
# replace nan by 0
for i in range(len(jokeData)):
    jokeDataNew[i] = [0 if np.isnan(x) else x for x in jokeData[i] ]
    
for k in [10, 100, 1000]:
    print "K Value:", k
    knn = KNN(k)
    knn.fit(jokeDataNew)
    neighbours = knn.neighbours
    av_score = []
    accuracy = 0
    for i in range(100):
        average_score = (np.mean([jokeDataNew[ind] for ind in neighbours[i]], 0))
        av_score.append(average_score)
        
    for elem in validationData:
        if (elem[2]*av_score[elem[0]-1][elem[1]-1] > 0) or (elem[2]==0 and av_score[elem[0]-1][elem[1]-1] < 0):
            accuracy+=1
    
    print "Pref Accuracy:", np.around(100.0*accuracy/len(validationData)), "%"
        
############# LATENT FACTOR ANALYSIS #############
def get_data():
    width = 8
    height = 8
    N = width * height
    X = np.zeros((N, 2))
    Y = np.zeros(N)
    n = 0
    start_t = 0
    for i in xrange(width):
        t = start_t
        for j in xrange(height):
            X[n] = [i, j]
            Y[n] = t
            n += 1
            t = (t + 1) % 2 # alternate between 0 and 1
        start_t = (start_t + 1) % 2
    return X, Y


if __name__ == '__main__':
    X, Y = get_data()

    # display the data
    plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5)
    plt.show()

    # get the accuracy
    model = KNN(3)
    model.fit(X, Y)
    print "Train accuracy:", model.score(X, Y)
Example #49
0
    def main(self, algo="KNN", textview=None):
        
        # Remplace "print"
        def print_output(text):
            if textview != None:
                buf = textview.get_buffer()
                buf.insert_at_cursor(text + "\n")
                textview.scroll_mark_onscreen(buf.get_insert())
            else:
                log.info(text)
        
        
        # liste des types de set
        if self.validation == 1:
            listeTypesSet = ["train", "validation", "test"]
        else:
            listeTypesSet = ["train", "test"]

        # liste des resultats utilises pour les courbes
        listeRes=[]

        # creation des trainFile et testFile
        log.debug("Construction des fichiers d'entrainement")
        tools.constructLfwNamesCurrent( self.nbExemples )   

        #TODO ca ne sert plus a rien finalement
        ( nbClassesLFW, nbClassesORL ) = tools.trainAndTestConstruction( self.pourcentageTrain, self.nbExemples )

        # Chargement des données
        dataTrain, dataTrainIndices, nClass = tools.loadImageData( "train", self.categorie)
        
        # tranformation pca
        print_output("Calcul des vecteurs propres...")
        pca_model = PCA( dataTrain )
        pca_model.transform() # on transforme les donné dans un le "eigen space"

        ##### Recherche pas KNN
        if algo == "KNN":
            print_output("Début de l'algorithme des K plus proches voisins...")
            
            # On build le model pour recherche par KNN
            knn_model = KNN( pca_model.getWeightsVectors(), dataTrainIndices, nClass, self.K )
            
            # On build le model pour Parzen
            parzen_model = ParzenWindows( pca_model.getWeightsVectors(), dataTrainIndices, nClass, self.Theta )

            ## TEST ###########################
            #TODO Toute cette partie est a revoir pour sortir des graphes
            # de train, validation, test
            for trainTest in listeTypesSet:
                if trainTest == "train":
                    dataTest, dataTestIndices = dataTrain, dataTrainIndices
                else :
                    ### si l'on n'effectue pas de validation on concatene les entrees de test et de validation initiales pour obtenir le test
                    #if "validation" not in listeTypesSet:
                        #dataTestInitial, dataTestInitialIndices, nClass = tools.loadImageData( "test", self.categorie )
                        #dataValidation, dataValidationIndices, nClass = tools.loadImageData( "validation", self.categorie )
                        #dataTest = np.zeros(dataTestInitial.size + dataValidation.size)
                        #dataTestIndices = np.zeros( dataTest.size )
                        #dataTest[ : dataTestInitial.size], dataTestIndices[ : dataTestInitial.size] = dataTestInitial, dataTestInitialIndices
                        #dataTest[dataTestInitial.size : ], dataTestIndices[dataTestInitial.size : ] = dataValidation, dataValidationIndices
                        
                        
                    #else:
                        dataTest, dataTestIndices, nClass = tools.loadImageData( trainTest, self.categorie )
                print_output("Projection des données de test...")
                dataTest_proj = pca_model.getProjection( dataTest )
                

            	# compteurs de bons résultats   
                nbGoodResult = 0
                nbGoodResult2 = 0 
                nbGoodResult3 = 0

                t_start = time.clock()
                for i in range(0, int( dataTest.shape[1] )):

					# k = 1, pour réference
					# on force k
                    knn_model.setK( 1 )
                    result1NN = knn_model.compute_predictions( dataTest_proj[:,i] )
                    if(result1NN == dataTestIndices[i]):
                        nbGoodResult += 1

		            # k = n
		            # replace k a ca position initial
                    knn_model.setK( self.K )
                    resultKNN = knn_model.compute_predictions( dataTest_proj[:,i] )
                    if(resultKNN == dataTestIndices[i]):
                        nbGoodResult2 += 1

                
                    resultParzen = parzen_model.compute_predictions( dataTest_proj[:,i] )
                    if(resultParzen == dataTestIndices[i]):
                        nbGoodResult3 += 1
     
                    out_str = "Classic method: "+ str( result1NN ) +" | KNN method: "+ str( resultKNN ) +" | KNN+Parzen method: "+ str( resultParzen ) +" | Expected: "+ str( dataTestIndices[i] ) +"\n" # +1 car l'index de la matrice commence a 0
                    print_output(out_str)

                resClassic = (float(nbGoodResult) / float(dataTest.shape[1])) * 100.
                out_str = "\nAccuracy with classic method: %.3f" % resClassic + "%\n"
                resKNN = (nbGoodResult2 / float(dataTest.shape[1])) * 100.
                out_str += "Accuracy with KNN method (k="+ str( self.K ) +"): %.3f" % resKNN + "%\n"
                res = (nbGoodResult3 / float(dataTest.shape[1])) * 100.
                out_str += "Accuracy with KNN + Parzen window method (theta="+ str( self.Theta ) +"): %.3f" % res + "%\n"
                print_output(out_str)
                
                t_stop = time.clock()
                log.info("Temps total: %.4fs\n" % float(t_stop-t_start)) 

				#### recupere les valeurs finale de l'erreur
                listeRes.append( 100 - resClassic )
                listeRes.append( 100 - resKNN )
                listeRes.append( 100 - res )

            
        
        #### Recherche pas NNET
        elif algo == "NNET":
			print_output("Début de l'algorithme du Perceptron multicouche...")
			
			# parametre, donnees, etc...
			dataTrain = pca_model.getWeightsVectors()
			dataTrainTargets = (dataTrainIndices - 1).reshape(dataTrainIndices.shape[0], -1)
			#! contrairement au KNN le NNET prends les vecteurs de features en ligne et non pas en colonne
			train_set = np.concatenate((dataTrain.T, dataTrainTargets), axis=1)

                        # recuperation des données de validation
			dataValidation, dataValidationIndices, nClass = tools.loadImageData( "validation", self.categorie )
			print_output("Projection des données de validation...")
			dataValidation_proj = pca_model.getProjection( dataValidation )
			dataValidationTargets = (dataValidationIndices - 1).reshape(dataValidationIndices.shape[0], -1)
			validation_set = np.concatenate((dataValidation_proj.T, dataValidationTargets), axis=1)

			# recuperation des données de test
			dataTest, dataTestIndices, nClass = tools.loadImageData( "test", self.categorie )
			print_output("Projection des données de test...")
			dataTest_proj = pca_model.getProjection( dataTest )
			dataTestTargets = (dataTestIndices - 1).reshape(dataTestIndices.shape[0], -1)
			test_set = np.concatenate((dataTest_proj.T, dataTestTargets), axis=1)

			# On build et on entraine le model pour recherche par KNN
			nnet_model = NeuralNetwork( dataTrain.shape[0], self.n_hidden, nClass, self.lr, self.wd )
                        if self.validation == 1:
                            train_out, valid_out, test_out = nnet_model.train( train_set, self.n_epoch, self.batch_size, valid_set=validation_set, test_set=test_set)
                        else :
                            train_out, test_out = nnet_model.train( train_set, self.n_epoch, self.batch_size, test_set=test_set)

			# affichage des courbes d'entrainement
			x = []
			y = []
			y_err = []
			color = []
			legend = []
			legend_err = []
			filename = IMG_DIR + "Risque__Epoch_"+ str(self.n_epoch) +"_Hidden_"+ str(self.n_hidden) +"_Lr_"+ str(self.lr) +"_L2_"+ str(self.wd) + "_Categorie_" + str(self.categorie) + "_Batch_" + str(self.batch_size) + "_"
			filename_err = IMG_DIR + "Erreur_classification__Epoch_"+ str(self.n_epoch) +"_Hidden_"+ str(self.n_hidden) +"_Lr_"+ str(self.lr) +"_L2_"+ str(self.wd) + "_Categorie_" + str(self.categorie) + "_Batch_" + str(self.batch_size) + "_"

			train_out = np.array(train_out)
			x.append(np.array(xrange(train_out.shape[0])))
		
			# parametres courbes train
			color.append('g-')
			legend.append("R Train")
			filename += "_Train"
			y.append(train_out[:,0])
			y_err.append(train_out[:,1])
			legend_err.append("Err Train")
			filename_err += "_Train"

                        # parametre courbes validation
                        if self.validation == 1:
                            valid_out = np.array(valid_out)
                            x.append(np.array(xrange(valid_out.shape[0])))
                            y.append(valid_out[:,0])
                            y_err.append(valid_out[:,1])
                            color.append('b-')
                            legend.append("R Validation")
                            legend_err.append("Err Validation")
                            filename += "_Validation"
                            filename_err += "_Validation"

			# parametre courbes test
			test_out = np.array(test_out)
			x.append(np.array(xrange(test_out.shape[0])))
			y.append(test_out[:,0])
			y_err.append(test_out[:,1])
			color.append('r-')
			legend.append("R Test")
			legend_err.append("Err Test")
			filename += "_Test"
			filename_err += "_Test"

			
			# affichage
			title = u"\nEpoque: " + str(self.n_epoch) + " - Taille du batch: " + str(self.batch_size) + u" - Neurones cachés: " + str(self.n_hidden) + "\nL2: " + str(self.wd) + " - Taux d'apprentissage: " + str(self.lr) + u" - Catégorie: " + str(self.categorie)
			tools.drawCurves(x, y, color, legend, bDisplay=True, filename=filename, title=title, xlabel="Epoque", ylabel=u"Risque régularisé")
			tools.drawCurves(x, y_err, color, legend_err, bDisplay=True, filename=filename_err, title=title, xlabel="Epoque", ylabel="Erreur classification")

                         #### construction fichier pour courbes ameliorees
                        if self.stock == 1 :
                            fichier = open("curvErrorNNet"+''.join( ''.join( title.split(' ') ).split('\n') ),"w")
                            fichier.write("#epoch errorTrain errorValidation errorTest\n")
                            
                            if len(x) == 3:
                            	for j in range(len( x[0] )):
                            	    fichier.write(str( x[0][j] )+" "+str( y[0][j] )+" "+str( y[1][j] )+" "+str( y[2][j] )+"\n")

                            fichier.close()

                        
			"""
			/!\ Cette partie n'est plus utile car effectué dans le nnet durant le train
			
			## TEST ###########################
			#TODO Toute cette partie est a revoir pour sortir des graphes
			# de train, validation, test
			
			# compteurs de bons résultats   
			nbGoodResult = 0

			for i in range(0, int( dataTest.shape[1] )):

				#
				resultNNET = np.argmax(nnet_model.compute_predictions( dataTest_proj[:,i] ), axis=1)[0]
				if(resultNNET == dataTestTargets[i]):
					nbGoodResult += 1
				out_str = "Result: "+ str( resultNNET ) + " | Expected: "+ str( dataTestTargets[i] ) +"\n" # +1 car l'index de la matrice commence a 0
				print_output(out_str)

			res = (float(nbGoodResult) / float(dataTest.shape[1])) * 100.
			out_str = "\nAccuracy : %.3f" % res + "%\n"
			print_output(out_str)
            """            
   
        return listeRes