Ejemplo n.º 1
0
 def test_model_selection_nb_best_b(self):
     data = test_data['model_selection_NB']
     error_best, best_a, best_b, errors = model_selection_nb(data['Xtrain'], data['Xval'], data['ytrain'],
                                                             data['yval'], data['a_values'], data['b_values'])
     expected_error_best, expected_best_a, expected_best_b, expected_errors = \
         data['error_best'], data['best_a'], data['best_b'], data['errors']
     self.assertEquals(best_b, expected_best_b)
Ejemplo n.º 2
0
    def test_model_selection_nb_errors(self):
        data = test_data['model_selection_NB']
        error_best, best_a, best_b, errors = model_selection_nb(data['Xtrain'], data['Xval'], data['ytrain'],
                                                                data['yval'], data['a_values'], data['b_values'])
        expected_error_best, expected_best_a, expected_best_b, expected_errors = \
            data['error_best'], data['best_a'], data['best_b'], data['errors']

        max_diff = np.max(np.abs(expected_errors - errors))
        self.assertAlmostEqual(max_diff, 0, 8)
Ejemplo n.º 3
0
    def test_model_selection_nb_best_a(self):
        data = TEST_DATA['model_selection_NB']
        best_a_expected = data['best_a']

        _, best_a, _, _ = model_selection_nb(data['Xtrain'], data['Xval'],
                                             data['ytrain'], data['yval'],
                                             data['a_values'],
                                             data['b_values'])
        self.assertEqual(np.size(best_a), 1)
        self.assertEqual(best_a, best_a_expected)
Ejemplo n.º 4
0
    def test_model_selection_nb_errors(self):
        data = TEST_DATA['model_selection_NB']
        errors_expected = data['errors']

        _, _, _, errors = model_selection_nb(data['Xtrain'], data['Xval'],
                                             data['ytrain'], data['yval'],
                                             data['a_values'],
                                             data['b_values'])

        self.assertEqual(np.shape(errors), (3, 3))
        np.testing.assert_almost_equal(errors, errors_expected)
Ejemplo n.º 5
0
    def test_model_selection_nb_best_error(self):
        data = TEST_DATA['model_selection_NB']
        error_best_expected = data['error_best']

        error_best, _, _, _ = model_selection_nb(data['Xtrain'], data['Xval'],
                                                 data['ytrain'], data['yval'],
                                                 data['a_values'],
                                                 data['b_values'])

        self.assertEqual(np.size(error_best), 1)
        self.assertAlmostEqual(error_best, error_best_expected)
Ejemplo n.º 6
0
    def test_model_selection_nb_errors(self):
        X_train = TEST_DATA['model_selection_NB']['Xtrain']
        X_val = TEST_DATA['model_selection_NB']['Xval']
        y_train = TEST_DATA['model_selection_NB']['ytrain']
        y_val = TEST_DATA['model_selection_NB']['yval']
        a_values = TEST_DATA['model_selection_NB']['a_values']
        b_values = TEST_DATA['model_selection_NB']['b_values']
        errors_expected = TEST_DATA['model_selection_NB']['errors']

        _, _, _, errors = model_selection_nb(X_train, X_val, y_train, y_val, a_values, b_values)

        self.assertEqual(np.shape(errors), (3, 3))
        np.testing.assert_almost_equal(errors, errors_expected)
Ejemplo n.º 7
0
    def test_model_selection_nb_best_b(self):
        X_train = TEST_DATA['model_selection_NB']['Xtrain']
        X_val = TEST_DATA['model_selection_NB']['Xval']
        y_train = TEST_DATA['model_selection_NB']['ytrain']
        y_val = TEST_DATA['model_selection_NB']['yval']
        a_values = TEST_DATA['model_selection_NB']['a_values']
        b_values = TEST_DATA['model_selection_NB']['b_values']
        best_b_expected = TEST_DATA['model_selection_NB']['best_b']

        _, _, best_b, _ = model_selection_nb(X_train, X_val, y_train, y_val, a_values, b_values)

        self.assertEqual(np.size(best_b), 1)
        self.assertEqual(best_b, best_b_expected)
Ejemplo n.º 8
0
    def test_model_selection_nb_best_error(self):
        X_train = TEST_DATA['model_selection_NB']['Xtrain']
        X_val = TEST_DATA['model_selection_NB']['Xval']
        y_train = TEST_DATA['model_selection_NB']['ytrain']
        y_val = TEST_DATA['model_selection_NB']['yval']
        a_values = TEST_DATA['model_selection_NB']['a_values']
        b_values = TEST_DATA['model_selection_NB']['b_values']
        error_best_expected = TEST_DATA['model_selection_NB']['error_best']

        error_best, _, _, _ = model_selection_nb(X_train, X_val, y_train, y_val, a_values, b_values)

        self.assertEqual(np.size(error_best), 1)
        self.assertAlmostEqual(error_best, error_best_expected)
Ejemplo n.º 9
0
def run_training():
    data = load_data()

    # KNN model selection
    k_values = range(1, 201, 2)
    print(
        '\n------------- Selekcja liczby sasiadow dla modelu dla KNN -------------'
    )
    print(
        '-------------------- Wartosci k: 1, 3, ..., 200 -----------------------'
    )
    print(
        '--------------------- To moze potrwac ok. 1 min ------------------------'
    )

    error_best, best_k, errors = model_selection_knn(data['Xval'],
                                                     data['Xtrain'],
                                                     data['yval'],
                                                     data['ytrain'], k_values)
    print('Najlepsze k: {num1} i najlepszy blad: {num2:.4f}'.format(
        num1=best_k, num2=error_best))
    print('\n--- Wcisnij klawisz, aby kontynuowac ---')
    classification_KNN_vs_no_neighbours(k_values, errors)
    a_values = [1, 3, 10, 30, 100, 300, 1000]
    b_values = [1, 3, 10, 30, 100, 300, 1000]

    print(
        '\n----------------- Selekcja parametrow a i b dla NB --------------------'
    )
    print(
        '--------- Wartosci a i b: 1, 3, 10, 30, 100, 300, 1000 -----------------'
    )
    print(
        '--------------------- To moze potrwac ok. 1 min ------------------------'
    )

    # NB model selection
    error_best, best_a, best_b, errors = model_selection_nb(
        data['Xtrain'], data['Xval'], data['ytrain'], data['yval'], a_values,
        b_values)

    print('Najlepsze a: {}, b: {} i najlepszy blad: {:.4f}'.format(
        best_a, best_b, error_best))
    print('\n--- Wcisnij klawisz, aby kontynuowac ---')
    plot_a_b_errors(errors, a_values, b_values)
    p_x_y = estimate_p_x_y_nb(data['Xtrain'], data['ytrain'], best_a, best_b)

    classes_no = p_x_y.shape[0]
    print(
        '\n------Wizualizacja najbardziej popularnych slow dla poszczegolnych klas------'
    )
    print(
        '--Sa to slowa o najwyzszym prawdopodobienstwie w danej klasie dla modelu NB--'
    )

    try:
        groupnames = data['groupnames']
        words = {}
        for x in range(classes_no):
            indices = np.argsort(p_x_y[x, :])[::-1][:50]
            words[groupnames[x]] = {
                word: prob
                for word, prob in zip(data['wordlist'][indices], p_x_y[
                    x, indices])
            }
        word_clouds(words.values(), words.keys())
    except Exception:
        print('---Wystapil problem z biblioteka wordcloud--- ')

    print('\n--- Wcisnij klawisz, aby kontynuowac ---')

    print(
        '\n----------------Porownanie bledow dla KNN i NB---------------------'
    )

    Dist = hamming_distance(data['Xtest'], data['Xtrain'])
    y_sorted = sort_train_labels_knn(Dist, data['ytrain'])
    p_y_x = p_y_x_knn(y_sorted, best_k)
    error_KNN = classification_error(p_y_x, data['ytest'])

    p_y = estimate_a_priori_nb(data['ytrain'])
    p_y_x = p_y_x_nb(p_y, p_x_y, data['Xtest'])
    error_NB = classification_error(p_y_x, data['ytest'])

    plot_error_NB_KNN(error_NB, error_KNN)
    print('\n--- Wcisnij klawisz, aby kontynuowac ---')
Ejemplo n.º 10
0
def run_training():
    data = load_data()

    # KNN model selection
    k_values = range(1, 201, 2)
    print('\n------------- Model selection for KNN -------------')
    print(
        '-------------------- Values k: 1, 3, ..., 200 -----------------------'
    )
    print(
        '--------------------- Calculation may take up to 1 min ------------------------'
    )

    error_best, best_k, errors = model_selection_knn(data['Xval'],
                                                     data['Xtrain'],
                                                     data['yval'],
                                                     data['ytrain'], k_values)
    print('The best k: {num1} and the best error: {num2:.4f}'.format(
        num1=best_k, num2=error_best))
    print('\n--- Press any key to continue ---')
    classification_KNN_vs_no_neighbours(k_values, errors)
    a_values = [1, 3, 10, 30, 100, 300, 1000]
    b_values = [1, 3, 10, 30, 100, 300, 1000]

    print(
        '\n----------------- Model selection for a and b --------------------')
    print(
        '--------- Values a and b: 1, 3, 10, 30, 100, 300, 1000 -----------------'
    )
    print(
        '--------------------- Calculation may take up to 1 min ------------------------'
    )

    # NB model selection
    error_best, best_a, best_b, errors = model_selection_nb(
        data['Xtrain'], data['Xval'], data['ytrain'], data['yval'], a_values,
        b_values)

    print('The best a: {}, b: {} and the best error: {:.4f}'.format(
        best_a, best_b, error_best))
    print('\n--- Press any key to continue ---')
    plot_a_b_errors(errors, a_values, b_values)
    p_x_y = estimate_p_x_y_nb(data['Xtrain'], data['ytrain'], best_a, best_b)

    classes_no = p_x_y.shape[0]
    print('\n------ Visualization of most popular words for each class ------')
    print(
        '-- These are words that are most probable for each class and NB model --'
    )

    try:
        groupnames = data['groupnames']
        words = {}
        for x in range(classes_no):
            indices = np.argsort(p_x_y[x, :])[::-1][:50]
            words[groupnames[x]] = {
                word: prob
                for word, prob in zip(data['wordlist'][indices], p_x_y[
                    x, indices])
            }
        word_clouds(words.values(), words.keys())
    except Exception:
        print('--- A problem with wordcloud library --- ')

    print('\n--- Press any key to continue ---')

    print(
        '\n---------------- Comparison of KNN and NB errors ---------------------'
    )

    Dist = hamming_distance(data['Xtest'], data['Xtrain'])
    y_sorted = sort_train_labels_knn(Dist, data['ytrain'])
    p_y_x = p_y_x_knn(y_sorted, best_k)
    error_KNN = classification_error(p_y_x, data['ytest'])

    p_y = estimate_a_priori_nb(data['ytrain'])
    p_y_x = p_y_x_nb(p_y, p_x_y, data['Xtest'])
    error_NB = classification_error(p_y_x, data['ytest'])

    plot_error_NB_KNN(error_NB, error_KNN)
    print('\n--- Press any key to continue ---')