コード例 #1
0
def main():
    num_classes = 3
    X, y = get_master_data()
    train_X, train_y, val_X, val_y, test_X, test_y = split_data(X, y)

    # hyperparams
    learning_rates = [1]
    reg_strengths = [0]
    num_iters = 5000

    best_model = None
    best_accuracy = -1

    results = {}

    for lr in learning_rates:
        for rs in reg_strengths:

            this_model = LinearClassifier(X.shape[1], num_classes)
            this_model.train(train_X, train_y, lr, rs, num_iters)
            y_pred = this_model.predict(val_X)
            val_accuracy = np.mean(y_pred == val_y)

            y_pred = this_model.predict(train_X)
            train_accuracy = np.mean(y_pred == train_y)
            print('This val accuracy: ' + str(val_accuracy))

            if (val_accuracy > best_accuracy):
                best_model = this_model
                best_accuracy = val_accuracy

            results[(lr, rs)] = train_accuracy, val_accuracy
            this_model.print_model()

    for lr, rs in sorted(results):
        train_accuracy, val_accuracy = results[(lr, rs)]
        print('lr %e reg %e train_accuracy %f val_accuracy %f' %
              (lr, rs, train_accuracy, val_accuracy))

    print(test_X[0])
    y_pred_test = best_model.predict(test_X)

    test_accuracy = np.mean(y_pred_test == test_y)

    print('This test accuracy: ' + str(test_accuracy))
    best_model.toFile('rowan_rest_grip_flex_linear_classifier_3_' +
                      str(int(test_accuracy * 100)) + 'p.csv')
コード例 #2
0
                if quantiles is not None:
                    options['quantiles'] = int(quantiles)
                model = SIL(n_jobs=n_jobs, **options)
                model.fit(X_train,
                          y_train,
                          calibrate=calibrate,
                          param_search=True,
                          sample_weight=sw)

            # Save model for future use.
            pickle_filename = cat_name + '_model_' + str(int(
                time.time())) + '.pkl'
            with open(pickle_filename, 'wb') as output_file:
                pickle.dump(model, output_file)

            p_predict = model.predict(X_test)
            y_predict = np.argmax(p_predict, axis=1)
            acc = sklearn.metrics.accuracy_score(y_test, y_predict)
            if len(y_test) == 1:
                auc = 0.0
            elif len(np.unique(y_train)) == 2:
                auc = sklearn.metrics.roc_auc_score(y_test, p_predict[:, 1])
            else:
                auc = 0.0
                for i in range(p_predict.shape[1]):
                    auc += sklearn.metrics.roc_auc_score(
                        y_test == i, p_predict[:, i])
                auc /= p_predict.shape[1]
            kappa = sklearn.metrics.cohen_kappa_score(y_test, y_predict)
            classes = np.unique(y_train)
            np.sort(classes)
コード例 #3
0
def main():

    # # CELL 1
    # '''
    #  Imporation des bibliothèques python générales
    # '''
    # import numpy as np
    # import matplotlib.pyplot as plt
    # import itertools
    # from sklearn.datasets import make_classification

    # '''
    #  Imporation des bibliothèques spécifiques au devoir
    # '''
    # import utils
    # from linear_classifier import LinearClassifier
    # from two_layer_classifier import TwoLayerClassifier

    # %matplotlib inline
    # plt.rcParams['figure.figsize'] = (14.0, 8.0) # set default size of plots

    # %load_ext autoreload
    # %autoreload 2

    # CELL 2
    # Générer des données
    X_, y_ = make_classification(1000,
                                 n_features=2,
                                 n_redundant=0,
                                 n_informative=2,
                                 n_clusters_per_class=1,
                                 n_classes=3,
                                 random_state=6)

    # Centrer et réduire les données (moyenne = 0, écart-type = 1)
    mean = np.mean(X_, axis=0)
    std = np.std(X_, axis=0)
    X_ = (X_ - mean) / std

    # Afficher
    plt.figure(figsize=(8, 6))
    plt.scatter(X_[:, 0], X_[:, 1], c=y_, edgecolors='k', cmap=plt.cm.Paired)
    plt.show(block=False)

    # CELL 3
    num_val = 200
    num_test = 200
    num_train = 600
    np.random.seed(1)
    idx = np.random.permutation(len(X_))

    train_idx = idx[:num_train]
    val_idx = idx[num_train:num_train + num_val]
    test_idx = idx[-num_test:]

    X_train = X_[train_idx]
    y_train = y_[train_idx]
    X_val = X_[val_idx]
    y_val = y_[val_idx]
    X_test = X_[test_idx]
    y_test = y_[test_idx]

    # Afficher
    plt.figure(figsize=(8, 6))
    plt.scatter(X_train[:, 0],
                X_train[:, 1],
                c=y_train,
                edgecolors='k',
                cmap=plt.cm.Paired)
    plt.title('Data train')
    plt.show(block=False)

    plt.figure(figsize=(8, 6))
    plt.scatter(X_val[:, 0],
                X_val[:, 1],
                c=y_val,
                edgecolors='k',
                cmap=plt.cm.Paired)
    plt.title('Data Validation')
    plt.show(block=False)

    plt.figure(figsize=(8, 6))
    plt.scatter(X_test[:, 0],
                X_test[:, 1],
                c=y_test,
                edgecolors='k',
                cmap=plt.cm.Paired)
    plt.title('Data test')
    plt.show(block=False)

    # CELL 4
    accu = utils.test_sklearn_svm(X_train, y_train, X_test, y_test)
    print('Test accuracy: {:.3f}'.format(accu))
    if accu < 0.7:
        print(
            'ERREUR: L\'accuracy est trop faible. Il y a un problème avec les données. Vous pouvez essayer de refaire le mélange (case ci-haut).'
        )

# CELL 5
# En premier, vérifier la prédiction du modèle, la "forward pass"
    # 1. Générer le modèle avec des poids W aléatoires
    model = LinearClassifier(X_train,
                             y_train,
                             X_val,
                             y_val,
                             num_classes=3,
                             bias=True)

    # 2. Appeler la fonction qui calcule l'accuracy et la loss moyenne pour l'ensemble des données d'entraînement
    _, loss = model.global_accuracy_and_cross_entropy_loss(X_train, y_train)

    # 3. Comparer au résultat attendu
    loss_attendu = -np.log(
        1.0 / 3.0)  # résultat aléatoire attendu soit -log(1/nb_classes)
    print('Sortie: {}  Attendu: {}'.format(loss, loss_attendu))
    if abs(loss - loss_attendu) > 0.05:
        print('ERREUR: la sortie de la fonction est incorrecte.')
    else:
        print('SUCCÈS')

# CELL 6
# Vérification: Vous devez pouvoir faire du surapprentissage sur quelques échantillons.
    # Si l'accuracy reste faible, votre implémentation a un bogue.
    n_check = 5
    X_check = X_train[:n_check]
    y_check = y_train[:n_check]
    model = LinearClassifier(X_check,
                             y_check,
                             X_val,
                             y_val,
                             num_classes=3,
                             bias=True)
    loss_train_curve, loss_val_curve, accu_train_curve, accu_val_curve = model.train(
        num_epochs=10, lr=1.0, l2_reg=0.0)
    accu_train_finale = accu_train_curve[-1]
    print('Accuracy d\'entraînement, devrait être 1.0: {:.3f}'.format(
        accu_train_finale))
    if accu_train_finale < 0.9999:
        print('ATTENTION: L\'accuracy n\'est pas 100%.')
        utils.plot_curves(loss_train_curve, loss_val_curve, accu_train_curve,
                          accu_val_curve)
    else:
        print('SUCCÈS')

# CELL 7
# Prenons encore un petit échantillon et testons différentes valeurs de l2_reg
    n_check = 5
    X_check = X_train[:n_check]
    y_check = y_train[:n_check]
    model = LinearClassifier(X_check,
                             y_check,
                             X_val,
                             y_val,
                             num_classes=3,
                             bias=True)

    for l2_r in np.arange(0, 1, 0.05):
        loss_train_curve, loss_val_curve, accu_train_curve, accu_val_curve = model.train(
            num_epochs=10, lr=1.0, l2_reg=l2_r)
        print(
            'l2_reg= {:.4f} >> Loss/accuracy d\'entraînement : {:.3f} {:.3f}'.
            format(l2_r, loss_train_curve[-1], accu_train_curve[-1]))

# CELL 8
# On instancie et entraîne notre modèle; cette fois-ci avec les données complètes.
    model = LinearClassifier(X_train,
                             y_train,
                             X_val,
                             y_val,
                             num_classes=3,
                             bias=True)
    loss_train_curve, loss_val_curve, accu_train_curve, accu_val_curve = model.train(
        lr=0.001, num_epochs=25, l2_reg=0.01)

    # Illustration de la loss et de l'accuracy (le % de biens classés) à chaque itération
    utils.plot_curves(loss_train_curve, loss_val_curve, accu_train_curve,
                      accu_val_curve)

    print('[Training]   Loss: {:.3f}   Accuracy: {:.3f}'.format(
        loss_train_curve[-1], accu_train_curve[-1]))
    print('[Validation] Loss: {:.3f}   Accuracy: {:.3f}'.format(
        loss_val_curve[-1], accu_val_curve[-1]))

    # CELL 9
    lr_choices = [1e-2, 1e-1, 1.0, 10.0]
    reg_choices = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
    lr_decay = 0.995  # learning rate is multiplied by this factor after each step

    best_accu = -1
    best_params = None
    best_model = None
    best_curves = None

    for lr, reg in itertools.product(lr_choices, reg_choices):
        params = (lr, reg)
        curves = model.train(num_epochs=25,
                             lr=lr,
                             l2_reg=reg,
                             lr_decay=lr_decay)
        loss_train_curve, loss_val_curve, accu_train_curve, accu_val_curve = curves

        val_accu = accu_val_curve[-1]
        if val_accu > best_accu:
            print('Best val accuracy: {:.3f} | lr: {:.0e} | l2_reg: {:.0e}'.
                  format(val_accu, lr, reg))
            best_accu = val_accu
            best_params = params
            best_model = model
            best_curves = curves

    model = best_model
    utils.plot_curves(*best_curves)

    # CELL 10
    # On ré-entraîne le modèle avec les meilleurs hyper-paramètres
    lr, reg = best_params
    model.train(num_epochs=25, lr=lr, l2_reg=reg, lr_decay=lr_decay)

    pred = model.predict(X_test)
    accu = (pred == y_test).mean()
    print('Test accuracy: {:.3f}'.format(accu))

    # CELL 11
    h = 0.01  # contrôle la résolution de la grille
    x_min, x_max = X_[:,
                      0].min() - .5, X_[:,
                                        0].max() + .5  # Limites de la grille
    y_min, y_max = X_[:, 1].min() - .5, X_[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))  # Créer la grille

    X_predict = np.c_[xx.ravel(),
                      yy.ravel()]  # Convertir la grille en une liste de points
    Z = model.predict(X_predict)  # Classifier chaque point de la grille
    Z = Z.reshape(xx.shape)  # Remettre en 2D

    plt.figure(figsize=(14, 8))
    plt.pcolormesh(
        xx, yy, Z,
        cmap=plt.cm.Paired)  # Colorier les cases selon les prédictions

    X_plot, y_plot = X_train, y_train
    X_plot, y_plot = X_train, y_train
    plt.scatter(X_plot[:, 0],
                X_plot[:, 1],
                c=y_plot,
                edgecolors='k',
                cmap=plt.cm.Paired)  # Tracer les données

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())

    plt.title('Frontières de décision')
    plt.show(block=False)

    # CELL 12
    #Choisissez le type de données que vous voulez

    # NOTE IMPORTANTE: on vous encourage à tester différentes bases de données.  Ceci dit,
    # votre solution sera testée avec Ncircles (N=4).  Vous devez donc tester cette option.
    dataset_type = 'Ncircles'
    if dataset_type == 'moons':
        X_, y_ = sklearn.datasets.make_moons(n_samples=200, noise=0.5)
        num_classes = 2
    elif dataset_type == 'gaussian_quantiles':
        X_, y_ = sklearn.datasets.make_gaussian_quantiles(n_samples=200,
                                                          n_classes=2)
        num_classes = 2
    elif dataset_type == '4blobs':
        d = 4
        c1a = np.random.randn(50, 2)
        c1b = np.random.randn(50, 2) + (d, d)
        c2a = np.random.randn(50, 2) + (0, d)
        c2b = np.random.randn(50, 2) + (d, 0)
        X_ = np.concatenate([c1a, c1b, c2a, c2b], axis=0)
        y_ = np.array([0] * 100 + [1] * 100)
        num_classes = 2
    elif dataset_type == '2circles':
        X_, y_ = sklearn.datasets.make_circles(n_samples=200)
        num_classes = 2
    elif dataset_type == 'Ncircles':
        samples_per_class = 100
        num_classes = 4
        angles = np.linspace(0, 2 * np.pi, samples_per_class)
        radius = 1.0 + np.arange(num_classes) * 0.3
        px = np.cos(angles[:, None]) * radius[None, :]  # (100, 3)
        py = np.sin(angles[:, None]) * radius[None, :]  # (100, 3)
        X_ = np.stack([px, py], axis=-1).reshape(
            (samples_per_class * num_classes, 2))
        X_ += np.random.randn(len(X_[:, 0]), 2) / 8
        y_ = np.array(list(range(num_classes)) * samples_per_class)
    else:
        print('Invalid dataset type')

# CELL 13
    plt.figure()
    plt.scatter(X_[:, 0], X_[:, 1], c=y_, cmap=plt.cm.Paired)
    plt.title('Données complètes')

    plt.show(block=False)

    # CELL 14
    train_proportion = 0.5
    val_proportion = 0.2
    num_train = int(len(X_) * train_proportion)
    num_val = int(len(X_) * val_proportion)

    np.random.seed(0)
    idx = np.random.permutation(len(X_))

    train_idx = idx[:num_train]
    val_idx = idx[num_train:num_train + num_val]
    test_idx = idx[num_train + num_val:]

    X_train = X_[train_idx]
    y_train = y_[train_idx]
    X_val = X_[val_idx]
    y_val = y_[val_idx]
    X_test = X_[test_idx]
    y_test = y_[test_idx]

    # CELL 15
    # Affichons maintenant les données d'entraînement, de validation et de test.
    plt.figure()
    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.Paired)
    plt.title('Train')
    plt.show(block=False)

    plt.figure()
    plt.scatter(X_val[:, 0], X_val[:, 1], c=y_val, cmap=plt.cm.Paired)
    plt.title('Validation')
    plt.show(block=False)

    plt.figure()
    plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=plt.cm.Paired)
    plt.title('Test')
    plt.show(block=False)

    # CELL 16
    num_hidden_neurons = 10
    model = TwoLayerClassifier(X_train,
                               y_train,
                               X_val,
                               y_val,
                               num_features=2,
                               num_hidden_neurons=num_hidden_neurons,
                               num_classes=num_classes)

    # CELL 17
    # Vérifier que la sortie du réseau initialisé au hasard donne bien une prédiction égale pour chaque classe
    num_hidden_neurons = 10
    model = TwoLayerClassifier(X_train,
                               y_train,
                               X_val,
                               y_val,
                               num_features=2,
                               num_hidden_neurons=num_hidden_neurons,
                               num_classes=num_classes)

    # 2. Appeler la fonction qui calcule l'accuracy et la loss moyenne pour l'ensemble des données d'entraînement
    _, loss = model.global_accuracy_and_cross_entropy_loss(X_train, y_train, 0)

    # 3. Comparer au résultat attendu
    loss_attendu = -np.log(
        1.0 /
        num_classes)  # résultat aléatoire attendu soit -log(1/nb_classes)
    print('Sortie: {}  Attendu: {}'.format(loss, loss_attendu))
    if abs(loss - loss_attendu) > 0.05:
        print('ERREUR: la sortie de la fonction est incorrecte.')
    else:
        print('SUCCÈS')

    # CELL 18
    # Vérifier que le fait d'augmenter la régularisation L2 augmente également la loss
    for l2_r in np.arange(0, 2, 0.1):
        _, loss = model.global_accuracy_and_cross_entropy_loss(
            X_train, y_train, l2_r)
        print(
            'l2_reg= {:.4f} >> Loss/accuracy d\'entraînement : {:.3f}'.format(
                l2_r, loss))

# CELL 19
# Vérification: Vous devez pouvoir faire du surapprentissage sur quelques échantillons.
    # Si l'accuracy reste faible, votre implémentation a un bogue.
    n_check = 5
    X_check = X_train[:n_check]
    y_check = y_train[:n_check]
    model = TwoLayerClassifier(X_check,
                               y_check,
                               X_val,
                               y_val,
                               num_features=2,
                               num_hidden_neurons=num_hidden_neurons,
                               num_classes=num_classes)

    loss_train_curve, loss_val_curve, accu_train_curve, accu_val_curve = model.train(
        num_epochs=200, lr=0.01, l2_reg=0.0)
    print('Accuracy d\'entraînement, devrait être 1.0: {:.3f}'.format(
        accu_train_curve[-1]))
    if accu_train_curve[-1] < 0.98:
        print('ATTENTION: L\'accuracy n\'est pas 100%.')
        utils.plot_curves(loss_train_curve, loss_val_curve, accu_train_curve,
                          accu_val_curve)
    else:
        print('SUCCÈS')

    # CELL 20
    # Vérifier que le fait d'entraîner avec une régularisation L2 croissante augmente la loss et, éventuellement, diminue l'accuracy
    for l2_r in np.arange(0, 1, 0.1):
        loss_train_curve, loss_val_curve, accu_train_curve, accu_val_curve = model.train(
            num_epochs=200, lr=0.01, l2_reg=l2_r)
        print(
            'l2_reg= {:.4f} >> Loss/accuracy d\'entraînement : {:.3f} {:.3f}'.
            format(l2_r, loss_train_curve[-1], accu_train_curve[-1]))

# CELL 21
# On instancie notre modèle; cette fois-ci avec les données complètes.
    num_hidden_neurons = 20
    model = TwoLayerClassifier(X_train,
                               y_train,
                               X_val,
                               y_val,
                               num_features=2,
                               num_hidden_neurons=num_hidden_neurons,
                               num_classes=num_classes,
                               activation='relu')

    # CELL 22
    loss_train_curve, loss_val_curve, accu_train_curve, accu_val_curve = model.train(
        num_epochs=200, lr=1e-2, l2_reg=0.0, momentum=0.5)

    # Illustration de la loss et de l'accuracy (le % de biens classés) à chaque itération
    utils.plot_curves(loss_train_curve, loss_val_curve, accu_train_curve,
                      accu_val_curve)

    print('[Training]   Loss: {:.3f}   Accuracy: {:.3f}'.format(
        loss_train_curve[-1], accu_train_curve[-1]))
    print('[Validation] Loss: {:.3f}   Accuracy: {:.3f}'.format(
        loss_val_curve[-1], accu_val_curve[-1]))

    # CELL 23
    # Find the best hyperparameters lr and l2_reg
    lr_choices = [1e-4, 1e-3, 1e-2]
    reg_choices = [1e-1, 1e-2, 1e-3, 1e-4, 0]
    lr_decay = 1.0  # 0.995  # learning rate is multiplied by this factor after each step

    best_accu = -1
    best_params = None
    best_model = None
    best_curves = None

    for lr, reg in itertools.product(lr_choices, reg_choices):
        params = (lr, reg)
        curves = model.train(num_epochs=50,
                             lr=lr,
                             l2_reg=reg,
                             lr_decay=lr_decay,
                             momentum=0.5)
        loss_train_curve, loss_val_curve, accu_train_curve, accu_val_curve = curves

        val_accu = accu_val_curve[-1]
        if val_accu > best_accu:
            print('Best val accuracy: {:.3f} | lr: {:.0e} | l2_reg: {:.0e}'.
                  format(val_accu, lr, reg))
            best_accu = val_accu
            best_params = params
            best_model = model
            best_curves = curves
        else:
            print('accuracy: {:.3f} | lr: {:.0e} | l2_reg: {:.0e}'.format(
                val_accu, lr, reg))

    model = best_model
    utils.plot_curves(*best_curves)

    # CELL 24
    # On ré-entraîne le modèle avec les meilleurs hyper-paramètres
    lr, reg = best_params
    print(best_params)
    curves = model.train(num_epochs=200, lr=lr, l2_reg=reg, momentum=0.5)
    loss_train_curve, loss_val_curve, accu_train_curve, accu_val_curve = curves

    pred = model.predict(X_test)
    accu = (pred == y_test).mean()
    print('Test accuracy: {:.3f}'.format(accu))
    utils.plot_curves(*curves)

    # CELL 25
    # Visualisation des résultats

    h = 0.05  # contrôle la résolution de la grille
    x_min, x_max = X_[:,
                      0].min() - .5, X_[:,
                                        0].max() + .5  # Limites de la grille
    y_min, y_max = X_[:, 1].min() - .5, X_[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))  # Créer la grille

    X_predict = np.c_[xx.ravel(),
                      yy.ravel()]  # Convertir la grille en une liste de points
    Z = model.predict(X_predict)  # Classifier chaque point de la grille
    Z = Z.reshape(xx.shape)  # Remettre en 2D

    plt.figure(figsize=(14, 8))
    plt.pcolormesh(
        xx, yy, Z,
        cmap=plt.cm.Paired)  # Colorier les cases selon les prédictions

    X_plot, y_plot = X_, y_
    plt.scatter(X_plot[:, 0],
                X_plot[:, 1],
                c=y_plot,
                edgecolors='k',
                cmap=plt.cm.Paired)  # Tracer les données

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())

    plt.title('Frontières de décision')
    plt.show()