Beispiel #1
0
def main(datapath, modelpath, idxs):
    dataset_names = get_filenames(datapath)
    print(f'using sets {dataset_names} from {datapath}')
    print('looking for idx files')
    idxs = load_pkl(idxs)
    idxs_train = idxs['train']
    idxs_test = idxs['test']

    for dataset_name in dataset_names:
        dataset: Dataset = load_pkl(datapath+dataset_name)
        X = dataset.data.detach().numpy()
        Y = dataset.get_labels_numerical()
        x_train, x_test = X[idxs_train], X[idxs_test]
        y_train, y_test = Y[idxs_train], Y[idxs_test]
        hiddim = dataset_name.split('/')[-1].split('_')[2]
        print(f'training gmlvq on {hiddim} dim embedding')
        gmlvq = GmlvqModel()
        gmlvq.fit(x_train, y_train)
        train_error = get_error(gmlvq, x_train, y_train)
        test_error = get_error(gmlvq, x_test, y_test)
        var = gmlvq_covered_variance(gmlvq, thresh=1, verbose=True)
        misc = {'train_error': train_error, 'test_error' : test_error, 'matrix_var' : var}
        print(f'adding misc data to gmlvq model {misc}')
        gmlvq.misc = misc
        modelname = f'gmlvq{hiddim}.pkl'
        print(f'saving model to {modelname}')
        pkl.dump(gmlvq, open(modelpath+modelname, 'wb'))
def do_experiment(title, cols, labels):
    ''' perform the classification experiment with given data '''
    print(f'{title} experiment')
    data = read_cols(cols)

    avg_acc = 0
    avg_cm = np.zeros((num_classes, num_classes))
    relevances = np.empty((NUM_CV, data.shape[1]))
    scores = np.empty((NUM_CV))

    i = 0
    for train, test in KFold(n_splits=NUM_CV).split(data):
        gmlvq = GmlvqModel(prototypes_per_class=[1, 1, 1, 1])
        gmlvq.fit(data[train], labels[train])

        score = gmlvq.score(data[test], labels[test])
        scores[i] = score

        relMatrix = np.dot(np.transpose(gmlvq.omega_), gmlvq.omega_)

        relevances[i] = np.diag(relMatrix)

        label_pred = gmlvq.predict(data[test])
        avg_cm += confusion_matrix(labels[test], label_pred)

        i += 1

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # normalize confusion matrix
    avg_cm = avg_cm.astype('float') / avg_cm.sum(axis=1)[:, np.newaxis]
    avg_acc = np.mean(scores)

    print(f'mean score: {np.mean(scores)} - variance score: {np.var(scores)}')

    # save confusion matrix figure
    plot_confusion_matrix(title, avg_cm)
    plt.savefig(f'{OUTPUT_DIR}/CM_{title}.pdf')
    plt.clf

    plot_relevances(title, cols, relevances)
    plt.savefig(f'{OUTPUT_DIR}/REL_{title}.pdf')
    plt.clf
Beispiel #3
0
def test_gmlvq():
    # Load data
    X, y = load_iris(True)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=4242)

    # Create and fit model
    model = GmlvqModel(prototypes_per_class=3,
                       max_iter=200,
                       random_state=4242,
                       dim=2)
    model.fit(X_train, y_train)

    # Select data point for explaining its prediction
    x_orig = X_test[1:4][0, :]
    assert model.predict([x_orig]) == 2

    # Compute counterfactual
    features_whitelist = None

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l1",
        C=0.01,
        optimizer="bfgs",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l1",
        C=1.0,
        optimizer="nelder-mead",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization=None,
        optimizer="bfgs",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization=None,
        optimizer="nelder-mead",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0

    features_whitelist = [0, 1, 2, 3]
    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l1",
        C=0.01,
        optimizer="bfgs",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0
    assert all([
        True if i in features_whitelist else delta[i] == 0.
        for i in range(x_orig.shape[0])
    ])

    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization="l1",
        C=1.0,
        optimizer="nelder-mead",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0
    assert all([
        True if i in features_whitelist else delta[i] == 0.
        for i in range(x_orig.shape[0])
    ])

    features_whitelist = [0, 2]
    x_cf, y_cf, delta = generate_counterfactual(
        model,
        x_orig,
        0,
        features_whitelist=features_whitelist,
        regularization=None,
        optimizer="nelder-mead",
        return_as_dict=False)
    assert y_cf == 0
    assert model.predict(np.array([x_cf])) == 0
    assert all([
        True if i in features_whitelist else delta[i] == 0.
        for i in range(x_orig.shape[0])
    ])
glvq.fit(x, y)
p2 = plt.subplot(232)
p2.set_title('GLVQ')
plot(PCA().fit_transform(x), y, glvq.predict(x), glvq.w_, glvq.c_w_, p2)

# GRLVQ
grlvq = GrlvqModel()
grlvq.fit(x, y)
p3 = plt.subplot(233)
p3.set_title('GRLVQ')
plot(grlvq.project(x, 2),
     y, grlvq.predict(x), grlvq.project(grlvq.w_, 2),
     grlvq.c_w_, p3)

# GMLVQ
gmlvq = GmlvqModel()
gmlvq.fit(x, y)
p4 = plt.subplot(234)
p4.set_title('GMLVQ')
plot(gmlvq.project(x, 2),
     y, gmlvq.predict(x), gmlvq.project(gmlvq.w_, 2),
     gmlvq.c_w_, p4)

# LGMLVQ
lgmlvq = LgmlvqModel()
lgmlvq.fit(x, y)
p5 = plt.subplot(235)
elem_set = list(set(lgmlvq.c_w_))
p5.set_title('LGMLVQ 1')
plot(lgmlvq.project(x, 1, 2, True),
     y, lgmlvq.predict(x), lgmlvq.project(np.array([lgmlvq.w_[1]]), 1, 2),
Beispiel #5
0
    show_plot = False

    # Evaluation scores
    accuracy_noadjust = []
    accuracy_adjust = []

    # Create initial data set
    class_means = np.array([[0.0, 0.0], [5.0, 8.0]])
    #class_means = np.array([[0.0, 0.0], [5.0, 8.0, ], [8.0, 0.0]])
    cov = np.array([[1.0, 0.0], [0.0, 1.0]])
    #cov = np.array([[0.1, 0.0], [0.0, 5.0]])

    X, y = sample_from_classdist(class_means, cov)
    #plot_classification_dataset(X, y)

    model = GmlvqModel(prototypes_per_class=1,
                       random_state=4242)  # Fit model to initial data set
    #model = MrslvqModel(prototypes_per_class=1, random_state=4242)
    model.fit(X, y)
    print(np.dot(model.omega_.T, model.omega_))

    mymodel = MyModel(model.w_, np.dot(model.omega_.T, model.omega_))
    y_pred = mymodel.predict(X)
    print("MyModel: {0}".format(accuracy_score(y, y_pred)))

    #model2 = LgmlvqModel(prototypes_per_class=1, random_state=4242)
    model2 = LmrslvqModel(prototypes_per_class=1, random_state=4242)
    model2.fit(X, y)
    print([np.dot(o.T, o) for o in model2.omegas_])

    mymodel2 = MyModel2(model2.w_, [np.dot(o.T, o) for o in model2.omegas_])
    y_pred2 = mymodel2.predict(X)
Beispiel #6
0
metric = ['euclidean','minkowski']
param_grid= dict(n_neighbors=k_list, metric=metric)
knn = GridSearchCV(classifier, param_grid)
knn.fit(val_x, val_y)
print('Os melhores parâmetros foram:', knn.best_params_)
best_n = knn.best_params_['n_neighbors']

"""#### **Classificador LVQ**

Utilizou-se o pacote *sklearn* para implementação do LVQ e verificou-se os melhores valores para regularização.
"""

#!pip install sklearn-lvq

warnings.filterwarnings(action='ignore')
glvq = GmlvqModel(gtol=1e-1, max_iter=150)
param_grid = {'regularization': [0.0, 0.1, 0.5] }#, 'beta': [1, 2]}
clf = GridSearchCV(glvq,param_grid)
clf.fit(val_x, val_y)
print('O melhor:', clf.best_params_)
best_reg = clf.best_params_['regularization']
#best_beta = clf.best_params_['beta']
warnings.filterwarnings(action='default') #reabilitando os warnings

"""#### **Classificador SVM**

Fez uso do pacote do *sklearn* para implementação do SVM e verificou-se os melhores valores para o parâmetro de regularização (C) e também qual o melhor kernel.
"""

svm = SVC()
param_grid = {'C': [0.5, 1.0, 10], 'kernel': ['rbf', 'sigmoid']}
Beispiel #7
0
        scores_cf_perturbation_dist = []
        results = {'notFound': 0, 'found': 0}

        kf = KFold(n_splits=n_kf_splits)
        for train_index, test_index in kf.split(X):
            # Split data into training and test set
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Fit and evaluate classifier
            model = None
            if modeldesc == "glvq":
                model = GlvqModel(prototypes_per_class=n_prototypes)
            elif modeldesc == "gmlvq":
                model = GmlvqModel(prototypes_per_class=n_prototypes)
            elif modeldesc == "logreg":
                model = LogisticRegression(multi_class='multinomial')
            elif modeldesc == "dectree":
                model = DecisionTreeClassifier(max_depth=7)
            model.fit(X_train, y_train)

            # Compute accuracy on test set
            y_pred = model.predict(X_test)
            print(f"F1-score: {f1_score(y_test, y_pred, average='weighted')}")

            labels = np.unique(y)
            # Compute counterfactual of each test sample
            for i in range(X_test.shape[0]):
                x_orig_orig = X_test[i,:]
                y_orig = y_test[i]
                   np.array([0 for _ in range(n_samples)]).reshape(-1, 1)))
    y = [0 for _ in range(n_samples)]

    X = np.vstack(
        (X,
         np.hstack((np.random.uniform(7, 12, n_samples).reshape(-1, 1),
                    np.array([5 for _ in range(n_samples)]).reshape(-1, 1)))))
    y += [1 for _ in range(n_samples)]
    y = np.array(y)

    from plotting import plot_classification_dataset, export_as_png
    plot_classification_dataset(X, y, show=False)
    export_as_png("toydata.png")

    # Fit model
    model = GmlvqModel(prototypes_per_class=1, random_state=4242)
    model.fit(X, y)

    # Evaluate
    y_pred = model.predict(X)
    y_, y_pred_ = encode_labels(y.reshape(-1, 1), y_pred.reshape(-1, 1))
    print("ROC-AUC: {0}".format(roc_auc_score(y_, y_pred_,
                                              average="weighted")))

    print("Omega\n{0}".format(np.dot(model.omega_.T, model.omega_)))
    print()

    # Compute counterfactual metric
    x_orig = np.array([10.0, 0])
    y_target = 1
    Omega_cf = compute_change_in_distmat_gmlvq(model, x_orig, y_target)[0]
                    pca = PCA(n_components=pca_dim)
                    pca.fit(X_train)

                    projection_matrix = pca.components_ # Projection matrix
                    projection_mean_sub = pca.mean_
                    #print(projection_matrix)

                    X_train = np.dot(X_train - projection_mean_sub, projection_matrix.T)
                    X_test = np.dot(X_test - projection_mean_sub, projection_matrix.T)
                
                # Fit classifier
                model = None
                if modeldesc == "glvq":
                    model = GlvqModel(prototypes_per_class=n_prototypes, random_state=4242)
                elif modeldesc == "gmlvq":
                    model = GmlvqModel(prototypes_per_class=n_prototypes, random_state=4242)
                elif modeldesc == "logreg":
                    model = LogisticRegression(multi_class='multinomial')
                elif modeldesc == "dectree":
                    model = DecisionTreeClassifier(max_depth=7, random_state=42)
                model.fit(X_train, y_train)

                # Compute accuracy on test set
                y_pred = model.predict(X_test)
                print(f"F1-score: {f1_score(y_test, y_pred, average='weighted')}")

                # Fit model for finding closest samples
                closest_samples = ClosestSample(X_train_orig, y_train)

                # For each class, fit density estimators
                density_estimators = {}
Beispiel #10
0
def get_error(lvq_model: GmlvqModel, x, y) -> float:
    y_ = lvq_model.predict(x)
    errors = 1 - np.mean(y_ == y)
    return errors
Beispiel #11
0
(big circle) and which class was predicted (smaller circle). It also
shows the prototypes (black diamond) and their labels (small point inside the diamond).
The projected data is shown in the right plot.

"""
import matplotlib.pyplot as plt
import numpy as np

from sklearn_lvq import GmlvqModel
from sklearn_lvq.utils import plot2d

print(__doc__)

nb_ppc = 100
toy_label = np.append(np.zeros(nb_ppc), np.ones(nb_ppc), axis=0)

print('GMLVQ:')
toy_data = np.append(np.random.multivariate_normal([0, 0],
                                                   np.array([[5, 4], [4, 6]]),
                                                   size=nb_ppc),
                     np.random.multivariate_normal([9, 0],
                                                   np.array([[5, 4], [4, 6]]),
                                                   size=nb_ppc),
                     axis=0)
gmlvq = GmlvqModel()
gmlvq.fit(toy_data, toy_label)
plot2d(gmlvq, toy_data, toy_label, 1, 'gmlvq')

print('classification accuracy:', gmlvq.score(toy_data, toy_label))
plt.show()