Ejemplo n.º 1
0
def experiment2c():
    image_dim = 28
    n_classes = 10
    n_trials = 5
    epochs = 20
    batch_size = 500

    db = getMACResultsDB()
    collection = db['experiment2c']

    x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST()

    y_train = np_utils.to_categorical(y_train, n_classes)
    y_test = np_utils.to_categorical(y_test, n_classes)

    clustering = clusterData(x_train_flattened)

    fractions = [0.00, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0]

    recovery = []
    results = []
    for fraction in fractions:
        x_labelled, labels, labelled_indices, unlabelled_indices, \
        sampled_dominant_cluster_classes, dominant_cluster_classes = \
          experiment2cLabelleingMethod(x_train, y_train, clustering,
                                       n_classes, fraction)

        result = \
          runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \
                    data='mnist_experiment2c_fraction%.2lf'%(fraction), \
                    epochs=epochs)

        results.append(result)

        recovery.append(
            calculateLabellingAccuracy(
                np_utils.to_categorical(dominant_cluster_classes),
                np_utils.to_categorical(sampled_dominant_cluster_classes)) /
            100.0)

        doc = {
              'name': 'majority class recovery by cluster subsample',
              'm': labels.shape[0],
              'fraction sampled': fraction,
              'accuracy': result[0],
              'error': result[1],
              'trials accuracy': result[2],
              'labelling accuracy': \
                calculateLabellingAccuracy(y_train[labelled_indices], labels),
              'training set class distribution': \
                calculateClassDistribution(labels).tolist(),
              'cluster majority class recovery rate': calculateLabellingAccuracy(
                np_utils.to_categorical(dominant_cluster_classes),
                np_utils.to_categorical(sampled_dominant_cluster_classes))/100.0
            }

        collection.insert_one(doc)

    return fractions, results, recovery
def experiment2b():

    image_dim = 28
    n_classes = 10
    n_trials = 5
    epochs = 20
    batch_size = 500

    db = getMACResultsDB()
    collection = db['experiment2b']

    x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST()

    y_train = np_utils.to_categorical(y_train, n_classes)
    y_test = np_utils.to_categorical(y_test, n_classes)

    clustering = clusterData(x_train_flattened)

    fractions = [0.11, 0.2, 0.25, 0.3, 0.4, 0.5, \
                 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1.0]

    results = []
    for fraction in fractions:
        x_labelled, labels, labelled_indices, unlabelled_indices = \
         experiment2bLabelleingMethod(x_train, y_train, clustering,
                                      n_classes, fraction)

        print(labels.shape[0])

        result = \
          runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \
                    data='mnist_experiment2b_fraction%.2lf'%(fraction), \
                    epochs=epochs)

        results.append(result)

        doc = {
              'name': 'majority class label assignment by fraction',
              'm': labels.shape[0],
              'fraction': fraction,
              'accuracy': result[0],
              'error': result[1],
              'trials accuracy': result[2],
              'labelling accuracy': \
                calculateLabellingAccuracy(y_train[labelled_indices], labels),
              'training set class distribution': \
                calculateClassDistribution(labels).tolist()
            }

        collection.insert_one(doc)

    return fractions, results
Ejemplo n.º 3
0
from nn import NN, Relu, Linear, SquaredLoss
from utils import data_loader, acc, save_plot, loadMNIST, onehot
x_train, label_train = loadMNIST('data/train-images.idx3-ubyte', 'data/train-labels.idx1-ubyte')
x_test, label_test = loadMNIST('data/t10k-images.idx3-ubyte', 'data/t10k-labels.idx1-ubyte')
y_train = onehot(label_train)
y_test = onehot(label_test)
model = NN(Relu(), SquaredLoss(), hidden_layers=[128, 128], input_d=784, output_d=10)
model.print_model()
training_data, dev_data = {"X":x_train, "Y":y_train}, {"X":x_test, "Y":y_test}
from run_nn import train_1pass
model, plot_dict = train_1pass(model, training_data, dev_data, learning_rate=1e-2, batch_size=64)
Ejemplo n.º 4
0
def experiment2aNoise(filename):
    image_dim = 28
    n_classes = 10
    n_trials = 5
    epochs = 20
    batch_size = 500

    db = getMACResultsDB()

    x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST()

    # calculate performance on gold labels
    y_train = np_utils.to_categorical(y_train, n_classes)
    y_test = np_utils.to_categorical(y_test, n_classes)

    clustering = clusterData(x_train_flattened)

    # experiment 2a labelling accuracy
    labelling_accuracies_2a = []
    labelling_accuracies_2a_errors = []
    labelling_accuracies_2a_2 = []
    labelling_accuracies_2a_2_errors = []
    # experiment 2a machine accuracy
    machine_accuracies_2a = []
    machine_accuracies_2a_errors = []
    machine_accuracies_2a_2 = []
    machine_accuracies_2a_2_errors = []

    noise_levels = np.arange(0, 1.1, 0.1)

    collection_name = 'experiment2d'
    collection = db[collection_name]

    for noise in noise_levels:
        l_results = []
        l_results_2 = []
        m_results = []
        m_results_2 = []
        for i in range(n_trials):
            # experiment 2a with random noise
            experiment_name = \
              'experiment 2d - experiment2a random noise %.2lf trial %d' % (noise, i)
            try:
                assert collection_name in db.collection_names()
                doc = db[collection_name].find({'name': experiment_name})[0]
                l_results.append(doc['labelling accuracy'])
                m_results.append(doc['machine accuracy'])
            except (AssertionError, IndexError):
                x_labelled, labels, labelled_indices, unlabelled_indices = \
                  experiment2aNoiseLabelling(x_train, y_train, clustering, \
                  n_classes, noise=noise)
                l_results.append(
                    calculateLabellingAccuracy(y_train[labelled_indices],
                                               labels))
                r = runTrials(x_labelled, labels, x_test, y_test, 1, n_classes, \
                  data='mnist_experiment2d_noise%.2lf_trial%d'%(noise, i))
                m_results.append(r[0])
                doc = {
                        'name':experiment_name,
                        'm': labels.shape[0],
                        'noise': noise,
                        'trial': i,
                        'labelling accuracy': \
                          calculateLabellingAccuracy(y_train[labelled_indices], labels),
                        'machine accuracy': r[0],
                        'training set class distribution': \
                          calculateClassDistribution(labels).tolist()
                      }
                collection.insert_one(doc)

            # experiment 2a with weighted noise
            experiment_name = \
              'experiment 2d - experiment2a class weighted noise %.2lf trial %d' \
                % (noise, i)
            try:
                assert collection_name in db.collection_names()
                doc = db[collection_name].find({'name': experiment_name})[0]
                l_results_2.append(doc['labelling accuracy'])
                m_results_2.append(doc['machine accuracy'])
            except (AssertionError, IndexError):
                x_labelled, labels, labelled_indices, unlabelled_indices = \
                  experiment2aNoiseLabelling(x_train, y_train, clustering, \
                  n_classes, intelligent_noise=noise)
                l_results_2.append(
                    calculateLabellingAccuracy(y_train[labelled_indices],
                                               labels))
                r = runTrials(x_labelled, labels, x_test, y_test, 1, n_classes, \
                  data='mnist_experiment2d_class_weighted_noise%.2lf_trial%d'%(noise, i))
                m_results_2.append(r[0])
                doc = {
                        'name':experiment_name,
                        'm': labels.shape[0],
                        'noise': noise,
                        'trial': i,
                        'labelling accuracy': \
                          calculateLabellingAccuracy(y_train[labelled_indices], labels),
                        'machine accuracy': r[0],
                        'training set class distribution': \
                          calculateClassDistribution(labels).tolist()
                      }
                collection.insert_one(doc)

        labelling_accuracies_2a.append(np.mean(l_results))
        labelling_accuracies_2a_errors.append(np.std(l_results))
        labelling_accuracies_2a_2.append(np.mean(l_results_2))
        labelling_accuracies_2a_2_errors.append(np.std(l_results_2))

        machine_accuracies_2a.append(np.mean(m_results))
        machine_accuracies_2a_errors.append(np.std(m_results))
        machine_accuracies_2a_2.append(np.mean(m_results_2))
        machine_accuracies_2a_2_errors.append(np.std(m_results_2))

    cursor = db['experiment2a'].find(
        {'name': 'majority class label assignment'})
    for doc in cursor:
        majority_class_benchmark = doc['labelling accuracy']

    fig, ax = plt.subplots()
    ax.plot(np.arange(-0.02,1.03,0.01), \
      np.ones((105))*majority_class_benchmark,'k--')

    ax.errorbar(noise_levels, labelling_accuracies_2a, \
        yerr=labelling_accuracies_2a_errors, fmt='o', mfc='None', \
        color='#B8336A', label='majority class - random noise')

    ax.errorbar(noise_levels, labelling_accuracies_2a_2, \
        yerr=labelling_accuracies_2a_2_errors, fmt='o', mfc='None', \
        color='#726DA8', label='majority class - class weighted noise', zorder=100)

    ax.set_xlabel('labelling noise')
    ax.set_ylabel('labelling accuracy')
    ax.set_ylim(-2, 100)
    ax.set_xlim(-0.02, 1.03)
    plt.legend(loc='lower left')
    #plt.show()
    plt.savefig(filename + 'labelling_noise.pdf')
    plt.savefig(filename + 'labelling_noise.png')

    cursor = db['experiment2a'].find({'name': 'gold benchmark'})
    for doc in cursor:
        gold_benchmark = doc['accuracy']
        gold_benchmark_error = doc['error']

    cursor = db['experiment2a'].find(
        {'name': 'majority class label assignment'})
    for doc in cursor:
        majority_class_benchmark = doc['accuracy']
        majority_class_error = doc['error']

    fig, ax = plt.subplots()

    ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*gold_benchmark, \
            color='#726DA8', label='gold benchmark')
    ax.axhspan(gold_benchmark-gold_benchmark_error, \
               gold_benchmark+gold_benchmark_error, \
               facecolor='#726DA8', alpha=0.5)

    ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*majority_class_benchmark, \
            color='#A0D2DB', label='majority class benchmark')
    ax.axhspan(majority_class_benchmark-majority_class_error, \
               majority_class_benchmark+majority_class_error, \
               facecolor='#A0D2DB', alpha=0.5)

    ax.errorbar(noise_levels, machine_accuracies_2a, \
        yerr=machine_accuracies_2a_errors, fmt='o', mfc='None', \
        color='#B8336A', label='majority class - random noise')

    ax.errorbar(noise_levels, machine_accuracies_2a_2, \
        yerr=machine_accuracies_2a_2_errors, fmt='o', mfc='None', \
        color='#726DA8', label='majority class - class weighted noise', zorder=100)

    ax.set_xlabel('labelling noise')
    ax.set_ylabel('machine accuracy')
    ax.set_ylim(-2, 100)
    ax.set_xlim(-0.02, 1.03)
    plt.legend(loc='lower left')
    #plt.show()
    plt.savefig(filename + '_machine_accuracy.pdf')
    plt.savefig(filename + '_machine_accuracy.png')
Ejemplo n.º 5
0
import math
import pickle

# Utils
from sklearn.metrics import accuracy_score
import utils

# Classifiers
from sklearn import tree
from sklearn import svm
from sklearn import linear_model

classifiers = []
classifierWeights = []
numClassifiers = 5
X_train, X_test, Y_train, Y_test = utils.loadMNIST(test=0.15)
loaded = False

modelFname = 'clf'


def predict(x):
    outputs = list(map(lambda clf: clf.predict([x]), classifiers))
    values = list(
        map(
            lambda i: np.sum(
                np.multiply(classifierWeights, np.equal(outputs, i))),
            range(10)))
    return np.argmax(values)

Ejemplo n.º 6
0
def experiment2a():

    image_dim = 28
    n_classes = 10
    n_trials = 5
    epochs = 20
    batch_size = 500

    db = getMACResultsDB()
    collection = db['experiment2a']

    x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST()

    # calculate performance on gold labels
    y_train = np_utils.to_categorical(y_train, n_classes)
    y_test = np_utils.to_categorical(y_test, n_classes)

    gold_benchmark = \
      runTrials(x_train, y_train, x_test, y_test, n_trials, n_classes, \
                data='mnist_gold', epochs=epochs)

    print(gold_benchmark)

    doc = {
            'name': 'gold benchmark',
            'm': y_train.shape[0],
            'accuracy': gold_benchmark[0],
            'error': gold_benchmark[1],
            'trials accuracy': gold_benchmark[2],
            'labelling accuracy': calculateLabellingAccuracy(y_train, y_train),
            'training set class distribution': \
              calculateClassDistribution(y_train).tolist()
          }

    collection.insert_one(doc)

    clustering = clusterData(x_train_flattened)

    x_labelled, labels, labelled_indices, unlabelled_indices = \
     experiment2aLabelleingMethod(x_train, y_train, \
                                  clustering, n_classes)

    result = \
      runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \
                data='mnist_experiment2a', epochs=epochs)

    print(result)

    print(calculateLabellingAccuracy(y_train[labelled_indices], labels))
    doc = {
            'name': 'majority class label assignment',
            'm': labels.shape[0],
            'accuracy': result[0],
            'error': result[1],
            'trials accuracy': result[2],
            'labelling accuracy': \
              calculateLabellingAccuracy(y_train[labelled_indices], labels),
            'training set class distribution': \
              calculateClassDistribution(labels).tolist()
          }

    collection.insert_one(doc)

#The MLP works better when the pixels are a value between 0.0f and 1.0f as oposed to 0 to 255?
#Can it be because the expected output is in the range 0.0f to 1.0f?
#Works best when normalized or input and output is in the same order of magnitude?
def _image_to_array(image):
    return image.reshape(len(image) * len(image[0])).astype(np.float64) / 255.0


def _label_to_array(label):
    res = np.zeros(10).astype(np.float64)
    res[label] = 1.0
    return res


_labels_training, _images_training = utils.loadMNIST(
    "mnist//train-labels-idx1-ubyte", "MNIST//train-images-idx3-ubyte")
_labels_test, _images_test = utils.loadMNIST("mnist//t10k-labels-idx1-ubyte",
                                             "MNIST//t10k-images-idx3-ubyte")

datasets = []

set_mnist_training = DataSet("MNIST training set", 784, 10)
datasets.append(set_mnist_training)
for i in range(len(_labels_training)):
    set_mnist_training.entries.append(
        DataEntry(_image_to_array(_images_training[i]),
                  _label_to_array(_labels_training[i])))

set_mnist_test = DataSet("MNIST test set", 784, 10)
datasets.append(set_mnist_test)
for i in range(len(_labels_test)):