def experiment2c(): image_dim = 28 n_classes = 10 n_trials = 5 epochs = 20 batch_size = 500 db = getMACResultsDB() collection = db['experiment2c'] x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST() y_train = np_utils.to_categorical(y_train, n_classes) y_test = np_utils.to_categorical(y_test, n_classes) clustering = clusterData(x_train_flattened) fractions = [0.00, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0] recovery = [] results = [] for fraction in fractions: x_labelled, labels, labelled_indices, unlabelled_indices, \ sampled_dominant_cluster_classes, dominant_cluster_classes = \ experiment2cLabelleingMethod(x_train, y_train, clustering, n_classes, fraction) result = \ runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \ data='mnist_experiment2c_fraction%.2lf'%(fraction), \ epochs=epochs) results.append(result) recovery.append( calculateLabellingAccuracy( np_utils.to_categorical(dominant_cluster_classes), np_utils.to_categorical(sampled_dominant_cluster_classes)) / 100.0) doc = { 'name': 'majority class recovery by cluster subsample', 'm': labels.shape[0], 'fraction sampled': fraction, 'accuracy': result[0], 'error': result[1], 'trials accuracy': result[2], 'labelling accuracy': \ calculateLabellingAccuracy(y_train[labelled_indices], labels), 'training set class distribution': \ calculateClassDistribution(labels).tolist(), 'cluster majority class recovery rate': calculateLabellingAccuracy( np_utils.to_categorical(dominant_cluster_classes), np_utils.to_categorical(sampled_dominant_cluster_classes))/100.0 } collection.insert_one(doc) return fractions, results, recovery
def experiment2b(): image_dim = 28 n_classes = 10 n_trials = 5 epochs = 20 batch_size = 500 db = getMACResultsDB() collection = db['experiment2b'] x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST() y_train = np_utils.to_categorical(y_train, n_classes) y_test = np_utils.to_categorical(y_test, n_classes) clustering = clusterData(x_train_flattened) fractions = [0.11, 0.2, 0.25, 0.3, 0.4, 0.5, \ 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1.0] results = [] for fraction in fractions: x_labelled, labels, labelled_indices, unlabelled_indices = \ experiment2bLabelleingMethod(x_train, y_train, clustering, n_classes, fraction) print(labels.shape[0]) result = \ runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \ data='mnist_experiment2b_fraction%.2lf'%(fraction), \ epochs=epochs) results.append(result) doc = { 'name': 'majority class label assignment by fraction', 'm': labels.shape[0], 'fraction': fraction, 'accuracy': result[0], 'error': result[1], 'trials accuracy': result[2], 'labelling accuracy': \ calculateLabellingAccuracy(y_train[labelled_indices], labels), 'training set class distribution': \ calculateClassDistribution(labels).tolist() } collection.insert_one(doc) return fractions, results
from nn import NN, Relu, Linear, SquaredLoss from utils import data_loader, acc, save_plot, loadMNIST, onehot x_train, label_train = loadMNIST('data/train-images.idx3-ubyte', 'data/train-labels.idx1-ubyte') x_test, label_test = loadMNIST('data/t10k-images.idx3-ubyte', 'data/t10k-labels.idx1-ubyte') y_train = onehot(label_train) y_test = onehot(label_test) model = NN(Relu(), SquaredLoss(), hidden_layers=[128, 128], input_d=784, output_d=10) model.print_model() training_data, dev_data = {"X":x_train, "Y":y_train}, {"X":x_test, "Y":y_test} from run_nn import train_1pass model, plot_dict = train_1pass(model, training_data, dev_data, learning_rate=1e-2, batch_size=64)
def experiment2aNoise(filename): image_dim = 28 n_classes = 10 n_trials = 5 epochs = 20 batch_size = 500 db = getMACResultsDB() x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST() # calculate performance on gold labels y_train = np_utils.to_categorical(y_train, n_classes) y_test = np_utils.to_categorical(y_test, n_classes) clustering = clusterData(x_train_flattened) # experiment 2a labelling accuracy labelling_accuracies_2a = [] labelling_accuracies_2a_errors = [] labelling_accuracies_2a_2 = [] labelling_accuracies_2a_2_errors = [] # experiment 2a machine accuracy machine_accuracies_2a = [] machine_accuracies_2a_errors = [] machine_accuracies_2a_2 = [] machine_accuracies_2a_2_errors = [] noise_levels = np.arange(0, 1.1, 0.1) collection_name = 'experiment2d' collection = db[collection_name] for noise in noise_levels: l_results = [] l_results_2 = [] m_results = [] m_results_2 = [] for i in range(n_trials): # experiment 2a with random noise experiment_name = \ 'experiment 2d - experiment2a random noise %.2lf trial %d' % (noise, i) try: assert collection_name in db.collection_names() doc = db[collection_name].find({'name': experiment_name})[0] l_results.append(doc['labelling accuracy']) m_results.append(doc['machine accuracy']) except (AssertionError, IndexError): x_labelled, labels, labelled_indices, unlabelled_indices = \ experiment2aNoiseLabelling(x_train, y_train, clustering, \ n_classes, noise=noise) l_results.append( calculateLabellingAccuracy(y_train[labelled_indices], labels)) r = runTrials(x_labelled, labels, x_test, y_test, 1, n_classes, \ data='mnist_experiment2d_noise%.2lf_trial%d'%(noise, i)) m_results.append(r[0]) doc = { 'name':experiment_name, 'm': labels.shape[0], 'noise': noise, 'trial': i, 'labelling accuracy': \ calculateLabellingAccuracy(y_train[labelled_indices], labels), 'machine accuracy': r[0], 'training set class distribution': \ calculateClassDistribution(labels).tolist() } collection.insert_one(doc) # experiment 2a with weighted noise experiment_name = \ 'experiment 2d - experiment2a class weighted noise %.2lf trial %d' \ % (noise, i) try: assert collection_name in db.collection_names() doc = db[collection_name].find({'name': experiment_name})[0] l_results_2.append(doc['labelling accuracy']) m_results_2.append(doc['machine accuracy']) except (AssertionError, IndexError): x_labelled, labels, labelled_indices, unlabelled_indices = \ experiment2aNoiseLabelling(x_train, y_train, clustering, \ n_classes, intelligent_noise=noise) l_results_2.append( calculateLabellingAccuracy(y_train[labelled_indices], labels)) r = runTrials(x_labelled, labels, x_test, y_test, 1, n_classes, \ data='mnist_experiment2d_class_weighted_noise%.2lf_trial%d'%(noise, i)) m_results_2.append(r[0]) doc = { 'name':experiment_name, 'm': labels.shape[0], 'noise': noise, 'trial': i, 'labelling accuracy': \ calculateLabellingAccuracy(y_train[labelled_indices], labels), 'machine accuracy': r[0], 'training set class distribution': \ calculateClassDistribution(labels).tolist() } collection.insert_one(doc) labelling_accuracies_2a.append(np.mean(l_results)) labelling_accuracies_2a_errors.append(np.std(l_results)) labelling_accuracies_2a_2.append(np.mean(l_results_2)) labelling_accuracies_2a_2_errors.append(np.std(l_results_2)) machine_accuracies_2a.append(np.mean(m_results)) machine_accuracies_2a_errors.append(np.std(m_results)) machine_accuracies_2a_2.append(np.mean(m_results_2)) machine_accuracies_2a_2_errors.append(np.std(m_results_2)) cursor = db['experiment2a'].find( {'name': 'majority class label assignment'}) for doc in cursor: majority_class_benchmark = doc['labelling accuracy'] fig, ax = plt.subplots() ax.plot(np.arange(-0.02,1.03,0.01), \ np.ones((105))*majority_class_benchmark,'k--') ax.errorbar(noise_levels, labelling_accuracies_2a, \ yerr=labelling_accuracies_2a_errors, fmt='o', mfc='None', \ color='#B8336A', label='majority class - random noise') ax.errorbar(noise_levels, labelling_accuracies_2a_2, \ yerr=labelling_accuracies_2a_2_errors, fmt='o', mfc='None', \ color='#726DA8', label='majority class - class weighted noise', zorder=100) ax.set_xlabel('labelling noise') ax.set_ylabel('labelling accuracy') ax.set_ylim(-2, 100) ax.set_xlim(-0.02, 1.03) plt.legend(loc='lower left') #plt.show() plt.savefig(filename + 'labelling_noise.pdf') plt.savefig(filename + 'labelling_noise.png') cursor = db['experiment2a'].find({'name': 'gold benchmark'}) for doc in cursor: gold_benchmark = doc['accuracy'] gold_benchmark_error = doc['error'] cursor = db['experiment2a'].find( {'name': 'majority class label assignment'}) for doc in cursor: majority_class_benchmark = doc['accuracy'] majority_class_error = doc['error'] fig, ax = plt.subplots() ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*gold_benchmark, \ color='#726DA8', label='gold benchmark') ax.axhspan(gold_benchmark-gold_benchmark_error, \ gold_benchmark+gold_benchmark_error, \ facecolor='#726DA8', alpha=0.5) ax.plot(np.arange(-0.02,1.03,0.01), np.ones((105,))*majority_class_benchmark, \ color='#A0D2DB', label='majority class benchmark') ax.axhspan(majority_class_benchmark-majority_class_error, \ majority_class_benchmark+majority_class_error, \ facecolor='#A0D2DB', alpha=0.5) ax.errorbar(noise_levels, machine_accuracies_2a, \ yerr=machine_accuracies_2a_errors, fmt='o', mfc='None', \ color='#B8336A', label='majority class - random noise') ax.errorbar(noise_levels, machine_accuracies_2a_2, \ yerr=machine_accuracies_2a_2_errors, fmt='o', mfc='None', \ color='#726DA8', label='majority class - class weighted noise', zorder=100) ax.set_xlabel('labelling noise') ax.set_ylabel('machine accuracy') ax.set_ylim(-2, 100) ax.set_xlim(-0.02, 1.03) plt.legend(loc='lower left') #plt.show() plt.savefig(filename + '_machine_accuracy.pdf') plt.savefig(filename + '_machine_accuracy.png')
import math import pickle # Utils from sklearn.metrics import accuracy_score import utils # Classifiers from sklearn import tree from sklearn import svm from sklearn import linear_model classifiers = [] classifierWeights = [] numClassifiers = 5 X_train, X_test, Y_train, Y_test = utils.loadMNIST(test=0.15) loaded = False modelFname = 'clf' def predict(x): outputs = list(map(lambda clf: clf.predict([x]), classifiers)) values = list( map( lambda i: np.sum( np.multiply(classifierWeights, np.equal(outputs, i))), range(10))) return np.argmax(values)
def experiment2a(): image_dim = 28 n_classes = 10 n_trials = 5 epochs = 20 batch_size = 500 db = getMACResultsDB() collection = db['experiment2a'] x_train, y_train, x_train_flattened, x_test, y_test = loadMNIST() # calculate performance on gold labels y_train = np_utils.to_categorical(y_train, n_classes) y_test = np_utils.to_categorical(y_test, n_classes) gold_benchmark = \ runTrials(x_train, y_train, x_test, y_test, n_trials, n_classes, \ data='mnist_gold', epochs=epochs) print(gold_benchmark) doc = { 'name': 'gold benchmark', 'm': y_train.shape[0], 'accuracy': gold_benchmark[0], 'error': gold_benchmark[1], 'trials accuracy': gold_benchmark[2], 'labelling accuracy': calculateLabellingAccuracy(y_train, y_train), 'training set class distribution': \ calculateClassDistribution(y_train).tolist() } collection.insert_one(doc) clustering = clusterData(x_train_flattened) x_labelled, labels, labelled_indices, unlabelled_indices = \ experiment2aLabelleingMethod(x_train, y_train, \ clustering, n_classes) result = \ runTrials(x_labelled, labels, x_test, y_test, n_trials, n_classes, \ data='mnist_experiment2a', epochs=epochs) print(result) print(calculateLabellingAccuracy(y_train[labelled_indices], labels)) doc = { 'name': 'majority class label assignment', 'm': labels.shape[0], 'accuracy': result[0], 'error': result[1], 'trials accuracy': result[2], 'labelling accuracy': \ calculateLabellingAccuracy(y_train[labelled_indices], labels), 'training set class distribution': \ calculateClassDistribution(labels).tolist() } collection.insert_one(doc)
#The MLP works better when the pixels are a value between 0.0f and 1.0f as oposed to 0 to 255? #Can it be because the expected output is in the range 0.0f to 1.0f? #Works best when normalized or input and output is in the same order of magnitude? def _image_to_array(image): return image.reshape(len(image) * len(image[0])).astype(np.float64) / 255.0 def _label_to_array(label): res = np.zeros(10).astype(np.float64) res[label] = 1.0 return res _labels_training, _images_training = utils.loadMNIST( "mnist//train-labels-idx1-ubyte", "MNIST//train-images-idx3-ubyte") _labels_test, _images_test = utils.loadMNIST("mnist//t10k-labels-idx1-ubyte", "MNIST//t10k-images-idx3-ubyte") datasets = [] set_mnist_training = DataSet("MNIST training set", 784, 10) datasets.append(set_mnist_training) for i in range(len(_labels_training)): set_mnist_training.entries.append( DataEntry(_image_to_array(_images_training[i]), _label_to_array(_labels_training[i]))) set_mnist_test = DataSet("MNIST test set", 784, 10) datasets.append(set_mnist_test) for i in range(len(_labels_test)):