def get_classifier_agreement_increase_table(target_weight_list, n_simulations=1000): agreement_before = np.zeros(n_simulations) agreement_after = np.zeros(n_simulations) annotations, labels = load_ambiguous_annotations_labeled( annotations_labeled_filename) result = "" for weight in target_weight_list: for i in xrange(n_simulations): classifier = joblib.load(classifier_pickle_filename) pool_annotations, test_annotations, pool_labels, test_labels = train_test_split( annotations, labels, test_size=0.33) # validate the initial state of the classifier agreement_before[i] = get_agreement( classifier, (test_annotations, test_labels)) # test: target train on the entire pool, validate again classifier.target_weight = weight classifier.train_target_online(pool_annotations, pool_labels) agreement_after[i] = get_agreement(classifier, (test_annotations, test_labels)) result += str(weight), np.mean(agreement_after - agreement_before) return result
def get_mturk_pickled_classifier_agreement(classifier_pickle_file, mturk_vote_file_path, classifier_class, **kwargs): classifier = joblib.load(classifier_pickle_file) mturk_labeled_data = data.load_ambiguous_annotations_labeled( mturk_vote_file_path) return get_agreement(classifier, mturk_labeled_data)
def plot_learning_curves(classifier_pickle_filename, target_weight=1000, n_simulations=100, test_size=0.33): classifier_loaded = joblib.load(CLASSIFIER_PICKLE_FOLDER + classifier_pickle_filename) annotations_loaded, labels_loaded = load_ambiguous_annotations_labeled(ANNOTATIONS_LABELED_FILENAME) pool, _, _, _ = train_test_split(annotations_loaded, labels_loaded, test_size = test_size) n_iterations = len(pool) + 1 passive_accuracy = np.zeros((n_simulations, n_iterations)) active_accuracy = np.zeros((n_simulations, n_iterations)) counter = CountPrinter(n_simulations) for run_number in range(n_simulations): # securing statelessness classifier = deepcopy(classifier_loaded) annotations= deepcopy(annotations_loaded) labels = deepcopy(labels_loaded) train_test_set = train_test_split(annotations, labels, test_size = test_size) passive_accuracy[run_number] = get_accuracy_progression( train_test_set, classifier, annotations, labels, target_weight, PassiveLearner) active_accuracy[run_number] = get_accuracy_progression( train_test_set, classifier, annotations, labels, target_weight, UncertaintySamplingLeastConfidenceActiveLearner) counter.count() passive_avg_accuracy_progression = np.mean(passive_accuracy, axis=0) active_avg_accuracy_progression = np.mean(active_accuracy, axis=0) plot_filename = PLOT_FOLDER + classifier_pickle_filename + '_weight' + str(target_weight) plot_curves.plot_curves(plot_filename, title="Average iteration accuracy for %s simulations" % n_simulations, PassiveLearner=passive_avg_accuracy_progression, ActiveLearner=active_avg_accuracy_progression)
def crossvalidation(mturk_vote_file_path, classifier_class, n_folds=2, verbose=False, **kwargs): # train a classifier on ambiguous annotations ambig_annotations, labels = data.load_ambiguous_annotations_labeled( mturk_vote_file_path) ambig_annotations = np.array(ambig_annotations) labels = np.array(labels) folds = cross_validation.KFold(len(ambig_annotations), n_folds=n_folds, indices=True) counter = CountPrinter(n_folds) fold_errors = [] for train_indices, test_indices in folds: if verbose: counter.count() classifier = classifier_class(**kwargs) classifier.train(ambig_annotations[train_indices], labels[train_indices]) predicted_group_numbers = classifier.predict( ambig_annotations[test_indices]) voted_group_numbers = [ data.Annotation.GROUP_MAPPING[label] for label in labels ] agreement = [ int(predicted == voted) for predicted, voted in zip( predicted_group_numbers, voted_group_numbers) ] fold_errors.append(np.mean(agreement)) return np.mean(fold_errors)
def get_mturk_classifier_agreement(ssc_file_path, mturk_vote_file_path, classifier_class, **kwargs): # train a classifier on unambiguous annotations unambig_annotations = data.load_unambiguous_annotations(ssc_file_path) classifier = classifier_class(**kwargs) classifier.train(unambig_annotations) # read mturk annotations mturk_labeled_data = data.load_ambiguous_annotations_labeled(mturk_vote_file_path) return get_agreement(classifier, mturk_labeled_data)
def get_mturk_classifier_agreement(ssc_file_path, mturk_vote_file_path, classifier_class, **kwargs): # train a classifier on unambiguous annotations unambig_annotations = data.load_unambiguous_annotations(ssc_file_path) classifier = classifier_class(**kwargs) classifier.train(unambig_annotations) # read mturk annotations mturk_labeled_data = data.load_ambiguous_annotations_labeled( mturk_vote_file_path) return get_agreement(classifier, mturk_labeled_data)
def get_mturk_classifier_agreement_label(mturk_vote_file_path, classifier_class, **kwargs): # train a classifier on ambiguous annotations ambig_annotations, labels = data.load_ambiguous_annotations_labeled(mturk_vote_file_path) classifier = classifier_class(**kwargs) classifier.train(ambig_annotations,labels) # classify annotations and output the agreement predicted_group_numbers = classifier.predict(ambig_annotations) voted_group_numbers = [data.Annotation.GROUP_MAPPING[label] for label in labels] agreement = [int(predicted == voted) for predicted, voted in zip(predicted_group_numbers, voted_group_numbers)] return np.mean(agreement)
def plot_learning_curves(classifier_pickle_filename, target_weight=1000, n_simulations=100, test_size=0.33): classifier_loaded = joblib.load(CLASSIFIER_PICKLE_FOLDER + classifier_pickle_filename) annotations_loaded, labels_loaded = load_ambiguous_annotations_labeled( ANNOTATIONS_LABELED_FILENAME) pool, _, _, _ = train_test_split(annotations_loaded, labels_loaded, test_size=test_size) n_iterations = len(pool) + 1 passive_accuracy = np.zeros((n_simulations, n_iterations)) active_accuracy = np.zeros((n_simulations, n_iterations)) counter = CountPrinter(n_simulations) for run_number in range(n_simulations): # securing statelessness classifier = deepcopy(classifier_loaded) annotations = deepcopy(annotations_loaded) labels = deepcopy(labels_loaded) train_test_set = train_test_split(annotations, labels, test_size=test_size) passive_accuracy[run_number] = get_accuracy_progression( train_test_set, classifier, annotations, labels, target_weight, PassiveLearner) active_accuracy[run_number] = get_accuracy_progression( train_test_set, classifier, annotations, labels, target_weight, UncertaintySamplingLeastConfidenceActiveLearner) counter.count() passive_avg_accuracy_progression = np.mean(passive_accuracy, axis=0) active_avg_accuracy_progression = np.mean(active_accuracy, axis=0) plot_filename = PLOT_FOLDER + classifier_pickle_filename + '_weight' + str( target_weight) plot_curves.plot_curves( plot_filename, title="Average iteration accuracy for %s simulations" % n_simulations, PassiveLearner=passive_avg_accuracy_progression, ActiveLearner=active_avg_accuracy_progression)
def get_mturk_classifier_agreement_label(mturk_vote_file_path, classifier_class, **kwargs): # train a classifier on ambiguous annotations ambig_annotations, labels = data.load_ambiguous_annotations_labeled( mturk_vote_file_path) classifier = classifier_class(**kwargs) classifier.train(ambig_annotations, labels) # classify annotations and output the agreement predicted_group_numbers = classifier.predict(ambig_annotations) voted_group_numbers = [ data.Annotation.GROUP_MAPPING[label] for label in labels ] agreement = [ int(predicted == voted) for predicted, voted in zip( predicted_group_numbers, voted_group_numbers) ] return np.mean(agreement)
def crossvalidation(mturk_vote_file_path, classifier_class, n_folds = 2, verbose=False, **kwargs): # train a classifier on ambiguous annotations ambig_annotations, labels = data.load_ambiguous_annotations_labeled(mturk_vote_file_path) ambig_annotations = np.array(ambig_annotations) labels = np.array(labels) folds = cross_validation.KFold(len(ambig_annotations), n_folds=n_folds, indices=True) counter = CountPrinter(n_folds) fold_errors = [] for train_indices, test_indices in folds: if verbose: counter.count() classifier = classifier_class(**kwargs) classifier.train(ambig_annotations[train_indices], labels[train_indices]) predicted_group_numbers = classifier.predict(ambig_annotations[test_indices]) voted_group_numbers = [data.Annotation.GROUP_MAPPING[label] for label in labels] agreement = [int(predicted == voted) for predicted, voted in zip(predicted_group_numbers, voted_group_numbers)] fold_errors.append(np.mean(agreement)) return np.mean(fold_errors)
def get_classifier_agreement_increase_table(target_weight_list, n_simulations = 1000): agreement_before = np.zeros(n_simulations) agreement_after = np.zeros(n_simulations) annotations, labels = load_ambiguous_annotations_labeled(annotations_labeled_filename) result = "" for weight in target_weight_list: for i in xrange(n_simulations): classifier = joblib.load(classifier_pickle_filename) pool_annotations, test_annotations, pool_labels, test_labels = train_test_split( annotations, labels, test_size = 0.33) # validate the initial state of the classifier agreement_before[i] = get_agreement(classifier, (test_annotations, test_labels)) # test: target train on the entire pool, validate again classifier.target_weight = weight classifier.train_target_online(pool_annotations, pool_labels) agreement_after[i] = get_agreement(classifier, (test_annotations, test_labels)) result += str(weight), np.mean(agreement_after - agreement_before) return result
return (y - x for x, y in itertools.izip( itertools.islice(seq, 0, len(seq) - 1), itertools.islice(seq, 1, len(seq)))) def format_float_list(seq, sep=" "): result = "" for item in seq: result += "%.2f" % item result += sep return result classifier = joblib.load(classifier_pickle_filename) annotations, labels = load_ambiguous_annotations_labeled( annotations_labeled_filename) N_SIMULATIONS = 100 accuracy_diffs = np.zeros((2, N_SIMULATIONS)) accuracy_diff_gains = np.zeros(N_SIMULATIONS) for i in range(N_SIMULATIONS): accuracy_progression_passive = get_accuracy_progression( classifier, annotations, labels, 1000, PassiveLearner) accuracy_diff_passive = accuracy_progression_passive[ -1] - accuracy_progression_passive[0] accuracy_progression_active = get_accuracy_progression( classifier, annotations, labels, 1000, UncertaintySamplingLeastConfidenceActiveLearner) accuracy_diff_active = accuracy_progression_active[
def get_mturk_pickled_classifier_agreement(classifier_pickle_file, mturk_vote_file_path, classifier_class, **kwargs): classifier = joblib.load(classifier_pickle_file) mturk_labeled_data = data.load_ambiguous_annotations_labeled(mturk_vote_file_path) return get_agreement(classifier, mturk_labeled_data)
#!/usr/bin/env python from data import load_ambiguous_annotations_labeled from mturk_classifier_agreement import get_agreement from sklearn.externals.joblib import load, Parallel, delayed from train_and_serialize import train_and_serialize from copy import deepcopy from sklearn.cross_validation import train_test_split import numpy as np MTURK_VOTE_FILE = '../vote_results_thr0.75-new6.csv' annotations, labels = load_ambiguous_annotations_labeled(MTURK_VOTE_FILE) ''' This function gets an accuracy gain for RANDOM train/test split of data ''' def get_accuracy_gain(loaded_classifier): classifier = deepcopy(loaded_classifier) pool_annotations, test_annotations, pool_labels, test_labels = train_test_split( annotations, labels, test_size=0.33) accuracy_before = get_agreement(classifier, (test_annotations, test_labels)) classifier.train_target_online(pool_annotations, pool_labels) accuracy_after = get_agreement(classifier, (test_annotations, test_labels)) return (accuracy_after, accuracy_before) def get_mean_accuracy_gain(classifier_pickle_file, n_runs, **kwargs): loaded_classifier = load(classifier_pickle_file) for k, v in kwargs.items():
def diff_iter(seq): return (y - x for x, y in itertools.izip(itertools.islice(seq, 0, len(seq) - 1), itertools.islice(seq, 1, len(seq))) ) def format_float_list(seq, sep=" "): result = "" for item in seq: result += "%.2f" % item result += sep return result classifier = joblib.load(classifier_pickle_filename) annotations, labels = load_ambiguous_annotations_labeled(annotations_labeled_filename) N_SIMULATIONS = 100 accuracy_diffs = np.zeros((2, N_SIMULATIONS)) accuracy_diff_gains = np.zeros(N_SIMULATIONS) for i in range(N_SIMULATIONS): accuracy_progression_passive = get_accuracy_progression(classifier, annotations, labels, 1000, PassiveLearner) accuracy_diff_passive = accuracy_progression_passive[-1] - accuracy_progression_passive[0] accuracy_progression_active = get_accuracy_progression(classifier, annotations, labels, 1000, UncertaintySamplingLeastConfidenceActiveLearner) accuracy_diff_active = accuracy_progression_active[-1] - accuracy_progression_active[0] accuracy_diff_gains[i] = accuracy_diff_active - accuracy_diff_passive
#!/usr/bin/env python from data import load_ambiguous_annotations_labeled from mturk_classifier_agreement import get_agreement from sklearn.externals.joblib import load, Parallel, delayed from train_and_serialize import train_and_serialize from copy import deepcopy from sklearn.cross_validation import train_test_split import numpy as np MTURK_VOTE_FILE = '../vote_results_thr0.75-new6.csv' annotations, labels = load_ambiguous_annotations_labeled(MTURK_VOTE_FILE) ''' This function gets an accuracy gain for RANDOM train/test split of data ''' def get_accuracy_gain(loaded_classifier): classifier = deepcopy(loaded_classifier) pool_annotations, test_annotations, pool_labels, test_labels = train_test_split( annotations, labels, test_size = 0.33) accuracy_before = get_agreement(classifier, (test_annotations, test_labels)) classifier.train_target_online(pool_annotations, pool_labels) accuracy_after = get_agreement(classifier, (test_annotations, test_labels)) return (accuracy_after, accuracy_before) def get_mean_accuracy_gain(classifier_pickle_file, n_runs, **kwargs): loaded_classifier = load(classifier_pickle_file) for k, v in kwargs.items(): loaded_classifier.__dict__[k] = v accuracies_before = np.zeros(n_runs)