def test_n_queries(self): annotator = StandardAnnot(self.X, self.Y, self.C) # test querying class labels of selected annotators ids = [0] annotator.class_labels(self.X[0:2], ids, query_value=3) # test number of queries np.testing.assert_array_equal([3, 0], annotator.n_queries())
def test_queried_samples(self): annotator = StandardAnnot(self.X, self.Y, self.C) # test querying class labels of selected annotators ids = [0] annotator.class_labels(self.X[0:2], ids) # test queried samples np.testing.assert_array_equal(self.X[0:2], annotator.queried_samples()[0]) np.testing.assert_array_equal( np.array([]).reshape(0, 2), annotator.queried_samples()[1])
def test_class_labels(self): annotator = StandardAnnot(self.X, self.Y, self.C) # test querying class labels ids = [0, 2, 3] X = self.X[ids] Y = annotator.class_labels(X) np.testing.assert_array_equal(self.Y[ids], Y) # test querying class labels of missing samples X = np.array([[-1, -1], [-2, -3]]) Y = annotator.class_labels(X) np.testing.assert_array_equal( np.array([[np.nan, np.nan], [np.nan, np.nan]]), Y) # test querying class labels of selected annotators ids = [0] Y = annotator.class_labels(self.X[0:2], ids) np.testing.assert_array_equal( np.array([[self.Y[0, 0], np.nan], [self.Y[0, 1], np.nan]]), Y)
def run(results_path, data_set, query_strategy, budget, test_ratio, seed): """ Run experiments to compare query selection strategies. Experimental results are stored in a .csv-file. Parameters ---------- results_path: str Absolute path to store results. data_set: str Name of the data set. query_strategy: str Determines query strategy. budget: int Maximal number of labeled samples. test_ratio: float in (0, 1) Ratio of test samples. seed: float Random seed. """ # --------------------------------------------- LOAD DATA ---------------------------------------------------------- is_cosine = 'reports' in data_set X, y_true, y = load_data(data_set_name=data_set) n_features = np.size(X, axis=1) n_classes = len(np.unique(y)) n_annotators = np.size(y, axis=1) print(data_set + ': ' + str(investigate_data_set(data_set))) budget_str = str(budget) if budget > len(X) * n_annotators * (1 - test_ratio): budget = int(math.floor(len(X) * n_annotators * (1 - test_ratio))) elif budget > 1: budget = int(budget) elif 0 < budget <= 1: budget = int( math.floor(len(X) * n_annotators * (1 - test_ratio) * budget)) else: raise ValueError( "'budget' must be a float in (0, 1] or an integer in [0, n_samples]" ) budget = np.min((budget, 1000)) # --------------------------------------------- STATISTICS --------------------------------------------------------- # define storage for performances results = {} # define performance functions C = 1 - np.eye(n_classes) perf_funcs = { 'micro-misclf-rate': [partial(misclassification_costs, C=C, average='micro'), {}], 'macro-misclf-rate': [partial(misclassification_costs, C=C, average='macro'), {}] } # ------------------------------------------- LOAD DATA ---------------------------------------------------- print('seed: {}'.format(str(seed))) X_train, X_test, y_true_train, y_true_test, y_train, y_test = train_test_split( X, y_true, y, test_size=test_ratio, random_state=seed) while not np.array_equal(np.unique(y_true_train), np.unique(y_true_test)): X_train, X_test, y_true_train, y_true_test, y_train, y_test = train_test_split( X, y_true, y, random_state=seed, test_size=test_ratio) seed += 1000 print('new seed: {}'.format(seed)) n_samples = len(X_train) # --------------------------------------------- CSV NAMES ---------------------------------------------------------- csv_name = '{}_{}_{}_{}_{}.csv'.format(data_set, query_strategy, budget_str, test_ratio, seed) # ------------------------------------------ PREPROCESS DATA ------------------------------------------------------- # standardize data if is_cosine: kwargs = {'metric': 'cosine'} else: # standardize data scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # compute bandwidth bandwidth = estimate_bandwidth(n_samples=n_samples, n_features=n_features) print('bandwidth: {}'.format(str(bandwidth))) gamma = 0.5 * (bandwidth**(-2)) kwargs = {'metric': 'rbf', 'gamma': gamma} # setup classifiers pwc_train = PWC(n_classes=n_classes, combine_labels=False, random_state=seed, **kwargs) S_train = pairwise_kernels(X_train, X_train, **kwargs) pwc_test = PWC(n_classes=n_classes, metric='precomputed', combine_labels=False, probabilistic=False, random_state=seed) S_test = pairwise_kernels(X_test, X_train, **kwargs) # set up data set data_set = DataSet(X_train, n_annotators=n_annotators) annotators = StandardAnnot(X=X_train, Y=y_train) # create query strategy if query_strategy == 'ceal': query_strategy = CEAL(data_set=data_set, n_classes=n_classes, clf=pwc_train, n_neighbors=10, label_proportion=0.2 * budget / n_annotators, random_state=seed, **kwargs) elif query_strategy == 'alio': query_strategy = ALIO(data_set=data_set, n_classes=n_classes, clf=pwc_train, label_proportion=0.2 * budget / n_annotators, random_state=seed) elif query_strategy == 'proactive': query_strategy = Proactive(data_set=data_set, n_classes=n_classes, clf=pwc_train, n_components=20, label_proportion=0.2 * budget / n_annotators, random_state=seed) elif 'mapal' in query_strategy: params = query_strategy.split('-') mean_prior = float(params[1]) sum_prior = (np.sum(S_train) - n_samples) / ( n_samples**2 - n_samples) if params[2] == 'mean' else float( params[2]) prior = np.array([mean_prior, 1 - mean_prior]) prior /= np.sum(prior) prior *= sum_prior print('prior = {}'.format(prior)) m_max = int(params[3]) alpha = float(params[4]) weights_type = str(params[5]) bam = BAM(n_classes=n_classes, weights_type=weights_type, prior=prior, random_state=seed, **kwargs) query_strategy = MAPAL(data_set=data_set, m_max=m_max, n_classes=n_classes, S=S_train, bam=bam, alpha_x=alpha, alpha_c=alpha, random_state=seed) elif query_strategy == 'ie-adj-cost': query_strategy = IEAdjCost(data_set=data_set, clf=pwc_train, n_classes=n_classes, delta=0.4, lmbda=0.4, alpha=0.05, epsilon=0.8, random_state=seed) elif query_strategy == 'ie-thresh': query_strategy = IEThresh(data_set=data_set, clf=pwc_train, n_classes=n_classes, epsilon=0.8, alpha=0.05, random_state=seed) elif query_strategy == 'random': query_strategy = RS(data_set=data_set, random_state=seed) else: raise ValueError( "query strategy must be in ['ceal', 'ie-thresh', 'pal-1-all', 'pal-1-single', 'mapal-..., random]" ) # ----------------------------------------- ACTIVE LEARNING CYCLE -------------------------------------------------- times = [0] for b in range(budget): print("budget: {}".format(b)) # evaluate results eval_perfs(clf=pwc_test, X_train=S_train, y_train=y_true_train, X_test=S_test, y_test=y_true_test, perf_results=results, perf_funcs=perf_funcs) eval_annot_stats(y=data_set.y_, y_true=y_true_train, results=results) # select sample and annotator t = time() selection = query_strategy.make_query() times.append(time() - t) sample_id = selection[0, 0] annotator_id = [selection[0, 1]] print("selected sample: {}".format(sample_id)) print("selected annotator: {}".format(annotator_id)) # query selected annotator for labeling selected sample X_query = [X_train[sample_id]] y_query = annotators.class_labels(X_query, annotator_ids=annotator_id) print('class label: {}'.format(y_query[0, annotator_id[0]])) # update training data data_set.update_entries(sample_id, y_query) print(data_set.len_labeled(per_annotator=True)) # retrain classifier pwc_test.fit(X=data_set.X_, y=data_set.y_, c=data_set.c_) # evaluate results eval_perfs(clf=pwc_test, X_train=S_train, y_train=y_true_train, X_test=S_test, y_test=y_true_test, perf_results=results, perf_funcs=perf_funcs) eval_annot_stats(y=data_set.y_, y_true=y_true_train, results=results) # store performance results results['times'] = times df = pd.DataFrame(results) df.to_csv('{}/{}'.format(results_path, csv_name), index_label='index')