def test_speedup_greedy(self): X = np.random.uniform(-1, 1, size=(1000, 2)) Y = np.ones(X.shape[0]) negative_examples = np.where(X[:, 0] < 0) Y[negative_examples] = -1 Y_obstructed = ObstructedY(Y) Y_obstructed.query(range(100)) m = Perceptron(alpha=0, n_iter=100).fit(X, Y) dist = construct_normalized_euc(X) D = pairwise_distances(X, metric=dist) r1 = quasi_greedy_batch(X, Y_obstructed, m, rng=None, batch_size=20, D=D) r2 = quasi_greedy_batch_slow(X, Y_obstructed, m, rng=None, batch_size=20, dist=dist, D=D) self.assertTrue(np.array_equal(r1[0], r2[0])) self.assertAlmostEqual(r1[1], r2[1])
def test_greedy_unc(self): mean_1 = np.array([-2, 0]) mean_2 = np.array([2, 0]) cov = np.array([[1, 0], [0, 1]]) X_1 = np.random.multivariate_normal(mean_1, cov, 100) X_2 = np.random.multivariate_normal(mean_2, cov, 200) X = np.vstack([X_1, X_2]) y = np.ones(X.shape[0]) y[101:] = -1 # shuffle data p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] y = ObstructedY(y) y.query(np.random.randint(0, X.shape[0], 50)) model = SVC(C=1, kernel='linear') model.fit(X[y.known], y[y.known]) picked, _ = quasi_greedy_batch_slow(X, y, current_model=model, c=0.0, rng=self.rng, batch_size=10, dist='cosine_distance_normalized', \ base_strategy='uncertainty_sampling') unc_pick, _ = uncertainty_sampling(X, y, model, batch_size=10, rng=self.rng) self.assertTrue(set(picked) == set(unc_pick))
def test_qbc(self): mean_1 = np.array([-2, 0]) mean_2 = np.array([2, 0]) cov = np.array([[1, 0], [0, 1]]) X_1 = np.random.multivariate_normal(mean_1, cov, 100) X_2 = np.random.multivariate_normal(mean_2, cov, 200) X = np.vstack([X_1, X_2]) y = np.ones(X.shape[0]) y[101:] = -1 # shuffle data p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] y = ObstructedY(y) y.query(np.random.randint(0, X.shape[0], 50)) model = SVC(C=1, kernel='linear') model.fit(X[y.known], y[y.known]) pick, _ = query_by_bagging(X, y, current_model=None, base_model=model, batch_size=50, rng=self.rng, n_bags=5, method='entropy') mean_picked_dist = np.abs(model.decision_function(X[pick])).mean() not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)] mean_unpicked_dist = np.abs(model.decision_function(X[not_picked])).mean() self.assertTrue(mean_picked_dist < mean_unpicked_dist)
def test_unknown_id(self): oy = ObstructedY(self.y) self.assertTrue(all(oy.unknown_ids == np.arange(self.y.shape[0]))) oy.query(range(50)) self.assertTrue((all(oy.unknown_ids == np.arange(50) + 50))) oy.query(range(50,100)) self.assertTrue(len(oy.unknown_ids) == 0)
def test_unknown_id(self): oy = ObstructedY(self.y) self.assertTrue(all(oy.unknown_ids == np.arange(self.y.shape[0]))) oy.query(range(50)) self.assertTrue((all(oy.unknown_ids == np.arange(50) + 50))) oy.query(range(50, 100)) self.assertTrue(len(oy.unknown_ids) == 0)
def test_element_access(self): oy = ObstructedY(self.y) self.assertEqual(oy.query(42), self.y[42]) self.assertEqual(oy[42], self.y[42]) self.assertTrue(all(oy.query([6,66]) == self.y[[6,66]])) self.assertTrue(all(oy[[6,66]] == self.y[[6,66]])) self.assertTrue(all(oy.query(np.array([1,2])) == self.y[[1,2]])) self.assertTrue(all(oy[np.array([1,2])] == self.y[[1,2]])) oy.query([3,4,5,6]) self.assertTrue(all(oy[3:7] == self.y[3:7]))
def test_element_access(self): oy = ObstructedY(self.y) self.assertEqual(oy.query(42), self.y[42]) self.assertEqual(oy[42], self.y[42]) self.assertTrue(all(oy.query([6, 66]) == self.y[[6, 66]])) self.assertTrue(all(oy[[6, 66]] == self.y[[6, 66]])) self.assertTrue(all(oy.query(np.array([1, 2])) == self.y[[1, 2]])) self.assertTrue(all(oy[np.array([1, 2])] == self.y[[1, 2]])) oy.query([3, 4, 5, 6]) self.assertTrue(all(oy[3:7] == self.y[3:7]))
def test_qbc(self): mean_1 = np.array([-2, 0]) mean_2 = np.array([2, 0]) cov = np.array([[1, 0], [0, 1]]) X_1 = np.random.multivariate_normal(mean_1, cov, 100) X_2 = np.random.multivariate_normal(mean_2, cov, 200) X = np.vstack([X_1, X_2]) y = np.ones(X.shape[0]) y[101:] = -1 # shuffle data p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] y = ObstructedY(y) y.query(np.random.randint(0, X.shape[0], 50)) model = SVC(C=1, kernel='linear') model.fit(X[y.known], y[y.known]) pick, _ = query_by_bagging(X, y, current_model=None, base_model=model, batch_size=50, rng=self.rng, n_bags=5, method='entropy') mean_picked_dist = np.abs(model.decision_function(X[pick])).mean() not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)] mean_unpicked_dist = np.abs(model.decision_function( X[not_picked])).mean() self.assertTrue(mean_picked_dist < mean_unpicked_dist)
mean_1 = np.array([-2, 0]) mean_2 = np.array([2, 0]) cov = np.array([[1, 0], [0, 1]]) X_1 = np.random.multivariate_normal(mean_1, cov, 100) X_2 = np.random.multivariate_normal(mean_2, cov, 100) X = np.vstack([X_1, X_2]) y = np.ones(X.shape[0]) y[101:] = -1 # shuffle data p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] y = ObstructedY(y) y.query(np.random.randint(0, X.shape[0], 50)) model = SVC(C=1, kernel='linear', probability=True) model.fit(X[y.known], y[y.known]) pick = query_by_bagging(X, y, model, batch_size=20, rng=np.random.RandomState(666), n_bags=10, method='KL') not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)] y_plot = y._y
def test_peeking(self): oy = ObstructedY(self.y) oy.query([])
def test_full_query(self): oy = ObstructedY(self.y) self.assertTrue(all(oy.query(range(100)) == self.y)) self.assertTrue(all(oy[:] == self.y)) self.assertTrue(all(oy.known))
def fit_AL_on_folds(model_cls, base_model_cls, base_model_kwargs, projector_cls, \ folds, base_seed=1, warm_start_percentage=0, id_folds=-1, logger=main_logger): metrics = defaultdict(list) monitors = [] if id_folds == -1: id_folds = range(len(folds)) for i in id_folds: start_time = time.time() rng = np.random.RandomState(base_seed+i) X = folds[i]['X_train'] y = folds[i]['Y_train']["data"] y_obst = ObstructedY(y) X_valid = folds[i]['X_valid'] y_valid = folds[i]['Y_valid']["data"] # Add fixed projection to models that accept projector base_model_cls_fold = partial(base_model_cls, random_state=base_seed+i, **base_model_kwargs) if "EEM" in base_model_cls.__name__ or "TWELM" in base_model_cls.__name__ or "RandomNB" in base_model_cls.__name__: base_model_cls_fold = partial(base_model_cls_fold, projector=projector_cls(rng=base_seed+i, X=X["data"])) elif hasattr(base_model_cls, "transform"): logger.warning("base_model_cls has transform, but didn't fix projection") logger.info("Fitting fold on "+str(X["data"].shape)) # Important to seed model based on fold, because part of strategies might be independent of data model = model_cls(random_state=base_seed + i, base_model_cls=base_model_cls_fold) test_error_datasets = [("concept", (X_valid["data"], y_valid))] if "cluster_A" in X_valid: test_error_datasets.append(("cluster_A_concept", (X_valid["data"][X_valid["cluster_A"]], y_valid[X_valid["cluster_A"]]))) if "cluster_B" in X_valid: test_error_datasets.append(("cluster_B_concept", (X_valid["data"][X_valid["cluster_B"]], y_valid[X_valid["cluster_B"]]))) if "cluster_A" in X: logger.info("cluster A training size: "+str(len(X["cluster_A"]))) test_error_datasets.append(("cluster_A_unlabeled", (X["data"][X["cluster_A"]], y[X["cluster_A"]]))) if "cluster_B" in X: test_error_datasets.append(("cluster_B_unlabeled", (X["data"][X["cluster_B"]], y[X["cluster_B"]]))) if "cluster_A" in X: warm_start_size = max(100, int(warm_start_percentage * len(X["cluster_A"]))) warm_start = rng.choice(X["cluster_A"], warm_start_size, replace=False) y_obst.query(warm_start) else: warm_start_size = max(100, int(warm_start_percentage * X["data"].shape[0])) warm_start = rng.choice(range(X["data"].shape[0]), warm_start_size, replace=False) y_obst.query(warm_start) model.fit(X, y_obst, test_error_datasets=test_error_datasets) y_valid_pred = model.predict(X_valid["data"]) y_pred = model.predict(X["data"]) for metric_name, metric_value in chain( binary_metrics(y_valid, y_valid_pred, "valid").items(), binary_metrics(y, y_pred, "train").items()): metrics[metric_name].append(metric_value) fold_monitors = copy.deepcopy(model.monitors) for key, values in dict(fold_monitors).iteritems(): if key != 'iter': assert isinstance(values, list), "monitor %s is not a list: %s" % (key, type(values)) fold_monitors['mean_' + key] = np.mean(values) fold_monitors['auc_' + key] = auc(np.arange(len(values)), values) fold_monitors['fold_time'] = time.time() - start_time monitors.append(fold_monitors) return metrics, monitors
def fit_AL_on_folds(model_cls, base_model_cls, base_model_kwargs, projector_cls, \ folds, base_seed=1, warm_start_percentage=0, id_folds=-1, logger=main_logger): metrics = defaultdict(list) monitors = [] if id_folds == -1: id_folds = range(len(folds)) for i in id_folds: start_time = time.time() rng = np.random.RandomState(base_seed + i) X = folds[i]['X_train'] y = folds[i]['Y_train']["data"] y_obst = ObstructedY(y) X_valid = folds[i]['X_valid'] y_valid = folds[i]['Y_valid']["data"] # Add fixed projection to models that accept projector base_model_cls_fold = partial(base_model_cls, random_state=base_seed + i, **base_model_kwargs) if "EEM" in base_model_cls.__name__ or "TWELM" in base_model_cls.__name__ or "RandomNB" in base_model_cls.__name__: base_model_cls_fold = partial(base_model_cls_fold, projector=projector_cls( rng=base_seed + i, X=X["data"])) elif hasattr(base_model_cls, "transform"): logger.warning( "base_model_cls has transform, but didn't fix projection") logger.info("Fitting fold on " + str(X["data"].shape)) # Important to seed model based on fold, because part of strategies might be independent of data model = model_cls(random_state=base_seed + i, base_model_cls=base_model_cls_fold) test_error_datasets = [("concept", (X_valid["data"], y_valid))] if "cluster_A" in X_valid: test_error_datasets.append( ("cluster_A_concept", (X_valid["data"][X_valid["cluster_A"]], y_valid[X_valid["cluster_A"]]))) if "cluster_B" in X_valid: test_error_datasets.append( ("cluster_B_concept", (X_valid["data"][X_valid["cluster_B"]], y_valid[X_valid["cluster_B"]]))) if "cluster_A" in X: logger.info("cluster A training size: " + str(len(X["cluster_A"]))) test_error_datasets.append( ("cluster_A_unlabeled", (X["data"][X["cluster_A"]], y[X["cluster_A"]]))) if "cluster_B" in X: test_error_datasets.append( ("cluster_B_unlabeled", (X["data"][X["cluster_B"]], y[X["cluster_B"]]))) if "cluster_A" in X: warm_start_size = max( 100, int(warm_start_percentage * len(X["cluster_A"]))) warm_start = rng.choice(X["cluster_A"], warm_start_size, replace=False) y_obst.query(warm_start) else: warm_start_size = max( 100, int(warm_start_percentage * X["data"].shape[0])) warm_start = rng.choice(range(X["data"].shape[0]), warm_start_size, replace=False) y_obst.query(warm_start) model.fit(X, y_obst, test_error_datasets=test_error_datasets) y_valid_pred = model.predict(X_valid["data"]) y_pred = model.predict(X["data"]) for metric_name, metric_value in chain( binary_metrics(y_valid, y_valid_pred, "valid").items(), binary_metrics(y, y_pred, "train").items()): metrics[metric_name].append(metric_value) fold_monitors = copy.deepcopy(model.monitors) for key, values in dict(fold_monitors).iteritems(): if key != 'iter': assert isinstance( values, list), "monitor %s is not a list: %s" % (key, type(values)) fold_monitors['mean_' + key] = np.mean(values) fold_monitors['auc_' + key] = auc(np.arange(len(values)), values) fold_monitors['fold_time'] = time.time() - start_time monitors.append(fold_monitors) return metrics, monitors
np.random.seed(666) mean_1 = np.array([-2, 0]) mean_2 = np.array([2, 0]) cov = np.array([[1, 0], [0, 1]]) X_1 = np.random.multivariate_normal(mean_1, cov, 100) X_2 = np.random.multivariate_normal(mean_2, cov, 100) X = np.vstack([X_1, X_2]) y = np.ones(X.shape[0]) y[101:] = -1 # shuffle data p = np.random.permutation(X.shape[0]) X = X[p] y = y[p] y = ObstructedY(y) y.query(np.random.randint(0, X.shape[0], 50)) model = SVC(C=1, kernel='linear', probability=True) model.fit(X[y.known], y[y.known]) pick = query_by_bagging(X, y, model, batch_size=20, rng=np.random.RandomState(666), n_bags=10, method='KL') not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)] y_plot = y._y y_plot[pick] = 2 plt.figure(figsize=(10,10)) plt.scatter(X[y.unknown_ids, 0], X[y.unknown_ids, 1], c=y_plot[y.unknown_ids], s=100, linewidths=0) plt.ylim(-6,6) plt.show()