Beispiel #1
0
    def test_speedup_greedy(self):
        X = np.random.uniform(-1, 1, size=(1000, 2))
        Y = np.ones(X.shape[0])
        negative_examples = np.where(X[:, 0] < 0)
        Y[negative_examples] = -1
        Y_obstructed = ObstructedY(Y)
        Y_obstructed.query(range(100))
        m = Perceptron(alpha=0, n_iter=100).fit(X, Y)

        dist = construct_normalized_euc(X)

        D = pairwise_distances(X, metric=dist)

        r1 = quasi_greedy_batch(X,
                                Y_obstructed,
                                m,
                                rng=None,
                                batch_size=20,
                                D=D)
        r2 = quasi_greedy_batch_slow(X,
                                     Y_obstructed,
                                     m,
                                     rng=None,
                                     batch_size=20,
                                     dist=dist,
                                     D=D)

        self.assertTrue(np.array_equal(r1[0], r2[0]))
        self.assertAlmostEqual(r1[1], r2[1])
Beispiel #2
0
    def test_greedy_unc(self):
        mean_1 = np.array([-2, 0])
        mean_2 = np.array([2, 0])
        cov = np.array([[1, 0], [0, 1]])
        X_1 = np.random.multivariate_normal(mean_1, cov, 100)
        X_2 = np.random.multivariate_normal(mean_2, cov, 200)
        X = np.vstack([X_1, X_2])
        y = np.ones(X.shape[0])
        y[101:] = -1

        # shuffle data
        p = np.random.permutation(X.shape[0])
        X = X[p]
        y = y[p]

        y = ObstructedY(y)
        y.query(np.random.randint(0, X.shape[0], 50))

        model = SVC(C=1, kernel='linear')
        model.fit(X[y.known], y[y.known])

        picked, _ = quasi_greedy_batch_slow(X, y, current_model=model, c=0.0, rng=self.rng, batch_size=10, dist='cosine_distance_normalized', \
                                  base_strategy='uncertainty_sampling')
        unc_pick, _ = uncertainty_sampling(X,
                                           y,
                                           model,
                                           batch_size=10,
                                           rng=self.rng)

        self.assertTrue(set(picked) == set(unc_pick))
Beispiel #3
0
    def test_qbc(self):
        mean_1 = np.array([-2, 0])
        mean_2 = np.array([2, 0])
        cov = np.array([[1, 0], [0, 1]])
        X_1 = np.random.multivariate_normal(mean_1, cov, 100)
        X_2 = np.random.multivariate_normal(mean_2, cov, 200)
        X = np.vstack([X_1, X_2])
        y = np.ones(X.shape[0])
        y[101:] = -1

        # shuffle data
        p = np.random.permutation(X.shape[0])
        X = X[p]
        y = y[p]

        y = ObstructedY(y)
        y.query(np.random.randint(0, X.shape[0], 50))

        model = SVC(C=1, kernel='linear')
        model.fit(X[y.known], y[y.known])

        pick, _ = query_by_bagging(X, y, current_model=None, base_model=model,
                                   batch_size=50, rng=self.rng, n_bags=5, method='entropy')
        mean_picked_dist = np.abs(model.decision_function(X[pick])).mean()

        not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)]
        mean_unpicked_dist = np.abs(model.decision_function(X[not_picked])).mean()

        self.assertTrue(mean_picked_dist < mean_unpicked_dist)
Beispiel #4
0
 def test_unknown_id(self):
     oy = ObstructedY(self.y)
     self.assertTrue(all(oy.unknown_ids == np.arange(self.y.shape[0])))
     oy.query(range(50))
     self.assertTrue((all(oy.unknown_ids == np.arange(50) + 50)))
     oy.query(range(50,100))
     self.assertTrue(len(oy.unknown_ids) == 0)
Beispiel #5
0
 def test_unknown_id(self):
     oy = ObstructedY(self.y)
     self.assertTrue(all(oy.unknown_ids == np.arange(self.y.shape[0])))
     oy.query(range(50))
     self.assertTrue((all(oy.unknown_ids == np.arange(50) + 50)))
     oy.query(range(50, 100))
     self.assertTrue(len(oy.unknown_ids) == 0)
Beispiel #6
0
    def test_greedy_unc(self):
            mean_1 = np.array([-2, 0])
            mean_2 = np.array([2, 0])
            cov = np.array([[1, 0], [0, 1]])
            X_1 = np.random.multivariate_normal(mean_1, cov, 100)
            X_2 = np.random.multivariate_normal(mean_2, cov, 200)
            X = np.vstack([X_1, X_2])
            y = np.ones(X.shape[0])
            y[101:] = -1

            # shuffle data
            p = np.random.permutation(X.shape[0])
            X = X[p]
            y = y[p]

            y = ObstructedY(y)
            y.query(np.random.randint(0, X.shape[0], 50))

            model = SVC(C=1, kernel='linear')
            model.fit(X[y.known], y[y.known])

            picked, _ = quasi_greedy_batch_slow(X, y, current_model=model, c=0.0, rng=self.rng, batch_size=10, dist='cosine_distance_normalized', \
                                      base_strategy='uncertainty_sampling')
            unc_pick, _ = uncertainty_sampling(X, y, model, batch_size=10, rng=self.rng)

            self.assertTrue(set(picked) == set(unc_pick))
Beispiel #7
0
    def test_element_access(self):
        oy = ObstructedY(self.y)

        self.assertEqual(oy.query(42), self.y[42])
        self.assertEqual(oy[42], self.y[42])

        self.assertTrue(all(oy.query([6,66]) == self.y[[6,66]]))
        self.assertTrue(all(oy[[6,66]] == self.y[[6,66]]))

        self.assertTrue(all(oy.query(np.array([1,2])) == self.y[[1,2]]))
        self.assertTrue(all(oy[np.array([1,2])] == self.y[[1,2]]))

        oy.query([3,4,5,6])
        self.assertTrue(all(oy[3:7] == self.y[3:7]))
Beispiel #8
0
    def test_element_access(self):
        oy = ObstructedY(self.y)

        self.assertEqual(oy.query(42), self.y[42])
        self.assertEqual(oy[42], self.y[42])

        self.assertTrue(all(oy.query([6, 66]) == self.y[[6, 66]]))
        self.assertTrue(all(oy[[6, 66]] == self.y[[6, 66]]))

        self.assertTrue(all(oy.query(np.array([1, 2])) == self.y[[1, 2]]))
        self.assertTrue(all(oy[np.array([1, 2])] == self.y[[1, 2]]))

        oy.query([3, 4, 5, 6])
        self.assertTrue(all(oy[3:7] == self.y[3:7]))
Beispiel #9
0
    def test_speedup_greedy(self):
        X = np.random.uniform(-1, 1, size=(1000, 2))
        Y = np.ones(X.shape[0])
        negative_examples = np.where(X[:, 0] < 0)
        Y[negative_examples] = -1
        Y_obstructed = ObstructedY(Y)
        Y_obstructed.query(range(100))
        m = Perceptron(alpha=0, n_iter=100).fit(X, Y)

        dist = construct_normalized_euc(X)

        D = pairwise_distances(X, metric=dist)

        r1 = quasi_greedy_batch(X, Y_obstructed, m, rng=None, batch_size=20, D=D)
        r2 = quasi_greedy_batch_slow(X, Y_obstructed, m, rng=None, batch_size=20, dist=dist, D=D)

        self.assertTrue(np.array_equal(r1[0], r2[0]))
        self.assertAlmostEqual(r1[1], r2[1])
Beispiel #10
0
    def test_qbc(self):
        mean_1 = np.array([-2, 0])
        mean_2 = np.array([2, 0])
        cov = np.array([[1, 0], [0, 1]])
        X_1 = np.random.multivariate_normal(mean_1, cov, 100)
        X_2 = np.random.multivariate_normal(mean_2, cov, 200)
        X = np.vstack([X_1, X_2])
        y = np.ones(X.shape[0])
        y[101:] = -1

        # shuffle data
        p = np.random.permutation(X.shape[0])
        X = X[p]
        y = y[p]

        y = ObstructedY(y)
        y.query(np.random.randint(0, X.shape[0], 50))

        model = SVC(C=1, kernel='linear')
        model.fit(X[y.known], y[y.known])

        pick, _ = query_by_bagging(X,
                                   y,
                                   current_model=None,
                                   base_model=model,
                                   batch_size=50,
                                   rng=self.rng,
                                   n_bags=5,
                                   method='entropy')
        mean_picked_dist = np.abs(model.decision_function(X[pick])).mean()

        not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)]
        mean_unpicked_dist = np.abs(model.decision_function(
            X[not_picked])).mean()

        self.assertTrue(mean_picked_dist < mean_unpicked_dist)
Beispiel #11
0
mean_1 = np.array([-2, 0])
mean_2 = np.array([2, 0])
cov = np.array([[1, 0], [0, 1]])
X_1 = np.random.multivariate_normal(mean_1, cov, 100)
X_2 = np.random.multivariate_normal(mean_2, cov, 100)
X = np.vstack([X_1, X_2])
y = np.ones(X.shape[0])
y[101:] = -1

# shuffle data
p = np.random.permutation(X.shape[0])
X = X[p]
y = y[p]

y = ObstructedY(y)
y.query(np.random.randint(0, X.shape[0], 50))

model = SVC(C=1, kernel='linear', probability=True)
model.fit(X[y.known], y[y.known])

pick = query_by_bagging(X,
                        y,
                        model,
                        batch_size=20,
                        rng=np.random.RandomState(666),
                        n_bags=10,
                        method='KL')

not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)]

y_plot = y._y
Beispiel #12
0
 def test_peeking(self):
     oy = ObstructedY(self.y)
     oy.query([])
Beispiel #13
0
 def test_full_query(self):
     oy = ObstructedY(self.y)
     self.assertTrue(all(oy.query(range(100)) == self.y))
     self.assertTrue(all(oy[:] == self.y))
     self.assertTrue(all(oy.known))
Beispiel #14
0
def fit_AL_on_folds(model_cls,  base_model_cls, base_model_kwargs, projector_cls, \
                    folds, base_seed=1, warm_start_percentage=0, id_folds=-1, logger=main_logger):
    metrics = defaultdict(list)
    monitors = []

    if id_folds == -1:
        id_folds = range(len(folds))

    for i in id_folds:

        start_time = time.time()
        rng = np.random.RandomState(base_seed+i)

        X = folds[i]['X_train']
        y = folds[i]['Y_train']["data"]
        y_obst = ObstructedY(y)

        X_valid = folds[i]['X_valid']
        y_valid = folds[i]['Y_valid']["data"]

        # Add fixed projection to models that accept projector
        base_model_cls_fold = partial(base_model_cls, random_state=base_seed+i, **base_model_kwargs)
        if "EEM" in base_model_cls.__name__ or "TWELM" in base_model_cls.__name__ or "RandomNB" in base_model_cls.__name__:
            base_model_cls_fold = partial(base_model_cls_fold, projector=projector_cls(rng=base_seed+i, X=X["data"]))
        elif hasattr(base_model_cls, "transform"):
            logger.warning("base_model_cls has transform, but didn't fix projection")
        logger.info("Fitting fold on "+str(X["data"].shape))

        # Important to seed model based on fold, because part of strategies might be independent of data
        model = model_cls(random_state=base_seed + i, base_model_cls=base_model_cls_fold)

        test_error_datasets = [("concept", (X_valid["data"], y_valid))]

        if "cluster_A" in X_valid:
            test_error_datasets.append(("cluster_A_concept", (X_valid["data"][X_valid["cluster_A"]], y_valid[X_valid["cluster_A"]])))
        if "cluster_B" in X_valid:
            test_error_datasets.append(("cluster_B_concept", (X_valid["data"][X_valid["cluster_B"]], y_valid[X_valid["cluster_B"]])))
        if "cluster_A" in X:
            logger.info("cluster A training size: "+str(len(X["cluster_A"])))
            test_error_datasets.append(("cluster_A_unlabeled", (X["data"][X["cluster_A"]], y[X["cluster_A"]])))
        if "cluster_B" in X:
            test_error_datasets.append(("cluster_B_unlabeled", (X["data"][X["cluster_B"]], y[X["cluster_B"]])))

        if "cluster_A" in X:
            warm_start_size = max(100, int(warm_start_percentage * len(X["cluster_A"])))
            warm_start = rng.choice(X["cluster_A"], warm_start_size, replace=False)
            y_obst.query(warm_start)
        else:
            warm_start_size = max(100, int(warm_start_percentage * X["data"].shape[0]))
            warm_start = rng.choice(range(X["data"].shape[0]), warm_start_size, replace=False)
            y_obst.query(warm_start)

        model.fit(X, y_obst, test_error_datasets=test_error_datasets)
        y_valid_pred = model.predict(X_valid["data"])
        y_pred = model.predict(X["data"])

        for metric_name, metric_value in chain(
                binary_metrics(y_valid, y_valid_pred, "valid").items(),
                binary_metrics(y, y_pred, "train").items()):

            metrics[metric_name].append(metric_value)

        fold_monitors = copy.deepcopy(model.monitors)

        for key, values in dict(fold_monitors).iteritems():
            if key != 'iter':
                assert isinstance(values, list), "monitor %s is not a list: %s" % (key, type(values))
                fold_monitors['mean_' + key] = np.mean(values)
                fold_monitors['auc_' + key] = auc(np.arange(len(values)), values)

        fold_monitors['fold_time'] = time.time() - start_time
        monitors.append(fold_monitors)



    return metrics, monitors
Beispiel #15
0
def fit_AL_on_folds(model_cls,  base_model_cls, base_model_kwargs, projector_cls, \
                    folds, base_seed=1, warm_start_percentage=0, id_folds=-1, logger=main_logger):
    metrics = defaultdict(list)
    monitors = []

    if id_folds == -1:
        id_folds = range(len(folds))

    for i in id_folds:

        start_time = time.time()
        rng = np.random.RandomState(base_seed + i)

        X = folds[i]['X_train']
        y = folds[i]['Y_train']["data"]
        y_obst = ObstructedY(y)

        X_valid = folds[i]['X_valid']
        y_valid = folds[i]['Y_valid']["data"]

        # Add fixed projection to models that accept projector
        base_model_cls_fold = partial(base_model_cls,
                                      random_state=base_seed + i,
                                      **base_model_kwargs)
        if "EEM" in base_model_cls.__name__ or "TWELM" in base_model_cls.__name__ or "RandomNB" in base_model_cls.__name__:
            base_model_cls_fold = partial(base_model_cls_fold,
                                          projector=projector_cls(
                                              rng=base_seed + i, X=X["data"]))
        elif hasattr(base_model_cls, "transform"):
            logger.warning(
                "base_model_cls has transform, but didn't fix projection")
        logger.info("Fitting fold on " + str(X["data"].shape))

        # Important to seed model based on fold, because part of strategies might be independent of data
        model = model_cls(random_state=base_seed + i,
                          base_model_cls=base_model_cls_fold)

        test_error_datasets = [("concept", (X_valid["data"], y_valid))]

        if "cluster_A" in X_valid:
            test_error_datasets.append(
                ("cluster_A_concept", (X_valid["data"][X_valid["cluster_A"]],
                                       y_valid[X_valid["cluster_A"]])))
        if "cluster_B" in X_valid:
            test_error_datasets.append(
                ("cluster_B_concept", (X_valid["data"][X_valid["cluster_B"]],
                                       y_valid[X_valid["cluster_B"]])))
        if "cluster_A" in X:
            logger.info("cluster A training size: " + str(len(X["cluster_A"])))
            test_error_datasets.append(
                ("cluster_A_unlabeled", (X["data"][X["cluster_A"]],
                                         y[X["cluster_A"]])))
        if "cluster_B" in X:
            test_error_datasets.append(
                ("cluster_B_unlabeled", (X["data"][X["cluster_B"]],
                                         y[X["cluster_B"]])))

        if "cluster_A" in X:
            warm_start_size = max(
                100, int(warm_start_percentage * len(X["cluster_A"])))
            warm_start = rng.choice(X["cluster_A"],
                                    warm_start_size,
                                    replace=False)
            y_obst.query(warm_start)
        else:
            warm_start_size = max(
                100, int(warm_start_percentage * X["data"].shape[0]))
            warm_start = rng.choice(range(X["data"].shape[0]),
                                    warm_start_size,
                                    replace=False)
            y_obst.query(warm_start)

        model.fit(X, y_obst, test_error_datasets=test_error_datasets)
        y_valid_pred = model.predict(X_valid["data"])
        y_pred = model.predict(X["data"])

        for metric_name, metric_value in chain(
                binary_metrics(y_valid, y_valid_pred, "valid").items(),
                binary_metrics(y, y_pred, "train").items()):

            metrics[metric_name].append(metric_value)

        fold_monitors = copy.deepcopy(model.monitors)

        for key, values in dict(fold_monitors).iteritems():
            if key != 'iter':
                assert isinstance(
                    values,
                    list), "monitor %s is not a list: %s" % (key, type(values))
                fold_monitors['mean_' + key] = np.mean(values)
                fold_monitors['auc_' + key] = auc(np.arange(len(values)),
                                                  values)

        fold_monitors['fold_time'] = time.time() - start_time
        monitors.append(fold_monitors)

    return metrics, monitors
Beispiel #16
0
 def test_peeking(self):
     oy = ObstructedY(self.y)
     oy.query([])
Beispiel #17
0
 def test_full_query(self):
     oy = ObstructedY(self.y)
     self.assertTrue(all(oy.query(range(100)) == self.y))
     self.assertTrue(all(oy[:] == self.y))
     self.assertTrue(all(oy.known))
Beispiel #18
0
np.random.seed(666)
mean_1 = np.array([-2, 0])
mean_2 = np.array([2, 0])
cov = np.array([[1, 0], [0, 1]])
X_1 = np.random.multivariate_normal(mean_1, cov, 100)
X_2 = np.random.multivariate_normal(mean_2, cov, 100)
X = np.vstack([X_1, X_2])
y = np.ones(X.shape[0])
y[101:] = -1

# shuffle data
p = np.random.permutation(X.shape[0])
X = X[p]
y = y[p]

y = ObstructedY(y)
y.query(np.random.randint(0, X.shape[0], 50))

model = SVC(C=1, kernel='linear', probability=True)
model.fit(X[y.known], y[y.known])

pick = query_by_bagging(X, y, model, batch_size=20, rng=np.random.RandomState(666), n_bags=10, method='KL')

not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)]

y_plot = y._y
y_plot[pick] = 2
plt.figure(figsize=(10,10))
plt.scatter(X[y.unknown_ids, 0], X[y.unknown_ids, 1], c=y_plot[y.unknown_ids], s=100, linewidths=0)
plt.ylim(-6,6)
plt.show()