Exemple #1
0
 def test_unknown_id(self):
     oy = ObstructedY(self.y)
     self.assertTrue(all(oy.unknown_ids == np.arange(self.y.shape[0])))
     oy.query(range(50))
     self.assertTrue((all(oy.unknown_ids == np.arange(50) + 50)))
     oy.query(range(50, 100))
     self.assertTrue(len(oy.unknown_ids) == 0)
Exemple #2
0
    def test_speedup_greedy(self):
        X = np.random.uniform(-1, 1, size=(1000, 2))
        Y = np.ones(X.shape[0])
        negative_examples = np.where(X[:, 0] < 0)
        Y[negative_examples] = -1
        Y_obstructed = ObstructedY(Y)
        Y_obstructed.query(range(100))
        m = Perceptron(alpha=0, n_iter=100).fit(X, Y)

        dist = construct_normalized_euc(X)

        D = pairwise_distances(X, metric=dist)

        r1 = quasi_greedy_batch(X,
                                Y_obstructed,
                                m,
                                rng=None,
                                batch_size=20,
                                D=D)
        r2 = quasi_greedy_batch_slow(X,
                                     Y_obstructed,
                                     m,
                                     rng=None,
                                     batch_size=20,
                                     dist=dist,
                                     D=D)

        self.assertTrue(np.array_equal(r1[0], r2[0]))
        self.assertAlmostEqual(r1[1], r2[1])
Exemple #3
0
    def test_greedy_unc(self):
        mean_1 = np.array([-2, 0])
        mean_2 = np.array([2, 0])
        cov = np.array([[1, 0], [0, 1]])
        X_1 = np.random.multivariate_normal(mean_1, cov, 100)
        X_2 = np.random.multivariate_normal(mean_2, cov, 200)
        X = np.vstack([X_1, X_2])
        y = np.ones(X.shape[0])
        y[101:] = -1

        # shuffle data
        p = np.random.permutation(X.shape[0])
        X = X[p]
        y = y[p]

        y = ObstructedY(y)
        y.query(np.random.randint(0, X.shape[0], 50))

        model = SVC(C=1, kernel='linear')
        model.fit(X[y.known], y[y.known])

        picked, _ = quasi_greedy_batch_slow(X, y, current_model=model, c=0.0, rng=self.rng, batch_size=10, dist='cosine_distance_normalized', \
                                  base_strategy='uncertainty_sampling')
        unc_pick, _ = uncertainty_sampling(X,
                                           y,
                                           model,
                                           batch_size=10,
                                           rng=self.rng)

        self.assertTrue(set(picked) == set(unc_pick))
Exemple #4
0
    def setUp(self):
        self.decision_model = DecisionDummy()
        self.prob_model = ProbDummy()
        self.X = np.linspace(0.6, 1, 20).reshape(-1, 1)

        self.batch_size = 3
        self.rng = np.random.RandomState(666)

        self.y = np.ones(self.X.shape[0])
        self.y[np.random.randint(0, 20, 15)] = -1
        self.y = ObstructedY(self.y)
Exemple #5
0
    def test_element_access(self):
        oy = ObstructedY(self.y)

        self.assertEqual(oy.query(42), self.y[42])
        self.assertEqual(oy[42], self.y[42])

        self.assertTrue(all(oy.query([6, 66]) == self.y[[6, 66]]))
        self.assertTrue(all(oy[[6, 66]] == self.y[[6, 66]]))

        self.assertTrue(all(oy.query(np.array([1, 2])) == self.y[[1, 2]]))
        self.assertTrue(all(oy[np.array([1, 2])] == self.y[[1, 2]]))

        oy.query([3, 4, 5, 6])
        self.assertTrue(all(oy[3:7] == self.y[3:7]))
Exemple #6
0
    def test_qbc(self):
        mean_1 = np.array([-2, 0])
        mean_2 = np.array([2, 0])
        cov = np.array([[1, 0], [0, 1]])
        X_1 = np.random.multivariate_normal(mean_1, cov, 100)
        X_2 = np.random.multivariate_normal(mean_2, cov, 200)
        X = np.vstack([X_1, X_2])
        y = np.ones(X.shape[0])
        y[101:] = -1

        # shuffle data
        p = np.random.permutation(X.shape[0])
        X = X[p]
        y = y[p]

        y = ObstructedY(y)
        y.query(np.random.randint(0, X.shape[0], 50))

        model = SVC(C=1, kernel='linear')
        model.fit(X[y.known], y[y.known])

        pick, _ = query_by_bagging(X,
                                   y,
                                   current_model=None,
                                   base_model=model,
                                   batch_size=50,
                                   rng=self.rng,
                                   n_bags=5,
                                   method='entropy')
        mean_picked_dist = np.abs(model.decision_function(X[pick])).mean()

        not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)]
        mean_unpicked_dist = np.abs(model.decision_function(
            X[not_picked])).mean()

        self.assertTrue(mean_picked_dist < mean_unpicked_dist)
Exemple #7
0
 def test_bad_access_slice(self):
     oy = ObstructedY(self.y)
     oy[6:66]
Exemple #8
0
 def test_bad_access_single(self):
     oy = ObstructedY(self.y)
     oy[42]
Exemple #9
0
 def test_nad_index_list(self):
     oy = ObstructedY(self.y)
     oy[[666, 777]]
Exemple #10
0
 def test_nad_index_slice(self):
     oy = ObstructedY(self.y)
     oy[666:777]
Exemple #11
0
 def test_nad_index_single(self):
     oy = ObstructedY(self.y)
     oy[666]
Exemple #12
0
 def test_full_query(self):
     oy = ObstructedY(self.y)
     self.assertTrue(all(oy.query(range(100)) == self.y))
     self.assertTrue(all(oy[:] == self.y))
     self.assertTrue(all(oy.known))
Exemple #13
0
    def test_constructor(self):
        oy = ObstructedY(self.y)

        self.assertTrue(all(oy._y == self.y))
        self.assertTrue(not any(oy.known))
        self.assertEqual(len(self.y[oy.known]), 0)
Exemple #14
0
 def test_bad_access_list(self):
     oy = ObstructedY(self.y)
     oy[[6, 66]]
Exemple #15
0
 def test_peeking(self):
     oy = ObstructedY(self.y)
     oy.query([])
Exemple #16
0
def fit_AL_on_folds(model_cls,  base_model_cls, base_model_kwargs, projector_cls, \
                    folds, base_seed=1, warm_start_percentage=0, id_folds=-1, logger=main_logger):
    metrics = defaultdict(list)
    monitors = []

    if id_folds == -1:
        id_folds = range(len(folds))

    for i in id_folds:

        start_time = time.time()
        rng = np.random.RandomState(base_seed + i)

        X = folds[i]['X_train']
        y = folds[i]['Y_train']["data"]
        y_obst = ObstructedY(y)

        X_valid = folds[i]['X_valid']
        y_valid = folds[i]['Y_valid']["data"]

        # Add fixed projection to models that accept projector
        base_model_cls_fold = partial(base_model_cls,
                                      random_state=base_seed + i,
                                      **base_model_kwargs)
        if "EEM" in base_model_cls.__name__ or "TWELM" in base_model_cls.__name__ or "RandomNB" in base_model_cls.__name__:
            base_model_cls_fold = partial(base_model_cls_fold,
                                          projector=projector_cls(
                                              rng=base_seed + i, X=X["data"]))
        elif hasattr(base_model_cls, "transform"):
            logger.warning(
                "base_model_cls has transform, but didn't fix projection")
        logger.info("Fitting fold on " + str(X["data"].shape))

        # Important to seed model based on fold, because part of strategies might be independent of data
        model = model_cls(random_state=base_seed + i,
                          base_model_cls=base_model_cls_fold)

        test_error_datasets = [("concept", (X_valid["data"], y_valid))]

        if "cluster_A" in X_valid:
            test_error_datasets.append(
                ("cluster_A_concept", (X_valid["data"][X_valid["cluster_A"]],
                                       y_valid[X_valid["cluster_A"]])))
        if "cluster_B" in X_valid:
            test_error_datasets.append(
                ("cluster_B_concept", (X_valid["data"][X_valid["cluster_B"]],
                                       y_valid[X_valid["cluster_B"]])))
        if "cluster_A" in X:
            logger.info("cluster A training size: " + str(len(X["cluster_A"])))
            test_error_datasets.append(
                ("cluster_A_unlabeled", (X["data"][X["cluster_A"]],
                                         y[X["cluster_A"]])))
        if "cluster_B" in X:
            test_error_datasets.append(
                ("cluster_B_unlabeled", (X["data"][X["cluster_B"]],
                                         y[X["cluster_B"]])))

        if "cluster_A" in X:
            warm_start_size = max(
                100, int(warm_start_percentage * len(X["cluster_A"])))
            warm_start = rng.choice(X["cluster_A"],
                                    warm_start_size,
                                    replace=False)
            y_obst.query(warm_start)
        else:
            warm_start_size = max(
                100, int(warm_start_percentage * X["data"].shape[0]))
            warm_start = rng.choice(range(X["data"].shape[0]),
                                    warm_start_size,
                                    replace=False)
            y_obst.query(warm_start)

        model.fit(X, y_obst, test_error_datasets=test_error_datasets)
        y_valid_pred = model.predict(X_valid["data"])
        y_pred = model.predict(X["data"])

        for metric_name, metric_value in chain(
                binary_metrics(y_valid, y_valid_pred, "valid").items(),
                binary_metrics(y, y_pred, "train").items()):

            metrics[metric_name].append(metric_value)

        fold_monitors = copy.deepcopy(model.monitors)

        for key, values in dict(fold_monitors).iteritems():
            if key != 'iter':
                assert isinstance(
                    values,
                    list), "monitor %s is not a list: %s" % (key, type(values))
                fold_monitors['mean_' + key] = np.mean(values)
                fold_monitors['auc_' + key] = auc(np.arange(len(values)),
                                                  values)

        fold_monitors['fold_time'] = time.time() - start_time
        monitors.append(fold_monitors)

    return metrics, monitors
Exemple #17
0
np.random.seed(666)
mean_1 = np.array([-2, 0])
mean_2 = np.array([2, 0])
cov = np.array([[1, 0], [0, 1]])
X_1 = np.random.multivariate_normal(mean_1, cov, 100)
X_2 = np.random.multivariate_normal(mean_2, cov, 100)
X = np.vstack([X_1, X_2])
y = np.ones(X.shape[0])
y[101:] = -1

# shuffle data
p = np.random.permutation(X.shape[0])
X = X[p]
y = y[p]

y = ObstructedY(y)
y.query(np.random.randint(0, X.shape[0], 50))

model = SVC(C=1, kernel='linear', probability=True)
model.fit(X[y.known], y[y.known])

pick = query_by_bagging(X,
                        y,
                        model,
                        batch_size=20,
                        rng=np.random.RandomState(666),
                        n_bags=10,
                        method='KL')

not_picked = [i for i in xrange(X.shape[0]) if i not in set(pick)]