def make_query(self):
        dataset = self.dataset
        labeled_pool, Y = dataset.get_labeled_entries()
        unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries()
        labeled_pool = np.array(labeled_pool)
        Y = np.array(Y)
        X_pool = np.array(X_pool)

        br = BinaryRelevance(self.br_base)
        br.train(Dataset(labeled_pool, Y))

        trnf = br.predict_proba(labeled_pool)
        poolf = br.predict_proba(X_pool)
        f = poolf * 2 - 1

        trnf = np.sort(trnf, axis=1)[:, ::-1]
        trnf /= np.tile(trnf.sum(axis=1).reshape(-1, 1), (1, trnf.shape[1]))
        if len(np.unique(Y.sum(axis=1))) == 1:
            lr = DummyClf()
        else:
            lr = self.logistic_regression_
        lr.train(Dataset(trnf, Y.sum(axis=1)))

        idx_poolf = np.argsort(poolf, axis=1)[:, ::-1]
        poolf = np.sort(poolf, axis=1)[:, ::-1]
        poolf /= np.tile(poolf.sum(axis=1).reshape(-1, 1), (1, poolf.shape[1]))
        pred_num_lbl = lr.predict(poolf).astype(int)

        yhat = -1 * np.ones((len(X_pool), self.n_labels), dtype=int)
        for i, p in enumerate(pred_num_lbl):
            yhat[i, idx_poolf[i, :p]] = 1

        score = ((1 - yhat * f) / 2).sum(axis=1)
        ask_id = self.random_state_.choice(np.where(score == np.max(score))[0])
        return unlabeled_entry_ids[ask_id]
def split_train_test(dataset_filepath, test_size, n_labeled):
    X, y = init_data(dataset_filepath)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size)
    trn_ds = Dataset(
        X_train,
        np.concatenate(
            [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
    tst_ds = Dataset(X_test, y_test)
    fully_labeled_trn_ds = Dataset(X_train, y_train)

    return trn_ds, tst_ds, y_train, fully_labeled_trn_ds
Beispiel #3
0
    def check_functions(self, adapter, clf):
        adapter.train(Dataset(self.X_train, self.y_train))
        clf.fit(self.X_train, self.y_train)

        assert_array_equal(adapter.predict(self.X_train),
                           clf.predict(self.X_train))
        assert_array_equal(adapter.predict(self.X_test),
                           clf.predict(self.X_test))
        self.assertEqual(adapter.score(Dataset(self.X_train, self.y_train)),
                         clf.score(self.X_train, self.y_train))
        self.assertEqual(adapter.score(Dataset(self.X_test, self.y_test)),
                         clf.score(self.X_test, self.y_test))
    def test_svm(self):
        svc_clf = SVC(gamma="auto")
        svc_clf.fit(self.X_train, self.y_train)
        svm = SVM()
        svm.train(Dataset(self.X_train, self.y_train))

        assert_array_equal(svc_clf.predict(self.X_train),
                           svm.predict(self.X_train))
        assert_array_equal(svc_clf.predict(self.X_test),
                           svm.predict(self.X_test))
        self.assertEqual(svc_clf.score(self.X_train, self.y_train),
                         svm.score(Dataset(self.X_train, self.y_train)))
        self.assertEqual(svc_clf.score(self.X_test, self.y_test),
                         svm.score(Dataset(self.X_test, self.y_test)))
    def test_binary_relevance_parallel(self):
        br = BinaryRelevance(base_clf=LogisticRegression(solver='liblinear',
                                                         multi_class="ovr",
                                                         random_state=1126),
                             n_jobs=1)
        br.train(Dataset(self.X_train, self.Y_train))
        br_par = BinaryRelevance(base_clf=LogisticRegression(
            solver='liblinear', random_state=1126),
                                 n_jobs=2)
        br_par.train(Dataset(self.X_train, self.Y_train))

        assert_array_equal(
            br.predict(self.X_test).astype(int),
            br_par.predict(self.X_test).astype(int))
    def test_perceptron(self):
        clf = sklearn.linear_model.Perceptron()
        clf.fit(self.X_train, self.y_train)
        perceptron = Perceptron()
        perceptron.train(Dataset(self.X_train, self.y_train))

        assert_array_equal(clf.predict(self.X_train),
                           perceptron.predict(self.X_train))
        assert_array_equal(clf.predict(self.X_test),
                           perceptron.predict(self.X_test))
        self.assertEqual(clf.score(self.X_train, self.y_train),
                         perceptron.score(Dataset(self.X_train, self.y_train)))
        self.assertEqual(clf.score(self.X_test, self.y_test),
                         perceptron.score(Dataset(self.X_test, self.y_test)))
 def test_hs_report_entry_label(self):
     ds = Dataset(self.X, self.y)
     qs = HS(ds, self.classes, random_state=1126)
     y_report = []
     for i in range(len(self.y)):
         y_report.append(qs.report_entry_label(i))
     assert_array_equal(y_report, self.y)
    def make_query(self):
        dataset = self.dataset
        X, y = dataset.get_labeled_entries()
        unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries()

        classes = np.unique(y)
        n_classes = len(classes)

        self.model.train(dataset)
        proba = self.model.predict_proba(X_pool)

        scores = []
        for i, x in enumerate(X_pool):
            score = []
            for yi in range(n_classes):
                m = copy.deepcopy(self.model)
                m.train(Dataset(np.vstack((X, [x])), y + [yi]))
                p = m.predict_proba(X_pool)

                if self.loss == '01':  # 0/1 loss
                    score.append(proba[i, yi] * np.sum(1 - np.max(p, axis=1)))
                elif self.loss == 'log':  # log loss
                    score.append(proba[i, yi] * -np.sum(p * np.log(p)))
            scores.append(np.sum(score))

        choices = np.where(np.array(scores) == np.min(scores))[0]
        ask_idx = self.random_state_.choice(choices)

        return unlabeled_entry_ids[ask_idx]
Beispiel #9
0
 def setUp(self):
     self.X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [0, 1],
               [0, -2], [1.5, 1.5], [-2, -2]]
     self.y = [-1, -1, -1, 1, 1, 1, -1, -1, 1, 1]
     self.quota = 4
     self.fully_labeled_trn_ds = Dataset(self.X, self.y)
     self.lbr = IdealLabeler(self.fully_labeled_trn_ds)
Beispiel #10
0
 def test_HintSVM(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)]))
     qs = HintSVM(trn_ds, random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([24, 235, 228, 209, 18, 143, 119, 90, 149, 207]))
Beispiel #11
0
 def test_DensityWeightedUncertaintySampling(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = DWUS(trn_ds, random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([30, 179, 104, 186, 28, 65, 142, 62, 257, 221]))
Beispiel #12
0
 def test_quire(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)]))
     qs = QUIRE(trn_ds)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([117, 175, 256, 64, 103, 118, 180, 159, 129, 235]))
Beispiel #13
0
 def test_mmc(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = MMC(trn_ds, random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq,
         np.array([117, 655, 1350, 909, 1003, 1116, 546, 1055, 165, 1441]))
Beispiel #14
0
 def test_RandomSampling(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)]))
     qs = RandomSampling(trn_ds, random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([150, 16, 122, 157, 233, 160, 114, 163, 155, 56]))
    def test_logistic_regression(self):
        clf = sklearn.linear_model.LogisticRegression(
            solver='liblinear', multi_class="ovr")
        clf.fit(self.X_train, self.y_train)
        lr = LogisticRegression(solver='liblinear', multi_class="ovr")
        lr.train(Dataset(self.X_train, self.y_train))

        assert_array_equal(
            clf.predict(self.X_train), lr.predict(self.X_train))
        assert_array_equal(
            clf.predict(self.X_test), lr.predict(self.X_test))
        self.assertEqual(
            clf.score(self.X_train, self.y_train),
            lr.score(Dataset(self.X_train, self.y_train)))
        self.assertEqual(
            clf.score(self.X_test, self.y_test),
            lr.score(Dataset(self.X_test, self.y_test)))
Beispiel #16
0
    def check_proba(self, adapter, clf):
        adapter.train(Dataset(self.X_train, self.y_train))
        clf.fit(self.X_train, self.y_train)

        assert_array_equal(adapter.predict_proba(self.X_train),
                           clf.predict_proba(self.X_train))
        assert_array_equal(adapter.predict_real(self.X_train),
                           clf.predict_proba(self.X_train))
 def test_alce_lr(self):
     cost_matrix = np.random.RandomState(1126).rand(3, 3)
     np.fill_diagonal(cost_matrix, 0)
     ds = Dataset(self.X + self.X_pool,
                  self.y[:3] + [None for _ in range(len(self.X_pool))])
     qs = ALCE(ds, cost_matrix, LinearRegression(), random_state=1126)
     qseq = run_qs(ds, qs, self.y_truth, self.quota)
     assert_array_equal(
         qseq, np.array([58, 118, 134, 43, 60, 139, 87, 78, 67, 146]))
 def test_eer(self):
     ds = Dataset(self.X + self.X_pool,
                  self.y[:3] + [None for _ in range(len(self.X_pool))])
     qs = EER(ds,
              LogisticRegression(solver='liblinear', multi_class="ovr"),
              random_state=1126)
     qseq = run_qs(ds, qs, self.y_truth, self.quota)
     assert_array_equal(
         qseq, np.array([131, 20, 129, 78, 22, 139, 88, 43, 141, 133]))
 def test_hs_random_selecting(self):
     ds = Dataset(self.X, self.y[:10] + [None] * (len(self.y) - 10))
     qs = HS(ds, self.classes, active_selecting=False, random_state=1126)
     qseq = run_qs(ds, qs, self.y, len(self.y) - 10)
     assert_array_equal(
         np.concatenate([qseq[:10], qseq[-10:]]),
         np.array([
             48, 143, 13, 142, 88, 130, 29, 87, 36, 28, 58, 137, 49, 105,
             76, 71, 63, 47, 64, 55
         ]))
 def test_hs_active_selecting(self):
     ds = Dataset(self.X, self.y[:10] + [None] * (len(self.y) - 10))
     qs = HS(ds, self.classes, active_selecting=True, random_state=1126)
     qseq = run_qs(ds, qs, self.y, len(self.y) - 10)
     assert_array_equal(
         np.concatenate([qseq[:10], qseq[-10:]]),
         np.array([
             48, 143, 13, 64, 101, 108, 51, 87, 36, 28, 43, 118, 47, 25, 81,
             82, 95, 40, 67, 120
         ]))
Beispiel #21
0
 def test_adaptive_active_learning(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = AdaptiveActiveLearning(trn_ds,
                                 base_clf=LogisticRegression(
                                     solver='liblinear', multi_class="ovr"),
                                 n_jobs=-1,
                                 random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([594, 827, 1128, 419, 1223, 484, 96, 833, 37, 367]))
Beispiel #22
0
 def test_binary_minimization(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = BinaryMinimization(trn_ds,
                             LogisticRegression(solver='liblinear',
                                                multi_class="ovr"),
                             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([936, 924, 1211, 1286, 590, 429, 404, 962, 825,
                         30]))
 def test_eer_01(self):
     ds = Dataset(self.X + self.X_pool,
                  self.y[:3] + [None for _ in range(len(self.X_pool))])
     qs = EER(ds,
              LogisticRegression(solver='liblinear', multi_class="ovr"),
              loss='01',
              random_state=1126)
     qseq = run_qs(ds, qs, self.y_truth, self.quota)
     assert_array_equal(
         qseq, np.array([105, 16, 131, 117, 109, 148, 136, 115, 144, 121]))
Beispiel #24
0
    def test_quire_mykernel(self):
        def my_kernel(X, Y):
            return np.dot(X, Y.T)

        np.random.seed(1126)
        trn_ds = Dataset(
            self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)]))
        qs = QUIRE(trn_ds, kernel=my_kernel)
        qseq = run_qs(trn_ds, qs, self.y, self.quota)
        assert_array_equal(
            qseq, np.array([9, 227, 176, 110, 52, 117, 228, 205, 103, 175]))
Beispiel #25
0
 def test_UcertaintySamplingEntropy(self):
     random.seed(1126)
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = UncertaintySampling(trn_ds,
                              method='entropy',
                              model=LogisticRegression(solver="liblinear",
                                                       multi_class="ovr"))
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([145, 66, 82, 37, 194, 60, 191, 211, 245, 131]))
 def test_variance_reduction(self):
     trn_ds = Dataset(self.X,
                      np.concatenate([self.y[:2],
                                      [None] * (len(self.y) - 2)]))
     qs = VarianceReduction(
             trn_ds,
             model=LogisticRegression(solver='liblinear', multi_class="ovr"),
             sigma=0.1
         )
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq, np.array([4, 5, 2, 3]))
Beispiel #27
0
 def test_multilabel_with_auxiliary_learner_mmr(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(
         trn_ds,
         major_learner=BinaryRelevance(
             LogisticRegression(solver='liblinear', multi_class="ovr")),
         auxiliary_learner=BinaryRelevance(SVM(gamma="auto")),
         criterion='mmr',
         random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq,
         np.array([1258, 1461, 231, 1198, 1498, 1374, 955, 1367, 265, 144]))
 def test_alce_lr_embed5(self):
     cost_matrix = np.random.RandomState(1126).rand(3, 3)
     np.fill_diagonal(cost_matrix, 0)
     ds = Dataset(self.X + self.X_pool,
                  self.y[:3] + [None for _ in range(len(self.X_pool))])
     qs = ALCE(ds,
               cost_matrix,
               LinearRegression(),
               embed_dim=5,
               random_state=1126)
     qseq = run_qs(ds, qs, self.y_truth, self.quota)
     assert_array_equal(
         qseq, np.array([106, 118, 141, 43, 63, 99, 65, 89, 26, 52]))
 def test_hs_subsampling(self):
     ds = Dataset(self.X, self.y[:10] + [None] * (len(self.y) - 10))
     sub_qs = UncertaintySampling(ds,
                                  model=SVM(gamma='auto',
                                            decision_function_shape='ovr'))
     qs = HS(ds, self.classes, subsample_qs=sub_qs, random_state=1126)
     qseq = run_qs(ds, qs, self.y, len(self.y) - 10)
     assert_array_equal(
         np.concatenate([qseq[:10], qseq[-10:]]),
         np.array([
             120, 50, 33, 28, 78, 133, 52, 124, 102, 109, 81, 108, 12, 10,
             89, 114, 92, 126, 48, 25
         ]))
Beispiel #30
0
def _E(args):
    X, y, qx, clf, label_count, sigma, model = args
    sigmoid = lambda x: 1 / (1 + np.exp(-x))
    query_point = sigmoid(clf.predict_real([qx]))
    feature_count = len(X[0])
    ret = 0.0
    for i in range(label_count):
        clf_ = copy.copy(model)
        clf_.train(Dataset(np.vstack((X, [qx])), np.append(y, i)))
        PI = sigmoid(clf_.predict_real(np.vstack((X, [qx]))))
        ret += query_point[-1][i] * _Phi(sigma, PI[:-1], X, PI[-1], qx,
                                         label_count, feature_count)
    return ret