예제 #1
0
 def test_uncertainty_entropy(self):
     trn_ds = init_toyexample(self.X, self.y)
     qs = UncertaintySampling(trn_ds,
                              method='entropy',
                              model=LogisticRegression(solver='liblinear',
                                                       multi_class="ovr"))
     model = LogisticRegression(solver='liblinear', multi_class="ovr")
     qseq = run_qs(trn_ds, self.lbr, model, qs, self.quota)
     assert_array_equal(qseq, np.array([6, 7, 8, 9]))
    def test_binary_relevance_parallel(self):
        br = BinaryRelevance(base_clf=LogisticRegression(solver='liblinear',
                                                         multi_class="ovr",
                                                         random_state=1126),
                             n_jobs=1)
        br.train(Dataset(self.X_train, self.Y_train))
        br_par = BinaryRelevance(base_clf=LogisticRegression(
            solver='liblinear', random_state=1126),
                                 n_jobs=2)
        br_par.train(Dataset(self.X_train, self.Y_train))

        assert_array_equal(
            br.predict(self.X_test).astype(int),
            br_par.predict(self.X_test).astype(int))
예제 #3
0
 def test_ActiveLearningByLearning(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = ActiveLearningByLearning(
         trn_ds,
         T=self.quota,
         query_strategies=[
             UncertaintySampling(
                 trn_ds,
                 model=LogisticRegression(solver="liblinear",
                                          multi_class="ovr")),
             HintSVM(trn_ds, random_state=1126)
         ],
         model=LogisticRegression(solver="liblinear", multi_class="ovr"),
         random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
 def test_eer(self):
     ds = Dataset(self.X + self.X_pool,
                  self.y[:3] + [None for _ in range(len(self.X_pool))])
     qs = EER(ds,
              LogisticRegression(solver='liblinear', multi_class="ovr"),
              random_state=1126)
     qseq = run_qs(ds, qs, self.y_truth, self.quota)
     assert_array_equal(
         qseq, np.array([131, 20, 129, 78, 22, 139, 88, 43, 141, 133]))
예제 #5
0
 def test_cost_sensitive_random_pair_encoding(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     model = BinaryRelevance(
         LogisticRegression(solver='liblinear', multi_class="ovr"))
     base_model = LogisticRegression(solver='liblinear',
                                     multi_class="ovr",
                                     random_state=1126)
     qs = CostSensitiveReferencePairEncoding(trn_ds,
                                             scoring_fn=pairwise_f1_score,
                                             model=model,
                                             base_model=base_model,
                                             n_models=10,
                                             n_jobs=1,
                                             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq,
         np.array([149, 434, 1126, 719, 983, 564, 816, 732, 101, 1242]))
예제 #6
0
 def test_binary_minimization(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = BinaryMinimization(trn_ds,
                             LogisticRegression(solver='liblinear',
                                                multi_class="ovr"),
                             random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([936, 924, 1211, 1286, 590, 429, 404, 962, 825,
                         30]))
def main():
    # Specifiy the parameters here:
    # path to your binary classification dataset
    dataset_filepath = '../../data/musk_csv.mat'
    test_size = 0.33  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set
    n_labeled = 10  # number of samples that are initially labeled

    # Load dataset
    trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
        split_train_test(dataset_filepath, test_size, n_labeled)
    trn_ds2 = copy.deepcopy(trn_ds)
    lbr = IdealLabeler(fully_labeled_trn_ds)

    quota = len(y_train) - n_labeled  # number of samples to query

    # Comparing UncertaintySampling strategy with RandomSampling.
    # model is the base learner, e.g. LogisticRegression, SVM ... etc.
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    model = LogisticRegression()
    E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)

    qs2 = RandomSampling(trn_ds2)
    model = LogisticRegression()
    E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)

    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    query_num = np.arange(1, quota + 1)
    plt.plot(query_num, E_in_1, 'b', label='qs Ein')
    plt.plot(query_num, E_in_2, 'r', label='random Ein')
    plt.plot(query_num, E_out_1, 'g', label='qs Eout')
    plt.plot(query_num, E_out_2, 'k', label='random Eout')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show()
 def test_eer_01(self):
     ds = Dataset(self.X + self.X_pool,
                  self.y[:3] + [None for _ in range(len(self.X_pool))])
     qs = EER(ds,
              LogisticRegression(solver='liblinear', multi_class="ovr"),
              loss='01',
              random_state=1126)
     qseq = run_qs(ds, qs, self.y_truth, self.quota)
     assert_array_equal(
         qseq, np.array([105, 16, 131, 117, 109, 148, 136, 115, 144, 121]))
예제 #9
0
 def test_adaptive_active_learning(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = AdaptiveActiveLearning(trn_ds,
                                 base_clf=LogisticRegression(
                                     solver='liblinear', multi_class="ovr"),
                                 n_jobs=-1,
                                 random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([594, 827, 1128, 419, 1223, 484, 96, 833, 37, 367]))
예제 #10
0
 def test_UcertaintySamplingEntropy(self):
     random.seed(1126)
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = UncertaintySampling(trn_ds,
                              method='entropy',
                              model=LogisticRegression(solver="liblinear",
                                                       multi_class="ovr"))
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([145, 66, 82, 37, 194, 60, 191, 211, 245, 131]))
예제 #11
0
 def test_query_by_committee_vote(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = QueryByCommittee(trn_ds,
                           disagreement='vote',
                           models=[
                               LogisticRegression(C=1.0,
                                                  solver="liblinear",
                                                  multi_class="ovr"),
                               LogisticRegression(C=0.01,
                                                  solver="liblinear",
                                                  multi_class="ovr"),
                               LogisticRegression(C=100,
                                                  solver="liblinear",
                                                  multi_class="ovr")
                           ],
                           random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
 def test_variance_reduction(self):
     trn_ds = Dataset(self.X,
                      np.concatenate([self.y[:2],
                                      [None] * (len(self.y) - 2)]))
     qs = VarianceReduction(
             trn_ds,
             model=LogisticRegression(solver='liblinear', multi_class="ovr"),
             sigma=0.1
         )
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq, np.array([4, 5, 2, 3]))
예제 #13
0
 def test_density_weighted_meta_uncertainty_lc(self):
     trn_ds = Dataset(self.X[:20], np.concatenate([self.y[:6],
                                                   [None] * 14]))
     base_qs = UncertaintySampling(trn_ds,
                                   method='lc',
                                   model=LogisticRegression(
                                       solver='liblinear',
                                       multi_class="ovr"))
     similarity_metric = cosine_similarity
     clustering_method = KMeans(n_clusters=3, random_state=1126)
     qs = DensityWeightedMeta(dataset=trn_ds,
                              base_query_strategy=base_qs,
                              similarity_metric=similarity_metric,
                              clustering_method=clustering_method,
                              beta=1.0,
                              random_state=1126)
     model = LogisticRegression(solver='liblinear', multi_class="ovr")
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
                        np.array([13, 18, 9, 12, 8, 16, 10, 19, 15, 17]))
예제 #14
0
 def test_query_by_committee_kl_divergence(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
     qs = QueryByCommittee(trn_ds,
                           disagreement='kl_divergence',
                           models=[
                               LogisticRegression(C=1.0,
                                                  solver="liblinear",
                                                  multi_class="ovr"),
                               LogisticRegression(C=0.01,
                                                  solver="liblinear",
                                                  multi_class="ovr"),
                               LogisticRegression(C=100,
                                                  solver="liblinear",
                                                  multi_class="ovr")
                           ],
                           random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([228, 111, 162, 243, 213, 122, 110, 108, 156, 37]))
예제 #15
0
 def test_multilabel_with_auxiliary_learner_mmr(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(
         trn_ds,
         major_learner=BinaryRelevance(
             LogisticRegression(solver='liblinear', multi_class="ovr")),
         auxiliary_learner=BinaryRelevance(SVM(gamma="auto")),
         criterion='mmr',
         random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq,
         np.array([1258, 1461, 231, 1198, 1498, 1374, 955, 1367, 265, 144]))
예제 #16
0
 def test_multilabel_with_auxiliary_learner_shlr(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(
         trn_ds,
         major_learner=BinaryRelevance(
             LogisticRegression(solver='liblinear', multi_class="ovr")),
         auxiliary_learner=BinaryRelevance(SVM(gamma="auto")),
         criterion='shlr',
         b=1.,
         random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([1258, 805, 459, 550, 783, 964, 736, 1004, 38,
                         750]))
def get_active_svm_index(X_train, y_train, total_labels):
    y_labes = np.array([None] * len(y_train))
    random_labels = []
    while len(np.unique(y_train[random_labels])) < 2:
        random_labels.append(np.random.choice(np.arange(len(y_labes))))

    y_labes[random_labels] = y_train[random_labels]
    fully_labeled_trn_ds = Dataset(X_train, y_train)
    lbr = IdealLabeler(fully_labeled_trn_ds)
    trn_ds = Dataset(X_train, y_labes)
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    idx = run(trn_ds, lbr, qs, total_labels - len(random_labels))

    return idx + random_labels
예제 #18
0
 def test_multilabel_with_auxiliary_learner_hlr(self):
     trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5))
     qs = MultilabelWithAuxiliaryLearner(
         trn_ds,
         major_learner=BinaryRelevance(
             LogisticRegression(solver='liblinear',
                                multi_class="ovr",
                                random_state=1126)),
         auxiliary_learner=BinaryRelevance(SVM(gamma="auto")),
         criterion='hlr',
         random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq,
         np.array([701, 1403, 147, 897, 974, 1266, 870, 703, 292, 1146]))
    def test_logistic_regression(self):
        clf = sklearn.linear_model.LogisticRegression(
            solver='liblinear', multi_class="ovr")
        clf.fit(self.X_train, self.y_train)
        lr = LogisticRegression(solver='liblinear', multi_class="ovr")
        lr.train(Dataset(self.X_train, self.y_train))

        assert_array_equal(
            clf.predict(self.X_train), lr.predict(self.X_train))
        assert_array_equal(
            clf.predict(self.X_test), lr.predict(self.X_test))
        self.assertEqual(
            clf.score(self.X_train, self.y_train),
            lr.score(Dataset(self.X_train, self.y_train)))
        self.assertEqual(
            clf.score(self.X_test, self.y_test),
            lr.score(Dataset(self.X_test, self.y_test)))
예제 #20
0
    def test_uncertainty_entropy_exceptions(self):
        trn_ds = init_toyexample(self.X, self.y)

        with self.assertRaises(TypeError):
            qs = UncertaintySampling(trn_ds, method='entropy', model=SVM())

        with self.assertRaises(TypeError):
            qs = UncertaintySampling(trn_ds,
                                     method='entropy',
                                     model=Perceptron())

        with self.assertRaises(TypeError):
            qs = UncertaintySampling(
                trn_ds,
                method='not_exist',
                model=LogisticRegression(solver='liblinear',
                                         multi_class="ovr"))
    def __init__(self, *args, **kwargs):
        super(MaximumLossReductionMaximalConfidence, self).__init__(*args, **kwargs)

        # self.n_labels = len(self.dataset.get_labeled_entries()[0][1])
        self.n_labels = len(self.dataset.get_labeled_entries()[1][0])

        random_state = kwargs.pop('random_state', None)
        self.random_state_ = seed_random_state(random_state)

        self.logreg_param = kwargs.pop('logreg_param',
                                       {'multi_class': 'multinomial',
                                        'solver': 'newton-cg',
                                        'random_state': random_state})
        self.logistic_regression_ = LogisticRegression(**self.logreg_param)

        self.br_base = kwargs.pop('br_base',
              SklearnProbaAdapter(SVC(kernel='linear',
                                      probability=True,
                                      gamma="auto",
                                      random_state=random_state)))
    def test_binary_relevance_lr(self):
        br = BinaryRelevance(base_clf=LogisticRegression(
            solver='liblinear', multi_class="ovr", random_state=1126))
        br.train(Dataset(self.X_train, self.Y_train))

        br_pred_train = br.predict(self.X_train).astype(int)
        br_pred_test = br.predict(self.X_test).astype(int)

        br_pred_proba_train = br.predict_proba(self.X_train).astype(float)
        br_pred_proba_test = br.predict_proba(self.X_test).astype(float)

        for i in range(np.shape(self.Y_train)[1]):
            clf = sklearn.linear_model.LogisticRegression(solver='liblinear',
                                                          multi_class="ovr",
                                                          random_state=1126)
            clf.fit(self.X_train, self.Y_train[:, i])

            assert_array_almost_equal(
                clf.predict(self.X_train).astype(int), br_pred_train[:, i])
            assert_array_almost_equal(
                clf.predict(self.X_test).astype(int), br_pred_test[:, i])

            assert_array_almost_equal(
                clf.predict_proba(self.X_train)[:, 1].astype(float),
                br_pred_proba_train[:, i].astype(float))
            assert_array_almost_equal(
                clf.predict_proba(self.X_test)[:, 1].astype(float),
                br_pred_proba_test[:, i].astype(float))

        self.assertEqual(
            np.mean(np.abs(self.Y_test - br_pred_test).mean(axis=1)),
            br.score(Dataset(self.X_test, self.Y_test), 'hamming'))

        self.assertRaises(
            NotImplementedError,
            lambda: br.score(Dataset(self.X_test, self.Y_test),
                             criterion='not_exist'))