def test_uncertainty_entropy(self): trn_ds = init_toyexample(self.X, self.y) qs = UncertaintySampling(trn_ds, method='entropy', model=LogisticRegression(solver='liblinear', multi_class="ovr")) model = LogisticRegression(solver='liblinear', multi_class="ovr") qseq = run_qs(trn_ds, self.lbr, model, qs, self.quota) assert_array_equal(qseq, np.array([6, 7, 8, 9]))
def test_binary_relevance_parallel(self): br = BinaryRelevance(base_clf=LogisticRegression(solver='liblinear', multi_class="ovr", random_state=1126), n_jobs=1) br.train(Dataset(self.X_train, self.Y_train)) br_par = BinaryRelevance(base_clf=LogisticRegression( solver='liblinear', random_state=1126), n_jobs=2) br_par.train(Dataset(self.X_train, self.Y_train)) assert_array_equal( br.predict(self.X_test).astype(int), br_par.predict(self.X_test).astype(int))
def test_ActiveLearningByLearning(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = ActiveLearningByLearning( trn_ds, T=self.quota, query_strategies=[ UncertaintySampling( trn_ds, model=LogisticRegression(solver="liblinear", multi_class="ovr")), HintSVM(trn_ds, random_state=1126) ], model=LogisticRegression(solver="liblinear", multi_class="ovr"), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
def test_eer(self): ds = Dataset(self.X + self.X_pool, self.y[:3] + [None for _ in range(len(self.X_pool))]) qs = EER(ds, LogisticRegression(solver='liblinear', multi_class="ovr"), random_state=1126) qseq = run_qs(ds, qs, self.y_truth, self.quota) assert_array_equal( qseq, np.array([131, 20, 129, 78, 22, 139, 88, 43, 141, 133]))
def test_cost_sensitive_random_pair_encoding(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) model = BinaryRelevance( LogisticRegression(solver='liblinear', multi_class="ovr")) base_model = LogisticRegression(solver='liblinear', multi_class="ovr", random_state=1126) qs = CostSensitiveReferencePairEncoding(trn_ds, scoring_fn=pairwise_f1_score, model=model, base_model=base_model, n_models=10, n_jobs=1, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([149, 434, 1126, 719, 983, 564, 816, 732, 101, 1242]))
def test_binary_minimization(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = BinaryMinimization(trn_ds, LogisticRegression(solver='liblinear', multi_class="ovr"), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([936, 924, 1211, 1286, 590, 429, 404, 962, 825, 30]))
def main(): # Specifiy the parameters here: # path to your binary classification dataset dataset_filepath = '../../data/musk_csv.mat' test_size = 0.33 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set n_labeled = 10 # number of samples that are initially labeled # Load dataset trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \ split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) model = LogisticRegression() E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) qs2 = RandomSampling(trn_ds2) model = LogisticRegression() E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. query_num = np.arange(1, quota + 1) plt.plot(query_num, E_in_1, 'b', label='qs Ein') plt.plot(query_num, E_in_2, 'r', label='random Ein') plt.plot(query_num, E_out_1, 'g', label='qs Eout') plt.plot(query_num, E_out_2, 'k', label='random Eout') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show()
def test_eer_01(self): ds = Dataset(self.X + self.X_pool, self.y[:3] + [None for _ in range(len(self.X_pool))]) qs = EER(ds, LogisticRegression(solver='liblinear', multi_class="ovr"), loss='01', random_state=1126) qseq = run_qs(ds, qs, self.y_truth, self.quota) assert_array_equal( qseq, np.array([105, 16, 131, 117, 109, 148, 136, 115, 144, 121]))
def test_adaptive_active_learning(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = AdaptiveActiveLearning(trn_ds, base_clf=LogisticRegression( solver='liblinear', multi_class="ovr"), n_jobs=-1, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([594, 827, 1128, 419, 1223, 484, 96, 833, 37, 367]))
def test_UcertaintySamplingEntropy(self): random.seed(1126) trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = UncertaintySampling(trn_ds, method='entropy', model=LogisticRegression(solver="liblinear", multi_class="ovr")) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([145, 66, 82, 37, 194, 60, 191, 211, 245, 131]))
def test_query_by_committee_vote(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, disagreement='vote', models=[ LogisticRegression(C=1.0, solver="liblinear", multi_class="ovr"), LogisticRegression(C=0.01, solver="liblinear", multi_class="ovr"), LogisticRegression(C=100, solver="liblinear", multi_class="ovr") ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
def test_variance_reduction(self): trn_ds = Dataset(self.X, np.concatenate([self.y[:2], [None] * (len(self.y) - 2)])) qs = VarianceReduction( trn_ds, model=LogisticRegression(solver='liblinear', multi_class="ovr"), sigma=0.1 ) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([4, 5, 2, 3]))
def test_density_weighted_meta_uncertainty_lc(self): trn_ds = Dataset(self.X[:20], np.concatenate([self.y[:6], [None] * 14])) base_qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression( solver='liblinear', multi_class="ovr")) similarity_metric = cosine_similarity clustering_method = KMeans(n_clusters=3, random_state=1126) qs = DensityWeightedMeta(dataset=trn_ds, base_query_strategy=base_qs, similarity_metric=similarity_metric, clustering_method=clustering_method, beta=1.0, random_state=1126) model = LogisticRegression(solver='liblinear', multi_class="ovr") qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([13, 18, 9, 12, 8, 16, 10, 19, 15, 17]))
def test_query_by_committee_kl_divergence(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, disagreement='kl_divergence', models=[ LogisticRegression(C=1.0, solver="liblinear", multi_class="ovr"), LogisticRegression(C=0.01, solver="liblinear", multi_class="ovr"), LogisticRegression(C=100, solver="liblinear", multi_class="ovr") ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([228, 111, 162, 243, 213, 122, 110, 108, 156, 37]))
def test_multilabel_with_auxiliary_learner_mmr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner( trn_ds, major_learner=BinaryRelevance( LogisticRegression(solver='liblinear', multi_class="ovr")), auxiliary_learner=BinaryRelevance(SVM(gamma="auto")), criterion='mmr', random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([1258, 1461, 231, 1198, 1498, 1374, 955, 1367, 265, 144]))
def test_multilabel_with_auxiliary_learner_shlr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner( trn_ds, major_learner=BinaryRelevance( LogisticRegression(solver='liblinear', multi_class="ovr")), auxiliary_learner=BinaryRelevance(SVM(gamma="auto")), criterion='shlr', b=1., random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([1258, 805, 459, 550, 783, 964, 736, 1004, 38, 750]))
def get_active_svm_index(X_train, y_train, total_labels): y_labes = np.array([None] * len(y_train)) random_labels = [] while len(np.unique(y_train[random_labels])) < 2: random_labels.append(np.random.choice(np.arange(len(y_labes)))) y_labes[random_labels] = y_train[random_labels] fully_labeled_trn_ds = Dataset(X_train, y_train) lbr = IdealLabeler(fully_labeled_trn_ds) trn_ds = Dataset(X_train, y_labes) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) idx = run(trn_ds, lbr, qs, total_labels - len(random_labels)) return idx + random_labels
def test_multilabel_with_auxiliary_learner_hlr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner( trn_ds, major_learner=BinaryRelevance( LogisticRegression(solver='liblinear', multi_class="ovr", random_state=1126)), auxiliary_learner=BinaryRelevance(SVM(gamma="auto")), criterion='hlr', random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([701, 1403, 147, 897, 974, 1266, 870, 703, 292, 1146]))
def test_logistic_regression(self): clf = sklearn.linear_model.LogisticRegression( solver='liblinear', multi_class="ovr") clf.fit(self.X_train, self.y_train) lr = LogisticRegression(solver='liblinear', multi_class="ovr") lr.train(Dataset(self.X_train, self.y_train)) assert_array_equal( clf.predict(self.X_train), lr.predict(self.X_train)) assert_array_equal( clf.predict(self.X_test), lr.predict(self.X_test)) self.assertEqual( clf.score(self.X_train, self.y_train), lr.score(Dataset(self.X_train, self.y_train))) self.assertEqual( clf.score(self.X_test, self.y_test), lr.score(Dataset(self.X_test, self.y_test)))
def test_uncertainty_entropy_exceptions(self): trn_ds = init_toyexample(self.X, self.y) with self.assertRaises(TypeError): qs = UncertaintySampling(trn_ds, method='entropy', model=SVM()) with self.assertRaises(TypeError): qs = UncertaintySampling(trn_ds, method='entropy', model=Perceptron()) with self.assertRaises(TypeError): qs = UncertaintySampling( trn_ds, method='not_exist', model=LogisticRegression(solver='liblinear', multi_class="ovr"))
def __init__(self, *args, **kwargs): super(MaximumLossReductionMaximalConfidence, self).__init__(*args, **kwargs) # self.n_labels = len(self.dataset.get_labeled_entries()[0][1]) self.n_labels = len(self.dataset.get_labeled_entries()[1][0]) random_state = kwargs.pop('random_state', None) self.random_state_ = seed_random_state(random_state) self.logreg_param = kwargs.pop('logreg_param', {'multi_class': 'multinomial', 'solver': 'newton-cg', 'random_state': random_state}) self.logistic_regression_ = LogisticRegression(**self.logreg_param) self.br_base = kwargs.pop('br_base', SklearnProbaAdapter(SVC(kernel='linear', probability=True, gamma="auto", random_state=random_state)))
def test_binary_relevance_lr(self): br = BinaryRelevance(base_clf=LogisticRegression( solver='liblinear', multi_class="ovr", random_state=1126)) br.train(Dataset(self.X_train, self.Y_train)) br_pred_train = br.predict(self.X_train).astype(int) br_pred_test = br.predict(self.X_test).astype(int) br_pred_proba_train = br.predict_proba(self.X_train).astype(float) br_pred_proba_test = br.predict_proba(self.X_test).astype(float) for i in range(np.shape(self.Y_train)[1]): clf = sklearn.linear_model.LogisticRegression(solver='liblinear', multi_class="ovr", random_state=1126) clf.fit(self.X_train, self.Y_train[:, i]) assert_array_almost_equal( clf.predict(self.X_train).astype(int), br_pred_train[:, i]) assert_array_almost_equal( clf.predict(self.X_test).astype(int), br_pred_test[:, i]) assert_array_almost_equal( clf.predict_proba(self.X_train)[:, 1].astype(float), br_pred_proba_train[:, i].astype(float)) assert_array_almost_equal( clf.predict_proba(self.X_test)[:, 1].astype(float), br_pred_proba_test[:, i].astype(float)) self.assertEqual( np.mean(np.abs(self.Y_test - br_pred_test).mean(axis=1)), br.score(Dataset(self.X_test, self.Y_test), 'hamming')) self.assertRaises( NotImplementedError, lambda: br.score(Dataset(self.X_test, self.Y_test), criterion='not_exist'))