def libact_QBC(X, y, n_queries): y_train = np.array([None for _ in range(len(y))]) y_train[0], y_train[50], y_train[100] = 0, 1, 2 libact_train_dataset = Dataset(X, y_train) libact_full_dataset = Dataset(X, y) libact_learner_list = [ LogisticRegressionLibact(solver='liblinear', n_jobs=1, multi_class='ovr'), LogisticRegressionLibact(solver='liblinear', n_jobs=1, multi_class='ovr') ] libact_qs = QueryByCommittee(libact_train_dataset, models=libact_learner_list, method='lc') libact_labeler = IdealLabeler(libact_full_dataset) for libact_learner in libact_learner_list: libact_learner.train(libact_train_dataset) for _ in range(n_queries): query_idx = libact_qs.make_query() query_label = libact_labeler.label(X[query_idx]) libact_train_dataset.update(query_idx, query_label) for libact_learner in libact_learner_list: libact_learner.train(libact_train_dataset)
def test_QueryByCommittee(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
def test_query_by_committee_kl_divergence(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, disagreement='kl_divergence', models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([228, 111, 162, 243, 213, 122, 110, 108, 156, 37]))
def test_query_by_committee_vote(self): #self.skipTest("In this version we randomize make queries") trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, disagreement='vote', models=[ LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100) ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([10, 12, 11, 13, 16, 14, 17, 18, 19, 21]))
def test_query_by_committee_vote(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = QueryByCommittee(trn_ds, disagreement='vote', models=[ LogisticRegression(C=1.0, solver="liblinear", multi_class="ovr"), LogisticRegression(C=0.01, solver="liblinear", multi_class="ovr"), LogisticRegression(C=100, solver="liblinear", multi_class="ovr") ], random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([267, 210, 229, 220, 134, 252, 222, 142, 245, 228]))
def initialQuerySetup(train_dataset, queryStrategyID, queryParams=None, fixRandomState=False): if queryStrategyID == 0: queryStrategy = RandomSampling(train_dataset,random_state=137 \ if fixRandomState else None) elif queryStrategyID == 1: queryStrategy = UncertaintySampling(train_dataset, method='sm', model=queryParams[0]) elif queryStrategyID == 2: queryStrategy = QueryByCommittee(train_dataset, models=queryParams[0], disagreement='vote', random_state=23 \ if fixRandomState else None) elif queryStrategyID == 3: queryStrategy = RandomBatchQuery(train_dataset, batch_size=queryParams[0], random_state=2311 \ if fixRandomState else None) elif queryStrategyID == 4: queryStrategy = LeastCertainBatchQuery(train_dataset, model=queryParams[0], batch_size=queryParams[1], random_state=2317 \ if fixRandomState else None) elif queryStrategyID == 5: queryStrategy = SemiSupervisedBatchQuery(train_dataset, model=queryParams[0], batch_size=queryParams[1], random_state=3112 \ if fixRandomState else None) return queryStrategy
def main(): global pos_filepath, dataset_filepath, csv_filepath, vectors_list, ids_list dataset_filepath = "/Users/dndesign/Desktop/active_learning/vecteurs_et_infos/vectors_2015.txt" csv_filepath = "/Users/dndesign/Desktop/active_learning/donnees/corpus_2015_id-time-text.csv" pos_filepath = "/Users/dndesign/Desktop/active_learning/donnees/oriane_pos_id-time-text.csv" vectors_list, ids_list = get_vectors_list(dataset_filepath) timestr = time.strftime("%Y%m%d_%H%M%S") text_file = codecs.open("task_" + str(timestr) + ".txt", "w", "utf-8") print("Loading data...") text_file.write("Loading data...\n") # Open this file t0 = time.time() file = openfile_txt(dataset_filepath) num_lines = sum(1 for line in file) print("Treating " + str(num_lines) + " entries...") text_file.write("Treating : %s entries...\n" % str(num_lines)) # Number of queries to ask human to label quota = 10 E_out1, E_out2, E_out3, E_out4, E_out6, E_out7 = [], [], [], [], [], [] trn_ds, tst_ds = split_train_test(csv_filepath) model = SVM(kernel='linear') # model = LogisticRegression() ''' UncertaintySampling (Least Confident) UncertaintySampling : it queries the instances about which it is least certain how to label Least Confident : it queries the instance whose posterior probability of being positive is nearest 0.5 ''' qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(C=.01)) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ''' UncertaintySampling (Max Margin) ''' trn_ds2 = copy.deepcopy(trn_ds) qs2 = USampling(trn_ds2, method='mm', model=SVM(kernel='linear')) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ''' CMB Sampling Combination of active learning algorithms (distance-based (DIST), diversity-based (DIV)) ''' trn_ds3 = copy.deepcopy(trn_ds) qs3 = CMBSampling(trn_ds3, model=SVM(kernel='linear')) model.train(trn_ds3) E_out3 = np.append(E_out3, 1 - model.score(tst_ds)) ''' Random Sampling Random : it chooses randomly a query ''' trn_ds4 = copy.deepcopy(trn_ds) qs4 = RandomSampling(trn_ds4, random_state=1126) model.train(trn_ds4) E_out4 = np.append(E_out4, 1 - model.score(tst_ds)) ''' QueryByCommittee (Vote Entropy) QueryByCommittee : it keeps a committee of classifiers and queries the instance that the committee members disagree, it also examines unlabeled examples and selects only those that are most informative for labeling Vote Entropy : a way of measuring disagreement Disadvantage : it does not consider the committee members’ class distributions. It also misses some informative unlabeled examples to label ''' trn_ds6 = copy.deepcopy(trn_ds) qs6 = QueryByCommittee(trn_ds6, disagreement='vote', models=[LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100)], random_state=1126) model.train(trn_ds6) E_out6 = np.append(E_out6, 1 - model.score(tst_ds)) ''' QueryByCommittee (Kullback-Leibler Divergence) QueryByCommittee : it examines unlabeled examples and selects only those that are most informative for labeling Disadvantage : it misses some examples on which committee members disagree ''' trn_ds7 = copy.deepcopy(trn_ds) qs7 = QueryByCommittee(trn_ds7, disagreement='kl_divergence', models=[LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100)], random_state=1126) model.train(trn_ds7) E_out7 = np.append(E_out7, 1 - model.score(tst_ds)) with sns.axes_style("darkgrid"): fig = plt.figure() ax = fig.add_subplot(1, 1, 1) query_num = np.arange(0, 1) p1, = ax.plot(query_num, E_out1, 'red') p2, = ax.plot(query_num, E_out2, 'blue') p3, = ax.plot(query_num, E_out3, 'green') p4, = ax.plot(query_num, E_out4, 'orange') p6, = ax.plot(query_num, E_out6, 'black') p7, = ax.plot(query_num, E_out7, 'purple') plt.legend(('Least Confident', 'Max Margin', 'Distance Diversity CMB', 'Random Sampling', 'Vote Entropy', 'KL Divergence'), loc=1) plt.ylabel('Accuracy') plt.xlabel('Number of Queries') plt.title('Active Learning - Query choice strategies') plt.ylim([0, 1]) plt.show(block=False) for i in range(quota): print("\n#################################################") print("Query number " + str(i) + " : ") print("#################################################\n") text_file.write("\n#################################################\n") text_file.write("Query number %s : " % str(i)) text_file.write("\n#################################################\n") ask_id = qs.make_query() print("\033[4mUsing Uncertainty Sampling (Least confident) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Least confident) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ask_id = qs2.make_query() print("\033[4mUsing Uncertainty Sampling (Max Margin) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Smallest Margin) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds2.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ask_id = qs3.make_query() print("\033[4mUsing CMB Distance-Diversity Sampling :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Entropy) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds3.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds3) E_out3 = np.append(E_out3, 1 - model.score(tst_ds)) ask_id = qs4.make_query() print("\033[4mUsing Random Sampling :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Random Sampling :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds4.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds4) E_out4 = np.append(E_out4, 1 - model.score(tst_ds)) ask_id = qs6.make_query() print("\033[4mUsing QueryByCommittee (Vote Entropy) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using QueryByCommittee (Vote Entropy) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds6.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds6) E_out6 = np.append(E_out6, 1 - model.score(tst_ds)) ask_id = qs7.make_query() print("\033[4mUsing QueryByCommittee (KL Divergence) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using QueryByCommittee (KL Divergence) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds7.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds7) E_out7 = np.append(E_out7, 1 - model.score(tst_ds)) ax.set_xlim((0, i + 1)) ax.set_ylim((0, max(max(E_out1), max(E_out2), max(E_out3), max(E_out4), max(E_out6), max(E_out7)) + 0.2)) query_num = np.arange(0, i + 2) p1.set_xdata(query_num) p1.set_ydata(E_out1) p2.set_xdata(query_num) p2.set_ydata(E_out2) p3.set_xdata(query_num) p3.set_ydata(E_out3) p4.set_xdata(query_num) p4.set_ydata(E_out4) p6.set_xdata(query_num) p6.set_ydata(E_out6) p7.set_xdata(query_num) p7.set_ydata(E_out7) plt.draw() t2 = time.time() time_total = t2 - t0 print("\n\n\n#################################################\n") print("Execution time : %fs \n\n" % time_total) text_file.write("\n\n\n#################################################\n") text_file.write("Execution time : %fs \n" % time_total) text_file.close() input("Press any key to save the plot...") plt.savefig('task_' + str(timestr) + '.png') print("Done")
def getQueryStrategy(query_strategy, train_ds, disagreement, estimator_name=None): print('Initialize Query Strategy') # no committee but baseline query strategy if query_strategy == 'uncertainty': qs = UncertaintySampling(train_ds, method='lc', model=la.LogisticRegression_()) # no committee but baseline query strategy elif query_strategy == 'random': qs = RandomSampling(train_ds) elif query_strategy == 'lr_lsvc_rf_dt': if disagreement == 'kl_divergence': raise ValueError( 'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\ Use svc instead or change disagreement to vote!') qs = QueryByCommittee(train_ds, models=[ la.RandomForest_(), la.DecisionTree_(), la.LogisticRegression_(solver='liblinear', max_iter=1000), la.LinearSVC_() ], disagreement=disagreement) # committee with probabilistic models (SVC with prob=True used here instead of LinearSVC) elif query_strategy == 'lr_svc_rf_dt': qs = QueryByCommittee(train_ds, models=[ la.RandomForest_(), la.DecisionTree_(), la.LogisticRegression_(solver='liblinear', max_iter=1000), la.SVC_(kernel='linear', probability=True) ], disagreement=disagreement) elif query_strategy == 'lr_svc_dt_xgb': qs = QueryByCommittee( train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.SVC_(kernel='linear', probability=True), la.DecisionTree_(), la.XGBClassifier_(objective="binary:logistic") ], disagreement=disagreement) # committee of five elif query_strategy == 'lr_svc_dt_xgb_rf': qs = QueryByCommittee( train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.SVC_(kernel='linear', probability=True), la.DecisionTree_(), la.XGBClassifier_(objective="binary:logistic"), la.RandomForest_() ], disagreement=disagreement) elif query_strategy == 'lr_lsvc_dt_gpc': if disagreement == 'kl_divergence': raise ValueError( 'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\ Use svc instead or change disagreement to vote!') qs = QueryByCommittee(train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.LinearSVC_(), la.DecisionTree_(), la.GaussianProcess_() ], disagreement=disagreement) elif query_strategy == 'lr_lsvc_dt_xgb': if disagreement == 'kl_divergence': raise ValueError( 'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\ Use svc instead or change disagreement to vote!') qs = QueryByCommittee( train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.LinearSVC_(), la.DecisionTree_(), la.XGBClassifier_(objective="binary:logistic") ], disagreement=disagreement) elif query_strategy == 'homogeneous_committee': committee = CommitteeModels(estimator_name) qs = QueryByCommittee(train_ds, models=committee.committee['models']) else: print("Query strategy not defined!") return None return qs