def build_query_strategy(sent_df, col_names): # type: (DataFrame, ColumnNames) -> QueryStrategy """ Builds and returns a QueryStrategy using a feature extractor and a base_df """ init_extractor = SynStateALHeuristic.build_feature_extractor( sent_df, col_names) combined_features = init_extractor.transform(sent_df, col_names) trn_ds = TextDataset(sent_df, col_names, None, features=combined_features) return ActiveLearningByLearning( trn_ds, query_strategies=[ UncertaintySampling(trn_ds, model=SVM(C=100, gamma=3.1, kernel='rbf', decision_function_shape='ovr')), QUIRE(trn_ds), HintSVM(trn_ds, cl=1.0, ch=1.0), ], T=1000, uniform_sampler=True, model=SVM(C=100, gamma=3.1, kernel='rbf', decision_function_shape='ovr'))
def train_for_user(user_id=None, device_type=None, n_class=None): test_data = waterloo_iv_processing.get_per_user_data( user_id=user_id, device=device_type, video_name=['sports', 'document', 'nature', 'game', 'movie']) X, y = processing_training_data(n_class=n_class, train_data=test_data) test_size = 0.2 # the percentage of samples in the dataset that will be quota = 350 # number of samples to query result = {'E1': [], 'E2': [], 'E3': []} for i in range(20): print('exp:', i) trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = split_train_test( X=X, y=y, test_size=test_size, n_class=n_class) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) model = SVM(kernel='rbf', decision_function_shape='ovr') qs = UncertaintySampling(trn_ds, method='sm', model=SVM(decision_function_shape='ovr')) _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix) result['E1'].append(E_out_1) qs2 = RandomSampling(trn_ds2) _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix) result['E2'].append(E_out_2) qs3 = ALCE(trn_ds3, cost_matrix, SVR()) _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix) result['E3'].append(E_out_3) E_out_1 = np.mean(result['E1'], axis=0) E_out_2 = np.mean(result['E2'], axis=0) E_out_3 = np.mean(result['E3'], axis=0) save_file( 'results/' + device_type + '_user_' + str(user_id) + '_E1_class_' + str(n_class) + '.txt', result['E1']) save_file( 'results/' + device_type + '_user_' + str(user_id) + '_E2_class_' + str(n_class) + '.txt', result['E2']) save_file( 'results/' + device_type + '_user_' + str(user_id) + '_E3_class_' + str(n_class) + '.txt', result['E3']) print("Uncertainty: ", E_out_1[::5].tolist()) print("Random: ", E_out_2[::5].tolist()) print("ALCE: ", E_out_3[::5].tolist()) query_num = np.arange(0, quota + 1) uncert, = plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling') rd, = plt.plot(query_num, E_out_2, 'k', label='Random') alce, = plt.plot(query_num, E_out_3, 'r', label='ALCE') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result (user ' + str(user_id) + ')') plt.legend(handles=[uncert, rd, alce], loc=3) plt.show()
def main(): test_size = 0.25 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set result = {'E1': [], 'E2': [], 'E3': []} for i in range(2): trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = \ split_train_test(test_size) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) model = SVM(kernel='rbf', decision_function_shape='ovr') quota = 100 # number of samples to query qs = UncertaintySampling(trn_ds, method='sm', model=SVM(decision_function_shape='ovr')) _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix) result['E1'].append(E_out_1) qs2 = RandomSampling(trn_ds2) _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix) result['E2'].append(E_out_2) qs3 = ALCE(trn_ds3, cost_matrix, SVR()) _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix) result['E3'].append(E_out_3) E_out_1 = np.mean(result['E1'], axis=0) E_out_2 = np.mean(result['E2'], axis=0) E_out_3 = np.mean(result['E3'], axis=0) #print("Uncertainty: ", E_out_1[::5].tolist()) #print("Random: ", E_out_2[::5].tolist()) #print("ALCE: ", E_out_3[::5].tolist()) query_num = np.arange(0, quota + 1) plt.figure(figsize=(10, 8)) plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling') plt.plot(query_num, E_out_2, 'k', label='Random') plt.plot(query_num, E_out_3, 'r', label='ALCE') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, ncol=5) plt.show()
def test_ALBLTestCase(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = ActiveLearningByLearning( trn_ds, T=self.quota, query_strategies=[ UncertaintySampling(trn_ds, model=SVM(kernel="linear", decision_function_shape="ovr")), QUIRE(trn_ds), RandomSampling(trn_ds) ], model=SVM(kernel="linear", decision_function_shape="ovr"), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
def test_hs_subsampling(self): ds = Dataset(self.X, self.y[:10] + [None] * (len(self.y) - 10)) sub_qs = UncertaintySampling(ds, model=SVM(gamma='auto', decision_function_shape='ovr')) qs = HS(ds, self.classes, subsample_qs=sub_qs, random_state=1126) qseq = run_qs(ds, qs, self.y, len(self.y)-10) assert_array_equal( np.concatenate([qseq[:10], qseq[-10:]]), np.array([120, 50, 33, 28, 78, 133, 52, 124, 102, 109, 81, 108, 10, 89, 126, 114, 92, 48, 25, 13]) )
def test_multilabel_with_auxiliary_learner_mmr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner(trn_ds, major_learner=BinaryRelevance(LogisticRegression(solver='liblinear', multi_class="ovr")), auxiliary_learner=BinaryRelevance(SVM(gamma="auto")), criterion='mmr', random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([1258, 1461, 231, 1198, 1498, 1374, 955, 1367, 265, 144]))
def test_multilabel_with_auxiliary_learner_hlr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner( trn_ds, major_learner=BinaryRelevance(LogisticRegression()), auxiliary_learner=BinaryRelevance(SVM()), criterion='hlr', random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([701, 1403, 147, 897, 974, 1266, 870, 703, 292, 1146]))
def test_svm(self): svc_clf = SVC(gamma="auto") svc_clf.fit(self.X_train, self.y_train) svm = SVM(gamma="auto") svm.train(Dataset(self.X_train, self.y_train)) assert_array_equal(svc_clf.predict(self.X_train), svm.predict(self.X_train)) assert_array_equal(svc_clf.predict(self.X_test), svm.predict(self.X_test)) self.assertEqual(svc_clf.score(self.X_train, self.y_train), svm.score(Dataset(self.X_train, self.y_train))) self.assertEqual(svc_clf.score(self.X_test, self.y_test), svm.score(Dataset(self.X_test, self.y_test)))
def test_multilabel_with_auxiliary_learner_shlr(self): trn_ds = Dataset(self.X, self.y[:5] + [None] * (len(self.y) - 5)) qs = MultilabelWithAuxiliaryLearner(trn_ds, major_learner=BinaryRelevance(LogisticRegression(solver='liblinear', multi_class="ovr")), auxiliary_learner=BinaryRelevance(SVM(gamma="auto")), criterion='shlr', b=1., random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([1258, 805, 459, 550, 783, 964, 736, 1004, 38, 750]))
def test_uncertainty_entropy_exceptions(self): trn_ds = init_toyexample(self.X, self.y) with self.assertRaises(TypeError): qs = UncertaintySampling(trn_ds, method='entropy', model=SVM()) with self.assertRaises(TypeError): qs = UncertaintySampling(trn_ds, method='entropy', model=Perceptron()) with self.assertRaises(TypeError): qs = UncertaintySampling(trn_ds, method='not_exist', model=LogisticRegression())
def test_svm(self): svc_clf = SVC() svc_clf.fit(self.X_train, self.y_train) svm = SVM() svm.train(Dataset(self.X_train, self.y_train)) assert_array_equal( svc_clf.predict(self.X_train), svm.predict(self.X_train)) assert_array_equal( svc_clf.predict(self.X_test), svm.predict(self.X_test)) self.assertEqual( svc_clf.score(self.X_train, self.y_train), svm.score(Dataset(self.X_train, self.y_train))) self.assertEqual( svc_clf.score(self.X_test, self.y_test), svm.score(Dataset(self.X_test, self.y_test)))
def run_active_learning(): logger = SimpleLogger(LOG_FILE) dm = DataManager() im = InterpretableDataManager() drp_model = SVM(kernel=KERNEL, probability=True) lime_model = svm.SVC(kernel=KERNEL, probability=True) accs = [[], [], []] mccs = [[], [], []] labeled_indices = dm.get_labeled_indices() logger.log(0, labeled_indices) for strategy in STRATEGIES: trn_ds = dm.trn_ds_list[strategy] drp_model.train(trn_ds) update_accs_mccs(accs, mccs, dm, drp_model.model.predict, strategy) print_last_round_mcc(0, accs, mccs) assert (AL_ROUNDS <= len(dm.y_train) - INITIAL_INSTANCES) for round in xrange(1, AL_ROUNDS + 1): print "=================================================" print "Round", round print "=================================================" for strategy in STRATEGIES: trn_ds = dm.trn_ds_list[strategy] exclusion = set() batch = set() unlabeled_indices, unlabeled_X_scaled = zip( *trn_ds.get_unlabeled_entries()) certainties = get_certainties(drp_model.model, dm.X_train_scaled) if strategy == EAL: threshold = get_certainty_threshold(drp_model.model, dm.X_train_scaled, THRESHOLD) y_certainty = discretize_certainties(certainties, threshold) lime_model.fit(dm.X_train_scaled_e, y_certainty) if SHOW_LIME: certainties_test = get_certainties(drp_model.model, dm.X_test_scaled) y_certainty_test = discretize_certainties( certainties_test, threshold) print_lime_model_performance(lime_model, dm, y_certainty_test) while (len(batch) < BATCH_SIZE): query_id = query_least_confident(unlabeled_indices, certainties, exclusion) query = dm.X_train_scaled[query_id] query_unscaled = dm.X_train_e[query_id] instance_certainty = get_certainty(drp_model.model, query) print "Explaining Query with id #{:d}".format(query_id) print "Certainty {:.3f}".format(instance_certainty) explainer = LimeTabularExplainer( dm.X_train_e, training_labels=y_certainty, feature_names=dm.feature_names_e, class_names=["uncertain", "certain"], discretize_continuous=True, discretizer="entropy") predict_fn = lambda x: lime_model.predict_proba( dm.scaler_e.transform(x)).astype(float) for i in xrange(0, MAX_EXP_FEATURE, 2): exp = explainer.explain_instance( query_unscaled, predict_fn, num_features=NUM_FEATURES + i) uncertain_exp_list = get_uncertain_exps(exp) if (len(uncertain_exp_list) >= NUM_FEATURES - 2): break print "INFO: looping" if SHOW_LIME: print_lime_model_prediction(predict_fn, query_unscaled) exp_indices = get_indices_exp_region( exp, dm, unlabeled_indices, y_certainty) exp_instances = get_values_of_indices( exp_indices, dm.X_train_scaled) exp_certainties = get_values_of_indices( exp_indices, certainties) batch_indices = select_batch( min(BATCH_SIZE, BATCH_SIZE - len(batch)), exp_indices, exp_instances, exp_certainties, "k-means-uncertain") if len(batch_indices) == 0: exclusion.add(query_id) continue print "" print_explanation_drp(uncertain_exp_list, False) print "" print "Instances in the batch: {}".format( len(batch_indices)) im.describe_instances(batch_indices) print "" im.describe_instance(query_id) print "" exclusion.update(set(exp_indices)) if ask_expert(): batch.update(set(batch_indices)) else: print "INFO: Not including in the batch" logger.log(round, batch) print "INFO: Labeling the batch" label_batch(trn_ds, dm.y_train, batch) elif strategy == AL: # AL + k-means-uncertain unlabeled_X_scaled = get_values_of_indices( unlabeled_indices, dm.X_train_scaled) unlabeled_certainties = get_values_of_indices( unlabeled_indices, certainties) batch_indices = select_batch(BATCH_SIZE, unlabeled_indices, unlabeled_X_scaled, unlabeled_certainties, "k-means-uncertain") label_batch(trn_ds, dm.y_train, batch_indices) elif strategy == PL: # Passive Learning batch_indices = random.sample(unlabeled_indices, BATCH_SIZE) label_batch(trn_ds, dm.y_train, batch_indices) drp_model.train(trn_ds) update_accs_mccs(accs, mccs, dm, drp_model.model.predict, strategy) print_mcc_summary(mccs)
def main(): test_size = 0.25 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set result = {'E1': [], 'E2': [], 'E3': [], 'E4': [], 'E5': [], 'E6': []} for i in range(10): # repeat experiment trn_ds, tst_ds, fully_labeled_trn_ds = split_train_test(test_size) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) trn_ds4 = copy.deepcopy(trn_ds) trn_ds5 = copy.deepcopy(trn_ds) trn_ds6 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) model = BinaryRelevance(LogisticRegression()) quota = 150 # number of samples to query qs = MMC(trn_ds, br_base=LogisticRegression()) _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) result['E1'].append(E_out_1) qs2 = RandomSampling(trn_ds2) _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) result['E2'].append(E_out_2) qs3 = MultilabelWithAuxiliaryLearner(trn_ds3, BinaryRelevance( LogisticRegression()), BinaryRelevance(SVM()), criterion='hlr') _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota) result['E3'].append(E_out_3) qs4 = MultilabelWithAuxiliaryLearner(trn_ds4, BinaryRelevance( LogisticRegression()), BinaryRelevance(SVM()), criterion='shlr') _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota) result['E4'].append(E_out_4) qs5 = MultilabelWithAuxiliaryLearner(trn_ds5, BinaryRelevance( LogisticRegression()), BinaryRelevance(SVM()), criterion='mmr') _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota) result['E5'].append(E_out_5) qs6 = BinaryMinimization(trn_ds6, LogisticRegression()) _, E_out_6 = run(trn_ds6, tst_ds, lbr, model, qs6, quota) result['E6'].append(E_out_6) E_out_1 = np.mean(result['E1'], axis=0) E_out_2 = np.mean(result['E2'], axis=0) E_out_3 = np.mean(result['E3'], axis=0) E_out_4 = np.mean(result['E4'], axis=0) E_out_5 = np.mean(result['E5'], axis=0) E_out_6 = np.mean(result['E6'], axis=0) print("MMC: ", E_out_1[::5].tolist()) print("Random: ", E_out_2[::5].tolist()) print("MultilabelWithAuxiliaryLearner_hlr: ", E_out_3[::5].tolist()) print("MultilabelWithAuxiliaryLearner_shlr: ", E_out_4[::5].tolist()) print("MultilabelWithAuxiliaryLearner_mmr: ", E_out_5[::5].tolist()) print("BinaryMinimization: ", E_out_6[::5].tolist()) query_num = np.arange(1, quota + 1) fig = plt.figure(figsize=(9, 6)) ax = plt.subplot(111) ax.plot(query_num, E_out_1, 'g', label='MMC') ax.plot(query_num, E_out_2, 'k', label='Random') ax.plot(query_num, E_out_3, 'r', label='AuxiliaryLearner_hlr') ax.plot(query_num, E_out_4, 'b', label='AuxiliaryLearner_shlr') ax.plot(query_num, E_out_5, 'c', label='AuxiliaryLearner_mmr') ax.plot(query_num, E_out_6, 'm', label='BinaryMinimization') box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.75, box.height]) plt.legend(loc=2, bbox_to_anchor=(1.05, 1), borderaxespad=0.) plt.xlabel('Number of Queries') plt.ylabel('Loss') plt.title('Experiment Result (Hamming Loss)') plt.show()
MAIN FUNCTION ''' result = {'Hamming': [],'F1': []} model = BinaryRelevance(LogisticRegression()) quota = 20 # number of samples to query #EXECUTE FROM HERE FOR ITERATIONS qs1 = MultilabelWithAuxiliaryLearner( trn_ds, BinaryRelevance(LogisticRegression()), BinaryRelevance(SVM()), criterion='hlr') run(data_CV_train,trn_ds, qs1, quota) model.train(trn_ds) X , y = zip(*tst_ds.get_labeled_entries()) pred = model.predict(X) output = pd.DataFrame() output['UE_pred'] = [pred[i][0] for i in range(len(pred))] output['BR_pred'] = [pred[i][1] for i in range(len(pred))] output['FR_pred'] = [pred[i][2] for i in range(len(pred))]
def main(): global pos_filepath, dataset_filepath, csv_filepath, vectors_list, ids_list dataset_filepath = "/Users/dndesign/Desktop/active_learning/vecteurs_et_infos/vectors_2015.txt" csv_filepath = "/Users/dndesign/Desktop/active_learning/donnees/corpus_2015_id-time-text.csv" pos_filepath = "/Users/dndesign/Desktop/active_learning/donnees/oriane_pos_id-time-text.csv" vectors_list, ids_list = get_vectors_list(dataset_filepath) timestr = time.strftime("%Y%m%d_%H%M%S") text_file = codecs.open("task_" + str(timestr) + ".txt", "w", "utf-8") print("Loading data...") text_file.write("Loading data...\n") # Open this file t0 = time.time() file = openfile_txt(dataset_filepath) num_lines = sum(1 for line in file) print("Treating " + str(num_lines) + " entries...") text_file.write("Treating : %s entries...\n" % str(num_lines)) # Number of queries to ask human to label quota = 10 E_out1, E_out2, E_out3, E_out4, E_out6, E_out7 = [], [], [], [], [], [] trn_ds, tst_ds = split_train_test(csv_filepath) model = SVM(kernel='linear') # model = LogisticRegression() ''' UncertaintySampling (Least Confident) UncertaintySampling : it queries the instances about which it is least certain how to label Least Confident : it queries the instance whose posterior probability of being positive is nearest 0.5 ''' qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(C=.01)) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ''' UncertaintySampling (Max Margin) ''' trn_ds2 = copy.deepcopy(trn_ds) qs2 = USampling(trn_ds2, method='mm', model=SVM(kernel='linear')) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ''' CMB Sampling Combination of active learning algorithms (distance-based (DIST), diversity-based (DIV)) ''' trn_ds3 = copy.deepcopy(trn_ds) qs3 = CMBSampling(trn_ds3, model=SVM(kernel='linear')) model.train(trn_ds3) E_out3 = np.append(E_out3, 1 - model.score(tst_ds)) ''' Random Sampling Random : it chooses randomly a query ''' trn_ds4 = copy.deepcopy(trn_ds) qs4 = RandomSampling(trn_ds4, random_state=1126) model.train(trn_ds4) E_out4 = np.append(E_out4, 1 - model.score(tst_ds)) ''' QueryByCommittee (Vote Entropy) QueryByCommittee : it keeps a committee of classifiers and queries the instance that the committee members disagree, it also examines unlabeled examples and selects only those that are most informative for labeling Vote Entropy : a way of measuring disagreement Disadvantage : it does not consider the committee members’ class distributions. It also misses some informative unlabeled examples to label ''' trn_ds6 = copy.deepcopy(trn_ds) qs6 = QueryByCommittee(trn_ds6, disagreement='vote', models=[LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100)], random_state=1126) model.train(trn_ds6) E_out6 = np.append(E_out6, 1 - model.score(tst_ds)) ''' QueryByCommittee (Kullback-Leibler Divergence) QueryByCommittee : it examines unlabeled examples and selects only those that are most informative for labeling Disadvantage : it misses some examples on which committee members disagree ''' trn_ds7 = copy.deepcopy(trn_ds) qs7 = QueryByCommittee(trn_ds7, disagreement='kl_divergence', models=[LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100)], random_state=1126) model.train(trn_ds7) E_out7 = np.append(E_out7, 1 - model.score(tst_ds)) with sns.axes_style("darkgrid"): fig = plt.figure() ax = fig.add_subplot(1, 1, 1) query_num = np.arange(0, 1) p1, = ax.plot(query_num, E_out1, 'red') p2, = ax.plot(query_num, E_out2, 'blue') p3, = ax.plot(query_num, E_out3, 'green') p4, = ax.plot(query_num, E_out4, 'orange') p6, = ax.plot(query_num, E_out6, 'black') p7, = ax.plot(query_num, E_out7, 'purple') plt.legend(('Least Confident', 'Max Margin', 'Distance Diversity CMB', 'Random Sampling', 'Vote Entropy', 'KL Divergence'), loc=1) plt.ylabel('Accuracy') plt.xlabel('Number of Queries') plt.title('Active Learning - Query choice strategies') plt.ylim([0, 1]) plt.show(block=False) for i in range(quota): print("\n#################################################") print("Query number " + str(i) + " : ") print("#################################################\n") text_file.write("\n#################################################\n") text_file.write("Query number %s : " % str(i)) text_file.write("\n#################################################\n") ask_id = qs.make_query() print("\033[4mUsing Uncertainty Sampling (Least confident) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Least confident) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ask_id = qs2.make_query() print("\033[4mUsing Uncertainty Sampling (Max Margin) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Smallest Margin) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds2.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ask_id = qs3.make_query() print("\033[4mUsing CMB Distance-Diversity Sampling :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Entropy) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds3.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds3) E_out3 = np.append(E_out3, 1 - model.score(tst_ds)) ask_id = qs4.make_query() print("\033[4mUsing Random Sampling :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Random Sampling :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds4.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds4) E_out4 = np.append(E_out4, 1 - model.score(tst_ds)) ask_id = qs6.make_query() print("\033[4mUsing QueryByCommittee (Vote Entropy) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using QueryByCommittee (Vote Entropy) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds6.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds6) E_out6 = np.append(E_out6, 1 - model.score(tst_ds)) ask_id = qs7.make_query() print("\033[4mUsing QueryByCommittee (KL Divergence) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using QueryByCommittee (KL Divergence) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds7.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds7) E_out7 = np.append(E_out7, 1 - model.score(tst_ds)) ax.set_xlim((0, i + 1)) ax.set_ylim((0, max(max(E_out1), max(E_out2), max(E_out3), max(E_out4), max(E_out6), max(E_out7)) + 0.2)) query_num = np.arange(0, i + 2) p1.set_xdata(query_num) p1.set_ydata(E_out1) p2.set_xdata(query_num) p2.set_ydata(E_out2) p3.set_xdata(query_num) p3.set_ydata(E_out3) p4.set_xdata(query_num) p4.set_ydata(E_out4) p6.set_xdata(query_num) p6.set_ydata(E_out6) p7.set_xdata(query_num) p7.set_ydata(E_out7) plt.draw() t2 = time.time() time_total = t2 - t0 print("\n\n\n#################################################\n") print("Execution time : %fs \n\n" % time_total) text_file.write("\n\n\n#################################################\n") text_file.write("Execution time : %fs \n" % time_total) text_file.close() input("Press any key to save the plot...") plt.savefig('task_' + str(timestr) + '.png') print("Done")
def main(): # Specifiy the parameters here: # path to your binary classification dataset ds_name = 'australian' dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), '%s.txt' % ds_name) test_size = 0.33 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set n_labeled = 10 # number of samples that are initially labeled results = [] for T in range(20): # repeat the experiment 20 times print("%dth experiment" % (T + 1)) trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \ split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) trn_ds4 = copy.deepcopy(trn_ds) trn_ds5 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. qs = UncertaintySampling(trn_ds, model=SVM(decision_function_shape='ovr')) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) results.append(E_out_1.tolist()) qs2 = RandomSampling(trn_ds2) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) results.append(E_out_2.tolist()) qs3 = QUIRE(trn_ds3) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota) results.append(E_out_3.tolist()) qs4 = HintSVM(trn_ds4, cl=1.0, ch=1.0) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota) results.append(E_out_4.tolist()) qs5 = ActiveLearningByLearning( trn_ds5, query_strategies=[ UncertaintySampling(trn_ds5, model=SVM(kernel='linear', decision_function_shape='ovr')), QUIRE(trn_ds5), HintSVM(trn_ds5, cl=1.0, ch=1.0), ], T=quota, uniform_sampler=True, model=SVM(kernel='linear', decision_function_shape='ovr')) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota) results.append(E_out_5.tolist()) result = [] for i in range(5): _temp = [] for j in range(i, len(results), 5): _temp.append(results[j]) result.append(np.mean(_temp, axis=0)) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. query_num = np.arange(1, quota + 1) plt.plot(query_num, result[0], 'g', label='uncertainty sampling') plt.plot(query_num, result[1], 'k', label='random') plt.plot(query_num, result[2], 'r', label='QUIRE') plt.plot(query_num, result[3], 'b', label='HintSVM') plt.plot(query_num, result[4], 'c', label='ALBL') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show()
def main(): X_train, y_train = load_data(DATA_TRAIN) X_test, y_test = load_data(DATA_TEST) X_all, y_all = load_data(DATA_ALL) trn_ds_eal = make_active_learning_dataset(len(y_train), X_all, y_all) trn_ds_al = copy.deepcopy(trn_ds_eal) trn_ds_pl = copy.deepcopy(trn_ds_eal) svm_model = SVM(kernel=KERNEL, probability=True) trn_datasets = [trn_ds_al, trn_ds_eal, trn_ds_pl] accs_list = [[], [], []] mccs_list = [[], [], []] for strategy in STRATEGIES: trn_ds = trn_datasets[strategy] svm_model.train(trn_ds) acc, mcc = compute_acc_mcc(svm_model.model, X_test, y_test) accs_list[strategy].append(acc) mccs_list[strategy].append(mcc) for i in range(ROUNDS): for strategy in STRATEGIES: trn_ds = trn_datasets[strategy] svm_model.train(trn_ds) pool_indices, X_pool = zip(*trn_ds.get_unlabeled_entries()) pool_indices = list(pool_indices) certainties = get_certainties(svm_model.model, X_pool) if strategy == AL: query_indices = select_batch(1, pool_indices, X_pool, certainties, "q-best") query_index = query_indices[0] x1, x2 = X_all[query_index] elif strategy == EAL: query_indices = select_batch(CANDIDATES, pool_indices, X_pool, certainties, "k-means-uncertain") query_indices_q2_q4 = [] for q in query_indices: x1, x2 = X_all[q] if quadrant(x1, x2) in ["Q2", "Q4"]: query_indices_q2_q4.append(q) if query_indices_q2_q4: query_indices = query_indices_q2_q4 query_index = query_indices[randint(0, len(query_indices) - 1)] elif strategy == PL: query_index = choice(pool_indices) x1, x2 = X_all[query_index] trn_ds.update(query_index, y_all[query_index]) svm_model.train(trn_ds) acc, mcc = compute_acc_mcc(svm_model.model, X_test, y_test) accs_list[strategy].append(acc) mccs_list[strategy].append(mcc) for strategy in STRATEGIES: strategy_name = STRATEGIY_NAMES[strategy] accs_list[strategy] = map(lambda x: pretty_float(x), accs_list[strategy]) mccs_list[strategy] = map(lambda x: pretty_float(x), mccs_list[strategy]) print "{0}_ACC,".format(strategy_name) + ",".join(accs_list[strategy]) print "{0}_MCC,".format(strategy_name) + ",".join(mccs_list[strategy])