def main(): quota = 10 # ask human to label 30 samples n_classes = 5 E_out1, E_out2 = [], [] trn_ds, tst_ds, ds = split_train_test(n_classes) trn_ds2 = copy.deepcopy(trn_ds) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) qs2 = RandomSampling(trn_ds2) model = LogisticRegression() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) ax.set_xlabel('Number of Queries') ax.set_ylabel('Error') model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) query_num = np.arange(0, 1) p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout') p2, = ax.plot(query_num, E_out2, 'k', label='random Eout') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show(block=False) img_ax = fig.add_subplot(2, 1, 2) box = img_ax.get_position() img_ax.set_position([box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9]) # Give each label its name (labels are from 0 to n_classes-1) lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)]) for i in range(quota): ask_id = qs.make_query() print("asking sample from Uncertainty Sampling") # reshape the image to its width and height lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8)) trn_ds.update(ask_id, lb) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ask_id = qs2.make_query() print("asking sample from Random Sample") lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8)) trn_ds2.update(ask_id, lb) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds))
def train_for_user(user_id=None, device_type=None, n_class=None): test_data = waterloo_iv_processing.get_per_user_data( user_id=user_id, device=device_type, video_name=['sports', 'document', 'nature', 'game', 'movie']) X, y = processing_training_data(n_class=n_class, train_data=test_data) test_size = 0.2 # the percentage of samples in the dataset that will be quota = 350 # number of samples to query result = {'E1': [], 'E2': [], 'E3': []} for i in range(20): print('exp:', i) trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = split_train_test( X=X, y=y, test_size=test_size, n_class=n_class) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) model = SVM(kernel='rbf', decision_function_shape='ovr') qs = UncertaintySampling(trn_ds, method='sm', model=SVM(decision_function_shape='ovr')) _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix) result['E1'].append(E_out_1) qs2 = RandomSampling(trn_ds2) _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix) result['E2'].append(E_out_2) qs3 = ALCE(trn_ds3, cost_matrix, SVR()) _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix) result['E3'].append(E_out_3) E_out_1 = np.mean(result['E1'], axis=0) E_out_2 = np.mean(result['E2'], axis=0) E_out_3 = np.mean(result['E3'], axis=0) save_file( 'results/' + device_type + '_user_' + str(user_id) + '_E1_class_' + str(n_class) + '.txt', result['E1']) save_file( 'results/' + device_type + '_user_' + str(user_id) + '_E2_class_' + str(n_class) + '.txt', result['E2']) save_file( 'results/' + device_type + '_user_' + str(user_id) + '_E3_class_' + str(n_class) + '.txt', result['E3']) print("Uncertainty: ", E_out_1[::5].tolist()) print("Random: ", E_out_2[::5].tolist()) print("ALCE: ", E_out_3[::5].tolist()) query_num = np.arange(0, quota + 1) uncert, = plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling') rd, = plt.plot(query_num, E_out_2, 'k', label='Random') alce, = plt.plot(query_num, E_out_3, 'r', label='ALCE') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result (user ' + str(user_id) + ')') plt.legend(handles=[uncert, rd, alce], loc=3) plt.show()
def test_RandomSampling(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)])) qs = RandomSampling(trn_ds, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([150, 16, 122, 157, 233, 160, 114, 163, 155, 56]))
def test_RandomSampling(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)])) qs = RandomSampling(trn_ds, random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([33, 143, 198, 29, 248, 92, 236, 212, 185, 163]))
def main(): test_size = 0.25 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set result = {'E1': [], 'E2': [], 'E3': []} for i in range(2): trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = \ split_train_test(test_size) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) model = SVM(kernel='rbf', decision_function_shape='ovr') quota = 100 # number of samples to query qs = UncertaintySampling(trn_ds, method='sm', model=SVM(decision_function_shape='ovr')) _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix) result['E1'].append(E_out_1) qs2 = RandomSampling(trn_ds2) _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix) result['E2'].append(E_out_2) qs3 = ALCE(trn_ds3, cost_matrix, SVR()) _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix) result['E3'].append(E_out_3) E_out_1 = np.mean(result['E1'], axis=0) E_out_2 = np.mean(result['E2'], axis=0) E_out_3 = np.mean(result['E3'], axis=0) #print("Uncertainty: ", E_out_1[::5].tolist()) #print("Random: ", E_out_2[::5].tolist()) #print("ALCE: ", E_out_3[::5].tolist()) query_num = np.arange(0, quota + 1) plt.figure(figsize=(10, 8)) plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling') plt.plot(query_num, E_out_2, 'k', label='Random') plt.plot(query_num, E_out_3, 'r', label='ALCE') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, ncol=5) plt.show()
def heuristic_score_fun(inst_idx, ss_type): if ss_type == "Random": if "qs2" not in shared_variables: extractor = SynStateALHeuristic.build_feature_extractor(enriched_train_df, col_names) qs2 = RandomSampling(TextDataset(enriched_train_df, col_names, extractor)) shared_variables["qs2"] = qs2 qs2 = shared_variables["qs2"] return qs2.get_score(inst_idx) class Object(object): pass PS_type = type(ss_type.__name__, (object,), dict(orig_state=Object())) # python hack for naming a type def prepare_prev_state(ss_type, prev_state=None): if prev_state is None: prev_state = PS_type() if issubclass(ss_type, SynStateALHeuristic): if str(ss_type)+"qs" not in shared_variables: qs = ss_type.build_query_strategy(enriched_train_df, col_names) shared_variables[str(ss_type)+"qs"] = qs qs = shared_variables[str(ss_type)+"qs"] prev_state.build_next_states_qs = lambda _: qs elif ss_type == SynStateTestDataGain: if "en_labeled_train_df" not in shared_variables: enriched_labeled_train_df = SynStateTestDataGain. \ label_dataframe_with_expert(enriched_train_df, col_names, labeled_df) shared_variables["en_labeled_train_df"] = enriched_labeled_train_df enriched_labeled_train_df = shared_variables["en_labeled_train_df"] prev_state.build_next_states_labeled_df = lambda _: enriched_labeled_train_df elif ss_type == SynStateRandom: pass # return prev_state as it is return prev_state ss_prev_state = prepare_prev_state(ss_type) ss = ss_type(inst_idx, enriched_train_df, col_names, ss_prev_state) return ss.get_state_score()
def main(): # Specifiy the parameters here: # path to your binary classification dataset dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt') test_size = 0.33 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set n_labeled = 10 # number of samples that are initially labeled # Load dataset trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \ split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) model = LogisticRegression() E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) qs2 = RandomSampling(trn_ds2) model = LogisticRegression() E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. query_num = np.arange(1, quota + 1) plt.plot(query_num, E_in_1, 'b', label='qs Ein') plt.plot(query_num, E_in_2, 'r', label='random Ein') plt.plot(query_num, E_out_1, 'g', label='qs Eout') plt.plot(query_num, E_out_2, 'k', label='random Eout') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show()
def test_ALBLTestCase(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = ActiveLearningByLearning( trn_ds, T=self.quota, query_strategies=[ UncertaintySampling(trn_ds, model=SVM(kernel="linear", decision_function_shape="ovr")), QUIRE(trn_ds), RandomSampling(trn_ds) ], model=SVM(kernel="linear", decision_function_shape="ovr"), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
def initialQuerySetup(train_dataset, queryStrategyID, queryParams=None, fixRandomState=False): if queryStrategyID == 0: queryStrategy = RandomSampling(train_dataset,random_state=137 \ if fixRandomState else None) elif queryStrategyID == 1: queryStrategy = UncertaintySampling(train_dataset, method='sm', model=queryParams[0]) elif queryStrategyID == 2: queryStrategy = QueryByCommittee(train_dataset, models=queryParams[0], disagreement='vote', random_state=23 \ if fixRandomState else None) elif queryStrategyID == 3: queryStrategy = RandomBatchQuery(train_dataset, batch_size=queryParams[0], random_state=2311 \ if fixRandomState else None) elif queryStrategyID == 4: queryStrategy = LeastCertainBatchQuery(train_dataset, model=queryParams[0], batch_size=queryParams[1], random_state=2317 \ if fixRandomState else None) elif queryStrategyID == 5: queryStrategy = SemiSupervisedBatchQuery(train_dataset, model=queryParams[0], batch_size=queryParams[1], random_state=3112 \ if fixRandomState else None) return queryStrategy
def test_quire(self): trn_ds = Dataset(self.X, np.concatenate([self.y[:10], [None] * 10])) qs = RandomSampling(trn_ds, random_state=2019) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([18, 12, 19, 16, 10, 11, 14, 13, 15, 17]))
def main(): # Specifiy the parameters here: # path to your binary classification dataset ds_name = 'australian' dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), '%s.txt' % ds_name) test_size = 0.33 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set n_labeled = 10 # number of samples that are initially labeled results = [] for T in range(20): # repeat the experiment 20 times print("%dth experiment" % (T + 1)) trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \ split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) trn_ds4 = copy.deepcopy(trn_ds) trn_ds5 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. qs = UncertaintySampling(trn_ds, model=SVM(decision_function_shape='ovr')) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) results.append(E_out_1.tolist()) qs2 = RandomSampling(trn_ds2) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) results.append(E_out_2.tolist()) qs3 = QUIRE(trn_ds3) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota) results.append(E_out_3.tolist()) qs4 = HintSVM(trn_ds4, cl=1.0, ch=1.0) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota) results.append(E_out_4.tolist()) qs5 = ActiveLearningByLearning( trn_ds5, query_strategies=[ UncertaintySampling(trn_ds5, model=SVM(kernel='linear', decision_function_shape='ovr')), QUIRE(trn_ds5), HintSVM(trn_ds5, cl=1.0, ch=1.0), ], T=quota, uniform_sampler=True, model=SVM(kernel='linear', decision_function_shape='ovr')) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota) results.append(E_out_5.tolist()) result = [] for i in range(5): _temp = [] for j in range(i, len(results), 5): _temp.append(results[j]) result.append(np.mean(_temp, axis=0)) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. query_num = np.arange(1, quota + 1) plt.plot(query_num, result[0], 'g', label='uncertainty sampling') plt.plot(query_num, result[1], 'k', label='random') plt.plot(query_num, result[2], 'r', label='QUIRE') plt.plot(query_num, result[3], 'b', label='HintSVM') plt.plot(query_num, result[4], 'c', label='ALBL') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show()
def main(): global pos_filepath, dataset_filepath, csv_filepath, vectors_list, ids_list dataset_filepath = "/Users/dndesign/Desktop/active_learning/vecteurs_et_infos/vectors_2015.txt" csv_filepath = "/Users/dndesign/Desktop/active_learning/donnees/corpus_2015_id-time-text.csv" pos_filepath = "/Users/dndesign/Desktop/active_learning/donnees/oriane_pos_id-time-text.csv" vectors_list, ids_list = get_vectors_list(dataset_filepath) timestr = time.strftime("%Y%m%d_%H%M%S") text_file = codecs.open("task_" + str(timestr) + ".txt", "w", "utf-8") print("Loading data...") text_file.write("Loading data...\n") # Open this file t0 = time.time() file = openfile_txt(dataset_filepath) num_lines = sum(1 for line in file) print("Treating " + str(num_lines) + " entries...") text_file.write("Treating : %s entries...\n" % str(num_lines)) # Number of queries to ask human to label quota = 10 E_out1, E_out2, E_out3, E_out4, E_out6, E_out7 = [], [], [], [], [], [] trn_ds, tst_ds = split_train_test(csv_filepath) model = SVM(kernel='linear') # model = LogisticRegression() ''' UncertaintySampling (Least Confident) UncertaintySampling : it queries the instances about which it is least certain how to label Least Confident : it queries the instance whose posterior probability of being positive is nearest 0.5 ''' qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(C=.01)) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ''' UncertaintySampling (Max Margin) ''' trn_ds2 = copy.deepcopy(trn_ds) qs2 = USampling(trn_ds2, method='mm', model=SVM(kernel='linear')) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ''' CMB Sampling Combination of active learning algorithms (distance-based (DIST), diversity-based (DIV)) ''' trn_ds3 = copy.deepcopy(trn_ds) qs3 = CMBSampling(trn_ds3, model=SVM(kernel='linear')) model.train(trn_ds3) E_out3 = np.append(E_out3, 1 - model.score(tst_ds)) ''' Random Sampling Random : it chooses randomly a query ''' trn_ds4 = copy.deepcopy(trn_ds) qs4 = RandomSampling(trn_ds4, random_state=1126) model.train(trn_ds4) E_out4 = np.append(E_out4, 1 - model.score(tst_ds)) ''' QueryByCommittee (Vote Entropy) QueryByCommittee : it keeps a committee of classifiers and queries the instance that the committee members disagree, it also examines unlabeled examples and selects only those that are most informative for labeling Vote Entropy : a way of measuring disagreement Disadvantage : it does not consider the committee members’ class distributions. It also misses some informative unlabeled examples to label ''' trn_ds6 = copy.deepcopy(trn_ds) qs6 = QueryByCommittee(trn_ds6, disagreement='vote', models=[LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100)], random_state=1126) model.train(trn_ds6) E_out6 = np.append(E_out6, 1 - model.score(tst_ds)) ''' QueryByCommittee (Kullback-Leibler Divergence) QueryByCommittee : it examines unlabeled examples and selects only those that are most informative for labeling Disadvantage : it misses some examples on which committee members disagree ''' trn_ds7 = copy.deepcopy(trn_ds) qs7 = QueryByCommittee(trn_ds7, disagreement='kl_divergence', models=[LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100)], random_state=1126) model.train(trn_ds7) E_out7 = np.append(E_out7, 1 - model.score(tst_ds)) with sns.axes_style("darkgrid"): fig = plt.figure() ax = fig.add_subplot(1, 1, 1) query_num = np.arange(0, 1) p1, = ax.plot(query_num, E_out1, 'red') p2, = ax.plot(query_num, E_out2, 'blue') p3, = ax.plot(query_num, E_out3, 'green') p4, = ax.plot(query_num, E_out4, 'orange') p6, = ax.plot(query_num, E_out6, 'black') p7, = ax.plot(query_num, E_out7, 'purple') plt.legend(('Least Confident', 'Max Margin', 'Distance Diversity CMB', 'Random Sampling', 'Vote Entropy', 'KL Divergence'), loc=1) plt.ylabel('Accuracy') plt.xlabel('Number of Queries') plt.title('Active Learning - Query choice strategies') plt.ylim([0, 1]) plt.show(block=False) for i in range(quota): print("\n#################################################") print("Query number " + str(i) + " : ") print("#################################################\n") text_file.write("\n#################################################\n") text_file.write("Query number %s : " % str(i)) text_file.write("\n#################################################\n") ask_id = qs.make_query() print("\033[4mUsing Uncertainty Sampling (Least confident) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Least confident) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ask_id = qs2.make_query() print("\033[4mUsing Uncertainty Sampling (Max Margin) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Smallest Margin) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds2.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ask_id = qs3.make_query() print("\033[4mUsing CMB Distance-Diversity Sampling :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Entropy) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds3.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds3) E_out3 = np.append(E_out3, 1 - model.score(tst_ds)) ask_id = qs4.make_query() print("\033[4mUsing Random Sampling :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Random Sampling :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds4.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds4) E_out4 = np.append(E_out4, 1 - model.score(tst_ds)) ask_id = qs6.make_query() print("\033[4mUsing QueryByCommittee (Vote Entropy) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using QueryByCommittee (Vote Entropy) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds6.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds6) E_out6 = np.append(E_out6, 1 - model.score(tst_ds)) ask_id = qs7.make_query() print("\033[4mUsing QueryByCommittee (KL Divergence) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using QueryByCommittee (KL Divergence) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds7.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds7) E_out7 = np.append(E_out7, 1 - model.score(tst_ds)) ax.set_xlim((0, i + 1)) ax.set_ylim((0, max(max(E_out1), max(E_out2), max(E_out3), max(E_out4), max(E_out6), max(E_out7)) + 0.2)) query_num = np.arange(0, i + 2) p1.set_xdata(query_num) p1.set_ydata(E_out1) p2.set_xdata(query_num) p2.set_ydata(E_out2) p3.set_xdata(query_num) p3.set_ydata(E_out3) p4.set_xdata(query_num) p4.set_ydata(E_out4) p6.set_xdata(query_num) p6.set_ydata(E_out6) p7.set_xdata(query_num) p7.set_ydata(E_out7) plt.draw() t2 = time.time() time_total = t2 - t0 print("\n\n\n#################################################\n") print("Execution time : %fs \n\n" % time_total) text_file.write("\n\n\n#################################################\n") text_file.write("Execution time : %fs \n" % time_total) text_file.close() input("Press any key to save the plot...") plt.savefig('task_' + str(timestr) + '.png') print("Done")
def main(): quota = 10 # ask human to label 10 samples n_classes = 5 E_out1, E_out2 = [], [] trn_ds, tst_ds, ds = split_train_test(n_classes) trn_ds2 = copy.deepcopy(trn_ds) # print(trn_ds.get_entries()) # print(len(trn_ds)) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) qs2 = RandomSampling(trn_ds2) model = LogisticRegression() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) ax.set_xlabel('Number of Queries') ax.set_ylabel('Error') model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) query_num = np.arange(0, 1) p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout') p2, = ax.plot(query_num, E_out2, 'k', label='random Eout') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show(block=False) img_ax = fig.add_subplot(2, 1, 2) box = img_ax.get_position() img_ax.set_position( [box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9]) # Give each label its name (labels are from 0 to n_classes-1) lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)]) for i in range(quota): ask_id = qs.make_query() print("asking sample from Uncertainty Sampling") # reshape the image to its width and height lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8)) trn_ds.update(ask_id, lb) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ask_id = qs2.make_query() print("asking sample from Random Sample") lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8)) trn_ds2.update(ask_id, lb) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ax.set_xlim((0, i + 1)) ax.set_ylim((0, max(max(E_out1), max(E_out2)) + 0.2)) query_num = np.arange(0, i + 2) p1.set_xdata(query_num) p1.set_ydata(E_out1) p2.set_xdata(query_num) p2.set_ydata(E_out2) plt.draw() input("Press any key to continue...")
L_test = get_label(D_test, landmark, threshold) testset = Dataset(D_test, L_test) sigma = np.mean(pairwise_distances(D0)) qs = EpsilonMarginSampling( dataset, # Dataset object model=GPC(RBF(1), optimizer=None), margin=margin) qs1 = UncertSampling( dataset, # Dataset object model=GPC(RBF(1), optimizer=None), method='sm') qs2 = RandomSampling(dataset) center0 = np.mean(D[L == 1], axis=0) center = center0 bounds_old = np.vstack((np.min(D0, axis=0), np.max(D0, axis=0))) i = 0 clf = GPC(RBF(1), optimizer=None) while i < n_iter + 1: print 'Iteration: %d/%d' % (i, n_iter) # Generate a pool and expand dataset pool, bounds_new = expand_pool(D, bounds_old, expansion_rate) for entry in pool: dataset.append(entry)
def getQueryStrategy(query_strategy, train_ds, disagreement, estimator_name=None): print('Initialize Query Strategy') # no committee but baseline query strategy if query_strategy == 'uncertainty': qs = UncertaintySampling(train_ds, method='lc', model=la.LogisticRegression_()) # no committee but baseline query strategy elif query_strategy == 'random': qs = RandomSampling(train_ds) elif query_strategy == 'lr_lsvc_rf_dt': if disagreement == 'kl_divergence': raise ValueError( 'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\ Use svc instead or change disagreement to vote!') qs = QueryByCommittee(train_ds, models=[ la.RandomForest_(), la.DecisionTree_(), la.LogisticRegression_(solver='liblinear', max_iter=1000), la.LinearSVC_() ], disagreement=disagreement) # committee with probabilistic models (SVC with prob=True used here instead of LinearSVC) elif query_strategy == 'lr_svc_rf_dt': qs = QueryByCommittee(train_ds, models=[ la.RandomForest_(), la.DecisionTree_(), la.LogisticRegression_(solver='liblinear', max_iter=1000), la.SVC_(kernel='linear', probability=True) ], disagreement=disagreement) elif query_strategy == 'lr_svc_dt_xgb': qs = QueryByCommittee( train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.SVC_(kernel='linear', probability=True), la.DecisionTree_(), la.XGBClassifier_(objective="binary:logistic") ], disagreement=disagreement) # committee of five elif query_strategy == 'lr_svc_dt_xgb_rf': qs = QueryByCommittee( train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.SVC_(kernel='linear', probability=True), la.DecisionTree_(), la.XGBClassifier_(objective="binary:logistic"), la.RandomForest_() ], disagreement=disagreement) elif query_strategy == 'lr_lsvc_dt_gpc': if disagreement == 'kl_divergence': raise ValueError( 'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\ Use svc instead or change disagreement to vote!') qs = QueryByCommittee(train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.LinearSVC_(), la.DecisionTree_(), la.GaussianProcess_() ], disagreement=disagreement) elif query_strategy == 'lr_lsvc_dt_xgb': if disagreement == 'kl_divergence': raise ValueError( 'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\ Use svc instead or change disagreement to vote!') qs = QueryByCommittee( train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.LinearSVC_(), la.DecisionTree_(), la.XGBClassifier_(objective="binary:logistic") ], disagreement=disagreement) elif query_strategy == 'homogeneous_committee': committee = CommitteeModels(estimator_name) qs = QueryByCommittee(train_ds, models=committee.committee['models']) else: print("Query strategy not defined!") return None return qs
def run_featureselection(trn_dss, tst_ds, y_train, model, method_, qs, X_test, y_test, all_cols, save_name, save, type_, part=20): """ Batch active learning algorithm with feature selection """ E_in, E_out = [], [] f1score = [] features_ls = [] label_holder, asked_id = [], [] tn, fp, fn, tp = [], [], [], [] k = trn_dss.len_labeled() k_beg = trn_dss.len_labeled() quota = len(trn_dss.data) iter_ = 0 while (k < quota): clear_output(wait=True) # Standard usage of libact objects # make_query returns the index of the sample that the active learning algorithm would like to query lbls, asks = [], [] if (part < trn_dss.len_unlabeled()): part1 = part else: part1 = trn_dss.len_unlabeled() # -------------------> Feature Selection # select features with feature selection X_train_feature = [i[0] for i in trn_dss.get_labeled_entries()] y_train_feature = [i[1] for i in trn_dss.get_labeled_entries()] col_index, features_f = feature_selection(X_train_feature, y_train_feature, all_cols, f_class=True) features_ls.append(features_f) # update the X_train dataset and y_train with the current selection of variables X_train_updated = [i[0][col_index] for i in trn_dss.data] y_train_updated = [i[1] for i in trn_dss.data] trn_dss_updated = Dataset(X_train_updated, y_train_updated) # update X_test X_test_feature = [i[col_index] for i in X_test] if (type_ == 'random'): qs = RandomSampling(trn_dss_updated, method=method_, model=model) model1 = model elif (type_ == 'unc'): qs = UncertaintySampling(trn_dss_updated, method=method_, model=model) model1 = model elif (type_ == 'qbc'): qs = QueryByCommittee(trn_dss_updated, models=model) model1 = method_ elif (type_ == 'dens'): qs = DWUS(trn_dss_updated, model=model) model1 = model for i in range(0, part1): # ask id only asks for particular id, not all, everytime ask_id = qs.make_query() asks.append(ask_id) # lbl label returns the label of a given sample lb = y_train[ask_id] lbls.append(lb) # update updates the unlabeled sample with queried sample trn_dss.update(ask_id, lb) trn_dss_updated.update(ask_id, lb) label_holder.append(lbls) asked_id.append(asks) # trains only on the labeled examples and chosen values model1.train(trn_dss_updated) # predict it pred_y = model1.predict(X_test_feature) # save the results f1score.append(f1_score(y_test, pred_y)) tn.append(confusion_matrix(y_test, pred_y)[0][0]) fp.append(confusion_matrix(y_test, pred_y)[0][1]) fn.append(confusion_matrix(y_test, pred_y)[1][0]) tp.append(confusion_matrix(y_test, pred_y)[1][1]) # score returns the mean accuracy of the results #E_in = np.append(E_in, 1 - model.score(trn_dss)) #train #E_out = np.append(E_out, 1 - model.score(tst_ds)) #test k = trn_dss_updated.len_labeled() print(k) print(quota) print('iteration:', iter_) print(len(f1score)) print('train dataset labeled:', trn_dss.len_labeled()) print('train dataset shape:', trn_dss.format_sklearn()[0].shape) print('train dataset sum:', trn_dss.format_sklearn()[1].sum()) print('Current f1 score:', f1_score(y_test, pred_y)) print('Current progress:', np.round(k / quota * 100, 2), '%') print('Chosen_features:', features_f) # number of iterations iter_ = iter_ + 1 q = [i for i in range(k_beg, quota, part)] iter_ = [i for i in range(0, len(f1score))] if (save == True): #q= [i for i in range(k_beg,quota,part)] #iter_=[i for i in range(0,len(f1score))] saved_file = pd.DataFrame({ 'iter': iter_, 'quota': q, 'f1_score': f1score, 'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp, 'id_index': asked_id, 'label': label_holder, 'features': features_ls }) saved_file.to_csv(save_name) return q, iter_, f1score, tn, fp, fn, tp, k, trn_dss.data, label_holder, asked_id, features_ls
def main(): test_size = 0.25 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set result = {'E1': [], 'E2': [], 'E3': [], 'E4': [], 'E5': [], 'E6': []} for i in range(10): # repeat experiment trn_ds, tst_ds, fully_labeled_trn_ds = split_train_test(test_size) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) trn_ds4 = copy.deepcopy(trn_ds) trn_ds5 = copy.deepcopy(trn_ds) trn_ds6 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) model = BinaryRelevance(LogisticRegression()) quota = 150 # number of samples to query qs = MMC(trn_ds, br_base=LogisticRegression()) _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) result['E1'].append(E_out_1) qs2 = RandomSampling(trn_ds2) _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) result['E2'].append(E_out_2) qs3 = MultilabelWithAuxiliaryLearner(trn_ds3, BinaryRelevance( LogisticRegression()), BinaryRelevance(SVM()), criterion='hlr') _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota) result['E3'].append(E_out_3) qs4 = MultilabelWithAuxiliaryLearner(trn_ds4, BinaryRelevance( LogisticRegression()), BinaryRelevance(SVM()), criterion='shlr') _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota) result['E4'].append(E_out_4) qs5 = MultilabelWithAuxiliaryLearner(trn_ds5, BinaryRelevance( LogisticRegression()), BinaryRelevance(SVM()), criterion='mmr') _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota) result['E5'].append(E_out_5) qs6 = BinaryMinimization(trn_ds6, LogisticRegression()) _, E_out_6 = run(trn_ds6, tst_ds, lbr, model, qs6, quota) result['E6'].append(E_out_6) E_out_1 = np.mean(result['E1'], axis=0) E_out_2 = np.mean(result['E2'], axis=0) E_out_3 = np.mean(result['E3'], axis=0) E_out_4 = np.mean(result['E4'], axis=0) E_out_5 = np.mean(result['E5'], axis=0) E_out_6 = np.mean(result['E6'], axis=0) print("MMC: ", E_out_1[::5].tolist()) print("Random: ", E_out_2[::5].tolist()) print("MultilabelWithAuxiliaryLearner_hlr: ", E_out_3[::5].tolist()) print("MultilabelWithAuxiliaryLearner_shlr: ", E_out_4[::5].tolist()) print("MultilabelWithAuxiliaryLearner_mmr: ", E_out_5[::5].tolist()) print("BinaryMinimization: ", E_out_6[::5].tolist()) query_num = np.arange(1, quota + 1) fig = plt.figure(figsize=(9, 6)) ax = plt.subplot(111) ax.plot(query_num, E_out_1, 'g', label='MMC') ax.plot(query_num, E_out_2, 'k', label='Random') ax.plot(query_num, E_out_3, 'r', label='AuxiliaryLearner_hlr') ax.plot(query_num, E_out_4, 'b', label='AuxiliaryLearner_shlr') ax.plot(query_num, E_out_5, 'c', label='AuxiliaryLearner_mmr') ax.plot(query_num, E_out_6, 'm', label='BinaryMinimization') box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.75, box.height]) plt.legend(loc=2, bbox_to_anchor=(1.05, 1), borderaxespad=0.) plt.xlabel('Number of Queries') plt.ylabel('Loss') plt.title('Experiment Result (Hamming Loss)') plt.show()