def libact_first_try_second_run(self, enriched_train_df, extractor, ideal_df, lbr, quota, validation_data_df, return_dict): trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) E_out1 = [] E_out1 = np.append( E_out1, run_classifier(trn_ds.extract_labeled_dataframe(), validation_data_df).f1) for i in range(quota): if len(trn_ds.get_unlabeled_entries()) == 0: break # finished labeling all examples ask_id = qs.make_query() lb = lbr.label(trn_ds.extract_sentence(ask_id)) self.assertEqual(lb, ideal_df[cn.tag_col][ask_id]) trn_ds.update(ask_id, lb) # model.train(trn_ds) E_out1 = np.append( E_out1, run_classifier(trn_ds.extract_labeled_dataframe(), validation_data_df).f1) return_dict[2] = E_out1
def make_query(self): tempDataset = copy.deepcopy(self.dataset) tempModel = copy.deepcopy(self.model) queryStrat = UncertaintySampling(tempDataset, model=tempModel) #Model is fit here queryIds = [] for j in range(self.batch_size_): queryId = queryStrat.make_query() #Model is also fit here queryIds.append(queryId) features = tempDataset.get_entries()[queryId][0] probs = tempModel.predict_proba(features.reshape(1, -1)) # hard coded flag for positive answer - need to improve if self.random_state_.rand() < probs[0][0]: label = 0 else: label = 1 tempDataset.update(queryId, label) # tempModel.train(tempDataset) #This is not needed, # since the make_query of UncertaintySampling fits return queryIds
def getUncertaintyIndex(self, trn_ds, method, clf): print "[Trainer-Selection] Get uncertainty sampling index." qs = UncertaintySampling(trn_ds, method=method, model=clf) _, score = qs.make_query(return_score=True) score_sorted = sorted(score, key=lambda x:x[1], reverse=True) result = [] for index in score_sorted: result.append(self.unlabeled_index_[index[0]]) return result
def strategies_to_try(tp): if tp == 'uncertainty': return lambda trn_ds, libact_model: UncertaintySampling(trn_ds, model=libact_model, method='lc') elif tp == 'random': return lambda trn_ds, libact_model: RandomSamplingWithRetraining(trn_ds, model=libact_model, method='lc') elif tp == 'positivelesscertain': return lambda tr_ds, libact_model: UncertaintySampling(tr_ds, model=PositiveLessCertain(libact_model), method='lc') else: raise ValueError('Wrong strategy')
def __init__(self, X, y, labs, n=2): y = [yy if yy >= 0 else None for yy in y] self.dataset = Dataset(X, y) self.labs = labs self.uc = UncertaintySampling(self.dataset, method='lc', model=LinearSVC()) self.n = n
def main(): quota = 10 # ask human to label 30 samples n_classes = 5 E_out1, E_out2 = [], [] trn_ds, tst_ds, ds = split_train_test(n_classes) trn_ds2 = copy.deepcopy(trn_ds) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) qs2 = RandomSampling(trn_ds2) model = LogisticRegression() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) ax.set_xlabel('Number of Queries') ax.set_ylabel('Error') model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) query_num = np.arange(0, 1) p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout') p2, = ax.plot(query_num, E_out2, 'k', label='random Eout') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show(block=False) img_ax = fig.add_subplot(2, 1, 2) box = img_ax.get_position() img_ax.set_position([box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9]) # Give each label its name (labels are from 0 to n_classes-1) lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)]) for i in range(quota): ask_id = qs.make_query() print("asking sample from Uncertainty Sampling") # reshape the image to its width and height lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8)) trn_ds.update(ask_id, lb) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ask_id = qs2.make_query() print("asking sample from Random Sample") lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8)) trn_ds2.update(ask_id, lb) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds))
def test_uncertainty_entropy_exceptions(self): trn_ds = init_toyexample(self.X, self.y) with self.assertRaises(TypeError): qs = UncertaintySampling(trn_ds, method='entropy', model=SVM()) with self.assertRaises(TypeError): qs = UncertaintySampling(trn_ds, method='entropy', model=Perceptron()) with self.assertRaises(TypeError): qs = UncertaintySampling(trn_ds, method='not_exist', model=LogisticRegression())
def build_query_strategy(sent_df, col_names): # type: (DataFrame, ColumnNames) -> QueryStrategy """ Builds and returns a QueryStrategy using a feature extractor and a base_df """ init_extractor = SynStateALHeuristic.build_feature_extractor( sent_df, col_names) combined_features = init_extractor.transform(sent_df, col_names) trn_ds = TextDataset(sent_df, col_names, None, features=combined_features) return ActiveLearningByLearning( trn_ds, query_strategies=[ UncertaintySampling(trn_ds, model=SVM(C=100, gamma=3.1, kernel='rbf', decision_function_shape='ovr')), QUIRE(trn_ds), HintSVM(trn_ds, cl=1.0, ch=1.0), ], T=1000, uniform_sampler=True, model=SVM(C=100, gamma=3.1, kernel='rbf', decision_function_shape='ovr'))
def train_for_user(user_id=None, device_type=None, n_class=None): test_data = waterloo_iv_processing.get_per_user_data( user_id=user_id, device=device_type, video_name=['sports', 'document', 'nature', 'game', 'movie']) X, y = processing_training_data(n_class=n_class, train_data=test_data) test_size = 0.2 # the percentage of samples in the dataset that will be quota = 350 # number of samples to query result = {'E1': [], 'E2': [], 'E3': []} for i in range(20): print('exp:', i) trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = split_train_test( X=X, y=y, test_size=test_size, n_class=n_class) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) model = SVM(kernel='rbf', decision_function_shape='ovr') qs = UncertaintySampling(trn_ds, method='sm', model=SVM(decision_function_shape='ovr')) _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix) result['E1'].append(E_out_1) qs2 = RandomSampling(trn_ds2) _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix) result['E2'].append(E_out_2) qs3 = ALCE(trn_ds3, cost_matrix, SVR()) _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix) result['E3'].append(E_out_3) E_out_1 = np.mean(result['E1'], axis=0) E_out_2 = np.mean(result['E2'], axis=0) E_out_3 = np.mean(result['E3'], axis=0) save_file( 'results/' + device_type + '_user_' + str(user_id) + '_E1_class_' + str(n_class) + '.txt', result['E1']) save_file( 'results/' + device_type + '_user_' + str(user_id) + '_E2_class_' + str(n_class) + '.txt', result['E2']) save_file( 'results/' + device_type + '_user_' + str(user_id) + '_E3_class_' + str(n_class) + '.txt', result['E3']) print("Uncertainty: ", E_out_1[::5].tolist()) print("Random: ", E_out_2[::5].tolist()) print("ALCE: ", E_out_3[::5].tolist()) query_num = np.arange(0, quota + 1) uncert, = plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling') rd, = plt.plot(query_num, E_out_2, 'k', label='Random') alce, = plt.plot(query_num, E_out_3, 'r', label='ALCE') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result (user ' + str(user_id) + ')') plt.legend(handles=[uncert, rd, alce], loc=3) plt.show()
def score_per_add_al(labeled_pool_df, base_training_df, validation_data_df): # type: (DataFrame, DataFrame, DataFrame) -> tuple gen_pool_df = labeled_pool_df.copy(deep=True) gen_pool_df[cn.col_names.tag] = [np.NaN] * len( gen_pool_df) # clear all tags enriched_train_df = pd.concat([base_training_df, gen_pool_df], ignore_index=True) extractor = cn.Feature_Extractor( enriched_train_df, cn.col_names) # build the feature extractor trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) ideal_df = pd.concat([base_training_df, labeled_pool_df], ignore_index=True) lbr = IdealTextLabeler(TextDataset(ideal_df, cn.col_names, extractor)) scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(), validation_data_df) ex_added_list, res_list = run_active_learning( trn_ds, scoring_fun, lbr, qs, len(enriched_train_df)) # label all df return ex_added_list, res_list
def test_uncertainty_entropy(self): trn_ds = init_toyexample(self.X, self.y) qs = UncertaintySampling(trn_ds, method='entropy', model=LogisticRegression()) model = LogisticRegression() qseq = run_qs(trn_ds, self.lbr, model, qs, self.quota) assert_array_equal(qseq, np.array([6, 7, 8, 9]))
def test_uncertainty_sm(self): trn_ds = init_toyexample(self.X, self.y) qs = UncertaintySampling(trn_ds, method='sm', model=LogisticRegression(solver='liblinear', multi_class="ovr")) model = LogisticRegression(solver='liblinear', multi_class="ovr") qseq = run_qs(trn_ds, self.lbr, model, qs, self.quota) assert_array_equal(qseq, np.array([6, 7, 8, 9]))
def test_UcertaintySamplingSm(self): random.seed(1126) trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = UncertaintySampling(trn_ds, method='sm', model=LogisticRegression()) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([145, 66, 82, 37, 194, 60, 191, 211, 245, 131]))
def libact_uncertainty(X, y, n_queries): y_train = np.array([None for _ in range(len(y))]) y_train[0], y_train[50], y_train[100] = 0, 1, 2 libact_train_dataset = Dataset(X, y_train) libact_full_dataset = Dataset(X, y) libact_learner = LogisticRegressionLibact( solver='liblinear', n_jobs=1, multi_class='ovr') #SVM(gamma='auto', probability=True) libact_qs = UncertaintySampling(libact_train_dataset, model=libact_learner, method='lc') libact_labeler = IdealLabeler(libact_full_dataset) libact_learner.train(libact_train_dataset) for _ in range(n_queries): query_idx = libact_qs.make_query() query_label = libact_labeler.label(X[query_idx]) libact_train_dataset.update(query_idx, query_label) libact_learner.train(libact_train_dataset)
def build_query_strategy(sent_df, col_names): # type: (DataFrame, ColumnNames) -> QueryStrategy """ Builds and returns a QueryStrategy using a feature extractor and a base_df """ init_extractor = SynStateALHeuristic.build_feature_extractor(sent_df, col_names) combined_features = init_extractor.transform(sent_df, col_names) return UncertaintySampling(TextDataset(sent_df, col_names, None, features=combined_features), method='lc', model=LogisticRegression())
def test_hs_subsampling(self): ds = Dataset(self.X, self.y[:10] + [None] * (len(self.y) - 10)) sub_qs = UncertaintySampling(ds, model=SVM(gamma='auto', decision_function_shape='ovr')) qs = HS(ds, self.classes, subsample_qs=sub_qs, random_state=1126) qseq = run_qs(ds, qs, self.y, len(self.y)-10) assert_array_equal( np.concatenate([qseq[:10], qseq[-10:]]), np.array([120, 50, 33, 28, 78, 133, 52, 124, 102, 109, 81, 108, 10, 89, 126, 114, 92, 48, 25, 13]) )
def libact_first_try_first_run(self, enriched_train_df, extractor, lbr, quota, validation_data_df, return_dict): trn_ds = TextDataset(enriched_train_df, cn.col_names, extractor) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) scoring_fun = lambda ds: run_classifier(ds.extract_labeled_dataframe(), validation_data_df).f1 query_num, E_out1 = run_active_learning(trn_ds, scoring_fun, lbr, qs, quota) return_dict[1] = E_out1
class UncertaintySampler(object): def __init__(self, X, y, labs, n=2): y = [yy if yy >= 0 else None for yy in y] self.dataset = Dataset(X, y) self.labs = labs self.uc = UncertaintySampling(self.dataset, method='lc', model=LinearSVC()) self.n = n def get_next(self): print >> sys.stderr, 'get_next: start' out = self.uc.make_query(n=self.n) print >> sys.stderr, 'get_next: done' return out def set_label(self, idx, label): print >> sys.stderr, 'set_label: start' out = self.dataset.update(idx, label) print >> sys.stderr, 'set_label: done' return out def get_data(self): X, y = zip(*self.dataset.get_entries()) X, y = np.vstack(X), np.array( [yy if yy is not None else -1 for yy in y]) return X, y def n_hits(self): labels = np.array(zip(*self.dataset.get_entries())[1]) return (labels == 1).sum() def n_labeled(self): return self.dataset.len_labeled() def is_labeled(self, idx): return idx in np.where(zip(*self.dataset.get_entries())[1])[0] def save(self, outpath): """ !! This should be updated to save in same format as simple_las """ X, y = self.get_data() f = h5py.File( '%s-%s-%s.h5' % (outpath, 'uncertainty', datetime.now().strftime('%Y%m%d_%H%M%S'))) f['X'] = X f['y'] = y f['labs'] = self.labs f.close()
def main(): test_size = 0.25 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set result = {'E1': [], 'E2': [], 'E3': []} for i in range(2): trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = \ split_train_test(test_size) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) model = SVM(kernel='rbf', decision_function_shape='ovr') quota = 100 # number of samples to query qs = UncertaintySampling(trn_ds, method='sm', model=SVM(decision_function_shape='ovr')) _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix) result['E1'].append(E_out_1) qs2 = RandomSampling(trn_ds2) _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix) result['E2'].append(E_out_2) qs3 = ALCE(trn_ds3, cost_matrix, SVR()) _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix) result['E3'].append(E_out_3) E_out_1 = np.mean(result['E1'], axis=0) E_out_2 = np.mean(result['E2'], axis=0) E_out_3 = np.mean(result['E3'], axis=0) #print("Uncertainty: ", E_out_1[::5].tolist()) #print("Random: ", E_out_2[::5].tolist()) #print("ALCE: ", E_out_3[::5].tolist()) query_num = np.arange(0, quota + 1) plt.figure(figsize=(10, 8)) plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling') plt.plot(query_num, E_out_2, 'k', label='Random') plt.plot(query_num, E_out_3, 'r', label='ALCE') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, ncol=5) plt.show()
def test_ActiveLearningByLearning(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = ActiveLearningByLearning(trn_ds, T=self.quota, query_strategies=[ UncertaintySampling( trn_ds, model=LogisticRegression()), HintSVM(trn_ds, random_state=1126) ], model=LogisticRegression(), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
def main(): # Specifiy the parameters here: # path to your binary classification dataset dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt') test_size = 0.33 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set n_labeled = 10 # number of samples that are initially labeled # Load dataset trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \ split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) model = LogisticRegression() E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) qs2 = RandomSampling(trn_ds2) model = LogisticRegression() E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. query_num = np.arange(1, quota + 1) plt.plot(query_num, E_in_1, 'b', label='qs Ein') plt.plot(query_num, E_in_2, 'r', label='random Ein') plt.plot(query_num, E_out_1, 'g', label='qs Eout') plt.plot(query_num, E_out_2, 'k', label='random Eout') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show()
def test_ALBLTestCase(self): trn_ds = Dataset( self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)])) qs = ActiveLearningByLearning( trn_ds, T=self.quota, query_strategies=[ UncertaintySampling(trn_ds, model=SVM(kernel="linear", decision_function_shape="ovr")), QUIRE(trn_ds), RandomSampling(trn_ds) ], model=SVM(kernel="linear", decision_function_shape="ovr"), random_state=1126) qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal( qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
def test_density_weighted_meta_uncertainty_lc(self): trn_ds = Dataset(self.X[:20], np.concatenate([self.y[:6], [None] * 14])) base_qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression( solver='liblinear', multi_class="ovr")) similarity_metric = cosine_similarity clustering_method = KMeans(n_clusters=3, random_state=1126) qs = DensityWeightedMeta(dataset=trn_ds, base_query_strategy=base_qs, similarity_metric=similarity_metric, clustering_method=clustering_method, beta=1.0, random_state=1126) model = LogisticRegression(solver='liblinear', multi_class="ovr") qseq = run_qs(trn_ds, qs, self.y, self.quota) assert_array_equal(qseq, np.array([13, 18, 9, 12, 8, 16, 10, 19, 15, 17]))
def initialQuerySetup(train_dataset, queryStrategyID, queryParams=None, fixRandomState=False): if queryStrategyID == 0: queryStrategy = RandomSampling(train_dataset,random_state=137 \ if fixRandomState else None) elif queryStrategyID == 1: queryStrategy = UncertaintySampling(train_dataset, method='sm', model=queryParams[0]) elif queryStrategyID == 2: queryStrategy = QueryByCommittee(train_dataset, models=queryParams[0], disagreement='vote', random_state=23 \ if fixRandomState else None) elif queryStrategyID == 3: queryStrategy = RandomBatchQuery(train_dataset, batch_size=queryParams[0], random_state=2311 \ if fixRandomState else None) elif queryStrategyID == 4: queryStrategy = LeastCertainBatchQuery(train_dataset, model=queryParams[0], batch_size=queryParams[1], random_state=2317 \ if fixRandomState else None) elif queryStrategyID == 5: queryStrategy = SemiSupervisedBatchQuery(train_dataset, model=queryParams[0], batch_size=queryParams[1], random_state=3112 \ if fixRandomState else None) return queryStrategy
def main(): # Specifiy the parameters here: # path to your binary classification dataset ds_name = 'australian' dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), '%s.txt' % ds_name) test_size = 0.33 # the percentage of samples in the dataset that will be # randomly selected and assigned to the test set n_labeled = 10 # number of samples that are initially labeled results = [] for T in range(20): # repeat the experiment 20 times print("%dth experiment" % (T + 1)) trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \ split_train_test(dataset_filepath, test_size, n_labeled) trn_ds2 = copy.deepcopy(trn_ds) trn_ds3 = copy.deepcopy(trn_ds) trn_ds4 = copy.deepcopy(trn_ds) trn_ds5 = copy.deepcopy(trn_ds) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - n_labeled # number of samples to query # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. qs = UncertaintySampling(trn_ds, model=SVM(decision_function_shape='ovr')) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota) results.append(E_out_1.tolist()) qs2 = RandomSampling(trn_ds2) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota) results.append(E_out_2.tolist()) qs3 = QUIRE(trn_ds3) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota) results.append(E_out_3.tolist()) qs4 = HintSVM(trn_ds4, cl=1.0, ch=1.0) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota) results.append(E_out_4.tolist()) qs5 = ActiveLearningByLearning( trn_ds5, query_strategies=[ UncertaintySampling(trn_ds5, model=SVM(kernel='linear', decision_function_shape='ovr')), QUIRE(trn_ds5), HintSVM(trn_ds5, cl=1.0, ch=1.0), ], T=quota, uniform_sampler=True, model=SVM(kernel='linear', decision_function_shape='ovr')) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota) results.append(E_out_5.tolist()) result = [] for i in range(5): _temp = [] for j in range(i, len(results), 5): _temp.append(results[j]) result.append(np.mean(_temp, axis=0)) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. query_num = np.arange(1, quota + 1) plt.plot(query_num, result[0], 'g', label='uncertainty sampling') plt.plot(query_num, result[1], 'k', label='random') plt.plot(query_num, result[2], 'r', label='QUIRE') plt.plot(query_num, result[3], 'b', label='HintSVM') plt.plot(query_num, result[4], 'c', label='ALBL') plt.xlabel('Number of Queries') plt.ylabel('Error') plt.title('Experiment Result') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show()
def main(): global pos_filepath, dataset_filepath, csv_filepath, vectors_list, ids_list dataset_filepath = "/Users/dndesign/Desktop/active_learning/vecteurs_et_infos/vectors_2015.txt" csv_filepath = "/Users/dndesign/Desktop/active_learning/donnees/corpus_2015_id-time-text.csv" pos_filepath = "/Users/dndesign/Desktop/active_learning/donnees/oriane_pos_id-time-text.csv" vectors_list, ids_list = get_vectors_list(dataset_filepath) timestr = time.strftime("%Y%m%d_%H%M%S") text_file = codecs.open("task_" + str(timestr) + ".txt", "w", "utf-8") print("Loading data...") text_file.write("Loading data...\n") # Open this file t0 = time.time() file = openfile_txt(dataset_filepath) num_lines = sum(1 for line in file) print("Treating " + str(num_lines) + " entries...") text_file.write("Treating : %s entries...\n" % str(num_lines)) # Number of queries to ask human to label quota = 10 E_out1, E_out2, E_out3, E_out4, E_out6, E_out7 = [], [], [], [], [], [] trn_ds, tst_ds = split_train_test(csv_filepath) model = SVM(kernel='linear') # model = LogisticRegression() ''' UncertaintySampling (Least Confident) UncertaintySampling : it queries the instances about which it is least certain how to label Least Confident : it queries the instance whose posterior probability of being positive is nearest 0.5 ''' qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(C=.01)) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ''' UncertaintySampling (Max Margin) ''' trn_ds2 = copy.deepcopy(trn_ds) qs2 = USampling(trn_ds2, method='mm', model=SVM(kernel='linear')) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ''' CMB Sampling Combination of active learning algorithms (distance-based (DIST), diversity-based (DIV)) ''' trn_ds3 = copy.deepcopy(trn_ds) qs3 = CMBSampling(trn_ds3, model=SVM(kernel='linear')) model.train(trn_ds3) E_out3 = np.append(E_out3, 1 - model.score(tst_ds)) ''' Random Sampling Random : it chooses randomly a query ''' trn_ds4 = copy.deepcopy(trn_ds) qs4 = RandomSampling(trn_ds4, random_state=1126) model.train(trn_ds4) E_out4 = np.append(E_out4, 1 - model.score(tst_ds)) ''' QueryByCommittee (Vote Entropy) QueryByCommittee : it keeps a committee of classifiers and queries the instance that the committee members disagree, it also examines unlabeled examples and selects only those that are most informative for labeling Vote Entropy : a way of measuring disagreement Disadvantage : it does not consider the committee members’ class distributions. It also misses some informative unlabeled examples to label ''' trn_ds6 = copy.deepcopy(trn_ds) qs6 = QueryByCommittee(trn_ds6, disagreement='vote', models=[LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100)], random_state=1126) model.train(trn_ds6) E_out6 = np.append(E_out6, 1 - model.score(tst_ds)) ''' QueryByCommittee (Kullback-Leibler Divergence) QueryByCommittee : it examines unlabeled examples and selects only those that are most informative for labeling Disadvantage : it misses some examples on which committee members disagree ''' trn_ds7 = copy.deepcopy(trn_ds) qs7 = QueryByCommittee(trn_ds7, disagreement='kl_divergence', models=[LogisticRegression(C=1.0), LogisticRegression(C=0.01), LogisticRegression(C=100)], random_state=1126) model.train(trn_ds7) E_out7 = np.append(E_out7, 1 - model.score(tst_ds)) with sns.axes_style("darkgrid"): fig = plt.figure() ax = fig.add_subplot(1, 1, 1) query_num = np.arange(0, 1) p1, = ax.plot(query_num, E_out1, 'red') p2, = ax.plot(query_num, E_out2, 'blue') p3, = ax.plot(query_num, E_out3, 'green') p4, = ax.plot(query_num, E_out4, 'orange') p6, = ax.plot(query_num, E_out6, 'black') p7, = ax.plot(query_num, E_out7, 'purple') plt.legend(('Least Confident', 'Max Margin', 'Distance Diversity CMB', 'Random Sampling', 'Vote Entropy', 'KL Divergence'), loc=1) plt.ylabel('Accuracy') plt.xlabel('Number of Queries') plt.title('Active Learning - Query choice strategies') plt.ylim([0, 1]) plt.show(block=False) for i in range(quota): print("\n#################################################") print("Query number " + str(i) + " : ") print("#################################################\n") text_file.write("\n#################################################\n") text_file.write("Query number %s : " % str(i)) text_file.write("\n#################################################\n") ask_id = qs.make_query() print("\033[4mUsing Uncertainty Sampling (Least confident) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Least confident) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ask_id = qs2.make_query() print("\033[4mUsing Uncertainty Sampling (Max Margin) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Smallest Margin) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds2.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ask_id = qs3.make_query() print("\033[4mUsing CMB Distance-Diversity Sampling :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Uncertainty Sampling (Entropy) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds3.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds3) E_out3 = np.append(E_out3, 1 - model.score(tst_ds)) ask_id = qs4.make_query() print("\033[4mUsing Random Sampling :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using Random Sampling :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds4.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds4) E_out4 = np.append(E_out4, 1 - model.score(tst_ds)) ask_id = qs6.make_query() print("\033[4mUsing QueryByCommittee (Vote Entropy) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using QueryByCommittee (Vote Entropy) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds6.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds6) E_out6 = np.append(E_out6, 1 - model.score(tst_ds)) ask_id = qs7.make_query() print("\033[4mUsing QueryByCommittee (KL Divergence) :\033[0m") print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True) print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n") text_file.write("Using QueryByCommittee (KL Divergence) :\n") text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id))) text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id))) trn_ds7.update(ask_id, simulate_human_decision(ask_id)) model.train(trn_ds7) E_out7 = np.append(E_out7, 1 - model.score(tst_ds)) ax.set_xlim((0, i + 1)) ax.set_ylim((0, max(max(E_out1), max(E_out2), max(E_out3), max(E_out4), max(E_out6), max(E_out7)) + 0.2)) query_num = np.arange(0, i + 2) p1.set_xdata(query_num) p1.set_ydata(E_out1) p2.set_xdata(query_num) p2.set_ydata(E_out2) p3.set_xdata(query_num) p3.set_ydata(E_out3) p4.set_xdata(query_num) p4.set_ydata(E_out4) p6.set_xdata(query_num) p6.set_ydata(E_out6) p7.set_xdata(query_num) p7.set_ydata(E_out7) plt.draw() t2 = time.time() time_total = t2 - t0 print("\n\n\n#################################################\n") print("Execution time : %fs \n\n" % time_total) text_file.write("\n\n\n#################################################\n") text_file.write("Execution time : %fs \n" % time_total) text_file.close() input("Press any key to save the plot...") plt.savefig('task_' + str(timestr) + '.png') print("Done")
def main(): quota = 10 # ask human to label 10 samples n_classes = 5 E_out1, E_out2 = [], [] trn_ds, tst_ds, ds = split_train_test(n_classes) trn_ds2 = copy.deepcopy(trn_ds) # print(trn_ds.get_entries()) # print(len(trn_ds)) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) qs2 = RandomSampling(trn_ds2) model = LogisticRegression() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) ax.set_xlabel('Number of Queries') ax.set_ylabel('Error') model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) query_num = np.arange(0, 1) p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout') p2, = ax.plot(query_num, E_out2, 'k', label='random Eout') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show(block=False) img_ax = fig.add_subplot(2, 1, 2) box = img_ax.get_position() img_ax.set_position( [box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9]) # Give each label its name (labels are from 0 to n_classes-1) lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)]) for i in range(quota): ask_id = qs.make_query() print("asking sample from Uncertainty Sampling") # reshape the image to its width and height lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8)) trn_ds.update(ask_id, lb) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ask_id = qs2.make_query() print("asking sample from Random Sample") lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8)) trn_ds2.update(ask_id, lb) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ax.set_xlim((0, i + 1)) ax.set_ylim((0, max(max(E_out1), max(E_out2)) + 0.2)) query_num = np.arange(0, i + 2) p1.set_xdata(query_num) p1.set_ydata(E_out1) p2.set_xdata(query_num) p2.set_ydata(E_out2) plt.draw() input("Press any key to continue...")
def getQueryStrategy(query_strategy, train_ds, disagreement, estimator_name=None): print('Initialize Query Strategy') # no committee but baseline query strategy if query_strategy == 'uncertainty': qs = UncertaintySampling(train_ds, method='lc', model=la.LogisticRegression_()) # no committee but baseline query strategy elif query_strategy == 'random': qs = RandomSampling(train_ds) elif query_strategy == 'lr_lsvc_rf_dt': if disagreement == 'kl_divergence': raise ValueError( 'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\ Use svc instead or change disagreement to vote!') qs = QueryByCommittee(train_ds, models=[ la.RandomForest_(), la.DecisionTree_(), la.LogisticRegression_(solver='liblinear', max_iter=1000), la.LinearSVC_() ], disagreement=disagreement) # committee with probabilistic models (SVC with prob=True used here instead of LinearSVC) elif query_strategy == 'lr_svc_rf_dt': qs = QueryByCommittee(train_ds, models=[ la.RandomForest_(), la.DecisionTree_(), la.LogisticRegression_(solver='liblinear', max_iter=1000), la.SVC_(kernel='linear', probability=True) ], disagreement=disagreement) elif query_strategy == 'lr_svc_dt_xgb': qs = QueryByCommittee( train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.SVC_(kernel='linear', probability=True), la.DecisionTree_(), la.XGBClassifier_(objective="binary:logistic") ], disagreement=disagreement) # committee of five elif query_strategy == 'lr_svc_dt_xgb_rf': qs = QueryByCommittee( train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.SVC_(kernel='linear', probability=True), la.DecisionTree_(), la.XGBClassifier_(objective="binary:logistic"), la.RandomForest_() ], disagreement=disagreement) elif query_strategy == 'lr_lsvc_dt_gpc': if disagreement == 'kl_divergence': raise ValueError( 'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\ Use svc instead or change disagreement to vote!') qs = QueryByCommittee(train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.LinearSVC_(), la.DecisionTree_(), la.GaussianProcess_() ], disagreement=disagreement) elif query_strategy == 'lr_lsvc_dt_xgb': if disagreement == 'kl_divergence': raise ValueError( 'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\ Use svc instead or change disagreement to vote!') qs = QueryByCommittee( train_ds, models=[ la.LogisticRegression_(solver='liblinear', max_iter=1000), la.LinearSVC_(), la.DecisionTree_(), la.XGBClassifier_(objective="binary:logistic") ], disagreement=disagreement) elif query_strategy == 'homogeneous_committee': committee = CommitteeModels(estimator_name) qs = QueryByCommittee(train_ds, models=committee.committee['models']) else: print("Query strategy not defined!") return None return qs
def main(args): acc_pool = [] maxlen = 100 # get the texts and their corresponding labels texts, labels = load_ptsd_data() # Keras example # # transform data into matrix of integers # tokenizer = Tokenizer() # tokenizer.fit_on_texts(texts) # sequences = tokenizer.texts_to_sequences(texts) # data = pad_sequences(sequences, # maxlen=maxlen, # padding='post', truncating='post') from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from libact.models import SklearnProbaAdapter, SklearnAdapter from sklearn.naive_bayes import MultinomialNB from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression # count words count_vect = CountVectorizer(max_features=5000, stop_words='english') features = count_vect.fit_transform(texts).todense().tolist() # import pdb; pdb.set_trace() if 0: # tf-idf tfidf_transformer = TfidfTransformer() features = tfidf_transformer.fit_transform(features) pool, pool_ideal = make_pool( features, labels, prelabeled=[1, 2, 3, 4, 5, 218, 260, 466, 532, 564] ) # get the model if args.model.lower() in ['multinomialnb', 'nb']: sklearn_model = MultinomialNB kwargs_model = {} elif args.model.lower() == 'svc': sklearn_model = SVC kwargs_model = { 'probability': True, # 'class_weight': {0: 1, 1: 100} 'class_weight': 'balanced' } elif args.model.lower() == 'logisticregression': sklearn_model = LogisticRegression kwargs_model = {} else: raise ValueError('Model not found.') # initialize the model through the adapter model = SklearnProbaAdapter(sklearn_model(**kwargs_model)) # query strategy # https://libact.readthedocs.io/en/latest/libact.query_strategies.html # #libact-query-strategies-uncertainty-sampling-module # # least confidence (lc), it queries the instance whose posterior # probability of being positive is nearest 0.5 (for binary # classification); smallest margin (sm), it queries the instance whose # posterior probability gap between the most and the second probable # labels is minimal qs = UncertaintySampling( pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model))) # The passive learning model. The model given in the query strategy is not # the same. Have a look at this one. # model = LogisticRegression() fig, ax = plt.subplots() ax.set_xlabel('Number of Queries') ax.set_ylabel('Value') # Train the model on the train dataset. model.train(pool) # the accuracy of the entire pool acc_pool = np.append( acc_pool, model._model.score([x[0] for x in pool.get_entries()], labels) ) # make plot query_num = np.arange(0, 1) p2, = ax.plot(query_num, acc_pool, 'r', label='Accuracy') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show(block=False) # Give each label its name (labels are from 0 to n_classes-1) if args.interactive: lbr = InteractivePaperLabeler(label_name=["0", "1"]) else: lbr = IdealLabeler(dataset=pool_ideal) query_i = 1 while query_i <= args.quota: # make a query from the pool print("Asking sample from pool with Uncertainty Sampling") ask_id = qs.make_query() print("Index {} returned. True label is {}.".format( ask_id, pool_ideal.data[ask_id][1])) # get the paper data_point = pool.data[ask_id][0] lb = lbr.label(data_point) # update the label in the train dataset pool.update(ask_id, lb) # train the model again model.train(pool) # append the score to the model acc_pool = np.append( acc_pool, model._model.score([x[0] for x in pool.get_entries()], labels) ) # additional evaluations #pred = model.predict([x[0] for x in pool.get_entries()]) idx_features = pool.get_unlabeled_entries() features = [x[1] for x in idx_features] idx= [x[0] for x in idx_features] pred = model.predict(features) print(confusion_matrix(labels[idx], pred)) print(recall_score(labels[idx], pred)) if args.interactive: # update plot ax.set_xlim((0, query_i)) ax.set_ylim((0, max(acc_pool) + 0.2)) p2.set_xdata(np.arange(0, query_i + 1)) p2.set_ydata(acc_pool) plt.draw() # update the query counter query_i += 1 if not args.interactive: # update plot ax.set_xlim((0, query_i - 1)) ax.set_ylim((0, max(acc_pool) + 0.2)) p2.set_xdata(np.arange(0, query_i)) p2.set_ydata(acc_pool) plt.draw() print(acc_pool) input("Press any key to continue...")
def main(args): pickle_file_name = args.dataset + '_pickle.pickle' pickle_file_path = os.path.join(TEMP_DATA_DIR, pickle_file_name) seed = 2018 * args.T if args.dataset == 'ptsd': texts, lbls = load_ptsd_data() else: texts, lbls = load_drug_data(args.dataset) # get the texts and their corresponding labels textManager = TextManager() data, labels, word_index = textManager.sequence_maker(texts, lbls) max_num_words = textManager.max_num_words max_sequence_length = textManager.max_sequence_length prelabeled_index = select_prelabeled(labels, args.init_included_papers, seed) # [1, 2, 3, 4, 5, 218, 260, 466, 532, 564] print('prelabeled_index', prelabeled_index) pool, pool_ideal = make_pool(data, labels, prelabeled=prelabeled_index) if os.path.isfile(pickle_file_path): embedding_layer = load_pickle(pickle_file_path) else: if not os.path.exists(TEMP_DATA_DIR): os.makedirs(TEMP_DATA_DIR) embedding = Word2VecEmbedding(word_index, max_num_words, max_sequence_length) embedding.load_word2vec_data(GLOVE_PATH) embedding_layer = embedding.build_embedding() dump_pickle(embedding_layer, pickle_file_path) # get the model if args.model.lower() == 'lstm': deep_model = LSTM_Libact kwargs_model = { 'backwards': True, 'dropout': 0.4, 'optimizer': 'rmsprop', 'max_sequence_length': max_sequence_length, 'embedding_layer': embedding_layer } else: raise ValueError('Model not found.') model = deep_model(**kwargs_model) # # query strategy # # https://libact.readthedocs.io/en/latest/libact.query_strategies.html # # #libact-query-strategies-uncertainty-sampling-module # # # # least confidence (lc), it queries the instance whose posterior # # probability of being positive is nearest 0.5 (for binary # # classification); smallest margin (sm), it queries the instance whose # # posterior probability gap between the most and the second probable # # labels is minimal # qs = UncertaintySampling( # pool, method='lc', model=SklearnProbaAdapter(sklearn_model(**kwargs_model))) #Todo: check if 'lc' works correctly/ add random as well qs = UncertaintySampling(pool, method='lc', model=deep_model(**kwargs_model)) # Give each label its name (labels are from 0 to n_classes-1) if args.interactive: lbr = InteractivePaperLabeler(label_name=["0", "1"]) else: lbr = IdealLabeler(dataset=pool_ideal) result_df = pd.DataFrame({'label': [x[1] for x in pool_ideal.data]}) query_i = 1 ##Todo: add multiple papers to labeled dataset with size of batch_size while query_i <= args.quota: # make a query from the pool print("Asking sample from pool with Uncertainty Sampling") # unlabeled_entry = pool.get_unlabeled_entries() ask_id = qs.make_query() print("Index {} returned. True label is {}.".format( ask_id, pool_ideal.data[ask_id][1])) # get the paper data_point = pool.data[ask_id][0] lb = lbr.label(data_point) # update the label in the train dataset pool.update(ask_id, lb) # train the model again # to_read_mean, to_read_std = cross_validation(model,pool,split_no=3,seed =query_i) model.train(pool) idx_features = pool.get_unlabeled_entries() idx = [x[0] for x in idx_features] features = [x[1] for x in idx_features] pred = model.predict(features) c_name = str(query_i) result_df[c_name] = -1 result_df.loc[idx, c_name] = pred[:, 1] # update the query counter query_i += 1 # save the result to a file output_dir = os.path.join(ACTIVE_DIR, args.dataset) if not os.path.exists(output_dir): os.makedirs(output_dir) export_path = os.path.join(output_dir, 'sr_lstm_active{}.csv'.format(args.T)) result_df.to_csv(export_path) input("Press any key to continue...")
sents = sent_tokenize(line, language='russian') tokenized_texts.append(sents) for s in sents: vocab[s] = line tfidf = TfidfVectorizer() # create the vectorizer and get the X x = tfidf.fit_transform(itertools.chain(*tokenized_texts)) # form the y by randomly fill the classes. # to not do this, the option of labeling dataset from scratch is needed # in Dataset class y = np.array([0, 1, 0, 1, 0, 1, 0, 1] + [None] * (x.shape[0] - 8)) # Create the handfull Dataset object from libact dataset = Dataset(x, y) # Create strategy qs = UncertaintySampling(dataset, method='lc', model=LogisticRegression()) # create list of sentences texts = list(itertools.chain(*tokenized_texts)) @app.route('/') def show_entries(): # Ask the sample have to be lableled # and ranger the page for the first time ask_id = qs.make_query() session['ask_id'] = int(ask_id) return render_template('show.html', \ text = vocab[texts[ask_id]],\ sentence= texts[ask_id])