def learn(self): # seeding classes = self.short_df['grades_round'].unique() seed_index = [] for i in classes: seed_index.append(self.short_df['grades_round'][self.short_df['grades_round']==i].index[0]) seed_index act_data = self.short_df.copy() accuracy_list = [] # initialising train_idx = seed_index X_train = self.X[train_idx] y_train = self.Y[train_idx] # generating the pool X_pool = np.delete(self.X, train_idx, axis=0) y_pool = np.delete(self.Y, train_idx) act_data = act_data.drop(axis=0,index = train_idx) act_data.reset_index(drop = True,inplace=True) initiated_committee = [] for learner_idx,model in enumerate(self.learners): learner = ActiveLearner( estimator=model, X_training=X_train, y_training=y_train ) initiated_committee.append(learner) # Commitee creation committee = Committee( learner_list= initiated_committee, # query_strategy=vote_entropy_sampling ) committee.teach(X_train,y_train) # pool-based sampling n_queries = int(len(X)/(100/self.percent)) for idx in range(n_queries): query_idx = np.random.choice(range(len(X_pool))) committee.teach( X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, ) ) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) act_data = act_data.drop(axis=0,index = query_idx) act_data.reset_index(drop=True, inplace=True) accuracy_list.append(accuracy_score(committee.predict(X_pool),y_pool)) # print('Accuracy after query no. %d: %f' % (idx+1, accuracy_score(committee.predict(X_pool),y_pool))) print("By just labelling ",round(n_queries*100.0/len(X),2),"% of total data accuracy of ", round(accuracy_score(committee.predict(X_pool),y_pool),3), " % is achieved on the unseen data" ) model_pred = committee.predict(X_pool) model_f1 = f1_score(y_pool,model_pred,average='weighted') return accuracy_list,model_f1
def run(X, y, n_samples_for_intial, n_queries, n_comittee_members, estimator): # start timer start_time = time.time() # init list of different learners learners = [] X_train, y_train, X_pool, y_pool = create_random_pool_and_initial_sets(X, y, n_samples_for_intial) for member_idx in range(n_comittee_members): learners.append(ActiveLearner(estimator=estimator, X_training=X_train, y_training=y_train)) # init committee committee = Committee(learner_list=learners, query_strategy=max_disagreement_sampling) unqueried_score = committee.score(X, y) print('Score over unqueried samples {:0.4f}'.format(unqueried_score)) performance_history = [] f1_score = 0 index = 0 while f1_score < 0.65: index += 1 # get sample from pool query_idx, query_instance = committee.query(X_pool) # retrain comittee with new sample committee.teach( X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, ) ) # remove queried instance from pool X_pool = delete_rows_csr(X_pool, query_idx) y_pool = np.delete(y_pool, query_idx) y_pred = committee.predict(X) f1_score = metrics.f1_score(y, y_pred, average='micro') if index % 100 == 0: print('F1 score after {n} training samples: {f1:0.4f}'.format(n=index, f1=f1_score)) # save accuracy score performance_history.append(f1_score) print("--- %s seconds ---" % (time.time() - start_time)) print(performance_history) return index
def cmte_loop(estimator1,estimator2,X_0,Y_0,X_train,Y_train,X_test,Y_test,indexs,n=5): #learners = [] X_pool = deepcopy(np.delete(X_train,indexs,axis=0)) Y_pool = deepcopy(np.delete(Y_train,indexs)) #committee = Committee2Level(estimator1,estimator2,X_0,Y_0) committee = Committee([ActiveLearner(e,X_training=X_0,y_training=Y_0) for e in estimator1]) accuracies = [] while len(X_pool)>0: #query_indxs,_ = committee.query(X_pool,Y_pool) query_indxs,_ = committee.query(X_pool) committee.teach(X_pool[query_indxs],Y_pool[query_indxs]) X_0 = np.append(X_0,X_pool[query_indxs],axis=0) Y_0 = np.append(Y_0,Y_pool[query_indxs][0]) X_pool = np.delete(X_pool,query_indxs,axis=0) Y_pool = np.delete(Y_pool,query_indxs,axis=0) accuracies.append(evaluate(committee,X_0,Y_0,X_test,Y_test)) return (committee,accuracies)
def modAL_QBC(X, y, n_queries): learner_list = [ ActiveLearner(LogisticRegression(solver='liblinear', n_jobs=1, multi_class='ovr'), X_training=X[[0, 50, 100]], y_training=y[[0, 50, 100]]), ActiveLearner(LogisticRegression(solver='liblinear', n_jobs=1, multi_class='ovr'), X_training=X[[0, 50, 100]], y_training=y[[0, 50, 100]]) ] modAL_learner = Committee(learner_list) for _ in range(n_queries): query_idx, query_inst = modAL_learner.query(X) modAL_learner.teach(X[query_idx], y[query_idx])
def cmte_loop(estimator, X_0, Y_0, X_train, Y_train, X_test, Y_test, indexs, n=5): learners = [] print(len(indexs)) X_pool = deepcopy(np.delete(X_train, indexs, axis=0)) Y_pool = deepcopy(np.delete(Y_train, indexs)) for i in range(len(estimator)): learners.append( ActiveLearner(estimator=estimator[i], X_training=X_0, y_training=Y_0)) committee = Committee(learner_list=learners) index = 0 accuracies = [] while len(X_pool) > 0: query_indxs, _ = committee.query(X_pool) if len(query_indxs) > 1: raise Exception("NOOOOOOOOOOOOOOO") committee.teach(X=X_pool[query_indxs], y=Y_pool[query_indxs]) X_0 = np.append(X_0, X_pool[query_indxs], axis=0) Y_0 = np.append(Y_0, Y_pool[query_indxs][0]) X_pool = np.delete(X_pool, query_indxs, axis=0) Y_pool = np.delete(Y_pool, query_indxs, axis=0) accuracies.append(evaluate(committee, X_0, Y_0, X_test, Y_test)) #model_accuracy = 1- committee.score(X_0,Y_0) #print('Error after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy)) index += 1 #predicts = committee.predict(X_test) #corrects = (predicts==Y_test) #accs =1 - sum([1 if i else 0 for i in corrects])/len(predicts) #print(accs) #plts_train.append(model_accuracy) #plts_test.append(accs) return (committee, accuracies)
class Committee2Level: def __init__(self,estimators_1,estimators_2,X,Y): indexs2 = list(filter(lambda i: Y[i]!=2,range(len(Y)))) X1 = X Y1 = np.array([i if i==2 else 1 for i in Y]) X2 = X[indexs2] Y2 = Y[indexs2] learners1 = [ActiveLearner(e,X_training=X1,y_training=Y1) for e in estimators_1] learners2 = [ActiveLearner(e,X_training=X2,y_training=Y2) for e in estimators_2] self.osCommittee = Committee(learners1) self.pnCommittee = Committee(learners2) self.choice_indx = 0 def teach(self,X,Y): indexs2 = list(filter(lambda i: Y[i]!=2,range(len(Y)))) X1 = X Y1 = np.array([i if i==2 else 1 for i in Y]) X2 = X[indexs2] Y2 = Y[indexs2] self.osCommittee.teach(X1,Y1) if len(Y2) > 0: self.pnCommittee.teach(X2,Y2) def predict(self,X): predicts1 = self.osCommittee.predict(X) indexs = range(len(X)) predicts2 = self.pnCommittee.predict(X) predicts = [predicts1[i] if predicts1[i]==2 else predicts2[i] for i in indexs] return predicts def query(self,X,Y): indexs2 = list(filter(lambda i: Y[i]!=2,range(len(Y)))) X1 = X X2 = X[indexs2] q2 = ([],2) if len(X2)>0: q2 = self.pnCommittee.query(X2) alternatives = [self.osCommittee.query(X1), q2] r = alternatives[self.choice_indx%2] #print(self.choice_indx) self.choice_indx +=1 if len(r[0]) == 0: return alternatives[self.choice_indx%2] return r
# initial training data n_initial = 5 train_idx = np.random.choice(range(X_pool.shape[0]), size=n_initial, replace=False) X_train = X_pool[train_idx] y_train = y_pool[train_idx] # creating a reduced copy of the data with the known instances removed X_pool = np.delete(X_pool, train_idx, axis=0) y_pool = np.delete(y_pool, train_idx) # initializing learner learner = ActiveLearner(estimator=RandomForestClassifier(n_estimators=10), X_training=X_train, y_training=y_train) learner_list.append(learner) # assembling the committee committee = Committee(learner_list=learner_list) # query by committee n_queries = 10 for idx in range(n_queries): query_idx, query_instance = committee.query(X_pool) committee.teach(X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, )) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx)
def active_learn(df1, first_item_index_of_each_category): train_idx = first_item_index_of_each_category # X_train = iris['data'][train_idx] # y_train = iris['target'][train_idx] # initial training data data = df1.values[:, 1:] target = df1['label'].values X_full = df1.values[:, 1:] y_full = df1['label'].values X_train = df1.values[:, 1:][ train_idx] #item from second column as the first column is the label.. y_train = df1['label'].values[train_idx] # X_pool = np.delete(data, train_idx, axis=0) # y_pool = np.delete(target, train_idx) X_pool = deepcopy(X_full) y_pool = deepcopy(y_full) # initializing Committee members n_members = 2 learner_list = list() for member_idx in range(n_members): # initial training data # n_initial = 5 # train_idx = np.random.choice(range(X_pool.shape[0]), size=n_initial, replace=False) # X_train = X_pool[train_idx] # y_train = y_pool[train_idx] # creating a reduced copy of the data with the known instances removed X_pool = np.delete(X_pool, train_idx, axis=0) y_pool = np.delete(y_pool, train_idx) # initializing learner learner = ActiveLearner(estimator=RandomForestClassifier(), X_training=X_train, y_training=y_train) learner_list.append(learner) # assembling the committee committee = Committee(learner_list=learner_list) # print('Committee initial predictions, accuracy = %1.3f' % committee.score(data, target)) print('%1.3f' % committee.score(data, target)) performance_array = [] n_queries = 505 for idx in range(n_queries): query_idx, query_instance = committee.query(X_pool) committee.teach(X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, )) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) learner_score = committee.score(data, target) # print('Committee %d th query predictions, accuracy = %1.3f' % (idx , learner_score)) print('%1.3f' % (learner_score)) if (idx % 100 == 0): performance_array.append(learner_score) percentage_increase(performance_array)
def learn(self): # seeding classes = self.short_df['grades_round'].unique() seed_index = [] for i in classes: seed_index.append(self.short_df['grades_round'][ self.short_df['grades_round'] == i].index[0]) seed_index act_data = self.short_df.copy() accuracy_list = [] f1_total_list = [] kappa_total_list = [] # initialising train_idx = seed_index X_train = self.X[train_idx] y_train = self.Y[train_idx] # generating the pool X_pool = np.delete(self.X, train_idx, axis=0) y_pool = np.delete(self.Y, train_idx) act_data = act_data.drop(axis=0, index=train_idx) act_data.reset_index(drop=True, inplace=True) initiated_committee = [] for learner_idx, model in enumerate(self.learners): learner = ActiveLearner(estimator=model, X_training=X_train, y_training=y_train) initiated_committee.append(learner) # Commitee creation committee = Committee( learner_list=initiated_committee, # query_strategy=vote_entropy_sampling ) committee.teach(X_train, y_train) # pool-based sampling n_queries = int(len(X) / (100 / self.percent)) for idx in range(n_queries): query_idx = np.random.choice(range(len(X_pool))) committee.teach(X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, )) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) act_data = act_data.drop(axis=0, index=query_idx) act_data.reset_index(drop=True, inplace=True) accuracy_list.append( accuracy_score(committee.predict(X_pool), y_pool)) model_pred = committee.predict(X_pool) f1_total_list.append( f1_score(y_pool, model_pred, average="weighted", labels=np.unique(model_pred))) kappa_total_list.append(cohen_kappa_score(y_pool, model_pred)) return accuracy_list, f1_total_list, kappa_total_list
plt.show() # visualizing the Committee's predictions per learner with plt.style.context('seaborn-white'): plt.figure(figsize=(7, 7)) prediction = committee.predict(iris['data']) plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50) plt.title('Committee initial predictions') plt.show() # query by committee n_queries = 10 for idx in range(n_queries): query_idx, query_instance = committee.query(X_pool) committee.teach( X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, ) ) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) # visualizing the final predictions per learner with plt.style.context('seaborn-white'): plt.figure(figsize=(n_members*7, 7)) for learner_idx, learner in enumerate(committee): plt.subplot(1, n_members, learner_idx + 1) plt.scatter(x=pca[:, 0], y=pca[:, 1], c=learner.predict(iris['data']), cmap='viridis', s=50) plt.title('Learner no. %d predictions after %d queries' % (learner_idx + 1, n_queries)) plt.show() # visualizing the Committee's predictions
for x in range(135): if (classifier_uncertainty(committee, X_unlab[iter].reshape(1, -1)) >= 0.8): break iter = (iter + 1) % (X_unlab.shape[0]) q_id = iter - 1 X_new = X_unlab[q_id].reshape(1, -1) Y_new = Y_unlab[q_id].reshape(1, ) X_unlab, Y_unlab = np.asarray(np.delete(X_unlab, q_id, axis=0)), np.delete(Y_unlab, q_id, axis=0) committee.teach(X_new, Y_new) x = committee.score(X, Y) accuracy_list.append(x) if ((i + 1) % 15 == 0): print("Accuracy after", i + 1, "iterations :", x) print("Number of unlabelled data points left are : ", X_unlab.shape[0]) # In[70]: plt.figure(figsize=(7, 5)) plt.plot(list(range(0, int(queries) + 1)), accuracy_list,
committee_random = Committee( learner_list=[member1r, member2r, member3r, member4r, member5r]) # In[10]: random_acc_list = [committee_random.score(X, Y)] print("Accuracy after", 0, "iterations :", committee_random.score(X, Y)) random.seed(99) for randompts in range(60): index = random.choice(unlabelled_idx) X_new = X[index].reshape(1, -1) Y_new = Y[index].reshape(1, ) committee_random.teach(X_new, Y_new) random_x = committee_random.score(X, Y) random_acc_list.append(random_x) if ((randompts + 1) % 15 == 0): print("Accuracy after", randompts + 1, "iterations :", random_x) # In[11]: import math unlab_length = X_unlab.shape[0] disagreement = np.zeros(unlab_length * 2).reshape(unlab_length, 2) for i in range(unlab_length): index = [i] predict = [-1, -1, -1, -1, -1]
X_initial, y_initial, X_pool, y_pool = init_data_sampling(xtrain, ytrain) # creating learners. if committee flag == True then create a committee with a subset with the returned learners. # else: use only the first learner learner1, learner2, learner3 = create_learners(qs, X_initial, y_initial) if committee_flag: learner = Committee( learner_list=[learner1, learner3], query_strategy=vote_entropy_sampling ) else: learner = learner1 # the active learning loop update_results(ytest, learner.predict(xtest), i) n_queries = 10 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool, n_instances=300) learner.teach( X=X_pool[query_idx], y=y_pool[query_idx] ) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx, axis=0) print(idx) update_results(ytest, learner.predict(xtest), i) plots(i) reset_tables() f1_plot()