def active_learning_procedure(query_strategy, test_X, test_y, pool_X, pool_y, initial_X, initial_y, estimator, epochs=50, batch_size=128, n_queries=100, n_instances=10, verbose=0): learner = ActiveLearner(estimator=estimator, X_training=initial_X, y_training=initial_y, query_strategy=query_strategy, verbose=verbose) perf_hist = [learner.score(test_X, test_y, verbose=verbose)] for index in range(n_queries): query_idx, query_instance = learner.query(pool_X, n_instances) learner.teach(pool_X[query_idx], pool_y[query_idx], epochs=epochs, batch_size=batch_size, verbose=verbose) pool_X = np.delete(pool_X, query_idx, axis=0) pool_y = np.delete(pool_y, query_idx, axis=0) model_accuracy = learner.score(test_X, test_y, verbose=0) print("accuracy after query {n}: {acc:0.4f".format(n=index + 1, acc=model_accuracy)) perf_hist.append(model_accuracy) return perf_hist
def learn(self): # seeding classes = self.short_df['grades_round'].unique() seed_index = [] for i in classes: seed_index.append(self.short_df['grades_round'][ self.short_df['grades_round'] == i].index[0]) seed_index act_data = self.short_df.copy() accuracy_list = [] f1_total_list = [] kappa_total_list = [] # initialising train_idx = seed_index X_train = self.X[train_idx] y_train = self.Y[train_idx] # generating the pool X_pool = np.delete(self.X, train_idx, axis=0) y_pool = np.delete(self.Y, train_idx) act_data = act_data.drop(axis=0, index=train_idx) act_data.reset_index(drop=True, inplace=True) # initializing the active learner learner = ActiveLearner(estimator=self.model, X_training=X_train, y_training=y_train, query_strategy=self.query_method) # pool-based sampling n_queries = int(len(X) / (100 / self.percent)) for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool) learner.teach(X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, )) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) act_data = act_data.drop(axis=0, index=query_idx) act_data.reset_index(drop=True, inplace=True) accuracy_list.append(learner.score(X_pool, y_pool)) model_pred = learner.predict(X_pool) f1_total_list.append( f1_score(y_pool, model_pred, average="weighted", labels=np.unique(model_pred))) kappa_total_list.append(cohen_kappa_score(y_pool, model_pred)) # print('Accuracy after query no. %d: %f' % (idx+1, learner.score(X_pool, y_pool))) # print("By just labelling ",round(n_queries*100.0/len(X),2),"% of total data accuracy of ", round(learner.score(X_pool, y_pool),3), " % is achieved on the unseen data" ) return accuracy_list, f1_total_list, kappa_total_list
def _train_using_dynamic_data_set(sampler, data_set, evaluation_steps): learner = ActiveLearner( estimator=RandomForestClassifier(), query_strategy=samplers[sampler], ) queried_points = 0 training_results = {"models": []} tmp_x = [] tmp_y = [] for data_index in trange(len(data_set), disable=disable_tqdm, desc=f"{sampler}-{data_set_name}"): x_train = data_set[data_index][:, :-1] y_train = data_set[data_index][:, -1] query_idx, query_inst = learner.query(x_train, n_instances=1) tmp_x.append(query_inst) tmp_y.append(y_train[query_idx]) queried_points += 1 if data_index+1 in evaluation_steps: learner.teach(np.array(tmp_x).reshape((len(tmp_x),-1)), np.array(tmp_y).flatten()) tmp_x = [] tmp_y = [] lfm = DecisionTreeClassifier().fit(learner.X_training, learner.y_training) training_results["models"].append(lfm) return training_results
def al_Loop(estimator, X_train, Y_train, X, Y, X_test, Y_test, indexs): learner = ActiveLearner(estimator=estimator, X_training=X_train, y_training=Y_train) X_pool = np.delete(X, indexs, axis=0) Y_pool = np.delete(Y, indexs, axis=0) index = 0 accuracy = 0 while len(X_pool) > 0: query_index, _ = learner.query(X_pool) x, y = X_pool[query_index].reshape(1, -1), Y_pool[query_index].reshape( 1, ) learner.teach(X=x, y=y) X_pool, Y_pool = np.delete(X_pool, query_index, axis=0), np.delete(Y_pool, query_index) model_accuracy = 1 - learner.score(X_pool, Y_pool) print('Error after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy)) accuracy = model_accuracy predicts = learner.predict(X_test) corrects = (predicts == Y_test) accs = (sum([1 if i else 0 for i in corrects]) / len(predicts)) accs = 1 - accs print(accs) index += 1 return learner
def _train_using_static_data_set(sampler, data_set, evaluation_steps): x_train = data_set[:, :-1] y_train = data_set[:, -1] learner = ActiveLearner( estimator=RandomForestClassifier(), query_strategy=samplers[sampler], ) tmp_x_train, tmp_y_train = x_train.copy(), y_train.copy() queried_points = 0 training_results = {"models": []} for step in trange(len(evaluation_steps), disable=disable_tqdm, desc=f"{sampler}-{data_set_name}"): query_idx, query_inst = learner.query(tmp_x_train, n_instances=evaluation_steps[step]-queried_points) # ...obtaining new labels from the pool... learner.teach(query_inst, tmp_y_train[query_idx]) queried_points += evaluation_steps[step] - queried_points tmp_x_train = np.delete(tmp_x_train, query_idx, axis=0) tmp_y_train = np.delete(tmp_y_train, query_idx, axis=0) lfm = DecisionTreeClassifier().fit(learner.X_training, learner.y_training) training_results["models"].append(lfm) return training_results
def al_pool_margin(self, data, target, X_train, y_train, X_full, y_full, train_idx): acc = [] X_pool = np.delete(data, train_idx, axis=0) y_pool = np.delete(target, train_idx) learner = ActiveLearner(estimator=RandomForestClassifier(), query_strategy=margin_sampling, X_training=X_train, y_training=y_train) n_queries = self.query_number # n_queries = 1500 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool) learner.teach(X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, )) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) learner_score = learner.score(data, target) # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore)) precision, recall, fscore, support = self.performance_measure( learner, X_full, y_full) learner_score = fscore acc.append(learner_score) print('%0.3f' % (learner_score), end=",") return acc
def al_pool(self, data, target, X_train, y_train, X_full, y_full, train_idx): acc = [] X_pool = np.delete(data, train_idx, axis=0) y_pool = np.delete(target, train_idx) learner = ActiveLearner( estimator=RandomForestClassifier(), X_training=X_train, y_training=y_train ) n_queries = self.query_number # n_queries = 1500 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool) learner.teach( X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, ) ) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) learner_score = learner.score(data, target) # learner.estimator # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore)) X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.30) y_predict = learner.predict(X_test) precision, recall, fscore, support = score(y_test, y_predict) acc.append(learner_score) print('%0.3f' % (learner_score), end=",") return acc
def RandomLearner(X, y): """ Create an active learner with random query strategy and run the active learner on the given data set. You should implement this also using modAL. Use SVM classifier with default parameter as the estimator. Input: The data set X and the corresponding labels Return: The accuracies evaluated on X, y whenever querying the true label of a data point from oracle as a one-demensional numpy array, the number of data points that are queried from oracle for the true label. """ random_learner = ActiveLearner(estimator=SVC(), query_strategy=RandomQuery, X_training=np.array([[0.5, 4.0], [2.0, 1.0]]), y_training=np.array([[0], [1]])) ### TODO: Write the main loop for running the random active learner accuracies = [] i = 0 #S, SLabels = np.array([[0.5, 4.0], [2.0, 1.0]]), np.array([[0], [1]]) U, ULabels = copy.deepcopy(X), copy.deepcopy(y) while (len(U) != 0): idx, instance = random_learner.query(U) i += 1 random_learner._add_training_data(U[idx].reshape(1, 2), ULabels[idx].reshape(1, 1)) random_learner._fit_to_known() U, ULabels = np.delete(U, idx, axis=0), np.delete(ULabels, idx, axis=0) acc = random_learner.score(X, y) accuracies.append(acc) return np.array(accuracies), i
def modAL_uncertainty(X, y, n_queries): modAL_learner = ActiveLearner(LogisticRegression(solver='liblinear', n_jobs=1, multi_class='ovr'), X_training=X[[0, 50, 100]], y_training=y[[0, 50, 100]]) for _ in range(n_queries): query_idx, query_inst = modAL_learner.query(X) modAL_learner.teach(X[query_idx], y[query_idx])
def modAL_EER(X, y, n_queries): modAL_learner = ActiveLearner(LogisticRegression(solver='liblinear', n_jobs=1, multi_class='ovr'), query_strategy=expected_error_reduction, X_training=X[[0, 50, 100]], y_training=y[[0, 50, 100]]) for _ in range(n_queries): query_idx, query_inst = modAL_learner.query(X) modAL_learner.teach(X[query_idx], y[query_idx])
def al_pool_proba(self, data, target, X_train, y_train, X_full, y_full, train_idx, classifier, sampling_strategy, proba): acc = [] pre = [] rec = [] fs = [] X_pool = np.delete(data, train_idx, axis=0) y_pool = np.delete(target, train_idx) learner = ActiveLearner( estimator=classifier, query_strategy=sampling_strategy, X_training=X_train, y_training=y_train ) n_queries = self.query_number # n_queries = 1500 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool) labeled_y = y_pool[query_idx].reshape(1, ) rand_int = randint(0, 100) if(rand_int <= proba): if( y_pool[query_idx][0] == 1): y_pool[query_idx][0] = 0 labeled_y = np.array((0)).reshape(1,) else: y_pool[query_idx][0] = 1 labeled_y = np.array((1)).reshape(1, ) learner.teach( X=X_pool[query_idx].reshape(1, -1), y=labeled_y ) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) learner_score = learner.score(data, target) # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore)) precision, recall, fscore, support, accuracy = self.performance_measure(learner, X_full, y_full) # learner_score = fscore acc.append(accuracy) pre.append(precision) rec.append(recall) fs.append(fscore) print('%0.3f' % (learner_score), end=",") return acc, pre, rec, fs
def active_learner(query_stra, N_query): knn = KNeighborsClassifier(n_neighbors=8) learner = ActiveLearner(estimator=knn, X_training=X_train, y_training=y_train, query_strategy=query_stra) predictions = learner.predict(X_test) X_pool = X_test.values y_pool = y_test.values for index in range(N_query): query_index, query_instance = learner.query(X_pool) X, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, ) learner.teach(X=X, y=y) X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index) model_accuracy = learner.score(X_test, y_test) print('Accuracy: {acc:0.4f} \n'.format(acc=model_accuracy)) performance_history.append(model_accuracy)
def al_pool(data, target, X_train, y_train, X_full, y_full, train_idx): X_pool = np.delete(data, train_idx, axis=0) y_pool = np.delete(target, train_idx) learner = ActiveLearner(estimator=RandomForestClassifier(), X_training=X_train[:200], y_training=y_train[:200]) n_queries = 1500 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool) learner.teach(X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, )) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) learner_score = learner.score(data, target) # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore)) print('%0.3f' % (learner_score), end=",")
def run_model(X, y, test_size, rep_times, n_queries, estimator, fd): performance_history = [[] for i in range(n_queries)] for i in range(rep_times): # print('exp:', i) # print('exp:', i, file=fd) n_labled_examples = X.shape[0] X_trn_all, X_tst, y_trn_all, y_tst = train_test_split( X, y, test_size=test_size, stratify=y) X_trn_all = X_trn_all[:, 1:] y_tst = X_tst[:, 0] X_tst = X_tst[:, 1:] y_tst = y_tst.astype('int32') X_trn_min, y_trn_min, X_trn, y_trn = get_init_train( X_trn_all, y_trn_all) # print('ground truth:', y_tst, file=fd) learner = ActiveLearner(estimator=estimator, X_training=X_trn_min, y_training=y_trn_min) # prediction with no query predictions_0 = learner.predict(X_tst) err_0 = error_calculation(predictions_0, y_tst) for j in range(n_queries): query_index, query_instance = learner.query(X_trn) X_qry, y_qry = X_trn[query_index].reshape( 1, -1), y_trn[query_index].reshape(1, ) learner.teach(X=X_qry, y=y_qry) X_trn, y_trn = np.delete(X_trn, query_index, axis=0), np.delete(y_trn, query_index) predictions = learner.predict(X_tst) err = error_calculation(predictions, y_tst) performance_history[j].append(err) avg_err = [] sd = [] for i in range(n_queries): avg_err.append(np.mean(performance_history[i])) sd.append(np.std(performance_history[i]) / np.sqrt(rep_times)) return avg_err, sd
def active_learn(df1, first_item_index_of_each_category): train_idx = first_item_index_of_each_category data = df1.values[:, 1:] target = df1['label'].values X_full = df1.values[:, 1:] y_full = df1['label'].values X_train = df1.values[:, 1:][ train_idx] #item from second column as the first column is the label.. y_train = df1['label'].values[train_idx] X_pool = np.delete(data, train_idx, axis=0) y_pool = np.delete(target, train_idx) for i in range(1001, 1500): learner = ActiveLearner(estimator=RandomForestClassifier(), X_training=X_train[:i], y_training=y_train[:i]) print('Initial prediction accuracy: %f' % learner.score(X_full, y_full)) print("================================") print("================================") print("================================") print("================================") print("================================") learner = ActiveLearner(estimator=RandomForestClassifier(), X_training=X_train[:1001], y_training=y_train[:1001]) n_queries = 502 performance_array = [] for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool) learner.teach(X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, )) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) learner_score = learner.score(data, target) # print('Accuracy after query no. %d: %f' % (idx + 1, learner_score)) print('%f' % (learner_score))
def CALLearner(X, y): """ Create an active learner with CAL query strategy and run the active learner on the given data set Input: The data set X and the corresponding labels Return: The accuracies evaluated on X, y using the fitted model with the labeled data so far whenever querying the true label of a data point from oracle as a one-demensional numpy array, the number of data points that are queried from oracle for the true label. """ # use SVM classifier with default parameters clf = SVC() # create an active learner with CAL as query strategy. The labeled pool of data is initially not empty, it contains two data points that belong to two classes. CAL_learner = ActiveLearner(estimator=clf, query_strategy=CAL, X_training=np.array([[0.5, 4.0], [2.0, 1.0]]), y_training=np.array([[0], [1]])) # In worst case, we would need to query all data points in the unlabeled pool. n_queries = len(y) # use variable i to keep track of the number of data points that are queried from oracle i = 0 # store the accuracies evaluated on X, y whenever querying the true label of a data point from oracle accuracies = [] ### TODO: Write the main loop for running the CAL active learner, make sure you maintain the labeled pool and unlabeled pool properly, and calculate the accuracy of the estimater on all given data, i.e. X, y whenever you query a data point from the oracle for the true label. S, SLabels = np.array([[0.5, 4.0], [2.0, 1.0]]), np.array([[0], [1]]) U, ULabels = copy.deepcopy(X), copy.deepcopy(y) while (i < n_queries and len(U) != 0): idx, y_idx, is_queried = CAL_learner.query(S, SLabels, U, ULabels) CAL_learner._add_training_data(U[idx].reshape(1, 2), y_idx.reshape(1, 1)) CAL_learner._fit_to_known() S, SLabels = np.vstack((S, U[idx].reshape(1, 2))), np.vstack( (SLabels, y_idx.reshape(1, 1))) U, ULabels = np.delete(U, idx, axis=0), np.delete(ULabels, idx) acc = CAL_learner.score(X, y) if (is_queried): i += 1 accuracies.append(acc) return np.array(accuracies), i
def run_exp_music(intup): global X_train, X_test, y_train, y_test rep, i, p = intup X_seed, X_pool = X_train[:n_seed], X_train[n_seed:] y_seed, y_pool = y_train[:n_seed], y_train[n_seed:] # Initializing the learner learner = ActiveLearner( estimator=RandomForestClassifier(n_estimators=10), query_strategy=entropy_sampling, X_training=X_seed, y_training=y_seed ) # Run active learning and record history of test accuracy history = np.zeros(query_budget - n_seed) for j in range(query_budget - n_seed): query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool[query_idx], y_pool[query_idx]) history[j] = learner.score(X_test, y_test) return history
def run_model(X, y, test_size, rep_times, n_queries, estimator, fd): performance_history = [[] for i in range(n_queries)] for i in range(rep_times): print('exp:', i) # print('exp:', i, file=fd) n_labled_examples = X.shape[0] X_trn_all, X_tst, y_trn_all, y_tst = train_test_split(X, y, test_size=test_size, stratify=y) # get initial training set, which size = n_class X_trn_min, y_trn_min, X_trn, y_trn = get_init_train(X_trn_all, y_trn_all) # print('ground truth:', y_tst, file=f_2) learner = ActiveLearner(estimator=estimator, X_training=X_trn_min, y_training=y_trn_min) # prediction with no query predictions_0 = learner.predict(X_tst) err_0 = error_calculation(predictions_0, y_tst) # print('query no.', 0, file=f_2) # print('predictions:', predictions_0, file=f_2) # print('MSE:', err_0, file=f_2) for j in range(n_queries): query_index, query_instance = learner.query(X_trn) X_qry, y_qry = X_trn[query_index].reshape(1, -1), y_trn[query_index].reshape(1, ) learner.teach(X=X_qry, y=y_qry) X_trn, y_trn = np.delete(X_trn, query_index, axis=0), np.delete(y_trn, query_index) predictions = learner.predict(X_tst) err = error_calculation(predictions, y_tst) # print('query no.', j+1, file=f_2) # print('predictions:', predictions, file=f_2) # print('MSE:', err, file=f_2) performance_history[j].append(err) avg_err = [] for i in range(n_queries): avg_err.append(np.mean(performance_history[i])) return avg_err
def RandomLearner(X, y): """ Create an active learner with random query strategy and run the active learner on the given data set. You should implement this also using modAL. Use SVM classifier with default parameter as the estimator. Input: The data set X and the corresponding labels Return: The accuracies evaluated on X, y whenever querying the true label of a data point from oracle as a one-demensional numpy array, the number of data points that are queried from oracle for the true label. """ random_learner = ActiveLearner(estimator=SVC(gamma='scale'), query_strategy=RandomQuery, X_training=np.array([[0.5, 4.0], [2.0, 1.0]]), y_training=np.array([[0], [1]])) accuracies = [] n_queries = len(y) i = 0 while i < n_queries: if len(random_learner.y_training) == 2: U = X ULabels = y else: U = np.delete(U, query_idx, axis=0) ULabels = np.delete(ULabels, query_idx) if not len(U): break query_idx, query_instance = random_learner.query(U) # add to training data random_learner._add_training_data(U[query_idx, :].reshape(-1, 2), ULabels[query_idx].reshape(-1, 1)) # fit on training data random_learner._fit_to_known() # calculate the accuracy of the learned estimator on the entire dataset accuracies.append(random_learner.score(X, y)) i += 1 return np.array(accuracies), i
def run_exp(intup): global X_train, X_test, y_train, y_test rep, i, p = intup # Make noisy data, simulate pool-based case X_train_noisy = utils.add_gaussian_noise(X_train, p) y_train_noisy = y_train # utils.flip_labels(y_train, p) X_seed, X_pool = X_train_noisy[:n_seed], X_train_noisy[n_seed:] y_seed, y_pool = y_train_noisy[:n_seed], y_train_noisy[n_seed:] # Initializing the learner learner = ActiveLearner( estimator=RandomForestClassifier(n_estimators=10), query_strategy=entropy_sampling, X_training=X_seed, y_training=y_seed ) # Run active learning and record history of test accuracy history = np.zeros(query_budget - n_seed) for j in range(query_budget - n_seed): query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool[query_idx], y_pool[query_idx]) history[j] = learner.score(X_test, y_test) return history
y_training=y_train) # visualizing initial prediction with plt.style.context('seaborn-white'): plt.figure(figsize=(7, 7)) prediction = learner.predict(data) plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50) plt.title('Initial accuracy: %f' % learner.score(data, target)) plt.show() print('Accuracy before active learning: %f' % learner.score(data, target)) # pool-based sampling n_queries = 30 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool) learner.teach(X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, )) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) print('Accuracy after query no. %d: %f' % (idx + 1, learner.score(data, target))) # plotting final prediction with plt.style.context('seaborn-white'): plt.figure(figsize=(7, 7)) prediction = learner.predict(data) plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50) plt.title('Classification accuracy after %i queries: %f' % (n_queries, learner.score(data, target)))
n_queries = 4000 loop = 0 strategy_count = np.zeros(len(learners)) x = [] # while True: score = [] for learner in learners: score.append(learner.score(train_features, train_labels)) unqueried_score = np.min(score) performance_history = [unqueried_score] for _ in range(n_queries): opinions = [] learner_id = 0 for learner in learners: if learner_id == 1: query_idx, query_instance = learner.query(X_pool, n_instances=1) else: query_idx, query_instance = learner.query(X_pool) # -> Here opinions.append(query_idx) learner_id += 1 opt_idx = np.random.choice(range(len(opinions)), p=weights, size=1, replace=True)[0] # print("selected strategy: ", opt_idx) strategy_count[opt_idx] += 1 x.append(opt_idx) selected_idx = opinions[opt_idx] # print(opinions) print("selected Index: ", selected_idx)
# initial training data: 100 random pixels initial_idx = np.random.choice(range(len(X_pool)), size=100) # initializing the learners n_learners = 3 learner_list = [] for _ in range(n_learners): learner = ActiveLearner( predictor=RandomForestClassifier(), X_initial=X_pool[initial_idx], y_initial=y_pool[initial_idx], bootstrap_init=True ) learner_list.append(learner) # assembling the Committee committee = Committee(learner_list) # ensemble active learner from the Committee ensemble_learner = ActiveLearner( predictor=committee ) query_idx, query_instance = ensemble_learner.query(X_pool) # ... # ... obtain label from the Oracle ... # ... ensemble_learner.teach(X_pool[query_idx], y_pool[query_idx], bootstrap=True)
def activeLearning(method, X_train, Y_train, X_test, Y_test, K): interations = 101 random.seed(0) # Define initial labels indexs to train classifier if method in ["RDS", "MST-BE"]: idx, root_idx, X_initial, Y_initial, X_pool, Y_pool = activeLearningLib_Object.get_samples( X_train, Y_train, n_clusters=int(len(np.unique(Y_train)) * 2), strategy=method) labeled_idx = np.empty(0, int) else: idx = np.asarray(random.sample(range(0, len(X_train)), k=K)) X_initial, Y_initial = X_train[idx], Y_train[idx] X_pool, Y_pool = np.delete(X_train, idx, axis=0), np.delete(Y_train, idx, axis=0) # Initialize Active Learning Methods t = time.time() if method == "Entropy Sampling": learner = ActiveLearner(estimator=SVC(probability=True), query_strategy=entropy_sampling, X_training=X_initial, y_training=Y_initial) elif method == "Margin Sampling": learner = ActiveLearner(estimator=SVC(probability=True), query_strategy=margin_sampling, X_training=X_initial, y_training=Y_initial) elif method == "Uncertainty Sampling": learner = ActiveLearner(estimator=SVC(probability=True), query_strategy=uncertainty_sampling, X_training=X_initial, y_training=Y_initial) elif method == "Average Confidence": learner = ActiveLearner(estimator=SVC(probability=True), query_strategy=avg_confidence, X_training=X_initial, y_training=Y_initial) elif method == "RDS": learner = ActiveLearner( estimator=SVC(probability=True), # estimator = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None), query_strategy=root_distance_based_selection_strategy, X_training=X_initial, y_training=Y_initial) elif method == "MST-BE": learner = ActiveLearner( estimator=SVC(probability=True), # estimator = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None), query_strategy=disagree_labels_edges_idx_query_strategy, X_training=X_initial, y_training=Y_initial) timeToTrain = time.time() - t results = [] labeledData_X = X_initial labeledData_Y = Y_initial for run in range(interations): if K > len(idx): break if method in ["RDS", "MST-BE"]: kwargs = dict() if K > len(idx): break kwargs = dict(idx=idx, labeled_idx=labeled_idx, y_root=Y_initial) t = time.time() query_idx, idx = learner.query(X_pool, n_instances=K, **kwargs) timeToSelect = time.time() - t if query_idx is None or len(query_idx) < K: break labeled_idx = np.append(labeled_idx, query_idx) predsCorrecteds = learner.predict(X_pool[query_idx]) counter = 0 for (x, y) in zip(predsCorrecteds, Y_pool[query_idx].flatten()): if x != y: counter += 1 t = time.time() learner.teach(X=X_pool[query_idx], y=Y_pool[query_idx]) timeToTrain = time.time() - t labeledData_X = np.vstack((labeledData_X, X_pool[query_idx])) labeledData_Y = np.vstack((labeledData_Y, Y_pool[query_idx])) t = time.time() # model = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None) # trained_model = model.fit(labeledData_X, labeledData_Y.flatten().astype("int")) preds = learner.predict(X_test.values) timeToTest = time.time() - t acc = accuracy_score(Y_test, preds) f1score = f1_score(Y_test, preds, average='macro') precision = precision_score(Y_test, preds, average='macro') recall = recall_score(Y_test, preds, average='macro') knowClasses = len(set(preds.tolist())) print("Run {}: Acc: {}".format(run + 1, acc)) print("Know Classes: {}".format(knowClasses)) print("Corrected Labels: {}".format(counter)) print("Time to Select: {}".format(timeToSelect)) else: if run == 0: t = time.time() # model = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None) # trained_model = model.fit(labeledData_X, labeledData_Y.flatten().astype("int")) preds = learner.predict(X_test.values) timeToTest = time.time() - t acc = accuracy_score(Y_test, preds) f1score = f1_score(Y_test, preds, average='macro') precision = precision_score(Y_test, preds, average='macro') recall = recall_score(Y_test, preds, average='macro') knowClasses = len(set(preds.tolist())) counter = len(Y_initial) timeToSelect = 0 print("Run {}: Acc: {}".format(run + 1, acc)) print("Know Classes: {}".format(knowClasses)) print("Corrected Labels: {}".format(counter)) print("Time to Select: {}".format(timeToSelect)) else: try: t = time.time() query_idx, idx = learner.query(X_pool, n_instances=K) timeToSelect = time.time() - t except: timeToSelect = 0 print("deu erro") break predsCorrecteds = learner.predict(X_pool[query_idx]) counter = 0 for (x, y) in zip(predsCorrecteds, Y_pool[query_idx].flatten()): if x != y: counter += 1 t = time.time() learner.teach(X=X_pool[query_idx], y=Y_pool[query_idx]) # X_pool, Y_pool = np.delete(X_pool, query_idx, axis=0), np.delete(Y_pool, query_idx, axis=0) timeToTrain = time.time() - t # t = time.time() # preds = learner.predict(X_test) # timeToTest = time.time() - t labeledData_X = np.vstack((labeledData_X, X_pool[query_idx])) labeledData_Y = np.vstack((labeledData_Y, Y_pool[query_idx])) t = time.time() # model = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None) # trained_model = model.fit(labeledData_X, labeledData_Y.flatten().astype("int")) preds = learner.predict(X_test.values) X_pool, Y_pool = np.delete(X_pool, query_idx, axis=0), np.delete(Y_pool, query_idx, axis=0) timeToTest = time.time() - t acc = accuracy_score(Y_test, preds) f1score = f1_score(Y_test, preds, average='macro') precision = precision_score(Y_test, preds, average='macro') recall = recall_score(Y_test, preds, average='macro') knowClasses = len(set(preds.tolist())) print("Run {}: Acc: {}".format(run + 1, acc)) print("Know Classes: {}".format(knowClasses)) print("Corrected Labels: {}".format(counter)) print("Time to Select: {}".format(timeToSelect)) results.append([ run + 1, K, np.round(timeToTrain, 2), np.round(timeToTest, 2), np.round(timeToSelect, 2), np.round(acc * 100, 2), np.round(f1score * 100, 2), np.round(precision * 100, 2), np.round(recall * 100, 2), knowClasses, counter ]) results_df = pd.DataFrame(results, columns=[ "iteration", "k-value", "time-to-train", "time-to-test", "time-to-select", "accuracy", "f1-score", "precision", "recall", "knowClasses", "correctedLabels" ]) return results_df
def al_rank(self, data, target, X_train, y_train, X_full, y_full, train_idx, N_RAW_SAMPLES=80, proba=5, proba_e=5, proba_n=20, e=1, n=4): acc = [] pre = [] rec = [] fs = [] BATCH_SIZE = 5 preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE) learner = ActiveLearner(estimator=RandomForestClassifier(), X_training=X_train, y_training=y_train, query_strategy=preset_batch) # N_RAW_SAMPLES = 80 N_QUERIES = N_RAW_SAMPLES // BATCH_SIZE unqueried_score = learner.score(X_full, y_full) performance_history = [unqueried_score] # Isolate our examples for our labeled dataset. n_labeled_examples = X_full.shape[0] training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=5) X_train = X_full[training_indices] y_train = y_full[training_indices] # Isolate the non-training examples we'll be querying. X_pool = np.delete(X_full, training_indices, axis=0) y_pool = np.delete(y_full, training_indices, axis=0) for index in range(N_QUERIES): query_index, query_instance = learner.query(X_pool) # Teach our ActiveLearner model the record it has requested. X, y = X_pool[query_index], y_pool[query_index] labeled_y = np.array([]) for i in range(0, e): if (randint(0, 100) <= proba_e): if (y_pool[query_index[i]] == 1): y_pool[query_index[i]] = 0 labeled_y = np.append(labeled_y, 0) else: labeled_y = np.append(labeled_y, 1) else: labeled_y = np.append(labeled_y, y_pool[query_index[i]]) for j in range(0, n): i = j + e if (randint(0, 100) <= proba_n): if (y_pool[query_index[i]] == 1): y_pool[query_index[i]] = 0 labeled_y = np.append(labeled_y, 0) else: labeled_y = np.append(labeled_y, 1) else: labeled_y = np.append(labeled_y, y_pool[query_index[i]]) # labeled_y =y # rand_int = randint(0, 100) # if (rand_int <= proba): # labeled_y = np.array([]) # for idx in query_index: # if (y_pool[idx] == 1): # y_pool[idx] = 0 # labeled_y = np.append(labeled_y, 0) # else: # y_pool[idx] = 1 # # labeled_y = np.array((1)).reshape(1, ) # labeled_y = np.append(labeled_y, 1) learner.teach(X=X, y=labeled_y) # learner.teach(X=X, y=y) # Remove the queried instance from the unlabeled pool. X_pool = np.delete(X_pool, query_index, axis=0) y_pool = np.delete(y_pool, query_index) # Calculate and report our model's accuracy. model_accuracy = learner.score(X_full, y_full) print('Accuracy after query {n}: {acc:0.4f}'.format( n=index + 1, acc=model_accuracy)) precision, recall, fscore, support, accuracy = self.performance_measure( learner, X_full, y_full) learner_score = accuracy acc.append(learner_score) pre.append(precision) rec.append(recall) fs.append(fscore) # Save our model's performance for plotting. performance_history.append(model_accuracy) return acc, pre, rec, fs
def al_pool_proba(self, data, target, X_train, y_train, X_full, y_full, train_idx, classifier, sampling_strategy, proba=10, proba_e=5, proba_n=20, e=1, n=4): acc = [] pre = [] rec = [] fs = [] X_pool = np.delete(data, train_idx, axis=0) y_pool = np.delete(target, train_idx) learner = ActiveLearner(estimator=classifier, query_strategy=sampling_strategy, X_training=X_train, y_training=y_train) n_queries = self.query_number # n_queries = 1500 for idx in range(0, n_queries, 5): for i in range(0, e): query_idx, query_instance = learner.query(X_pool) labeled_y = np.array([]) if (randint(0, 100) <= proba_e): if (y_pool[query_idx] == 1): y_pool[query_idx] = 0 labeled_y = np.append(labeled_y, 0) else: labeled_y = np.append(labeled_y, 1) else: labeled_y = np.append(labeled_y, y_pool[query_idx]) learner.teach(X=X_pool[query_idx].reshape(1, -1), y=labeled_y) X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) precision, recall, fscore, support, accuracy = self.performance_measure( learner, X_full, y_full) acc.append(accuracy) pre.append(precision) rec.append(recall) fs.append(fscore) for i in range(0, n): query_idx, query_instance = learner.query(X_pool) labeled_y = np.array([]) if (randint(0, 100) <= proba_n): if (y_pool[query_idx] == 1): y_pool[query_idx] = 0 labeled_y = np.append(labeled_y, 0) else: labeled_y = np.append(labeled_y, 1) else: labeled_y = np.append(labeled_y, y_pool[query_idx]) learner.teach(X=X_pool[query_idx].reshape(1, -1), y=labeled_y) X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) precision, recall, fscore, support, accuracy = self.performance_measure( learner, X_full, y_full) acc.append(accuracy) pre.append(precision) rec.append(recall) fs.append(fscore) # # for idx in range(0, n_queries, 5): # query_idx, query_instance = learner.query(X_pool) # # labeled_y = y_pool[query_idx].reshape(1, ) # rand_int = randint(0, 100) # if(rand_int <= proba): # if( y_pool[query_idx][0] == 1): # y_pool[query_idx][0] = 0 # labeled_y = np.array((0)).reshape(1,) # else: # y_pool[query_idx][0] = 1 # labeled_y = np.array((1)).reshape(1, ) # # learner.teach( # X=X_pool[query_idx].reshape(1, -1), # y=labeled_y # ) # # remove queried instance from pool # X_pool = np.delete(X_pool, query_idx, axis=0) # y_pool = np.delete(y_pool, query_idx) # learner_score = learner.score(data, target) # # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore)) # precision, recall, fscore, support, accuracy = self.performance_measure(learner, X_full, y_full) # # learner_score = fscore # acc.append(accuracy) # pre.append(precision) # rec.append(recall) # fs.append(fscore) # print('%0.3f' % (learner_score), end=",") return acc, pre, rec, fs
) # visualizing initial prediction with plt.style.context('seaborn-white'): plt.figure(figsize=(7, 7)) prediction = learner.predict(iris['data']) plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50) plt.title('Initial accuracy: %f' % learner.score(iris['data'], iris['target'])) plt.show() print('Accuracy before active learning: %f' % learner.score(iris['data'], iris['target'])) # pool-based sampling n_queries = 20 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool) learner.teach( X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, ) ) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) print('Accuracy after query no. %d: %f' % (idx+1, learner.score(iris['data'], iris['target']))) # plotting final prediction with plt.style.context('seaborn-white'): plt.figure(figsize=(7, 7)) prediction = learner.predict(iris['data']) plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50) plt.title('Classification accuracy after %i queries: %f' % (n_queries, learner.score(iris['data'], iris['target'])))
y_pool = deepcopy(y_full) # assembling initial training set initial_idx = [0, im_height-1, im_height*(im_height-1), -1, im_width//2 + im_height//2*im_height] X_train, y_train = X_pool[initial_idx], y_pool[initial_idx] # create an ActiveLearner instance learner = ActiveLearner( predictor=RandomForestClassifier(), X_initial=X_train, y_initial=y_train ) initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width) n_queries = 100 for round_idx in range(n_queries): query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(-1, )) X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) final_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width) # learning with randomly selected queries instead of active learning random_idx = initial_idx + list(np.random.choice(range(len(X_full)), n_queries, replace=False)) X_train, y_train = X_full[initial_idx], y_full[initial_idx] random_learner = ActiveLearner( predictor=RandomForestClassifier(), X_initial=X_train, y_initial=y_train ) with plt.style.context('seaborn-white'):
X_initial=X_initial.reshape(-1, 1), y_initial=y_initial.reshape(-1, 1) ) # plotting the initial estimation with plt.style.context('seaborn-white'): plt.figure(figsize=(14, 7)) x = np.linspace(0, 20, 1000) pred, std = regressor.predict(x.reshape(-1,1), return_std=True) plt.plot(x, pred) plt.fill_between(x, pred.reshape(-1, )-std, pred.reshape(-1, )+std, alpha=0.2) plt.scatter(X, y, c='k') plt.title('Initial estimation based on %d points' % n_initial) plt.show() # active learning n_queries = 10 for idx in range(n_queries): query_idx, query_instance = regressor.query(X) regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1)) # plotting after active learning with plt.style.context('seaborn-white'): plt.figure(figsize=(14, 7)) x = np.linspace(0, 20, 1000) pred, std = regressor.predict(x.reshape(-1,1), return_std=True) plt.plot(x, pred) plt.fill_between(x, pred.reshape(-1, )-std, pred.reshape(-1, )+std, alpha=0.2) plt.scatter(X, y, c='k') plt.title('Estimation after %d queries' % n_queries) plt.show()
# initial training data: 100 random pixels initial_idx = np.random.choice(range(len(X_pool)), size=100) # initializing the learners n_learners = 3 learner_list = [] for _ in range(n_learners): learner = ActiveLearner( estimator=RandomForestClassifier(), X_training=X_pool[initial_idx], y_training=y_pool[initial_idx], bootstrap_init=True ) learner_list.append(learner) # assembling the Committee committee = Committee(learner_list) # ensemble active learner from the Committee ensemble_learner = ActiveLearner( estimator=committee ) query_idx, query_instance = ensemble_learner.query(X_pool) # ... # ... obtain label from the Oracle ... # ... ensemble_learner.teach(X_pool[query_idx], y_pool[query_idx], bootstrap=True)
class ActiveKNN: """A KNN machine learning model using active learning with modAL package Attributes: amine: A string representing the amine that the KNN model is used for predictions. n_neighbors: An integer representing the number of neighbors to classify using KNN model. model: A KNeighborClassifier object as the classifier model given the number of neighbors to classify with. metrics: A dictionary to store the performance metrics locally. It has the format of {'metric_name': [metric_value]}. verbose: A boolean representing whether it will prints out additional information to the terminal or not. pool_data: A numpy array representing all the data from the dataset. pool_labels: A numpy array representing all the labels from the dataset. x_t: A numpy array representing the training data used for model training. y_t: A numpy array representing the training labels used for model training. x_v: A numpy array representing the testing data used for active learning. y_v: A numpy array representing the testing labels used for active learning. learner: An ActiveLearner to conduct active learning with. See modAL documentation for more details. """ def __init__(self, amine=None, n_neighbors=2, verbose=True): """Initialize the ActiveKNN object.""" self.amine = amine self.n_neighbors = n_neighbors self.model = KNeighborsClassifier(n_neighbors=self.n_neighbors) self.metrics = { 'accuracies': [], 'precisions': [], 'recalls': [], 'bcrs': [], 'confusion_matrices': [] } self.verbose = verbose def load_dataset(self, x_t, y_t, x_v, y_v, all_data, all_labels): """Load the input training and validation data and labels into the model. Args: x_t: A 2-D numpy array representing the training data. y_t: A 2-D numpy array representing the training labels. x_v: A 2-D numpy array representing the validation data. y_v: A 2-D numpy array representing the validation labels. all_data: A 2-D numpy array representing all the data in the active learning pool. all_labels: A 2-D numpy array representing all the labels in the active learning pool. Returns: N/A """ self.x_t, self.x_v, self.y_t, self.y_v = x_t, y_t, x_v, y_v self.pool_data = all_data self.pool_labels = all_labels if self.verbose: print(f'The training data has dimension of {self.x_t.shape}.') print(f'The training labels has dimension of {self.y_t.shape}.') print(f'The testing data has dimension of {self.x_v.shape}.') print(f'The testing labels has dimension of {self.y_v.shape}.') def train(self): """Train the KNN model by setting up the ActiveLearner.""" self.learner = ActiveLearner(estimator=self.model, X_training=self.x_t, y_training=self.y_t) # Evaluate zero-point performance self.evaluate() def active_learning(self, num_iter=None, to_params=True): """ The active learning loop This is the active learning model that loops around the KNN model to look for the most uncertain point and give the model the label to train Args: num_iter: An integer that is the number of iterations. Default = None to_params: A boolean that decide if to store the metrics to the dictionary, detail see "store_metrics_to_params" function. Default = True return: N/A """ num_iter = num_iter if num_iter else self.x_v.shape[0] for _ in range(num_iter): # Query the most uncertain point from the active learning pool query_index, query_instance = self.learner.query(self.x_v) # Teach our ActiveLearner model the record it has requested. uncertain_data, uncertain_label = self.x_v[query_index].reshape( 1, -1), self.y_v[query_index].reshape(1, ) self.learner.teach(X=uncertain_data, y=uncertain_label) self.evaluate() # Remove the queried instance from the unlabeled pool. self.x_t = np.append(self.x_t, uncertain_data).reshape( -1, self.pool_data.shape[1]) self.y_t = np.append(self.y_t, uncertain_label) self.x_v = np.delete(self.x_v, query_index, axis=0) self.y_v = np.delete(self.y_v, query_index) if to_params: self.store_metrics_to_params() def evaluate(self, store=True): """Evaluation of the model Args: store: A boolean that decides if to store the metrics of the performance of the model. Default = True return: N/A """ # Calculate and report our model's accuracy. accuracy = self.learner.score(self.pool_data, self.pool_labels) preds = self.learner.predict(self.pool_data) cm = confusion_matrix(self.pool_labels, preds) # To prevent nan value for precision, we set it to 1 and send out a warning message if cm[1][1] + cm[0][1] != 0: precision = cm[1][1] / (cm[1][1] + cm[0][1]) else: precision = 1.0 print('WARNING: zero division during precision calculation') recall = cm[1][1] / (cm[1][1] + cm[1][0]) true_negative = cm[0][0] / (cm[0][0] + cm[0][1]) bcr = 0.5 * (recall + true_negative) if store: self.store_metrics_to_model(cm, accuracy, precision, recall, bcr) def store_metrics_to_model(self, cm, accuracy, precision, recall, bcr): """Store the performance metrics The metrics are specifically the confusion matrices, accuracies, precisions, recalls and balanced classification rates. Args: cm: A numpy array representing the confusion matrix given our predicted labels and the actual corresponding labels. It's a 2x2 matrix for the drp_chem model. accuracy: A float representing the accuracy rate of the model: the rate of correctly predicted reactions out of all reactions. precision: A float representing the precision rate of the model: the rate of the number of actually successful reactions out of all the reactions predicted to be successful. recall: A float representing the recall rate of the model: the rate of the number of reactions predicted to be successful out of all the actual successful reactions. bcr: A float representing the balanced classification rate of the model. It's the average value of recall rate and true negative rate. return: N/A """ self.metrics['confusion_matrices'].append(cm) self.metrics['accuracies'].append(accuracy) self.metrics['precisions'].append(precision) self.metrics['recalls'].append(recall) self.metrics['bcrs'].append(bcr) if self.verbose: print(cm) print('accuracy for model is', accuracy) print('precision for model is', precision) print('recall for model is', recall) print('balanced classification rate for model is', bcr) def store_metrics_to_params(self): """Store the metrics results to the model's parameters dictionary Use the same logic of saving the metrics for each model. Dump the cross validation statistics to a pickle file. """ model = 'KNN' with open(os.path.join("./data", "cv_statistics.pkl"), "rb") as f: stats_dict = pickle.load(f) stats_dict[model]['accuracies'].append(self.metrics['accuracies']) stats_dict[model]['confusion_matrices'].append( self.metrics['confusion_matrices']) stats_dict[model]['precisions'].append(self.metrics['precisions']) stats_dict[model]['recalls'].append(self.metrics['recalls']) stats_dict[model]['bcrs'].append(self.metrics['bcrs']) # Save this dictionary in case we need it later with open(os.path.join("./data", "cv_statistics.pkl"), "wb") as f: pickle.dump(stats_dict, f) def save_model(self, k_shot, n_way, meta): """Save the data used to train, validate and test the model to designated folder Args: k_shot: An integer representing the number of training samples per class. n_way: An integer representing the number of classes per task. meta: A boolean representing if it will be trained under option 1 or option 2. Option 1 is train with observations of other tasks and validate on the task-specific observations. Option 2 is to train and validate on the task-specific observations. Returns: N/A """ # Indicate which option we used the data for option = 2 if meta else 1 # Set up the main destination folder for the model dst_root = './KNN_few_shot/option_{0:d}'.format(option) if not os.path.exists(dst_root): os.makedirs(dst_root) print('No folder for KNN model storage found') print(f'Make folder to store KNN model at') # Set up the model specific folder model_folder = '{0:s}/KNN_{1:d}_shot_{2:d}_way_option_{3:d}_{4:s}'.format( dst_root, k_shot, n_way, option, self.amine) if not os.path.exists(model_folder): os.makedirs(model_folder) print('No folder for KNN model storage found') print(f'Make folder to store KNN model of amine {self.amine} at') else: print( f'Found existing folder. Model of amine {self.amine} will be stored at' ) print(model_folder) # Dump the model into the designated folder file_name = "KNN_{0:s}_option_{1:d}.pkl".format(self.amine, option) with open(os.path.join(model_folder, file_name), "wb") as f: pickle.dump([self], f, -1) def __str__(self): return 'A {0:d}-neighbor KNN model for amine {1:s} using active learning'.format( self.n_neighbors, self.amine)
class ActiveLearningClassifier: """Base machine learning classifier using active learning with modAL package Attributes: amine: A string representing the amine that the Logistic Regression model is used for predictions. config: A dictionary representing the hyper-parameters of the model metrics: A dictionary to store the performance metrics locally. It has the format of {'metric_name': [metric_value]}. verbose: A boolean representing whether it will prints out additional information to the terminal or not. stats_path: A Path object representing the directory of the stats dictionary if we are not running multi-processing. result_dict: A dictionary representing the result dictionary used during multi-thread processing. classifier_name: A string representing the name of the generic classifier. model_name: A string representing the name of the specific model for future plotting. all_data: A numpy array representing all the data from the dataset. all_labels: A numpy array representing all the labels from the dataset. x_t: A numpy array representing the training data used for model training. y_t: A numpy array representing the training labels used for model training. x_v: A numpy array representing the testing data used for active learning. y_v: A numpy array representing the testing labels used for active learning. learner: An ActiveLearner to conduct active learning with. See modAL documentation for more details. """ def __init__(self, amine=None, config=None, verbose=True, stats_path=None, result_dict=None, classifier_name='Base Classifier', model_name='Base Classifier'): """initialization of the class""" self.amine = amine self.config = config self.metrics = defaultdict(dict) self.verbose = verbose self.stats_path = stats_path self.result_dict = result_dict self.classifier_name = classifier_name self.model_name = model_name def load_dataset(self, set_id, x_t, y_t, x_v, y_v, all_data, all_labels): """Load the input training and validation data and labels into the model. Args: set_id: An integer representing the id of the random draw that we are loading. x_t: A 2-D numpy array representing the training data. y_t: A 2-D numpy array representing the training labels. x_v: A 2-D numpy array representing the validation data. y_v: A 2-D numpy array representing the validation labels. all_data: A 2-D numpy array representing all the data in the active learning pool. all_labels: A 2-D numpy array representing all the labels in the active learning pool. """ self.draw_id = set_id self.metrics[self.draw_id] = defaultdict(list) self.x_t, self.y_t, self.x_v, self.y_v = x_t, y_t, x_v, y_v self.all_data = all_data self.all_labels = all_labels if self.verbose: print(f'The training data has dimension of {self.x_t.shape}.') print(f'The training labels has dimension of {self.y_t.shape}.') print(f'The testing data has dimension of {self.x_v.shape}.') print(f'The testing labels has dimension of {self.y_v.shape}.') def train(self, warning=True): """Train the KNN model by setting up the ActiveLearner.""" self.learner = ActiveLearner(estimator=self.model, X_training=self.x_t, y_training=self.y_t) # Evaluate zero-point performance self.evaluate(warning=warning) def active_learning(self, num_iter=None, warning=True): """The active learning loop This is the active learning model that loops around the decision tree model to look for the most uncertain point and give the model the label to train Args: num_iter: An integer that is the number of iterations. Default = None warning: A boolean that decide if to declare zero division warning or not. Default = True. """ num_iter = num_iter if num_iter else self.x_v.shape[0] for _ in range(num_iter): # Query the most uncertain point from the active learning pool query_index, query_instance = self.learner.query(self.x_v) # Teach our ActiveLearner model the record it has requested. uncertain_data, uncertain_label = self.x_v[query_index].reshape( 1, -1), self.y_v[query_index].reshape(1, ) self.learner.teach(X=uncertain_data, y=uncertain_label) self.evaluate(warning=warning) # Remove the queried instance from the unlabeled pool. self.x_t = np.append(self.x_t, uncertain_data).reshape( -1, self.all_data.shape[1]) self.y_t = np.append(self.y_t, uncertain_label) self.x_v = np.delete(self.x_v, query_index, axis=0) self.y_v = np.delete(self.y_v, query_index) def evaluate(self, warning=True, store=True): """Evaluation of the model Args: warning: A boolean that decides if to warn about the zero division issue or not. Default = True store: A boolean that decides if to store the metrics of the performance of the model. Default = True """ # Calculate and report our model's accuracy. accuracy = self.learner.score(self.all_data, self.all_labels) self.y_preds = self.learner.predict(self.all_data) cm = confusion_matrix(self.all_labels, self.y_preds) # To prevent nan value for precision, we set it to 1 and send out a warning message if cm[1][1] + cm[0][1] != 0: precision = cm[1][1] / (cm[1][1] + cm[0][1]) else: precision = 1.0 if warning: print('WARNING: zero division during precision calculation') recall = cm[1][1] / (cm[1][1] + cm[1][0]) true_negative = cm[0][0] / (cm[0][0] + cm[0][1]) bcr = 0.5 * (recall + true_negative) if store: self.store_metrics_to_model(cm, accuracy, precision, recall, bcr) def store_metrics_to_model(self, cm, accuracy, precision, recall, bcr): """Store the performance metrics The metrics are specifically the confusion matrices, accuracies, precisions, recalls and balanced classification rates. Args: cm: A numpy array representing the confusion matrix given our predicted labels and the actual corresponding labels. It's a 2x2 matrix for the drp_chem model. accuracy: A float representing the accuracy rate of the model: the rate of correctly predicted reactions out of all reactions. precision: A float representing the precision rate of the model: the rate of the number of actually successful reactions out of all the reactions predicted to be successful. recall: A float representing the recall rate of the model: the rate of the number of reactions predicted to be successful out of all the actual successful reactions. bcr: A float representing the balanced classification rate of the model. It's the average value of recall rate and true negative rate. """ self.metrics[self.draw_id]['confusion_matrices'].append(cm) self.metrics[self.draw_id]['accuracies'].append(accuracy) self.metrics[self.draw_id]['precisions'].append(precision) self.metrics[self.draw_id]['recalls'].append(recall) self.metrics[self.draw_id]['bcrs'].append(bcr) if self.verbose: print(cm) print('accuracy for model is', accuracy) print('precision for model is', precision) print('recall for model is', recall) print('balanced classification rate for model is', bcr) def find_inner_avg(self): """Find the average across all random draws""" metric_names = ['accuracies', 'precisions', 'recalls', 'bcrs'] rand_draws = list(self.metrics.keys()) for metric in metric_names: lst_of_metrics = [] for set_id in rand_draws: lst_of_metrics.append(self.metrics[set_id][metric]) self.metrics['average'][metric] = list( np.average(lst_of_metrics, axis=0)) lst_of_confusion_matrices = [] for set_id in rand_draws: lst_of_confusion_matrices.append( self.metrics[set_id]['confusion_matrices']) self.metrics['average'][ 'confusion_matrices'] = lst_of_confusion_matrices def store_metrics_to_file(self): """Store the metrics results to the model's parameters dictionary Use the same logic of saving the metrics for each model. Dump the cross validation statistics to a pickle file. """ self.find_inner_avg() model = self.model_name # Check if we are running multi-thread process # Or single-thread process if self.result_dict: # Store to the existing multi-processing dictionary stats_dict = self.result_dict else: # Store to a simple dictionary if self.stats_path.exists(): with open(self.stats_path, "rb") as f: stats_dict = pickle.load(f) else: stats_dict = {} if model not in stats_dict: stats_dict[model] = defaultdict(list) stats_dict[model]['amine'].append(self.amine) stats_dict[model]['accuracies'].append( self.metrics['average']['accuracies']) stats_dict[model]['confusion_matrices'].append( self.metrics['average']['confusion_matrices']) stats_dict[model]['precisions'].append( self.metrics['average']['precisions']) stats_dict[model]['recalls'].append(self.metrics['average']['recalls']) stats_dict[model]['bcrs'].append(self.metrics['average']['bcrs']) # Save this dictionary in case we need it later if not self.result_dict and self.stats_path: with open(self.stats_path, "wb") as f: pickle.dump(stats_dict, f) def save_model(self): """Save the data used to train, validate and test the model to designated folder""" # Set up the main destination folder for the model dst_root = './data/{}/{}'.format(self.classifier_name, self.model_name) if not os.path.exists(dst_root): os.makedirs(dst_root) print( f'No folder for {self.classifier_name} model {self.model_name} storage found' ) print(f'Make folder to store model at') # Dump the model into the designated folder file_name = "{0:s}_{1:s}.pkl".format(self.model_name, self.amine) with open(os.path.join(dst_root, file_name), "wb") as f: pickle.dump(self, f)
from modAL.models import ActiveLearner from modAL.disagreement import max_std_sampling np.random.seed(0) # generating the data X = np.random.choice(np.linspace(0, 20, 10000), size=200, replace=False).reshape(-1, 1) y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape) # assembling initial training set n_initial = 5 initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False) X_initial, y_initial = X[initial_idx], y[initial_idx] # defining the kernel for the Gaussian process kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \ + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1)) # initializing the active learner regressor = ActiveLearner(estimator=GaussianProcessRegressor(kernel=kernel), query_strategy=max_std_sampling, X_training=X_initial.reshape(-1, 1), y_training=y_initial.reshape(-1, 1)) # active learning n_queries = 10 for idx in range(n_queries): query_idx, query_instance = regressor.query(X) regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
initial_idx = [ 0, im_height - 1, im_height * (im_height - 1), -1, im_width // 2 + im_height // 2 * im_height ] X_train, y_train = X_pool[initial_idx], y_pool[initial_idx] # create an ActiveLearner instance learner = ActiveLearner(predictor=RandomForestClassifier(), X_initial=X_train, y_initial=y_train) initial_prediction = learner.predict_proba(X_full)[:, 1].reshape( im_height, im_width) n_queries = 100 for round_idx in range(n_queries): query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(-1, )) X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) final_prediction = learner.predict_proba(X_full)[:, 1].reshape( im_height, im_width) # learning with randomly selected queries instead of active learning random_idx = initial_idx + list( np.random.choice(range(len(X_full)), n_queries, replace=False)) X_train, y_train = X_full[initial_idx], y_full[initial_idx] random_learner = ActiveLearner(predictor=RandomForestClassifier(), X_initial=X_train, y_initial=y_train)
# generate the pool # remove the initial data from the training dataset X_pool = np.delete(X_train, initial_idx, axis=0) y_pool = np.delete(y_train, initial_idx, axis=0) """ Training the ActiveLearner """ # initialize ActiveLearner learner = ActiveLearner( predictor=classifier, X_initial=X_initial, y_initial=y_initial, verbose=0 ) # the active learning loop n_queries = 10 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool, n_instances=200, verbose=0) learner.teach( X=X_pool[query_idx], y=y_pool[query_idx], verbose=0 ) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx, axis=0) # the final accuracy score print(learner.score(X_test, y_test, verbose=0))