class Review(ABC): """Base class for Systematic Review""" def __init__(self, X, y=None, model=None, query_strategy=None, train_data_fn=full_sample, n_instances=1, n_queries=None, prior_included=[], prior_excluded=[], log_file=None, settings={}, verbose=1): super(Review, self).__init__() self.X = X self.y = y self.model = model self.query_strategy = query_strategy self.train_data = train_data_fn self.n_instances = n_instances self.n_queries = n_queries self.log_file = log_file self.verbose = verbose self.prior_included = prior_included self.prior_excluded = prior_excluded self.fit_kwargs = settings['fit_kwargs'] self.balance_kwargs = settings['balance_kwargs'] self.query_kwargs = settings['query_kwargs'] self._logger = Logger() @abstractmethod def _prior_knowledge(self): pass @abstractmethod def _classify(self, ind): """Classify the provided indices.""" pass def _prior_teach(self): """Function called before training model.""" pass def _stop_iter(self, query_i, pool): """Criteria for stopping iteration. Stop iterating if: - n_queries is reached - the pool is empty """ stop_iter = False # if the pool is empty, always stop if len(pool) == 0: stop_iter = True # don't stop if there is no stopping criteria if self.n_queries is not None and query_i >= self.n_queries: stop_iter = True return stop_iter def review(self): # create the pool and training indices. n_samples = self.X.shape[0] pool_idx = np.arange(n_samples) # add prior knowledge init_idx, init_labels = self._prior_knowledge() self.y[init_idx] = init_labels # remove the initial sample from the pool pool_idx = np.delete(pool_idx, init_idx) # Initialize learner, but don't start training yet. self.learner = ActiveLearner(estimator=self.model, query_strategy=self.query_strategy) query_i = 0 train_idx = init_idx.copy() query_idx = train_idx self._logger.add_labels(self.y) while not self._stop_iter(query_i - 1, pool_idx): self._logger.add_training_log(query_idx, self.y[query_idx]) # Get the training data. X_train, y_train = self.train_data(self.X, self.y, train_idx, **self.balance_kwargs) # validation_data(self.X[pool_idx], self.y[pool_idx], # self.fit_kwargs, ratio=1) # Train the model on the training data. self.learner.teach(X=X_train, y=y_train, only_new=True, **self.fit_kwargs) # Make a query from the pool. query_idx, _ = self.learner.query(X=self.X, pool_idx=pool_idx, n_instances=min( self.n_instances, len(pool_idx)), query_kwargs=self.query_kwargs) # Log the probabilities of samples in the pool being included. pred_proba = self.query_kwargs.get('pred_proba', []) if len(pred_proba) == 0: pred_proba = self.learner.predict_proba(self.X[pool_idx]) self._logger.add_proba(pool_idx, pred_proba) # Log the probabilities of samples that were trained. pred_proba_train = self.learner.predict_proba(self.X[train_idx]) self._logger.add_proba(train_idx, pred_proba_train, logname="train_proba") # Classify the queried papers. self.y[query_idx] = self._classify(query_idx) self._logger.add_labels(self.y) # Update training/pool indices train_idx = np.append(train_idx, query_idx) pool_idx = np.delete(np.arange(n_samples), train_idx, axis=0) # update the query counter query_i += 1 # Save the result to a file if self.log_file: self.save_logs(self.log_file) if self.verbose: print(f"Saved results in log file: {self.log_file}") def save_logs(self, *args, **kwargs): """Save the logs to a file.""" self._logger.save(*args, **kwargs)
from modAL.models import ActiveLearner np.random.seed(0) # loading the iris dataset iris = load_iris() # initial training data train_idx = [0, 50, 100] X_train = iris['data'][train_idx] y_train = iris['target'][train_idx] # generating the pool X_pool = np.delete(iris['data'], train_idx, axis=0) y_pool = np.delete(iris['target'], train_idx) # initializing the active learner learner = ActiveLearner(predictor=KNeighborsClassifier(n_neighbors=3), X_initial=X_train, y_initial=y_train) # pool-based sampling n_queries = 20 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool) learner.teach(X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, )) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx)
a3=np.array(np.where(labels==2)) a4=np.array(np.where(labels==3)) b1=np.random.choice(a1[0,:],20) b2=np.random.choice(a2[0,:],20) b3=np.random.choice(a3[0,:],20) b4=np.random.choice(a4[0,:],20) c1=x[b1] c2=x[b2] c3=x[b3] c4=x[b4] d1=label[b1] d2=label[b2] d3=label[b3] d4=label[b4] train_data=np.concatenate((c1,c2,c3,c4),axis=0) train_label=np.concatenate((d1,d2,d3,d4),axis=0) index=np.arange(len(train_data)) np.random.shuffle(index) train_data,train_label=train_data[index],train_label[index] learner = ActiveLearner(estimator=RandomForestClassifier(),X_training=train_data, y_training=train_label) unqueried_score = learner.score(x,label) performance_history = [unqueried_score] while learner.score(x, label) < 0.97: stream_idx = np.random.choice(range(len(x))) idx = np.random.choice(range(len(train_data))) if classifier_uncertainty(learner, x[stream_idx].reshape(1, -1)) >= 0.4: learner.teach(train_data[idx].reshape(1, -1), train_label[idx].reshape(-1, )) new_score = learner.score(x, label) performance_history.append(new_score) print('Data no. %d queried, new accuracy: %f' % (idx, new_score))
# remove the initial data from the training dataset X_pool = np.delete(X_train, initial_idx, axis=0) names_pool = np.delete(name, initial_idx, axis=0) y_pool = np.delete(y_train, initial_idx, axis=0) #print(np.shape(X_pool), 'X_pool') print(y_pool[:20], 'y_pool') #### Active Learner # QUERY strategy 1 # initialize ActiveLearner if args.query_strategy == "uncertainty": learner = ActiveLearner( estimator=net, query_strategy=modAL.uncertainty.uncertainty_sampling, X_training=X_initial, y_training=y_initial, ) # QUERY strategy 2 ####Yet another query strategy######################## elif args.query_strategy == "margin": learner = ActiveLearner( estimator=net, query_strategy=modAL.uncertainty.margin_sampling, X_training=X_initial, y_training=y_initial, ) ###################################################### # QUERY strategy 3
def al_rank(self, data, target, X_train, y_train, X_full, y_full, train_idx, N_RAW_SAMPLES=80): BATCH_SIZE = 3 preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE) learner = ActiveLearner(estimator=RandomForestClassifier(), X_training=X_train, y_training=y_train, query_strategy=preset_batch) # N_RAW_SAMPLES = 80 N_QUERIES = N_RAW_SAMPLES // BATCH_SIZE unqueried_score = learner.score(X_full, y_full) performance_history = [unqueried_score] # Isolate our examples for our labeled dataset. n_labeled_examples = X_full.shape[0] training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=3) X_train = X_full[training_indices] y_train = y_full[training_indices] # Isolate the non-training examples we'll be querying. X_pool = np.delete(X_full, training_indices, axis=0) y_pool = np.delete(y_full, training_indices, axis=0) acc = [] for index in range(N_QUERIES): query_index, query_instance = learner.query(X_pool) # Teach our ActiveLearner model the record it has requested. X, y = X_pool[query_index], y_pool[query_index] learner.teach(X=X, y=y) # Remove the queried instance from the unlabeled pool. X_pool = np.delete(X_pool, query_index, axis=0) y_pool = np.delete(y_pool, query_index) # Calculate and report our model's accuracy. model_accuracy = learner.score(X_full, y_full) print('Accuracy after query {n}: {acc:0.4f}'.format( n=index + 1, acc=model_accuracy)) acc.append(model_accuracy) # Save our model's performance for plotting. performance_history.append(model_accuracy) # acc = [] # X_pool = np.delete(data, train_idx, axis=0) # y_pool = np.delete(target, train_idx) # learner = ActiveLearner( # estimator=RandomForestClassifier(), # X_training=X_train, y_training=y_train # ) # # n_queries = self.query_number # # n_queries = 1500 # for idx in range(n_queries): # query_idx, query_instance = learner.query(X_pool) # learner.teach( # X=X_pool[query_idx].reshape(1, -1), # y=y_pool[query_idx].reshape(1, ) # ) # # remove queried instance from pool # X_pool = np.delete(X_pool, query_idx, axis=0) # y_pool = np.delete(y_pool, query_idx) # learner_score = learner.score(data, target) # # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore)) # acc.append(learner_score) # print('%0.3f' % (learner_score), end=",") return acc
class CustomAcitveLearner(BaseModel): def __init__(self, X_train, y_train, X_test, y_test, epochs=10, batch_size=128, lr=1e-3, n_initial=100, n_queries=100, query_strategy=uncertainty_sampling, estimator=None): super().__init__(X_train, y_train, X_test, y_test, epochs, batch_size, lr) self.X_train = X_train self.y_train = y_train self.X_test = X_test self.y_test = y_test self.epochs = epochs self.batch_size = batch_size self.lr = lr self.n_initial = n_initial self.n_queries = n_queries self.query_strategy = query_strategy initial_idx = np.random.choice(range(len(self.X_train)), size=self.n_initial, replace=False) self.__X_initial = self.X_train[initial_idx] self.__y_initial = self.y_train[initial_idx] self.__X_pool = np.delete(self.X_train, initial_idx, axis=0) self.__y_pool = np.delete(self.y_train, initial_idx, axis=0) self.learner = ActiveLearner( estimator=DL.LeNet(self.lr), query_strategy=self.query_strategy, X_training=self.__X_initial, y_training=self.__y_initial, verbose=1 ) BaseModel.estimator = self.learner def train(self): performances = [self.evaluate(self.X_test, self.y_test)] for idx in range(self.n_queries): try: query_idx, query_instance = self.learner.query(self.__X_pool, verbose=0) except: break placeholder = st.empty() with plt.style.context('seaborn-white'): plt.figure(figsize=(10, 5)) plt.subplot(1, 2, 1) plt.title('Digit to label') plt.imshow(query_instance.reshape(8, 8)) plt.subplot(1, 2, 2) plt.title('Accuracy of your model') plt.plot(range(idx+1), performances) plt.scatter(range(idx+1), performances) plt.xlabel('number of queries') plt.ylabel('accuracy') plt.savefig('../buf.png', format='png') with placeholder.beta_container(): st.image('../buf.png', use_column_width=True) time.sleep(0.5) plt.close() placeholder.empty() self.learner.teach( X=self.__X_pool[query_idx], y=self.__y_pool[query_idx], epochs=self.epochs, batch_size=self.batch_size, verbose=0 ) self.__X_pool = np.delete(self.__X_pool, query_idx, axis=0) self.__y_pool = np.delete(self.__y_pool, query_idx, axis=0) model_accuracy = self.evaluate(self.X_test, self.y_test) performances.append(model_accuracy) # with st.beta_container(): # info = 'Accuracy after query {n}: {acc:0.4f}'.format(n=idx + 1, acc=model_accuracy) # st.write(info) return performances def predict(self, X): y_prob = super().estimator.predict(X) y_classes = y_prob.argmax(axis=-1) return y_classes def evaluate(self, X_test, y_test): from sklearn.metrics import accuracy_score y_classes = self.predict(X_test) return accuracy_score(y_test.argmax(axis=-1), y_classes)
X_initial = X_train[initial_idx] y_initial = y_train[initial_idx] # generate the pool # remove the initial data from the training dataset X_pool = np.delete(X_train, initial_idx, axis=0) y_pool = np.delete(y_train, initial_idx, axis=0) """ Training the ActiveLearner """ # initialize ActiveLearner learner = ActiveLearner( predictor=classifier, X_initial=X_initial, y_initial=y_initial, verbose=0 ) # the active learning loop n_queries = 10 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool, n_instances=200, verbose=0) learner.teach( X=X_pool[query_idx], y=y_pool[query_idx], verbose=0 ) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx, axis=0)
def main(): """ Run an active learning experiment. Sample command: ``` python training/run_modAL_experiment.py --al_epochs_init=10 --al_epochs_incr=5 --al_n_iter=10 --al_samples_per_iter=100 --data_class=DroughtWatch --model_class=ResnetClassifier --batch_size=64 --n_train_images=1000 --n_validation_images=1000 --pretrained=True --wandb ``` """ # generic setup steps from run_experiment # --------------------------------------- parser = _setup_parser() args = parser.parse_args() data_class = _import_class(f"active_learning.data.{args.data_class}") model_class = _import_class(f"active_learning.models.{args.model_class}") data = data_class(args) model = model_class(data_config=data.config(), args=args) if args.loss not in ("ctc", "transformer"): lit_model_class = lit_models.BaseLitModel if args.loss == "ctc": lit_model_class = lit_models.CTCLitModel if args.loss == "transformer": lit_model_class = lit_models.TransformerLitModel if args.load_checkpoint is not None: lit_model = lit_model_class.load_from_checkpoint(args.load_checkpoint, args=args, model=model) else: lit_model = lit_model_class(args=args, model=model) # modAL specific experiment setup # ------------------------------- # initialize wandb with pytorch model if args.wandb: wandb.init(config=args) wandb.watch(model, log_freq=100) # evaluate query strategy from args parameter if args.al_query_strategy in ["uncertainty_sampling", "margin_sampling", "entropy_sampling"]: query_strategy = _import_class(f"modAL.uncertainty.{args.al_query_strategy}") else: query_strategy = _import_class(f"active_learning.sampling.{args.al_query_strategy}") # cpu vs. gpu: ignore --gpu args param, instead just set gpu based on availability device = "cuda" if torch.cuda.is_available() else "cpu" # initialize train, validation and pool datasets data.setup() X_initial = np.moveaxis( data.data_train.data, 3, 1 ) # shape change: (i, channels, h, w) instead of (i, h, w, channels) y_initial = data.data_train.targets if args.reduced_develop_train_size: print("NOTE: Reduced initial train set size for development activated") X_initial = X_initial[:100, :, :, :] y_initial = y_initial[:100] X_val = np.moveaxis(data.data_val.data, 3, 1) # shape change y_val = data.data_val.targets X_pool = np.moveaxis(data.data_unlabelled.data, 3, 1) # shape change y_pool = data.data_unlabelled.targets # initialize skorch classifier classifier = NeuralNetClassifier( model, criterion=torch.nn.CrossEntropyLoss, optimizer=torch.optim.Adam, train_split=predefined_split(Dataset(X_val, y_val)), verbose=1, device=device, ) lit_model.summarize(mode="full") # initialize modal active learner print("Initializing model with base training set") learner = ActiveLearner( estimator=classifier, X_training=X_initial, y_training=y_initial, epochs=args.al_epochs_init, query_strategy=query_strategy, ) _log_skorch_history( history=learner.estimator.history, al_iter=0, epoch_start=0, train_acc=learner.score(learner.X_training, learner.y_training), train_size=len(learner.y_training), wandb_logging=args.wandb, ) # active learning loop for idx in range(args.al_n_iter): print("Active learning query no. %d" % (idx + 1)) query_idx, _ = learner.query(X_pool, n_instances=args.al_samples_per_iter) learner.teach( X=X_pool[query_idx], y=y_pool[query_idx], only_new=args.al_incr_onlynew, epochs=args.al_epochs_incr ) _log_skorch_history( history=learner.estimator.history, al_iter=idx + 1, epoch_start=args.al_epochs_init + idx * args.al_epochs_incr, train_acc=learner.score(learner.X_training, learner.y_training), train_size=len(learner.y_training), wandb_logging=args.wandb, ) # remove queried instances from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx, axis=0)
# In[11]: y_initial # ## Initialize learner # In[12]: learner = ActiveLearner( estimator=svm.SVC(kernel='linear', gamma='scale', C=2, probability = True), query_strategy=uncertainty_sampling, X_training=X_initial, y_training=y_initial ) # In[17]: learner.estimator # In[18]: # import pickle # pickle.dump(learner.estimator, open('models/model0.sav','wb'))
# create the data to stream from X_full = np.transpose( [np.tile(np.asarray(range(im.shape[0])), im.shape[1]), np.repeat(np.asarray(range(im.shape[1])), im.shape[0])] ) # map the intensity values against the grid y_full = np.asarray([im[P[0], P[1]] for P in X_full]) # assembling initial training set n_initial = 5 initial_idx = np.random.choice(range(len(X_full)), size=n_initial, replace=False) X_train, y_train = X_full[initial_idx], y_full[initial_idx] # initialize the learner learner = ActiveLearner( predictor=RandomForestClassifier(), X_initial=X_train, y_initial=y_train ) print('Initial prediction accuracy: %f' % learner.score(X_full, y_full)) # visualizing initial prediciton with plt.style.context('seaborn-white'): plt.figure(figsize=(7, 7)) prediction = learner.predict_proba(X_full)[:, 1] plt.imshow(prediction.reshape(im_width, im_height)) plt.title('Initial prediction accuracy: %f' % learner.score(X_full, y_full)) plt.show() """ The instances are randomly selected one by one, if an instance's uncertainty is above a threshold, the label is requested and shown to the learner. The process is continued until the learner reaches a previously defined accuracy.
def __init__(self, X, y=None, model=None, query_strategy=max_sampling, train_data_fn=full_sample, n_instances=1, n_queries=1, prior_included=[], prior_excluded=[], log_file=None, fit_kwargs={}, balance_kwargs={}, query_kwargs={}, logger=None, verbose=1): super(BaseReview, self).__init__() self.X = X self.y = y if y is None: self.y = np.full(X.shape[0], NOT_AVAILABLE) # Default to Naive Bayes model if model is None: print("Warning: using naive Bayes model as default." "If you experience bad performance, read the documentation" " in order to implement a RNN based solution.") from asreview.models import create_nb_model model = create_nb_model() self.model = model self.query_strategy = query_strategy self.train_data = train_data_fn self.n_instances = n_instances self.n_queries = n_queries self.log_file = log_file self.verbose = verbose self.prior_included = prior_included self.prior_excluded = prior_excluded self.fit_kwargs = fit_kwargs self.balance_kwargs = balance_kwargs self.query_kwargs = query_kwargs self.query_i = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False self.query_kwargs["src_query_idx"] = {} if logger is None: self._logger = Logger() self.start_from_logger = False else: self._logger = logger self._prepare_with_logger() self.start_from_logger = True # Initialize learner, but don't start training yet. self.learner = ActiveLearner(estimator=self.model, query_strategy=self.query_strategy)
X_full = np.transpose( [np.tile(np.asarray(range(data.shape[0])), data.shape[1]), np.repeat(np.asarray(range(data.shape[1])), data.shape[0])] ) # map the intensity values against the grid y_full = np.asarray([data[P[0], P[1]] for P in X_full]) X_pool = deepcopy(X_full) y_pool = deepcopy(y_full) # assembling initial training set initial_idx = [0, im_height-1, im_height*(im_height-1), -1, im_width//2 + im_height//2*im_height] X_train, y_train = X_pool[initial_idx], y_pool[initial_idx] # create an ActiveLearner instance learner = ActiveLearner( predictor=RandomForestClassifier(), X_initial=X_train, y_initial=y_train ) initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width) n_queries = 100 for round_idx in range(n_queries): query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(-1, )) X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) final_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width) # learning with randomly selected queries instead of active learning random_idx = initial_idx + list(np.random.choice(range(len(X_full)), n_queries, replace=False)) X_train, y_train = X_full[initial_idx], y_full[initial_idx]
# initial training data: 100 random pixels initial_idx = np.random.choice(range(len(X_pool)), size=100) # initializing the learners n_learners = 3 learner_list = [] for _ in range(n_learners): learner = ActiveLearner( predictor=RandomForestClassifier(), X_initial=X_pool[initial_idx], y_initial=y_pool[initial_idx], bootstrap_init=True ) learner_list.append(learner) # assembling the Committee committee = Committee(learner_list) # ensemble active learner from the Committee ensemble_learner = ActiveLearner( predictor=committee ) query_idx, query_instance = ensemble_learner.query(X_pool) # ... # ... obtain label from the Oracle ... # ... ensemble_learner.teach(X_pool[query_idx], y_pool[query_idx], bootstrap=True)
# loading the iris dataset iris = load_iris() # initial training data train_idx = [0, 50, 100] X_train = iris['data'][train_idx] y_train = iris['target'][train_idx] # generating the pool X_pool = np.delete(iris['data'], train_idx, axis=0) y_pool = np.delete(iris['target'], train_idx) # initializing the active learner learner = ActiveLearner( predictor=KNeighborsClassifier(n_neighbors=3), X_initial=X_train, y_initial=y_train ) # pool-based sampling n_queries = 20 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool) learner.teach( X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, ) ) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx)
def ProactiveLearning(train_data, test_data, budget, cost_non_unif, cost_ratio, class_num, mode): ''' Create an active learner with proactive query strategy and run the active learner on the given data set train_data: the train data, which is used to do proactive learning test_data: the held-out test data, which is used to compute classification accuracy budget: total amount of prices that is allowed to pay cost_non_unif: the "price list" of the variable cost oracle cost_ratio: defined as mean(cost_non_unif) / cost_unif class_num: the number of classes of the data set, for this data, it is 10 mode: 'proactive' for proactive learning query strategy, 'uniform' for only querying from the uniform cost oracle 'random' for randomly querying an instance from a randomly selected oracle Return: a list of accuracies, a list of cumulative costs, a list of queried oracles, each element corresponds to each iteration. ''' # cost of uniform cost oracle cost_unif = np.mean(cost_non_unif) / cost_ratio # use SVM classifier clf = svm.SVC(gamma='scale', decision_function_shape='ovo', probability=True) # load the initial, free labeled data initial_labeled_data = np.load('./initial_labeled_sample.npy') L_X = initial_labeled_data[:, :-1] L_y = initial_labeled_data[:, -1].reshape(-1, 1) # create an active learner with proactive learning strategy learner = ActiveLearner(estimator=clf, query_strategy=ProactiveQuery, X_training=initial_labeled_data[:, :-1], y_training=initial_labeled_data[:, -1].reshape(-1, 1)) # Initially, the unlabeled pool of data is the entire train data UL_X, UL_y = train_data.iloc[:, data.columns != 'Label'].values, train_data[ 'Label'].values test_X, test_y = test_data.iloc[:, data.columns != 'Label'].values, test_data[ 'Label'].values accuracy = [] total_cost = [0] # here a dummy cost of 0 is added for convenience oracle = [] while total_cost[-1] < budget and UL_X.shape[0] != 0: # TODO: implement the active learning loop with proactive learning query strategy x_star, k_star = learner.query(L_X, L_y, UL_X, UL_y, cost_non_unif, cost_unif, mode) learner.teach(UL_X[x_star:x_star + 1], UL_y[x_star:x_star + 1].reshape(-1, 1)) score = learner.score(test_X, test_y) accuracy.append(score) oracle.append(k_star) if (k_star == 1): total_cost.append(total_cost[-1] + cost_unif) else: # query from non_uninf oracle total_cost.append(total_cost[-1] + cost_non_unif[x_star]) #print(UL_X.shape[0], x_star, total_cost[-1]) # Add x_star to L_X, L_Y L_X = np.append(L_X, UL_X[x_star:x_star + 1], axis=0) L_y = np.append(L_y, UL_y[x_star].reshape(-1, 1), axis=0) # Delete x_star from UL_X, UL_y, cost_unif, cost_non_unif UL_X = np.delete(UL_X, x_star, 0) UL_y = np.delete(UL_y, x_star, 0) cost_unif = np.delete(cost_unif, x_star, 0) cost_non_unif = np.delete(cost_non_unif, x_star, 0) return accuracy, total_cost[1:], oracle
initial_idx, axis=0) with plt.style.context('seaborn-white'): plt.figure(figsize=(10, 10)) plt.scatter(X[:, 0], X[:, 1], c='k', s=20) plt.scatter(X[y[:, 0] == 1, 0], X[y[:, 0] == 1, 1], facecolors='none', edgecolors='b', s=50, linewidths=2, label='class 1') plt.scatter(X[y[:, 1] == 1, 0], X[y[:, 1] == 1, 1], facecolors='none', edgecolors='r', s=100, linewidths=2, label='class 2') plt.legend() plt.show() learner = ActiveLearner(estimator=OneVsRestClassifier( SVC(probability=True, gamma='auto')), query_strategy=avg_score, X_training=X_initial, y_training=y_initial) query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool[query_idx], y_pool[query_idx])
def prepare_learner(): estimator = RandomForestClassifier() preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE) learner = ActiveLearner(estimator=estimator, query_strategy=preset_batch) return learner
x_train = x[training_indices] y_train = y[training_indices] x_new = x[training_indices] y_new = y[training_indices] # Isolate the non-training examples we'll be querying. x_pool = np.delete(x, training_indices, axis=0) y_pool = np.delete(y, training_indices, axis=0) #''' classifier1 = RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=50) classifier2 = KNeighborsClassifier(n_neighbors=3) learner = ActiveLearner(estimator=classifier1, X_training=x_train, y_training=y_train) predictions = learner.predict(x) is_correct = (predictions == y) unqueried_score = learner.score(x, y) print('Accuracy after first 1000 random rows: {acc:0.4f}%'.format( acc=unqueried_score * 100)) performance_history = [unqueried_score] count = 1 while (float(performance_history[-1] * 100) < 90): queryList = [] query_index, query_instance = learner.query(x_pool, n_instances=1000) training_indices = np.concatenate([training_indices, query_index]) x_temp, y_temp = x_pool[query_index], y_pool[query_index]
[np.tile(np.asarray(range(data.shape[0])), data.shape[1]), np.repeat(np.asarray(range(data.shape[1])), data.shape[0])] ) # map the intensity values against the grid y_pool = np.asarray([data[P[0], P[1]] for P in X_pool]) # initial training data: 1000 random pixels initial_idx = np.random.choice(range(len(X_pool)), size=1000) # initializing the learners n_learners = 3 learner_list = [] for _ in range(n_learners): learner = ActiveLearner( predictor=KNeighborsClassifier(n_neighbors=10), X_initial=X_pool[initial_idx], y_initial=y_pool[initial_idx], bootstrap_init=True ) learner_list.append(learner) # assembling the Committee committee = Committee(learner_list) # visualizing every learner in the Committee with plt.style.context('seaborn-white'): plt.figure(figsize=(7*n_learners, 7)) for learner_idx, learner in enumerate(committee): plt.subplot(1, n_learners, learner_idx+1) plt.imshow(learner.predict(X_pool).reshape(im_height, im_width)) plt.title('Learner no. %d' % (learner_idx + 1)) plt.show()
X_pool = np.transpose([ np.tile(np.asarray(range(data.shape[0])), data.shape[1]), np.repeat(np.asarray(range(data.shape[1])), data.shape[0]) ]) # map the intensity values against the grid y_pool = np.asarray([data[P[0], P[1]] for P in X_pool]) # initial training data: 1000 random pixels initial_idx = np.random.choice(range(len(X_pool)), size=1000) # initializing the learners n_learners = 3 learner_list = [] for _ in range(n_learners): learner = ActiveLearner(estimator=KNeighborsClassifier(n_neighbors=10), X_training=X_pool[initial_idx], y_training=y_pool[initial_idx], bootstrap_init=True) learner_list.append(learner) # assembling the Committee committee = Committee(learner_list) # visualizing every learner in the Committee with plt.style.context('seaborn-white'): plt.figure(figsize=(7 * n_learners, 7)) for learner_idx, learner in enumerate(committee): plt.subplot(1, n_learners, learner_idx + 1) plt.imshow(learner.predict(X_pool).reshape(im_height, im_width)) plt.title('Learner no. %d' % (learner_idx + 1)) plt.show()
size=n_initial, replace=False) X_initial = X_train[initial_idx] y_initial = y_train[initial_idx] # generate the pool # remove the initial data from the training dataset X_pool = np.delete(X_train, initial_idx, axis=0) y_pool = np.delete(y_train, initial_idx, axis=0) """ Training the ActiveLearner """ # initialize ActiveLearner learner = ActiveLearner(estimator=classifier, X_training=X_initial, y_training=y_initial, verbose=1) # the active learning loop n_queries = 10 for idx in range(n_queries): query_idx, query_instance = learner.query(X_pool, n_instances=100, verbose=0) print(query_idx) learner.teach(X=X_pool[query_idx], y=y_pool[query_idx], only_new=True, verbose=1) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0)
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf): unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf) label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) X_train, y_train = get_active_data(unlabel_feedback_info, unlabel_feature) X_feedback, y_feedback = get_active_data(label_feedback_info, choose_feature) # initializing the active learner learner = ActiveLearner( # estimator=KNeighborsClassifier(n_neighbors=4), estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=X_feedback, y_training=y_feedback ) predict, sel_query, add_unlabel_feature = [], [], [] if len(unlabel_query) > 0: # pool-based sampling n_queries = 100 sel_idx, sel_label = [], [] for idx in range(n_queries): # query_idx, query_instance = learner.query(X=X_train) query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train) idx = int(query_idx/10) # print(idx, len(X_train)) # print('uncertain', query_idx, X_train[query_idx], y_train[query_idx]) learner.teach( X=X_train[query_idx].reshape(1, -1), y=y_train[query_idx].reshape(1, ) ) # add queried instance into FR choose_query.append(unlabel_query[idx]) choose_answer.append(unlabel_answer[idx]) rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10]) choose_feature.extend(unlabel_feature[idx*10:idx*10+10]) # learner.teach( # X=new_X_train.reshape(1, -1), # y=new_y_train.reshape(1, ) # ) # print(unlabel_query[idx], unlabel_query[idx], rec_api_unlabel[idx*10:idx*10+10], rec_api_unlabel[idx*10:idx*10+10]) # remove queried instance from pool for i in range(10): X_train = np.delete(X_train, idx*10, axis=0) y_train = np.delete(y_train, idx*10) del unlabel_query[idx] del unlabel_answer[idx] del rec_api_unlabel[idx*10:idx*10+10] del unlabel_feature[idx*10:idx*10+10] if len(X_train) == 0: break add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf) new_X_feedback, new_y_feedback = get_active_data(add_label_feedback_info, choose_feature) learner = ActiveLearner( # estimator=KNeighborsClassifier(n_neighbors=4), estimator=LogisticRegression(penalty='l1', solver='liblinear'), X_training=new_X_feedback, y_training=new_y_feedback ) feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf) X = split_data.get_test_feature_matrix(feedback_info, test_feature) X_test = np.array(X) # 用反馈数据学习过后的模型来预测测试数据 for query_idx in range(400): y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1)) predict.append(float(y_pre[0, 1])) # predict.append(math.log(float(y_pre[0, 1])+1)) # predict.extend(y_pre.tolist()) x = X_test[query_idx].reshape(1, -1) # print(predict) # print('new_choose', len(choose_query), len(choose_answer)) # fw = open('../data/add_FR.csv', 'a+', newline='') # writer = csv.writer(fw) # for i, fr_q in enumerate(choose_query): # writer.writerow((fr_q, choose_answer[i])) # fw.close() return predict, X, new_X_feedback, new_y_feedback #sorted(sel_query)
def al_qbc_proba(self, data, target, X_train, y_train, X_full, y_full, train_idx, committee_strategy, proba): acc = [] pre = [] rec = [] fs = [] X_pool = deepcopy(X_full) y_pool = deepcopy(y_full) # initializing Committee members n_members = 2 learner_list = list() for member_idx in range(n_members): # initial training data # n_initial = 5 # train_idx = np.random.choice(range(X_pool.shape[0]), size=n_initial, replace=False) # X_train = X_pool[train_idx] # y_train = y_pool[train_idx] # creating a reduced copy of the data with the known instances removed X_pool = np.delete(X_pool, train_idx, axis=0) y_pool = np.delete(y_pool, train_idx) # initializing learner learner = ActiveLearner( estimator=RandomForestClassifier(), # query_strategy=vote_entropy_sampling, X_training=X_train, y_training=y_train) learner_list.append(learner) # assembling the committee committee = Committee(learner_list=learner_list, query_strategy=committee_strategy) # print('Committee initial predictions, accuracy = %1.3f' % committee.score(data, target)) # print('%1.3f' % committee.score(data, target)) n_queries = self.query_number for idx in range(n_queries): query_idx, query_instance = committee.query(X_pool) labeled_y = y_pool[query_idx].reshape(1, ) rand_int = randint(0, 100) if (rand_int <= proba): if (y_pool[query_idx][0] == 1): y_pool[query_idx][0] = 0 labeled_y = np.array((0)).reshape(1, ) else: y_pool[query_idx][0] = 1 labeled_y = np.array((1)).reshape(1, ) # learner.teach( # X=X_pool[query_idx].reshape(1, -1), # y=labeled_y # ) committee.teach(X=X_pool[query_idx].reshape(1, -1), y=labeled_y) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) # learner_score = committee.score(data, target) # print('Committee %d th query predictions, accuracy = %1.3f' % (idx , learner_score)) precision, recall, fscore, support, accuracy = self.performance_measure( learner, X_full, y_full) learner_score = accuracy acc.append(learner_score) pre.append(precision) rec.append(recall) fs.append(fscore) print('%0.3f' % (learner_score), end=",") return acc, pre, rec, fs
size=int(n_labeled_examples * percent)) X_train = X_raw[training_indices] y_train = y_raw[training_indices] # Isolate the non-training examples we'll be querying. X_pool = np.delete(X_raw, training_indices, axis=0) y_pool = np.delete(y_raw, training_indices, axis=0) from sklearn.neighbors import KNeighborsClassifier from modAL.models import ActiveLearner # Specify our core estimator along with it's active learning model. knn = KNeighborsClassifier(n_neighbors=3) learner = ActiveLearner(estimator=RandomForestClassifier(), query_strategy=uncertainty_sampling, X_training=X_train, y_training=y_train) # Isolate the data we'll need for plotting. predictions = learner.predict(X_raw) is_correct = (predictions == y_raw) # Record our learner's score on the raw data. unqueried_score = learner.score(X_raw, y_raw) # Plot our classification results. ''' fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130) ax.scatter(x=x_component[is_correct], y=y_component[is_correct], c='g', marker='+', label='Correct', alpha=8/10) ax.scatter(x=x_component[~is_correct], y=y_component[~is_correct], c='r', marker='x', label='Incorrect', alpha=8/10) ax.legend(loc='lower right')
def mnist_cnn(nr_of_labeled_examples=60000, verbose=0): assert (nr_of_labeled_examples >= 100 and nr_of_labeled_examples <= 60000 and nr_of_labeled_examples % 10 == 0), \ "Number of labeled example should be between 100 and 60000 and be dividible by 10" batch_size = 128 epochs = 100 model_path = 'best_model.h5' (X_train, y_train), (x_test, y_test) = load_proc_data(nr_of_labeled_examples) if verbose == 2: print('X_train shape:', X_train.shape) print(X_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') model = create_model() # added early stopping to avoid training when it's not progressing es = EarlyStopping(monitor='val_accuracy', mode='max', min_delta=0.0001, patience=20, verbose=1, restore_best_weights=True) mc = ModelCheckpoint(model_path, monitor='val_accuracy', mode='max', save_best_only=True) model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test), callbacks=[mc, es]) i=0 while True: if not os.path.exists(model_path): if i==0: print('Waiting h5 model file...') if i == 10: msg = 'error, check in logs' with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f: print(f'Random sampling - Training on {nr_of_labeled_examples} samples\n\tVal. accuracy: {msg}', file=f) break i = i + 1 time.sleep(0.0001) else: saved_model = load_model(model_path) score = saved_model.evaluate(x_test, y_test, verbose=0) os.remove(model_path) with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f: print(f'Random sampling - Training on {nr_of_labeled_examples} samples\n\tVal. accuracy: ', '%.5f' % score[1], file=f) break if verbose >= 1: print('Test loss:', score[0]) print('Test accuracy:', score[1]) #AL!! sampling_methods = [uncertainty_sampling, entropy_sampling, margin_sampling] for method in sampling_methods: with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f: print(f'{method.__name__} - # trained samples - Val. accuracy', file=f) (X_train, y_train), (x_test, y_test) = load_proc_data() segment = int(nr_of_labeled_examples / 10) # create the classifier classifier = KerasClassifier(create_model) # assemble initial data initial_idx = np.random.choice(range(len(X_train)), size=segment, replace=False) X_initial = X_train[initial_idx] y_initial = y_train[initial_idx] # initialize ActiveLearner learner = ActiveLearner( estimator=classifier, query_strategy=method, X_training=X_initial, y_training=y_initial, verbose=1 ) # the active learning loop n_queries = 9 only_new = False # TODO: maybe learn on all data from the beggining, test!!! for idx in range(n_queries): model_path_al = f'best_model_al_{method.__name__}_{(idx + 2)*segment}.h5' mc_al = ModelCheckpoint(model_path_al, monitor='val_accuracy', mode='max', save_best_only=True) print('Query no. %d' % (idx + 1)) query_idx, _ = learner.query(X_train, n_instances=segment, verbose=0) #TODO: n_instances param, get it here somehow, or do the process for n times learner.teach( X=X_train[query_idx], y=y_train[query_idx], only_new=only_new, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test), callbacks=[mc_al, es] ) i=0 while True: if not os.path.exists(model_path_al): if i==0: print('Waiting h5 model file...') if i==10: msg = 'error, check in logs' with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f: print(f' {(idx + 2)*segment} \t\t{msg}', file=f) break i = i + 1 time.sleep(0.01) else: saved_model = load_model(model_path_al) score_al = saved_model.evaluate(x_test, y_test, verbose=0) os.remove(model_path_al) with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f: print(f' {(idx + 2)*segment} \t\t','%.5f' % score_al[1], file=f) break # remove queried instance from pool X_train = np.delete(X_train, query_idx, axis=0) y_train = np.delete(y_train, query_idx, axis=0) # score_al = learner.score_al(x_test, y_test) with open(f'result_{nr_of_labeled_examples}.txt', 'a') as f: print('\n', file=f) return
]) # map the intensity values against the grid y_full = np.asarray([data[P[0], P[1]] for P in X_full]) X_pool = deepcopy(X_full) y_pool = deepcopy(y_full) # assembling initial training set initial_idx = [ 0, im_height - 1, im_height * (im_height - 1), -1, im_width // 2 + im_height // 2 * im_height ] X_train, y_train = X_pool[initial_idx], y_pool[initial_idx] # create an ActiveLearner instance learner = ActiveLearner(estimator=RandomForestClassifier(), X_training=X_train, y_training=y_train) initial_prediction = learner.predict_proba(X_full)[:, 1].reshape( im_height, im_width) n_queries = 100 for round_idx in range(n_queries): query_idx, query_inst = learner.query(X_pool) learner.teach(X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(-1, )) X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) final_prediction = learner.predict_proba(X_full)[:, 1].reshape( im_height, im_width)
def active_learn(df1, first_item_index_of_each_category): train_idx = first_item_index_of_each_category # X_train = iris['data'][train_idx] # y_train = iris['target'][train_idx] # initial training data data = df1.values[:, 1:] target = df1['label'].values X_full = df1.values[:, 1:] y_full = df1['label'].values X_train = df1.values[:, 1:][ train_idx] #item from second column as the first column is the label.. y_train = df1['label'].values[train_idx] # with plt.style.context('seaborn-white'): # pca = PCA(n_components=2).fit_transform(data) # plt.figure(figsize=(7, 7)) # plt.scatter(x=pca[:, 0], y=pca[:, 1], c=y_train, cmap='viridis', s=50) # plt.title('The iris dataset') # plt.show() # generating the pool X_pool = np.delete(data, train_idx, axis=0) y_pool = np.delete(target, train_idx) # initializing the active learner learner = ActiveLearner(estimator=RandomForestClassifier(), X_training=X_train, y_training=y_train) print('Initial prediction accuracy: %f' % learner.score(X_full, y_full)) index = 0 performance_array = [] # learning until the accuracy reaches a given threshold while learner.score(X_full, y_full) < 0.90: stream_idx = np.random.choice(range(len(X_full))) if classifier_uncertainty(learner, X_full[stream_idx].reshape( 1, -1)) >= 0.4: learner.teach(X_full[stream_idx].reshape(1, -1), y_full[stream_idx].reshape(-1, )) learner_score = learner.score(X_full, y_full) print('Item no. %d queried, new accuracy: %f' % (stream_idx, learner_score)) if index == 505: break if (index % 100 == 0): performance_array.append(learner_score) index = index + 1 percentage_increase(performance_array) # visualizing initial prediction # with plt.style.context('seaborn-white'): # plt.figure(figsize=(7, 7)) # prediction = learner.predict(data) # plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50) # plt.title('Initial accuracy: %f' % learner.score(data, target)) # plt.show() # pool-based sampling # n_queries = 502 # performance_array = [] # for idx in range(n_queries): # query_idx, query_instance = learner.query(X_pool) # learner.teach( # X=X_pool[query_idx].reshape(1, -1), # y=y_pool[query_idx].reshape(1, ) # ) # # remove queried instance from pool # X_pool = np.delete(X_pool, query_idx, axis=0) # y_pool = np.delete(y_pool, query_idx) # learner_score = learner.score(data, target) # print('Accuracy after query no. %d: %f' % (idx + 1, learner_score)) # if (idx % 100 == 0): # performance_array.append(learner_score) # # percentage_increase(performance_array) # plotting final prediction # with plt.style.context('seaborn-white'): # plt.figure(figsize=(7, 7)) # prediction = learner.predict(data) # plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50) # plt.title( # 'Classification accuracy after %i queries: %f' % (n_queries, learner.score(data,target))) # plt.show() y = 0
train_k = train_k.drop(['animal_name'], axis=1) #dropping the target variable for clustering print(train_k) #plotting the data in an understandable form(kmeans) f, ax = plt.subplots(figsize=(12, 8)) corr = train_k.corr() hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="summer",fmt='.2f') f.subplots_adjust(top=.94) t= f.suptitle('Zoo animals Heatmap', fontsize=16) kmeans = KMeans(n_clusters=7, max_iter=10000) X = np.array(train_k.drop(["class_type"], 1).astype(float)) Y = np.array(train_k["class_type"]) learner = ActiveLearner(estimator=kmeans, X_training=X, y_training=Y) predictions = learner.predict(X_test) X_pool = np.array(test_k.drop(["class_type"], 1).astype(float)) y_pool = np.array(test_k["class_type"]) - 1 for index in range(N_Queries[0]): query_index = random.randrange(0,len(X_pool)) x, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, ) learner.teach(X=x, y=y) X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index) model_accuracy = learner.score(X, Y) print('Accuracy: {acc:0.4f} \n'.format(acc=model_accuracy))
def review(self): # create the pool and training indices. n_samples = self.X.shape[0] pool_idx = np.arange(n_samples) # add prior knowledge init_idx, init_labels = self._prior_knowledge() self.y[init_idx] = init_labels # remove the initial sample from the pool pool_idx = np.delete(pool_idx, init_idx) # Initialize learner, but don't start training yet. self.learner = ActiveLearner(estimator=self.model, query_strategy=self.query_strategy) query_i = 0 train_idx = init_idx.copy() query_idx = train_idx self._logger.add_labels(self.y) while not self._stop_iter(query_i - 1, pool_idx): self._logger.add_training_log(query_idx, self.y[query_idx]) # Get the training data. X_train, y_train = self.train_data(self.X, self.y, train_idx, **self.balance_kwargs) # validation_data(self.X[pool_idx], self.y[pool_idx], # self.fit_kwargs, ratio=1) # Train the model on the training data. self.learner.teach(X=X_train, y=y_train, only_new=True, **self.fit_kwargs) # Make a query from the pool. query_idx, _ = self.learner.query(X=self.X, pool_idx=pool_idx, n_instances=min( self.n_instances, len(pool_idx)), query_kwargs=self.query_kwargs) # Log the probabilities of samples in the pool being included. pred_proba = self.query_kwargs.get('pred_proba', []) if len(pred_proba) == 0: pred_proba = self.learner.predict_proba(self.X[pool_idx]) self._logger.add_proba(pool_idx, pred_proba) # Log the probabilities of samples that were trained. pred_proba_train = self.learner.predict_proba(self.X[train_idx]) self._logger.add_proba(train_idx, pred_proba_train, logname="train_proba") # Classify the queried papers. self.y[query_idx] = self._classify(query_idx) self._logger.add_labels(self.y) # Update training/pool indices train_idx = np.append(train_idx, query_idx) pool_idx = np.delete(np.arange(n_samples), train_idx, axis=0) # update the query counter query_i += 1 # Save the result to a file if self.log_file: self.save_logs(self.log_file) if self.verbose: print(f"Saved results in log file: {self.log_file}")
s=50) plt.title('The iris dataset') plt.show() # initial training data train_idx = [0, 50, 100] # index des éléments initiaux du training set X_train = iris['data'][train_idx] y_train = iris['target'][train_idx] # generating the pool X_pool = np.delete(iris['data'], train_idx, axis=0) y_pool = np.delete(iris['target'], train_idx) # initializing the active learner learner = ActiveLearner(estimator=KNeighborsClassifier(n_neighbors=3), X_training=X_train, y_training=y_train) # visualizing initial prediction with plt.style.context('seaborn-white'): plt.figure(figsize=(7, 7)) prediction = learner.predict(iris['data']) plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50) plt.title('Initial accuracy: %f' % learner.score(iris['data'], iris['target'])) plt.show() print('Accuracy before active learning: %f' % learner.score(iris['data'], iris['target'])) # pool-based sampling
n_labeled_examples = X_raw.shape[0] training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=3) X_train = X_raw[training_indices] y_train = y_raw[training_indices] # Isolate the non-training examples we'll be querying. X_pool = np.delete(X_raw, training_indices, axis=0) y_pool = np.delete(y_raw, training_indices, axis=0) # Specify our core estimator along with it's active learning model. knn = KNeighborsClassifier(n_neighbors=3) learner = ActiveLearner(estimator=knn, X_training=X_train, y_training=y_train) predictions = learner.predict(X_raw) is_correct = (predictions == y_raw) unqueried_score = learner.score(X_raw, y_raw) # Plot our classification results. fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130) ax.scatter(x=x_component[is_correct], y=y_component[is_correct], c='g', marker='+', label='Correct', alpha=8 / 10) ax.scatter(x=x_component[~is_correct],
X = np.random.choice(np.linspace(0, 20, 10000), size=200, replace=False).reshape(-1, 1) y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape) # assembling initial training set n_initial = 5 initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False) X_initial, y_initial = X[initial_idx], y[initial_idx] # defining the kernel for the Gaussian process kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \ + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1)) # initializing the active learner regressor = ActiveLearner( predictor=GaussianProcessRegressor(kernel=kernel), query_strategy=GP_regression_std, X_initial=X_initial.reshape(-1, 1), y_initial=y_initial.reshape(-1, 1) ) # plotting the initial estimation with plt.style.context('seaborn-white'): plt.figure(figsize=(14, 7)) x = np.linspace(0, 20, 1000) pred, std = regressor.predict(x.reshape(-1,1), return_std=True) plt.plot(x, pred) plt.fill_between(x, pred.reshape(-1, )-std, pred.reshape(-1, )+std, alpha=0.2) plt.scatter(X, y, c='k') plt.title('Initial estimation based on %d points' % n_initial) plt.show() # active learning
# create the data to stream from X_full = np.transpose( [np.tile(np.asarray(range(im.shape[0])), im.shape[1]), np.repeat(np.asarray(range(im.shape[1])), im.shape[0])] ) # map the intensity values against the grid y_full = np.asarray([im[P[0], P[1]] for P in X_full]) # assembling initial training set n_initial = 5 initial_idx = np.random.choice(range(len(X_full)), size=n_initial, replace=False) X_train, y_train = X_full[initial_idx], y_full[initial_idx] # initialize the learner learner = ActiveLearner( predictor=RandomForestClassifier(), X_initial=X_train, y_initial=y_train ) """ The instances are randomly selected one by one, if an instance's uncertainty is above a threshold, the label is requested and shown to the learner. The process is continued until the learner reaches a previously defined accuracy. """ # learning until the accuracy reaches a given threshold while learner.score(X_full, y_full) < 0.7: stream_idx = np.random.choice(range(len(X_full))) if classifier_uncertainty(learner, X_full[stream_idx].reshape(1, -1)) >= 0.4: learner.teach(X_full[stream_idx].reshape(1, -1), y_full[stream_idx].reshape(-1, ))
def learn(self): # seeding classes = self.short_df['grades_round'].unique() seed_index = [] for i in classes: seed_index.append(self.short_df['grades_round'][ self.short_df['grades_round'] == i].index[0]) seed_index act_data = self.short_df.copy() accuracy_list = [] f1_total_list = [] kappa_total_list = [] # initialising train_idx = seed_index X_train = self.X[train_idx] y_train = self.Y[train_idx] # generating the pool X_pool = np.delete(self.X, train_idx, axis=0) y_pool = np.delete(self.Y, train_idx) act_data = act_data.drop(axis=0, index=train_idx) act_data.reset_index(drop=True, inplace=True) initiated_committee = [] for learner_idx, model in enumerate(self.learners): learner = ActiveLearner(estimator=model, X_training=X_train, y_training=y_train) initiated_committee.append(learner) # Commitee creation committee = Committee( learner_list=initiated_committee, # query_strategy=vote_entropy_sampling ) committee.teach(X_train, y_train) # pool-based sampling n_queries = int(len(X) / (100 / self.percent)) for idx in range(n_queries): query_idx = np.random.choice(range(len(X_pool))) committee.teach(X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, )) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) act_data = act_data.drop(axis=0, index=query_idx) act_data.reset_index(drop=True, inplace=True) accuracy_list.append( accuracy_score(committee.predict(X_pool), y_pool)) model_pred = committee.predict(X_pool) f1_total_list.append( f1_score(y_pool, model_pred, average="weighted", labels=np.unique(model_pred))) kappa_total_list.append(cohen_kappa_score(y_pool, model_pred)) # print('Accuracy after query no. %d: %f' % (idx+1, accuracy_score(committee.predict(X_pool),y_pool))) # print("By just labelling ",round(n_queries*100.0/len(X),2),"% of total data accuracy of ", round(accuracy_score(committee.predict(X_pool),y_pool),3), " % is achieved on the unseen data" ) return accuracy_list, f1_total_list, kappa_total_list
def active_learn(df1, first_item_index_of_each_category): train_idx = first_item_index_of_each_category # X_train = iris['data'][train_idx] # y_train = iris['target'][train_idx] # initial training data data = df1.values[:,1:] target = df1['label'].values X_full = df1.values[:, 1:] y_full = df1['label'].values X_train = df1.values[:,1:][train_idx] #item from second column as the first column is the label.. y_train = df1['label'].values[train_idx] # X_pool = np.delete(data, train_idx, axis=0) # y_pool = np.delete(target, train_idx) X_pool = deepcopy(X_full) y_pool = deepcopy(y_full) # initializing Committee members n_members = 2 learner_list = list() for member_idx in range(n_members): # initial training data n_initial = 5 train_idx = np.random.choice(range(X_pool.shape[0]), size=n_initial, replace=False) X_train = X_pool[train_idx] y_train = y_pool[train_idx] # creating a reduced copy of the data with the known instances removed X_pool = np.delete(X_pool, train_idx, axis=0) y_pool = np.delete(y_pool, train_idx) # initializing learner learner = ActiveLearner( estimator=RandomForestClassifier(), X_training=X_train, y_training=y_train ) learner_list.append(learner) # assembling the committee committee = Committee(learner_list=learner_list) print('Committee initial predictions, accuracy = %1.3f' % committee.score(data, target)) performance_array = [] n_queries = 505 for idx in range(n_queries): query_idx, query_instance = committee.query(X_pool) committee.teach( X=X_pool[query_idx].reshape(1, -1), y=y_pool[query_idx].reshape(1, ) ) # remove queried instance from pool X_pool = np.delete(X_pool, query_idx, axis=0) y_pool = np.delete(y_pool, query_idx) learner_score = committee.score(data, target) print('Committee %d th query predictions, accuracy = %1.3f' % (idx , learner_score)) if (idx % 100 == 0): performance_array.append(learner_score) percentage_increase(performance_array)
for member_idx in range(n_members): # initial training data n_initial = 5 train_idx = np.random.choice(range(X_pool.shape[0]), size=n_initial, replace=False) X_train = X_pool[train_idx] y_train = y_pool[train_idx] # creating a reduced copy of the data with the known instances removed X_pool = np.delete(X_pool, train_idx, axis=0) y_pool = np.delete(y_pool, train_idx) # initializing learner learner = ActiveLearner(estimator=RandomForestClassifier(), X_training=X_train, y_training=y_train) learner_list.append(learner) # assembling the committee committee = Committee(learner_list=learner_list) # visualizing the initial predictions # with plt.style.context('seaborn-white'): # plt.figure(figsize=(n_members*7, 7)) # for learner_idx, learner in enumerate(committee): # plt.subplot(1, n_members, learner_idx + 1) # plt.scatter(x=pca[:, 0], y=pca[:, 1], c=learner.predict(iris['data']), cmap='viridis', s=5) # plt.title('Learner no. %d initial predictions' % (learner_idx + 1)) # plt.show()
learner_list = list() for member_idx in range(n_members): # initial training data n_initial = 5 train_idx = np.random.choice(range(X_pool.shape[0]), size=n_initial, replace=False) X_train = X_pool[train_idx] y_train = y_pool[train_idx] # creating a reduced copy of the data with the known instances removed X_pool = np.delete(X_pool, train_idx, axis=0) y_pool = np.delete(y_pool, train_idx) # initializing learner learner = ActiveLearner( predictor=RandomForestClassifier(), X_initial=X_train, y_initial=y_train ) learner_list.append(learner) # assembling the committee committee = Committee(learner_list=learner_list) # visualizing the initial predictions with plt.style.context('seaborn-white'): plt.figure(figsize=(n_members*7, 7)) for learner_idx, learner in enumerate(committee): plt.subplot(1, n_members, learner_idx + 1) plt.scatter(x=pca[:, 0], y=pca[:, 1], c=learner.predict(iris['data']), cmap='viridis', s=50) plt.title('Learner no. %d initial predictions' % (learner_idx + 1)) plt.show()
def al_rank(self, data, target, X_train, y_train, X_full, y_full, train_idx, N_RAW_SAMPLES=120, proba = 5): acc = [] pre = [] rec = [] fs = [] BATCH_SIZE = 3 preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE) learner = ActiveLearner( estimator=RandomForestClassifier(), X_training=X_train, y_training=y_train, query_strategy=preset_batch ) # N_RAW_SAMPLES = 80 N_QUERIES = N_RAW_SAMPLES // BATCH_SIZE unqueried_score = learner.score(X_full, y_full) performance_history = [unqueried_score] # Isolate our examples for our labeled dataset. n_labeled_examples = X_full.shape[0] training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=3) X_train = X_full[training_indices] y_train = y_full[training_indices] # Isolate the non-training examples we'll be querying. X_pool = np.delete(X_full, training_indices, axis=0) y_pool = np.delete(y_full, training_indices, axis=0) for index in range(N_QUERIES): query_index, query_instance = learner.query(X_pool) # Teach our ActiveLearner model the record it has requested. X, y = X_pool[query_index], y_pool[query_index] labeled_y =y rand_int = randint(0, 100) if (rand_int <= proba): labeled_y = np.array([]) for idx in query_index: if (y_pool[idx] == 1): y_pool[idx] = 0 labeled_y = np.append(labeled_y, 0) else: y_pool[idx] = 1 # labeled_y = np.array((1)).reshape(1, ) labeled_y = np.append(labeled_y, 1) learner.teach( X=X, y=labeled_y ) # learner.teach(X=X, y=y) # Remove the queried instance from the unlabeled pool. X_pool = np.delete(X_pool, query_index, axis=0) y_pool = np.delete(y_pool, query_index) # Calculate and report our model's accuracy. model_accuracy = learner.score(X_full, y_full) print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy)) precision, recall, fscore, support, accuracy = self.performance_measure(learner, X_full, y_full) learner_score = accuracy acc.append(learner_score) pre.append(precision) rec.append(recall) fs.append(fscore) # Save our model's performance for plotting. performance_history.append(model_accuracy) return acc, pre, rec, fs
plt.scatter(x=pca[:, 0], y=pca[:, 1], c=iris['target'], cmap='viridis', s=50) plt.title('The iris dataset') plt.show() # initial training data train_idx = [0, 50, 100] X_train = iris['data'][train_idx] y_train = iris['target'][train_idx] # generating the pool X_pool = np.delete(iris['data'], train_idx, axis=0) y_pool = np.delete(iris['target'], train_idx) # initializing the active learner learner = ActiveLearner( predictor=KNeighborsClassifier(n_neighbors=3), X_initial=X_train, y_initial=y_train ) # visualizing initial prediction with plt.style.context('seaborn-white'): plt.figure(figsize=(7, 7)) prediction = learner.predict(iris['data']) plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50) plt.title('Initial accuracy: %f' % learner.score(iris['data'], iris['target'])) plt.show() print('Accuracy before active learning: %f' % learner.score(iris['data'], iris['target'])) # pool-based sampling n_queries = 20 for idx in range(n_queries):
def GP_regression_std(regressor, X): _, std = regressor.predict(X, return_std=True) query_idx = np.argmax(std) return query_idx, X[query_idx] # generating the data X = np.random.choice(np.linspace(0, 20, 10000), size=200, replace=False).reshape(-1, 1) y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape) # assembling initial training set n_initial = 5 initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False) X_initial, y_initial = X[initial_idx], y[initial_idx] # defining the kernel for the Gaussian process kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \ + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1)) # initializing the active learner regressor = ActiveLearner( predictor=GaussianProcessRegressor(kernel=kernel), query_strategy=GP_regression_std, X_initial=X_initial.reshape(-1, 1), y_initial=y_initial.reshape(-1, 1) ) # active learning n_queries = 10 for idx in range(n_queries): query_idx, query_instance = regressor.query(X) regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))