def main_loop(alibox, strategy, round): # Get the data split of one fold experiment train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # train_idx = train_indexs[round] # test_idx = test_indexs[round] # label_ind = label_indexs[round] # unlab_ind = unlabel_indexs[round] # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) # To balance such effects that QueryMeta need to select the first five rounds selection temp_rand = QueryRandom(X, y) model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) for i in range(5): rand_select_ind = temp_rand.select(label_ind, unlab_ind) label_ind.update(rand_select_ind) unlab_ind.difference_update(rand_select_ind) model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) # label_ind = copy.deepcopy(label_index_round[round][4]) # unlab_ind = copy.deepcopy(unlabel_index_round[round][4]) # model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) pred = model.predict(X[test_idx, :]) accuracy = sum(pred == y[test_idx]) / len(test_idx) saver.set_initial_point(accuracy) while not stopping_criterion.is_stop(): # Select a subset of Uind according to the query strategy # Passing model=None to use the default model for evaluating the committees' disagreement select_ind = strategy.select(label_ind, unlab_ind, model=model, batch_size=1) label_ind.update(select_ind) unlab_ind.difference_update(select_ind) # Update model and calc performance according to the model you are using model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) pred = model.predict(X[test_idx, :]) accuracy = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=pred, performance_metric='accuracy_score') # Save intermediate results to file st = alibox.State(select_index=select_ind, performance=accuracy) saver.add_state(st) # Passing the current progress to stopping criterion object stopping_criterion.update_information(saver) # Reset the progress in stopping criterion object stopping_criterion.reset() return saver
y_true=y[test_idx], y_pred=pred, performance_metric='accuracy_score') # Save intermediate results to file st = alibox.State(select_index=select_ind, performance=accuracy) saver.add_state(st) saver.save() # Passing the current progress to stopping criterion object stopping_criterion.update_information(saver) # Reset the progress in stopping criterion object stopping_criterion.reset() meta_result.append(copy.deepcopy(saver)) random = QueryRandom(X, y) random_result = [] for round in range(5): # Get the data split of one fold experiment train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) # calc the initial point model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) pred = model.predict(X[test_idx, :]) accuracy = sum(pred == y[test_idx]) / len(test_idx) saver.set_initial_point(accuracy) while not stopping_criterion.is_stop(): # Select a subset of Uind according to the query strategy
# The cost budget is 50 times querying stopping_criterion = alibox.get_stopping_criterion('num_of_queries', query_num) # generate the first five rounds data(label_index unlabel_index model_output) label_index_round = [] unlabel_index_round = [] model_output_round = [] for round in range(splitcount): label_inds_5 = [] unlabel_inds_5 = [] model_output_5 = [] train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) temp_rand = QueryRandom(X, y) model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) for i in range(5): rand_select_ind = temp_rand.select(label_ind, unlab_ind) label_ind.update(rand_select_ind) unlab_ind.difference_update(rand_select_ind) label_inds_5.append(copy.deepcopy(label_ind)) unlabel_inds_5.append(copy.deepcopy(unlab_ind)) model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) if hasattr(model, 'predict_proba'): output = (model.predict_proba(X)[:, 1] - 0.5) * 2 else: output = model.predict(X) model_output_5.append(output) label_index_round.append(label_inds_5)
saver.add_state(st) # Passing the current progress to stopping criterion object stopping_criterion.update_information(saver) # Reset the progress in stopping criterion object stopping_criterion.reset() return saver random_result = [] unc_result = [] qbc_result = [] for round in range(splitcount): train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Use pre-defined strategy random = QueryRandom(X, y) unc = QueryInstanceUncertainty(X, y) qbc = QueryInstanceQBC(X, y) random_result.append(copy.deepcopy(main_loop(alibox, random, round))) unc_result.append(copy.deepcopy(main_loop(alibox, unc, round))) qbc_result.append(copy.deepcopy(main_loop(alibox, qbc, round))) analyser = alibox.get_experiment_analyser(x_axis='num_of_queries') analyser.add_method(method_name='QBC', method_results=qbc_result) analyser.add_method(method_name='Unc', method_results=unc_result) analyser.add_method(method_name='random', method_results=random_result) plt = analyser.plot_learning_curves(title=testdataset, std_area=False, saving_path=savefloder_path +
def select(self, label_index, unlabel_index, model=None, xb_way='uncertainty'): """Select indexes from the unlabel_index for querying. Parameters ---------- label_index: {list, np.ndarray, IndexCollection} The indexes of labeled samples. unlabel_index: {list, np.ndarray, IndexCollection} The indexes of unlabeled samples. model: object, optional (default=None) Current classification model, should have the 'predict_proba' method for probabilistic output. If not provided, LogisticRegression with default parameters implemented by sklearn will be used. Returns ------- selected_idx: int The selected index. """ if model is None: model = LogisticRegression() if self.flag is False: self.get_5_rouds(label_index, unlabel_index, model) label_ind = copy.deepcopy(self.label_inds_5[4]) unlabel_ind = copy.deepcopy(self.unlabel_inds_5[4]) # select x^ by unncertainty for combining the [x*, x^] c_data # using uncertainty to select x^ if xb_way is 'uncertainty': un = QueryInstanceUncertainty(self.X, self.y) selectedind = un.select(label_ind, unlabel_ind, model) elif xb_way is 'random': rand = QueryRandom(self.X, self.y) selectedind = rand.select(label_ind, unlabel_ind)[0] else: raise Exception( 'calculating the xb at least one of [uncertrainty, random]') # using random to select x^ # rand = QueryRandom(self.X, self.y) # rand_selectedind = rand.select(label_ind, unlabel_ind) # cd_second = meta_data(self.X, self.y, self.distacne, self.cluster_center_index, self.label_inds_5, self.unlabel_inds_5, self.modelOutput_5, un_selectedind) metadata = self.cal_mate_data_Z(self.label_inds_5, self.unlabel_inds_5, self.modelOutput_5, model) # if np.where(self.unlabel_inds_5[4] == un_selectedind)[0] > 0: # metadata_unind = np.where(self.unlabel_inds_5[4] == un_selectedind)[0][0] # cd_second = metadata[metadata_unind] # else: # l_ind = copy.deepcopy(self.label_inds_5[4]) # u_ind = copy.deepcopy(self.unlabel_inds_5[4]) # l_ind. # metadata_unind = np.where(self.unlabel_inds_5[4] == selectedind)[0][0] metadata_unind = np.where(unlabel_ind == selectedind)[0][0] cd_second = metadata[metadata_unind] num_unlabeled = len(metadata) cd_second = np.tile(cd_second, [num_unlabeled, 1]) combination_data = np.c_[metadata, cd_second] predict_proba = self.cb_classifier.predict_proba(combination_data) select = np.argmax(predict_proba[:, 1]) # metareg_perdict = self.metaregressor.predict(metadata) # print('len(metareg_perdict) ',len(metareg_perdict)) # select = np.argmax(metareg_perdict) # print('select ',select) # print('len(unlabel_ind)',len(unlabel_ind)) select_ind = unlabel_ind[select] label_ind.update(select_ind) unlabel_ind.difference_update(select_ind) model.fit(X=self.X[label_index.index, :], y=self.y[label_index.index]) # update the five rounds infor before del self.label_inds_5[0] del self.unlabel_inds_5[0] del self.modelOutput_5[0] self.label_inds_5.append(label_ind) self.unlabel_inds_5.append(unlabel_ind) if hasattr(model, 'predict_proba'): output = (model.predict_proba(self.X)[:, 1] - 0.5) * 2 else: output = model.predict(self.X) self.modelOutput_5.append(output) return select_ind, copy.deepcopy(self.label_inds_5[4]), copy.deepcopy( self.unlabel_inds_5[4])
def get_5_rouds(self, label_ind, unlabel_ind, Model, querystategy='random'): """ label_ind: {list, np.ndarray, IndexCollection} The indexes of labeled samples. unlabel_ind: {list, np.ndarray, IndexCollection} The indexes of unlabeled samples. model: object, optional (default=None) Current classification model, should have the 'predict_proba' method for probabilistic output. If not provided, LogisticRegression with default parameters implemented by sklearn will be used. querystategy: str, default='uncertainty' In the first five rounds of active learning,choose to select the query strategy. Currently only supported uncertainty and random """ assert (isinstance(label_ind, IndexCollection)) assert (isinstance(unlabel_ind, IndexCollection)) label_index = copy.deepcopy(label_ind) unlabel_index = copy.deepcopy(unlabel_ind) model = copy.deepcopy(Model) if querystategy == 'uncertainty': un = QueryInstanceUncertainty(self.X, self.y) for _ in range(5): select_ind = un.select(label_index, unlabel_index, model=model) label_index.update(select_ind) unlabel_index.difference_update(select_ind) self.label_inds_5.append(copy.deepcopy(label_index)) self.unlabel_inds_5.append(copy.deepcopy(unlabel_index)) model.fit(X=self.X[label_index.index, :], y=self.y[label_index.index]) self.modelOutput_5.append(model.predict(self.X)) elif querystategy == 'random': random = QueryRandom(self.X, self.y) for _ in range(5): select_ind = random.select(label_index, unlabel_index) label_index.update(select_ind) unlabel_index.difference_update(select_ind) self.label_inds_5.append(copy.deepcopy(label_index)) self.unlabel_inds_5.append(copy.deepcopy(unlabel_index)) model.fit(X=self.X[label_index.index, :], y=self.y[label_index.index]) if hasattr(model, 'predict_proba'): output = (model.predict_proba(self.X)[:, 1] - 0.5) * 2 else: output = model.predict(self.X) # self.modelOutput_5.append(model.predict(self.X)) self.modelOutput_5.append(output) elif querystategy is None: for _ in range(5): num_label = len(label_index.index) num_unlabel = len(unlabel_index.index) n_samples = np.shape(self.X)[0] self.label_inds_5.append(np.zeros(num_label)) self.unlabel_inds_5.append(np.zeros(num_unlabel)) self.modelOutput_5.append(np.zeros(n_samples)) self.flag = True
def set_query_strategy(self, strategy="QueryInstanceUncertainty", **kwargs): """ Set the query strategy of the experiment. Parameters ---------- strategy: {str, callable}, optional (default='QueryInstanceUncertainty') The query strategy function. Giving str to use a pre-defined strategy. Giving callable to use a user-defined strategy. kwargs: dict, optional The args used in strategy. If kwargs is None,the pre-defined query strategy will init in default way. (See the default way of pre-defined query strategy in the alipy/query_strategy/'query_strategy' and 'sota_strategy'). If strategy is a user-define strategy,the parameters accord with definition of function parameter. Note that, each parameters should be static. The parameters will be fed to the callable object automatically. """ # check if self._existed_query_strategy: raise Exception( "You already has set the query strategy,don`t has to set it again." ) # user-defined strategy if callable(strategy): self.__custom_strategy_flag = True strategyname = kwargs.pop('strategyname', None) if strategyname is not None: self._query_function_name = strategyname else: self._query_function_name = 'user-defined strategy' self.__custom_func_arg = kwargs self._query_function = strategy(self._X, self._y, **kwargs) else: # a pre-defined strategy in ALiPy if strategy not in ['QueryInstanceQBC', 'QueryInstanceUncertainty', 'QueryRandom', \ 'QureyExpectedErrorReduction', 'QueryInstanceGraphDensity', 'QueryInstanceQUIRE', \ 'QueryInstanceBMDR', 'QueryInstanceSPAL', 'QueryInstanceLAL']: raise NotImplementedError( 'Strategy {} is not implemented. Specify a valid ' 'method name or privide a callable object.'.format( str(strategy))) else: self._query_function_name = strategy if strategy == 'QueryInstanceQBC': method = kwargs.pop('method', 'query_by_bagging') disagreement = kwargs.pop('disagreement', 'vote_entropy') self._query_function = QueryInstanceQBC( self._X, self._y, method, disagreement) elif strategy == 'QueryInstanceUncertainty': measure = kwargs.pop('measure', 'entropy') self._query_function = QueryInstanceUncertainty( self._X, self._y, measure) elif strategy == 'QueryRandom': self._query_function = QueryRandom(self._X, self._y) elif strategy == 'QureyExpectedErrorReduction': self._query_function = QureyExpectedErrorReduction( self._X, self._y) elif strategy == 'QueryInstanceGraphDensity' or strategy == 'QueryInstanceQUIRE': if self._train_idx is None: raise ValueError( 'train_idx is None.Please split data firstly.You can call set_data_split or split_AL to split data.' ) self._query_function_need_train_ind = True self._query_function_metric = kwargs.pop( 'metric', 'manhattan') self._query_function_kwargs = kwargs elif strategy == 'QueryInstanceBMDR': beta = kwargs.pop('beta', 1000) gamma = kwargs.pop('gamma', 0.1) rho = kwargs.pop('rho', 1) self._query_function = QueryInstanceBMDR( self._X, self._y, beta, gamma, rho, **kwargs) self.qp_solver = kwargs.pop('qp_sover', 'ECOS') elif strategy == 'QueryInstanceSPAL': mu = kwargs.pop('mu', 0.1) gamma = kwargs.pop('gamma', 0.1) rho = kwargs.pop('rho', 1) lambda_init = kwargs.pop('lambda_init', 0.1) lambda_pace = kwargs.pop('lambda_pace', 0.01) self._query_function = QueryInstanceSPAL( self._X, self._y, mu, gamma, rho, lambda_init, lambda_pace, **kwargs) self.qp_solver = kwargs.pop('qp_sover', 'ECOS') elif strategy == 'QueryInstanceLAL': mode = kwargs.pop('mode', 'LAL_iterative') data_path = kwargs.pop('data_path', '.') cls_est = kwargs.pop('cls_est', 50) train_slt = kwargs.pop('train_slt', True) self._query_function = QueryInstanceLAL( self._X, self._y, mode, data_path, cls_est, train_slt, **kwargs)