def main_loop(alibox, strategy, round): # Get the data split of one fold experiment train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # train_idx = train_indexs[round] # test_idx = test_indexs[round] # label_ind = label_indexs[round] # unlab_ind = unlabel_indexs[round] # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) # To balance such effects that QueryMeta need to select the first five rounds selection temp_rand = QueryRandom(X, y) model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) for i in range(5): rand_select_ind = temp_rand.select(label_ind, unlab_ind) label_ind.update(rand_select_ind) unlab_ind.difference_update(rand_select_ind) model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) # label_ind = copy.deepcopy(label_index_round[round][4]) # unlab_ind = copy.deepcopy(unlabel_index_round[round][4]) # model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) pred = model.predict(X[test_idx, :]) accuracy = sum(pred == y[test_idx]) / len(test_idx) saver.set_initial_point(accuracy) while not stopping_criterion.is_stop(): # Select a subset of Uind according to the query strategy # Passing model=None to use the default model for evaluating the committees' disagreement select_ind = strategy.select(label_ind, unlab_ind, model=model, batch_size=1) label_ind.update(select_ind) unlab_ind.difference_update(select_ind) # Update model and calc performance according to the model you are using model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) pred = model.predict(X[test_idx, :]) accuracy = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=pred, performance_metric='accuracy_score') # Save intermediate results to file st = alibox.State(select_index=select_ind, performance=accuracy) saver.add_state(st) # Passing the current progress to stopping criterion object stopping_criterion.update_information(saver) # Reset the progress in stopping criterion object stopping_criterion.reset() return saver
for round in range(5): # Get the data split of one fold experiment train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) # Get intermediate results saver for one fold experiment saver = alibox.get_stateio(round) # calc the initial point model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) pred = model.predict(X[test_idx, :]) accuracy = sum(pred == y[test_idx]) / len(test_idx) saver.set_initial_point(accuracy) while not stopping_criterion.is_stop(): # Select a subset of Uind according to the query strategy # Passing model=None to use the default model for evaluating the committees' disagreement select_ind = random.select(unlab_ind) label_ind.update(select_ind) unlab_ind.difference_update(select_ind) # Update model and calc performance according to the model you are using model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) pred = model.predict(X[test_idx, :]) accuracy = alibox.calc_performance_metric( y_true=y[test_idx], y_pred=pred, performance_metric='accuracy_score') # Save intermediate results to file st = alibox.State(select_index=select_ind, performance=accuracy) saver.add_state(st) saver.save()
def select(self, label_index, unlabel_index, model=None, xb_way='uncertainty'): """Select indexes from the unlabel_index for querying. Parameters ---------- label_index: {list, np.ndarray, IndexCollection} The indexes of labeled samples. unlabel_index: {list, np.ndarray, IndexCollection} The indexes of unlabeled samples. model: object, optional (default=None) Current classification model, should have the 'predict_proba' method for probabilistic output. If not provided, LogisticRegression with default parameters implemented by sklearn will be used. Returns ------- selected_idx: int The selected index. """ if model is None: model = LogisticRegression() if self.flag is False: self.get_5_rouds(label_index, unlabel_index, model) label_ind = copy.deepcopy(self.label_inds_5[4]) unlabel_ind = copy.deepcopy(self.unlabel_inds_5[4]) # select x^ by unncertainty for combining the [x*, x^] c_data # using uncertainty to select x^ if xb_way is 'uncertainty': un = QueryInstanceUncertainty(self.X, self.y) selectedind = un.select(label_ind, unlabel_ind, model) elif xb_way is 'random': rand = QueryRandom(self.X, self.y) selectedind = rand.select(label_ind, unlabel_ind)[0] else: raise Exception( 'calculating the xb at least one of [uncertrainty, random]') # using random to select x^ # rand = QueryRandom(self.X, self.y) # rand_selectedind = rand.select(label_ind, unlabel_ind) # cd_second = meta_data(self.X, self.y, self.distacne, self.cluster_center_index, self.label_inds_5, self.unlabel_inds_5, self.modelOutput_5, un_selectedind) metadata = self.cal_mate_data_Z(self.label_inds_5, self.unlabel_inds_5, self.modelOutput_5, model) # if np.where(self.unlabel_inds_5[4] == un_selectedind)[0] > 0: # metadata_unind = np.where(self.unlabel_inds_5[4] == un_selectedind)[0][0] # cd_second = metadata[metadata_unind] # else: # l_ind = copy.deepcopy(self.label_inds_5[4]) # u_ind = copy.deepcopy(self.unlabel_inds_5[4]) # l_ind. # metadata_unind = np.where(self.unlabel_inds_5[4] == selectedind)[0][0] metadata_unind = np.where(unlabel_ind == selectedind)[0][0] cd_second = metadata[metadata_unind] num_unlabeled = len(metadata) cd_second = np.tile(cd_second, [num_unlabeled, 1]) combination_data = np.c_[metadata, cd_second] predict_proba = self.cb_classifier.predict_proba(combination_data) select = np.argmax(predict_proba[:, 1]) # metareg_perdict = self.metaregressor.predict(metadata) # print('len(metareg_perdict) ',len(metareg_perdict)) # select = np.argmax(metareg_perdict) # print('select ',select) # print('len(unlabel_ind)',len(unlabel_ind)) select_ind = unlabel_ind[select] label_ind.update(select_ind) unlabel_ind.difference_update(select_ind) model.fit(X=self.X[label_index.index, :], y=self.y[label_index.index]) # update the five rounds infor before del self.label_inds_5[0] del self.unlabel_inds_5[0] del self.modelOutput_5[0] self.label_inds_5.append(label_ind) self.unlabel_inds_5.append(unlabel_ind) if hasattr(model, 'predict_proba'): output = (model.predict_proba(self.X)[:, 1] - 0.5) * 2 else: output = model.predict(self.X) self.modelOutput_5.append(output) return select_ind, copy.deepcopy(self.label_inds_5[4]), copy.deepcopy( self.unlabel_inds_5[4])
# generate the first five rounds data(label_index unlabel_index model_output) label_index_round = [] unlabel_index_round = [] model_output_round = [] for round in range(splitcount): label_inds_5 = [] unlabel_inds_5 = [] model_output_5 = [] train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round) temp_rand = QueryRandom(X, y) model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) for i in range(5): rand_select_ind = temp_rand.select(label_ind, unlab_ind) label_ind.update(rand_select_ind) unlab_ind.difference_update(rand_select_ind) label_inds_5.append(copy.deepcopy(label_ind)) unlabel_inds_5.append(copy.deepcopy(unlab_ind)) model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) if hasattr(model, 'predict_proba'): output = (model.predict_proba(X)[:, 1] - 0.5) * 2 else: output = model.predict(X) model_output_5.append(output) label_index_round.append(label_inds_5) unlabel_index_round.append(unlabel_inds_5) model_output_round.append(model_output_5)
def get_5_rouds(self, label_ind, unlabel_ind, Model, querystategy='random'): """ label_ind: {list, np.ndarray, IndexCollection} The indexes of labeled samples. unlabel_ind: {list, np.ndarray, IndexCollection} The indexes of unlabeled samples. model: object, optional (default=None) Current classification model, should have the 'predict_proba' method for probabilistic output. If not provided, LogisticRegression with default parameters implemented by sklearn will be used. querystategy: str, default='uncertainty' In the first five rounds of active learning,choose to select the query strategy. Currently only supported uncertainty and random """ assert (isinstance(label_ind, IndexCollection)) assert (isinstance(unlabel_ind, IndexCollection)) label_index = copy.deepcopy(label_ind) unlabel_index = copy.deepcopy(unlabel_ind) model = copy.deepcopy(Model) if querystategy == 'uncertainty': un = QueryInstanceUncertainty(self.X, self.y) for _ in range(5): select_ind = un.select(label_index, unlabel_index, model=model) label_index.update(select_ind) unlabel_index.difference_update(select_ind) self.label_inds_5.append(copy.deepcopy(label_index)) self.unlabel_inds_5.append(copy.deepcopy(unlabel_index)) model.fit(X=self.X[label_index.index, :], y=self.y[label_index.index]) self.modelOutput_5.append(model.predict(self.X)) elif querystategy == 'random': random = QueryRandom(self.X, self.y) for _ in range(5): select_ind = random.select(label_index, unlabel_index) label_index.update(select_ind) unlabel_index.difference_update(select_ind) self.label_inds_5.append(copy.deepcopy(label_index)) self.unlabel_inds_5.append(copy.deepcopy(unlabel_index)) model.fit(X=self.X[label_index.index, :], y=self.y[label_index.index]) if hasattr(model, 'predict_proba'): output = (model.predict_proba(self.X)[:, 1] - 0.5) * 2 else: output = model.predict(self.X) # self.modelOutput_5.append(model.predict(self.X)) self.modelOutput_5.append(output) elif querystategy is None: for _ in range(5): num_label = len(label_index.index) num_unlabel = len(unlabel_index.index) n_samples = np.shape(self.X)[0] self.label_inds_5.append(np.zeros(num_label)) self.unlabel_inds_5.append(np.zeros(num_unlabel)) self.modelOutput_5.append(np.zeros(n_samples)) self.flag = True