# Passing model=None to use the default model for evaluating the committees' disagreement select_ind = QBCStrategy.select(label_ind, unlab_ind, model=None, batch_size=1) label_ind.update(select_ind) unlab_ind.difference_update(select_ind) # Update model and calc performance according to the model you are using model.fit(X=X[label_ind.index, :], y=y[label_ind.index]) pred = model.predict(X[test_idx, :]) accuracy = alibox.calc_performance_metric( y_true=y[test_idx], y_pred=pred, performance_metric='accuracy_score') # Save intermediate results to file st = alibox.State(select_index=select_ind, performance=accuracy) saver.add_state(st) saver.save() # Passing the current progress to stopping criterion object stopping_criterion.update_information(saver) # Reset the progress in stopping criterion object stopping_criterion.reset() QBC_result.append(copy.deepcopy(saver)) analyser = alibox.get_experiment_analyser(x_axis='num_of_queries') analyser.add_method(method_name='QBC', method_results=QBC_result) print(analyser) analyser.plot_learning_curves(title='Example of AL', std_area=True)
def create_and_implement_strategy(strategy_name, data, labels, queries): # Keep only the values of data and labels dataframe (Later, we use the global split based on idxs) X = data.values y = np.asarray(labels) toolbox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # Create Logistic Regression model ( Default Setting with liblinear solver) model = toolbox.get_default_model() # Implement query strategy uncertainty_strategy = toolbox.get_query_strategy(strategy_name=strategy_name) # Create array to save the results examples = [] # Set stopping criterion, we will stop in 1000 labeled examples stopping_criterion = toolbox.get_stopping_criterion('num_of_queries', queries) # Get the indexes of the global split with open("dataset_al", "rb") as f: train_idx, test_idx, labeled_idx, unlabeled_idx = pickle.load(f) # Create saver to save the results saver = StateIO(round=0, train_idx=train_idx, test_idx=test_idx, init_L=labeled_idx, init_U=unlabeled_idx, saving_path='.') # print(train_idx.shape, test_idx.shape) # Starting with some labeled examples model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index]) y_pred = model.predict(X[test_idx, :]) # Calculate the accuracy of the prediction accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='accuracy_score') # Save accuracy of the prediction saver.set_initial_point(accuracy) while not stopping_criterion.is_stop(): # Select example of the unlabeled dataset example = uncertainty_strategy.select(labeled_idx, unlabeled_idx, model=model, batch_size=1) # Update the label idxs labeled_idx.update(example) unlabeled_idx.difference_update(example) # Train model for the added example model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index]) y_pred = model.predict(X[test_idx, :]) # Calculate accuracy accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='accuracy_score') # f1 = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='f1_score') # Save update results state = toolbox.State(select_index=example, performance=accuracy) saver.add_state(state) saver.save() # Update progress for stopping criterion stopping_criterion.update_information(saver) stopping_criterion.reset() examples.append(copy.deepcopy(saver)) # Uncomment and return in order to save the new active learning dataset # Save selected x_train examples X_train = X[labeled_idx, :] # Save labels for the examples y_train = y[labeled_idx, :] # Reshape target y_train = np.array(y_train).reshape(-1) # Save to pickle # with open('qbc_dataset','wb') as f: # pickle.dump((X_train, y_train), f) return examples
query_y[select_ins, select_y1] = 0.5 query_y[select_ins, select_y2] = 1 # record results label_ind.update([(select_ins, select_y1), (select_ins, select_y2)]) unlab_ind.difference_update([(select_ins, select_y1), (select_ins, select_y2)]) if iter % 5 == 0: # train/test X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=query_y) model.fit(X=X_tr, y=y_tr) pres, pred = model.predict(X[test_idx]) perf = alibox.calc_performance_metric( y_true=mult_y[test_idx], y_pred=pred, performance_metric='hamming_loss') # save st = alibox.State(select_index=[(select_ins, select_y1), (select_ins, select_y2)], performance=perf) saver.add_state(st) AURO_results.append(copy.copy(saver)) analyser = alibox.get_experiment_analyser() analyser.add_method(method_name='AURO', method_results=AURO_results) analyser.plot_learning_curves()
from sklearn.datasets import load_iris from alipy import ToolBox X, y = load_iris(return_X_y=True) alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.') # get tools tr, te, lab, unlab = alibox.split_AL() tr0, te0, lab0, unlab0 = alibox.get_split(round=0) oracle = alibox.get_clean_oracle() saver = alibox.get_stateio(round=0) repo = alibox.get_repository(round=0) rand_strategy = alibox.get_query_strategy(strategy_name="QueryInstanceRandom") perf = alibox.calc_performance_metric(y_true=[1], y_pred=[1], performance_metric='accuracy_score') model = alibox.get_default_model() sc = alibox.get_stopping_criterion(stopping_criteria='num_of_queries', value=50) analyser = alibox.get_experiment_analyser(x_axis='num_of_queries') acethread = alibox.get_ace_threading() # data struct defined in alipy ind = alibox.IndexCollection([1, 2, 3]) m_ind = alibox.MultiLabelIndexCollection([(1, 0), (2, )]) st = alibox.State(select_index=[1], performance=perf) # io alibox.save() # al_settings.pkl is the default name. To use another name, please pass a specific file name # to 'saving_path' parameter when initializing the ToolBox object. (e.g., saving_path='./my_file.pkl') alibox = ToolBox.load(path='./al_settings.pkl')
class TorchFold: def __init__(self, dataset, labels, testset, testlab, model, phase, path, stopping): self.dataset = dataset self.labels = labels self.testset = testset self.testlab = testlab self.model = model self.phase = phase self.classes = int(max(labels)) self.alibox = ToolBox(X=dataset, y=labels, query_type='AllLabels', saving_path='./%s' % path) self.alibox.split_AL(test_ratio=0, initial_label_rate=0.05, split_count=1) self.stopping_criterion = self.alibox.get_stopping_criterion( stopping[0], value=stopping[1]) self.query_strategy = QueryInstanceUncertainty( X=dataset, y=labels, measure='least_confident') # self.query_strategy = QueryInstanceQBC(disagreement='KL_divergence') self.random = QueryRandom() self.unc_result = [] self.title = '' self.acc = [] self.gmeans = [] self.recall = [] self.precision = [] self.specificity = [] self.auc = [] self.f1 = [] self.pos = [] self.neg = [] self.ratio = [] self.loss = [] self.mcc = [] self.path = path def train(self): for round in range(1): try: os.mkdir('%s/%d' % (self.path, round)) except FileExistsError: pass # get data split of one fold train_idx, test_idx, label_ind, unlab_ind = self.alibox.get_split( round) # get intermediate results saver for one fold experiment saver = self.alibox.get_stateio(round) # set initial performance point model = self.model # print(torch.cuda.current_device()) # print(torch.cuda.device_count(), torch.cuda.is_available()) net = NN.NeuralNetwork(model=model, num_classes=2, batch_size=500, device_ids=[0], epochs=50) net.lr_fc = 0.0001 net.initiate(self.dataset[label_ind.index], self.labels[label_ind.index]) net.predict(self.testset) pred = net.preds weight = [] conf_mat = confusion_matrix(y_true=self.testlab, y_pred=pred) precision = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[0, 1]) recall = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[1, 0]) specificity = conf_mat[0, 0] / (conf_mat[0, 0] + conf_mat[0, 1]) gmeans = sqrt(recall * specificity) f1 = metrics.f1_score(y_true=self.testlab, y_pred=pred) auc = metrics.roc_auc_score(y_true=self.testlab, y_score=pred) accuracy = self.alibox.calc_performance_metric( y_true=self.testlab, y_pred=pred.reshape(list(self.testlab.shape)), performance_metric='accuracy_score') self.auc.append(auc) self.acc.append(accuracy) self.f1.append(f1) self.gmeans.append(gmeans) self.recall.append(recall) self.precision.append(precision) self.specificity.append(specificity) all = len(label_ind) + len(unlab_ind) lab_init = len(label_ind) lab = list(self.labels[label_ind.index]) self.pos.append(lab.count(1)) self.neg.append(lab.count(0)) self.ratio.append(lab.count(0) / lab.count(1)) tn, tp, fp, fn = conf_mat[0, 0], conf_mat[1, 1], conf_mat[0, 1], conf_mat[1, 0] mcc = ((tn * tp) - (fn * fp)) / sqrt( (tn + fp) * (tn + fn) * (tp + fp) * (tp + fn)) self.mcc.append(mcc) saver.set_initial_point(gmeans) iteration = 0 while not self.stopping_criterion.is_stop(): # select subsets of Uind samples according to query strategy iteration += 1 if self.phase == 'active': net.predict(self.dataset[unlab_ind.index]) prob_pred = net.probablistic_matrix() if len(label_ind) < all * 0.3: if iteration % 10: select_ind = self.query_strategy.select_by_prediction_mat( unlabel_index=unlab_ind, predict=prob_pred, batch_size=int(lab_init * 0.4)) # batch_size=1) else: select_ind = self.random.select( label_ind, unlab_ind, batch_size=int(lab_init * 0.4)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) else: select_ind = self.query_strategy.select_by_prediction_mat( unlabel_index=unlab_ind, predict=prob_pred, batch_size=int(len(label_ind) * 0.4)) # batch_size=1) elif self.phase == 'passive': if len(label_ind) < all * 0.3: select_ind = self.random.select(label_ind, unlab_ind, batch_size=int( lab_init * 0.4)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) else: select_ind = self.random.select( label_ind, unlab_ind, batch_size=int(len(label_ind) * 0.4)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) # print(select_ind) label_ind.update(select_ind) unlab_ind.difference_update(select_ind) # update model and calc performance accoding to the updated model loss = net.train(self.dataset[label_ind.index], self.labels[label_ind.index]) # if not iteration%2: net.predict(self.testset) pred = net.preds conf_mat = confusion_matrix(y_true=self.testlab, y_pred=pred) precision = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[0, 1]) recall = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[1, 0]) specificity = conf_mat[0, 0] / (conf_mat[0, 0] + conf_mat[0, 1]) gmeans = sqrt(recall * specificity) f1 = metrics.f1_score(y_true=self.testlab, y_pred=pred) auc = metrics.roc_auc_score(y_true=self.testlab, y_score=pred) accuracy = self.alibox.calc_performance_metric( y_true=self.testlab, y_pred=pred.reshape(list(self.testlab.shape)), performance_metric='accuracy_score') self.auc.append(auc) self.acc.append(accuracy) self.f1.append(f1) self.gmeans.append(gmeans) self.recall.append(recall) self.precision.append(precision) self.specificity.append(specificity) lab = list(self.labels[label_ind.index]) self.pos.append(lab.count(1)) self.neg.append((lab.count(0))) self.ratio.append(lab.count(0) / lab.count(1)) self.loss.append(loss) tn, tp, fp, fn = conf_mat[0, 0], conf_mat[1, 1], conf_mat[ 0, 1], conf_mat[1, 0] mcc = ((tn * tp) - (fn * fp)) / sqrt( (tn + fp) * (tn + fn) * (tp + fp) * (tp + fn)) self.mcc.append(mcc) # save the results st = self.alibox.State(select_ind, gmeans) saver.add_state(st) saver.save() self.stopping_criterion.update_information(saver) lab = list(self.labels[label_ind.index]) print('\n class \n0 and 1\n', lab.count(0), lab.count(1)) print('\n', conf_mat) torch.save(self.model, './%s/%d/model%d' % (self.path, round, iteration)) self.stopping_criterion.reset() self.unc_result.append(copy.deepcopy(saver)) joblib.dump(self.auc, './%s/%d/auc' % (self.path, round)) joblib.dump(self.acc, './%s/%d/acc' % (self.path, round)) joblib.dump(self.f1, './%s/%d/f1' % (self.path, round)) joblib.dump(self.gmeans, './%s/%d/gmeans' % (self.path, round)) joblib.dump(self.recall, './%s/%d/recall' % (self.path, round)) joblib.dump(self.precision, './%s/%d/precision' % (self.path, round)) joblib.dump(self.specificity, './%s/%d/specificity' % (self.path, round)) joblib.dump(self.pos, './%s/%d/pos' % (self.path, round)) joblib.dump(self.neg, './%s/%d/neg' % (self.path, round)) joblib.dump(self.ratio, './%s/%d/ratio' % (self.path, round)) joblib.dump(self.mcc, './%s/%d/mcc' % (self.path, round)) self.analyser = self.alibox.get_experiment_analyser( x_axis='num_of_queries') self.analyser.add_method(method_name='QueryInstanceUncertaity-lc', method_results=self.unc_result) print(self.analyser)
class TorchRegressionFold: def __init__(self, dataset, labels, testset, testlab, model, phase, path, stopping, measure='nearest_neighbor', distance='linear'): self.dataset = dataset self.labels = labels self.testset = testset self.testlab = testlab self.model = model self.phase = phase self.classes = int(max(labels)) self.alibox = ToolBox(X=dataset, y=np.asarray([0] * len(labels), dtype=np.int), query_type='AllLabels', saving_path='./%s' % path) self.alibox.split_AL(test_ratio=0, initial_label_rate=0.05, split_count=1) self.stopping_criterion = self.alibox.get_stopping_criterion( stopping[0], value=stopping[1]) self.measure = measure if measure == 'residue': self.query_strategy = QueryInstanceResidueRegressor( X=self.dataset, y=self.labels, distance=distance) else: self.query_strategy = QueryInstanceDistribution(measure=measure) self.random = QueryRandom() self.unc_result = [] self.title = '' self.loss = [] self.path = path self.one = self.two = self.three = self.four = self.five = self.six = None self.max, self.mae, self.mse, self.evs, self.r2 = [], [], [], [], [] self.sample = [] def train(self): from sklearn.metrics import (mean_squared_log_error as msle, max_error as max, mean_absolute_error as mae, mean_squared_error as mse, explained_variance_score as evs, r2_score as r2, mean_tweedie_deviance as tweedie) for round in range(1): try: os.mkdir('%s/%d' % (self.path, round)) except FileExistsError: pass # get data split of one fold train_idx, test_idx, label_ind, unlab_ind = self.alibox.get_split( round) # get intermediate results saver for one fold experiment saver = self.alibox.get_stateio(round) # set initial performance point model = self.model net = NN.NeuralNetworkRegressor(model=model, batch_size=1, device_ids=[0], epochs=50) net.lr_fc = 0.01 net.initiate(self.dataset[label_ind.index], self.labels[label_ind.index]) net.predict(self.testset) pred = net.preds # evaluation all = len(label_ind) + len(unlab_ind) lab_init = len(label_ind) self.mse.append(mse(self.testlab, pred)) self.mae.append(mae(self.testlab, pred)) self.max.append(max(self.testlab, pred)) self.evs.append(evs(self.testlab, pred)) self.r2.append(r2(self.testlab, pred)) self.sample.append(len(label_ind.index)) saver.set_initial_point(mse(self.testlab, pred)) iteration = 0 while not self.stopping_criterion.is_stop(): # select subsets of Uind samples according to query strategy iteration += 1 lr_fc = net.lr_fc * (1 - len(label_ind.index) / (all * 1.001)) for p in net.optimizer.param_groups: p['lr'] = lr_fc print('learning rate is', net.optimizer.state_dict()['param_groups'][0]['lr']) if self.phase == 'active': if self.measure != 'residue': net.predict(self.dataset[unlab_ind.index]) else: net.predict(self.dataset[label_ind]) pred = net.preds if self.measure == 'distance': if iteration == 1: self._update_previous_prediction(pred) else: self._update_previous_prediction( pred, select_ind, unlab_ind_save) previous = self._get_previous_prediction() else: previous = None if len(label_ind) < all * 0.6: if iteration % 10: select_ind = self.query_strategy.select_by_prediction( unlabel_index=unlab_ind, predict=pred, labels=self.labels[label_ind.index], batch_size=int(lab_init * 1), X_lab=self.dataset[label_ind.index], X_unlab=self.dataset[unlab_ind.index], previous=previous) else: select_ind = self.random.select(label_ind, unlab_ind, batch_size=int( lab_init * 1)) else: select_ind = self.query_strategy.select_by_prediction( unlabel_index=unlab_ind, predict=pred, labels=self.labels[label_ind.index], batch_size=int(len(label_ind) * 0.3), X_lab=self.dataset[label_ind.index], X_unlab=self.dataset[unlab_ind.index], previous=previous) elif self.phase == 'passive': if len(label_ind) < all * 0.6: select_ind = self.random.select(label_ind, unlab_ind, batch_size=int( lab_init * 1)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) else: select_ind = self.random.select( label_ind, unlab_ind, batch_size=int(len(label_ind) * 0.3)) # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1) # update the datasets and previous prediction unlab_ind_save = unlab_ind.index label_ind.update(select_ind) unlab_ind.difference_update(select_ind) # update model and calc performance accoding to the updated model loss = net.train(self.dataset[label_ind.index], self.labels[label_ind.index]) # if not iteration%2: net.predict(self.testset) pred = net.preds # evaluation self.mse.append(mse(self.testlab, pred)) self.mae.append(mae(self.testlab, pred)) self.max.append(max(self.testlab, pred)) self.evs.append(evs(self.testlab, pred)) self.r2.append(r2(self.testlab, pred)) self.sample.append(len(label_ind.index)) self.loss.append(loss) # save the results st = self.alibox.State(select_ind, mse(self.testlab, pred)) saver.add_state(st) saver.save() self.stopping_criterion.update_information(saver) torch.save(self.model, './%s/%d/model%d' % (self.path, round, iteration)) self.stopping_criterion.reset() self.unc_result.append(copy.deepcopy(saver)) joblib.dump(self.mse, './%s/%d/mse' % (self.path, round)) joblib.dump(self.mae, './%s/%d/mae' % (self.path, round)) joblib.dump(self.max, './%s/%d/max' % (self.path, round)) joblib.dump(self.evs, './%s/%d/evs' % (self.path, round)) joblib.dump(self.r2, './%s/%d/r2' % (self.path, round)) joblib.dump(self.sample, './%s/%d/sample' % (self.path, round)) joblib.dump(self.loss, './%s/%d/loss' % (self.path, round)) joblib.dump(self.testlab, './%s/%d/testlab' % (self.path, round)) joblib.dump(pred, './%s/%d/pred' % (self.path, round)) self.analyser = self.alibox.get_experiment_analyser( x_axis='num_of_queries') self.analyser.add_method( method_name='QueryInstanceDistribution-distance', method_results=self.unc_result) print(self.analyser) def _update_previous_prediction(self, new, selected=None, unlab=None): if self.six is not None: del_ind = [unlab.index(i) for i in selected] if self.two is not None: self.one = np.delete(self.two, del_ind) if self.three is not None: self.two = np.delete(self.three, del_ind) if self.four is not None: self.three = np.delete(self.four, del_ind) if self.five is not None: self.four = np.delete(self.five, del_ind) if self.six is not None: self.five = np.delete(self.six, del_ind) self.six = new def _get_previous_prediction(self): if self.one is not None: return np.vstack((self.one, self.two, self.three, self.four, self.five, self.six)) elif self.two is not None: return np.vstack( (self.two, self.three, self.four, self.five, self.six)) elif self.three is not None: return np.vstack((self.three, self.four, self.five, self.six))