class AdaptorLibAct: """Adaptor for libact query strategies.""" def __init__(self, X_full_dataset, y_full_dataset, libact_query_alg_ctor, max_samples_number=40): self._train_dataset = Dataset(X_full_dataset, y_full_dataset) self._ctor = libact_query_alg_ctor self._max_samples_number = max_samples_number #self._train_dataset.on_update(lambda _, __ : None) self._train_dataset._update_callback = set() def start(self): self._libact_query_alg = MultipleQueryStrategy( impl=self._ctor(self._train_dataset), query_n=self._max_samples_number) self._train_dataset._update_callback = set() def make_iteration(self, indexes, y): for i in range(indexes.shape[0]): self._train_dataset.update(indexes[i], y[i]) self._libact_query_alg.update(indexes, y) def choose_samples_for_annotation(self): res = np.array(list(self._libact_query_alg.make_query())) return res
def libact_QBC(X, y, n_queries): y_train = np.array([None for _ in range(len(y))]) y_train[0], y_train[50], y_train[100] = 0, 1, 2 libact_train_dataset = Dataset(X, y_train) libact_full_dataset = Dataset(X, y) libact_learner_list = [ LogisticRegressionLibact(solver='liblinear', n_jobs=1, multi_class='ovr'), LogisticRegressionLibact(solver='liblinear', n_jobs=1, multi_class='ovr') ] libact_qs = QueryByCommittee(libact_train_dataset, models=libact_learner_list, method='lc') libact_labeler = IdealLabeler(libact_full_dataset) for libact_learner in libact_learner_list: libact_learner.train(libact_train_dataset) for _ in range(n_queries): query_idx = libact_qs.make_query() query_label = libact_labeler.label(X[query_idx]) libact_train_dataset.update(query_idx, query_label) for libact_learner in libact_learner_list: libact_learner.train(libact_train_dataset)
def libact_EER(X, y, n_queries): y_train = np.array([None for _ in range(len(y))]) y_train[0], y_train[50], y_train[100] = 0, 1, 2 libact_train_dataset = Dataset(X, y_train) libact_full_dataset = Dataset(X, y) libact_learner = LogisticRegressionLibact( solver='liblinear', n_jobs=1, multi_class='ovr') #SVM(gamma='auto', probability=True) libact_qs = EER(libact_train_dataset, model=libact_learner, loss='01') libact_labeler = IdealLabeler(libact_full_dataset) libact_learner.train(libact_train_dataset) for _ in range(n_queries): query_idx = libact_qs.make_query() query_label = libact_labeler.label(X[query_idx]) libact_train_dataset.update(query_idx, query_label) libact_learner.train(libact_train_dataset)
class UncertaintySampler(object): def __init__(self, X, y, labs, n=2): y = [yy if yy >= 0 else None for yy in y] self.dataset = Dataset(X, y) self.labs = labs self.uc = UncertaintySampling(self.dataset, method='lc', model=LinearSVC()) self.n = n def get_next(self): print >> sys.stderr, 'get_next: start' out = self.uc.make_query(n=self.n) print >> sys.stderr, 'get_next: done' return out def set_label(self, idx, label): print >> sys.stderr, 'set_label: start' out = self.dataset.update(idx, label) print >> sys.stderr, 'set_label: done' return out def get_data(self): X, y = zip(*self.dataset.get_entries()) X, y = np.vstack(X), np.array( [yy if yy is not None else -1 for yy in y]) return X, y def n_hits(self): labels = np.array(zip(*self.dataset.get_entries())[1]) return (labels == 1).sum() def n_labeled(self): return self.dataset.len_labeled() def is_labeled(self, idx): return idx in np.where(zip(*self.dataset.get_entries())[1])[0] def save(self, outpath): """ !! This should be updated to save in same format as simple_las """ X, y = self.get_data() f = h5py.File( '%s-%s-%s.h5' % (outpath, 'uncertainty', datetime.now().strftime('%Y%m%d_%H%M%S'))) f['X'] = X f['y'] = y f['labs'] = self.labs f.close()
# Generate a pool and expand dataset pool, bounds_new = expand_pool(D, bounds_old, expansion_rate) for entry in pool: dataset.append(entry) # Query a new sample ask_id, clf = qs.make_query(center) # ask_id, clf = qs1.make_query() # ask_id = qs2.make_query() # clf.train(dataset) new = dataset.data[ask_id][0].reshape(1, -1) # Update model and dataset l = get_label(new, landmark, threshold) dataset.update(ask_id, l) # update dataset D = np.vstack((D, new)) L = np.append(L, l) if np.any(np.array(L[-10:]) == 1) and np.any(np.array(L[-10:]) == -1): center = D[np.array(L) == 1][-1] # the last positive sample else: center = center0 i += 1 bounds_old = bounds_new # Create a mesh grid xx, yy = np.meshgrid(np.linspace(BD[0][0], BD[1][0], 200), np.linspace(BD[0][1], BD[1][1], 200))
def run_featureselection(trn_dss, tst_ds, y_train, model, method_, qs, X_test, y_test, all_cols, save_name, save, type_, part=20): """ Batch active learning algorithm with feature selection """ E_in, E_out = [], [] f1score = [] features_ls = [] label_holder, asked_id = [], [] tn, fp, fn, tp = [], [], [], [] k = trn_dss.len_labeled() k_beg = trn_dss.len_labeled() quota = len(trn_dss.data) iter_ = 0 while (k < quota): clear_output(wait=True) # Standard usage of libact objects # make_query returns the index of the sample that the active learning algorithm would like to query lbls, asks = [], [] if (part < trn_dss.len_unlabeled()): part1 = part else: part1 = trn_dss.len_unlabeled() # -------------------> Feature Selection # select features with feature selection X_train_feature = [i[0] for i in trn_dss.get_labeled_entries()] y_train_feature = [i[1] for i in trn_dss.get_labeled_entries()] col_index, features_f = feature_selection(X_train_feature, y_train_feature, all_cols, f_class=True) features_ls.append(features_f) # update the X_train dataset and y_train with the current selection of variables X_train_updated = [i[0][col_index] for i in trn_dss.data] y_train_updated = [i[1] for i in trn_dss.data] trn_dss_updated = Dataset(X_train_updated, y_train_updated) # update X_test X_test_feature = [i[col_index] for i in X_test] if (type_ == 'random'): qs = RandomSampling(trn_dss_updated, method=method_, model=model) model1 = model elif (type_ == 'unc'): qs = UncertaintySampling(trn_dss_updated, method=method_, model=model) model1 = model elif (type_ == 'qbc'): qs = QueryByCommittee(trn_dss_updated, models=model) model1 = method_ elif (type_ == 'dens'): qs = DWUS(trn_dss_updated, model=model) model1 = model for i in range(0, part1): # ask id only asks for particular id, not all, everytime ask_id = qs.make_query() asks.append(ask_id) # lbl label returns the label of a given sample lb = y_train[ask_id] lbls.append(lb) # update updates the unlabeled sample with queried sample trn_dss.update(ask_id, lb) trn_dss_updated.update(ask_id, lb) label_holder.append(lbls) asked_id.append(asks) # trains only on the labeled examples and chosen values model1.train(trn_dss_updated) # predict it pred_y = model1.predict(X_test_feature) # save the results f1score.append(f1_score(y_test, pred_y)) tn.append(confusion_matrix(y_test, pred_y)[0][0]) fp.append(confusion_matrix(y_test, pred_y)[0][1]) fn.append(confusion_matrix(y_test, pred_y)[1][0]) tp.append(confusion_matrix(y_test, pred_y)[1][1]) # score returns the mean accuracy of the results #E_in = np.append(E_in, 1 - model.score(trn_dss)) #train #E_out = np.append(E_out, 1 - model.score(tst_ds)) #test k = trn_dss_updated.len_labeled() print(k) print(quota) print('iteration:', iter_) print(len(f1score)) print('train dataset labeled:', trn_dss.len_labeled()) print('train dataset shape:', trn_dss.format_sklearn()[0].shape) print('train dataset sum:', trn_dss.format_sklearn()[1].sum()) print('Current f1 score:', f1_score(y_test, pred_y)) print('Current progress:', np.round(k / quota * 100, 2), '%') print('Chosen_features:', features_f) # number of iterations iter_ = iter_ + 1 q = [i for i in range(k_beg, quota, part)] iter_ = [i for i in range(0, len(f1score))] if (save == True): #q= [i for i in range(k_beg,quota,part)] #iter_=[i for i in range(0,len(f1score))] saved_file = pd.DataFrame({ 'iter': iter_, 'quota': q, 'f1_score': f1score, 'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp, 'id_index': asked_id, 'label': label_holder, 'features': features_ls }) saved_file.to_csv(save_name) return q, iter_, f1score, tn, fp, fn, tp, k, trn_dss.data, label_holder, asked_id, features_ls