class UncertaintySampler(object): def __init__(self, X, y, labs, n=2): y = [yy if yy >= 0 else None for yy in y] self.dataset = Dataset(X, y) self.labs = labs self.uc = UncertaintySampling(self.dataset, method='lc', model=LinearSVC()) self.n = n def get_next(self): print >> sys.stderr, 'get_next: start' out = self.uc.make_query(n=self.n) print >> sys.stderr, 'get_next: done' return out def set_label(self, idx, label): print >> sys.stderr, 'set_label: start' out = self.dataset.update(idx, label) print >> sys.stderr, 'set_label: done' return out def get_data(self): X, y = zip(*self.dataset.get_entries()) X, y = np.vstack(X), np.array( [yy if yy is not None else -1 for yy in y]) return X, y def n_hits(self): labels = np.array(zip(*self.dataset.get_entries())[1]) return (labels == 1).sum() def n_labeled(self): return self.dataset.len_labeled() def is_labeled(self, idx): return idx in np.where(zip(*self.dataset.get_entries())[1])[0] def save(self, outpath): """ !! This should be updated to save in same format as simple_las """ X, y = self.get_data() f = h5py.File( '%s-%s-%s.h5' % (outpath, 'uncertainty', datetime.now().strftime('%Y%m%d_%H%M%S'))) f['X'] = X f['y'] = y f['labs'] = self.labs f.close()
def run_featureselection(trn_dss, tst_ds, y_train, model, method_, qs, X_test, y_test, all_cols, save_name, save, type_, part=20): """ Batch active learning algorithm with feature selection """ E_in, E_out = [], [] f1score = [] features_ls = [] label_holder, asked_id = [], [] tn, fp, fn, tp = [], [], [], [] k = trn_dss.len_labeled() k_beg = trn_dss.len_labeled() quota = len(trn_dss.data) iter_ = 0 while (k < quota): clear_output(wait=True) # Standard usage of libact objects # make_query returns the index of the sample that the active learning algorithm would like to query lbls, asks = [], [] if (part < trn_dss.len_unlabeled()): part1 = part else: part1 = trn_dss.len_unlabeled() # -------------------> Feature Selection # select features with feature selection X_train_feature = [i[0] for i in trn_dss.get_labeled_entries()] y_train_feature = [i[1] for i in trn_dss.get_labeled_entries()] col_index, features_f = feature_selection(X_train_feature, y_train_feature, all_cols, f_class=True) features_ls.append(features_f) # update the X_train dataset and y_train with the current selection of variables X_train_updated = [i[0][col_index] for i in trn_dss.data] y_train_updated = [i[1] for i in trn_dss.data] trn_dss_updated = Dataset(X_train_updated, y_train_updated) # update X_test X_test_feature = [i[col_index] for i in X_test] if (type_ == 'random'): qs = RandomSampling(trn_dss_updated, method=method_, model=model) model1 = model elif (type_ == 'unc'): qs = UncertaintySampling(trn_dss_updated, method=method_, model=model) model1 = model elif (type_ == 'qbc'): qs = QueryByCommittee(trn_dss_updated, models=model) model1 = method_ elif (type_ == 'dens'): qs = DWUS(trn_dss_updated, model=model) model1 = model for i in range(0, part1): # ask id only asks for particular id, not all, everytime ask_id = qs.make_query() asks.append(ask_id) # lbl label returns the label of a given sample lb = y_train[ask_id] lbls.append(lb) # update updates the unlabeled sample with queried sample trn_dss.update(ask_id, lb) trn_dss_updated.update(ask_id, lb) label_holder.append(lbls) asked_id.append(asks) # trains only on the labeled examples and chosen values model1.train(trn_dss_updated) # predict it pred_y = model1.predict(X_test_feature) # save the results f1score.append(f1_score(y_test, pred_y)) tn.append(confusion_matrix(y_test, pred_y)[0][0]) fp.append(confusion_matrix(y_test, pred_y)[0][1]) fn.append(confusion_matrix(y_test, pred_y)[1][0]) tp.append(confusion_matrix(y_test, pred_y)[1][1]) # score returns the mean accuracy of the results #E_in = np.append(E_in, 1 - model.score(trn_dss)) #train #E_out = np.append(E_out, 1 - model.score(tst_ds)) #test k = trn_dss_updated.len_labeled() print(k) print(quota) print('iteration:', iter_) print(len(f1score)) print('train dataset labeled:', trn_dss.len_labeled()) print('train dataset shape:', trn_dss.format_sklearn()[0].shape) print('train dataset sum:', trn_dss.format_sklearn()[1].sum()) print('Current f1 score:', f1_score(y_test, pred_y)) print('Current progress:', np.round(k / quota * 100, 2), '%') print('Chosen_features:', features_f) # number of iterations iter_ = iter_ + 1 q = [i for i in range(k_beg, quota, part)] iter_ = [i for i in range(0, len(f1score))] if (save == True): #q= [i for i in range(k_beg,quota,part)] #iter_=[i for i in range(0,len(f1score))] saved_file = pd.DataFrame({ 'iter': iter_, 'quota': q, 'f1_score': f1score, 'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp, 'id_index': asked_id, 'label': label_holder, 'features': features_ls }) saved_file.to_csv(save_name) return q, iter_, f1score, tn, fp, fn, tp, k, trn_dss.data, label_holder, asked_id, features_ls