def test_mlc_label(self): """test multi-label case""" dataset = self.setup_mlc_dataset() lbr = IdealLabeler(dataset) ask_id = lbr.label(np.array([12., 5., 2., 11., 14.])) np.testing.assert_array_equal(ask_id, [0, 1, 0, 0, 1]) ask_id = lbr.label(np.array([6., 2., 21., 20., 5.])) np.testing.assert_array_equal(ask_id, [0, 0, 1, 0, 1])
def test_label(self): dataset = self.setup_dataset() lbr = IdealLabeler(dataset) ask_id = lbr.label(np.array([0, 1, 2])) self.assertEqual(ask_id, 1) ask_id = lbr.label(np.array([6, 7, 8])) self.assertEqual(ask_id, 3) ask_id = lbr.label([12, 13, 14]) self.assertEqual(ask_id, 4)
def libact_QBC(X, y, n_queries): y_train = np.array([None for _ in range(len(y))]) y_train[0], y_train[50], y_train[100] = 0, 1, 2 libact_train_dataset = Dataset(X, y_train) libact_full_dataset = Dataset(X, y) libact_learner_list = [ LogisticRegressionLibact(solver='liblinear', n_jobs=1, multi_class='ovr'), LogisticRegressionLibact(solver='liblinear', n_jobs=1, multi_class='ovr') ] libact_qs = QueryByCommittee(libact_train_dataset, models=libact_learner_list, method='lc') libact_labeler = IdealLabeler(libact_full_dataset) for libact_learner in libact_learner_list: libact_learner.train(libact_train_dataset) for _ in range(n_queries): query_idx = libact_qs.make_query() query_label = libact_labeler.label(X[query_idx]) libact_train_dataset.update(query_idx, query_label) for libact_learner in libact_learner_list: libact_learner.train(libact_train_dataset)
def libact_EER(X, y, n_queries): y_train = np.array([None for _ in range(len(y))]) y_train[0], y_train[50], y_train[100] = 0, 1, 2 libact_train_dataset = Dataset(X, y_train) libact_full_dataset = Dataset(X, y) libact_learner = LogisticRegressionLibact( solver='liblinear', n_jobs=1, multi_class='ovr') #SVM(gamma='auto', probability=True) libact_qs = EER(libact_train_dataset, model=libact_learner, loss='01') libact_labeler = IdealLabeler(libact_full_dataset) libact_learner.train(libact_train_dataset) for _ in range(n_queries): query_idx = libact_qs.make_query() query_label = libact_labeler.label(X[query_idx]) libact_train_dataset.update(query_idx, query_label) libact_learner.train(libact_train_dataset)
def main(): quota = 10 # ask human to label 10 samples n_classes = 5 E_out1, E_out2 = [], [] trn_ds, tst_ds, ds = split_train_test(n_classes) trn_ds2 = copy.deepcopy(trn_ds) qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression()) qs2 = RandomSampling(trn_ds2) model = LogisticRegression() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) ax.set_xlabel('Number of Queries') ax.set_ylabel('Error') model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) query_num = np.arange(0, 1) p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout') p2, = ax.plot(query_num, E_out2, 'k', label='random Eout') plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5) plt.show(block=False) img_ax = fig.add_subplot(2, 1, 2) box = img_ax.get_position() img_ax.set_position( [box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9]) plt.show() # Give each label its name (labels are from 0 to n_classes-1) # lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)]) x_ds = ds.data print(x_ds.shape) y_ds = ds.target print(y_ds.shape) lbr_ds = Dataset(x_ds, y_ds) x, _ = zip(*trn_ds.data) print(x) lbr = IdealLabeler(lbr_ds) for i in range(quota): ask_id = qs.make_query() print("asking sample from Uncertainty Sampling") # reshape the image to its width and height x, _ = zip(*trn_ds.data) lb = lbr.label(x[ask_id]) # lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8)) trn_ds.update(ask_id, lb) model.train(trn_ds) E_out1 = np.append(E_out1, 1 - model.score(tst_ds)) ask_id = qs2.make_query() print("asking sample from Random Sample") x, _ = zip(*trn_ds2.data) lb = lbr.label(x[ask_id]) # lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8)) trn_ds2.update(ask_id, lb) model.train(trn_ds2) E_out2 = np.append(E_out2, 1 - model.score(tst_ds)) ax.set_xlim((0, i + 1)) ax.set_ylim((0, max(max(E_out1), max(E_out2)) + 0.2)) query_num = np.arange(0, i + 2) p1.set_xdata(query_num) p1.set_ydata(E_out1) p2.set_xdata(query_num) p2.set_ydata(E_out2) plt.show() input("Press any key to continue...")
def visualize(dataset_file, word2vec_file, word_freq_file, output_file): nfolds = 3 for i, foldid in zip( [0], [0]): #itertools.product(get_set_random_states(), range(nfolds)): #for i, foldid in itertools.product(get_set_random_states(), range(nfolds)): result = [] data = read_and_split_data2(dataset_file, random_state=i, nfold=nfolds, foldid=foldid) # convert to libact dataset docs = data['train_x'] + data['dev_x'] + data['test_x'] trn_len = len(data['train_x']) dev_len = len(data['dev_x']) test_len = len(data['test_x']) X = np.asarray([[i] for i in range(len(docs))]) X_train = np.asarray([X[i, :] for i in range(trn_len)]) X_dev = np.asarray( [X[i, :] for i in range(trn_len, trn_len + dev_len)]) X_test = np.asarray( [X[i, :] for i in range(trn_len + dev_len, len(docs))]) y = data['train_y'] + data['dev_y'] + data['test_y'] y_train = data['train_y'] y_dev = data['dev_y'] y_test = data['test_y'] n_labeled = 50 tmp = prepare_data(X_train, y_train, X_test, y_test, n_labeled, random_state=3) trn_ds, tst_ds, fully_labeled_trn_ds, n_labeled = tmp lbr = IdealLabeler(fully_labeled_trn_ds) # quota = len(y_train) - n_labeled # number of samples to query quota = 10 batchsize = 10 # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. # train CNN Readability Classifier # train CNN Readability Classifier print('>>>>>>>>>>>UnCertainty ....') preprocessor = rlc.Preprocessor(word2vec_file, word_freq_file, topwords_as_vocab=True) cls = rlc.ReadlevelClassifier(preprocessor, useGPU=True) cls.cuda() wrapper = CNNWrapperClassifier((docs, y), cls, (data['dev_x'], data['dev_y'])) wrapper.train(trn_ds) import ipdb # ipdb.set_trace() unlabeled_entry_ids, X_pool = zip(*trn_ds.get_unlabeled_entries()) dvalue = wrapper.predict_proba(X_pool) from sklearn.manifold import TSNE X_embedded = TSNE(n_components=2).fit_transform(dvalue) labels = np.asarray([[ lbr.label(X_pool[i]) for i in range(len(unlabeled_entry_ids)) ]]).transpose() print(labels.shape, X_embedded.shape) X_embedded = np.concatenate((X_embedded, labels), axis=1) result = pd.DataFrame(data=X_embedded) with open(output_file, 'w') as f: result.to_csv(f, header=False)