Example #1
0
 def test_mlc_label(self):
     """test multi-label case"""
     dataset = self.setup_mlc_dataset()
     lbr = IdealLabeler(dataset)
     ask_id = lbr.label(np.array([12., 5., 2., 11., 14.]))
     np.testing.assert_array_equal(ask_id, [0, 1, 0, 0, 1])
     ask_id = lbr.label(np.array([6., 2., 21., 20., 5.]))
     np.testing.assert_array_equal(ask_id, [0, 0, 1, 0, 1])
Example #2
0
 def test_label(self):
     dataset = self.setup_dataset()
     lbr = IdealLabeler(dataset)
     ask_id = lbr.label(np.array([0, 1, 2]))
     self.assertEqual(ask_id, 1)
     ask_id = lbr.label(np.array([6, 7, 8]))
     self.assertEqual(ask_id, 3)
     ask_id = lbr.label([12, 13, 14])
     self.assertEqual(ask_id, 4)
Example #3
0
 def test_label(self):
     dataset = self.setup_dataset()
     lbr = IdealLabeler(dataset)
     ask_id = lbr.label(np.array([0, 1, 2]))
     self.assertEqual(ask_id, 1)
     ask_id = lbr.label(np.array([6, 7, 8]))
     self.assertEqual(ask_id, 3)
     ask_id = lbr.label([12, 13, 14])
     self.assertEqual(ask_id, 4)
def libact_QBC(X, y, n_queries):
    y_train = np.array([None for _ in range(len(y))])
    y_train[0], y_train[50], y_train[100] = 0, 1, 2
    libact_train_dataset = Dataset(X, y_train)
    libact_full_dataset = Dataset(X, y)
    libact_learner_list = [
        LogisticRegressionLibact(solver='liblinear',
                                 n_jobs=1,
                                 multi_class='ovr'),
        LogisticRegressionLibact(solver='liblinear',
                                 n_jobs=1,
                                 multi_class='ovr')
    ]
    libact_qs = QueryByCommittee(libact_train_dataset,
                                 models=libact_learner_list,
                                 method='lc')
    libact_labeler = IdealLabeler(libact_full_dataset)
    for libact_learner in libact_learner_list:
        libact_learner.train(libact_train_dataset)

    for _ in range(n_queries):
        query_idx = libact_qs.make_query()
        query_label = libact_labeler.label(X[query_idx])
        libact_train_dataset.update(query_idx, query_label)
        for libact_learner in libact_learner_list:
            libact_learner.train(libact_train_dataset)
def libact_EER(X, y, n_queries):
    y_train = np.array([None for _ in range(len(y))])
    y_train[0], y_train[50], y_train[100] = 0, 1, 2
    libact_train_dataset = Dataset(X, y_train)
    libact_full_dataset = Dataset(X, y)
    libact_learner = LogisticRegressionLibact(
        solver='liblinear', n_jobs=1,
        multi_class='ovr')  #SVM(gamma='auto', probability=True)
    libact_qs = EER(libact_train_dataset, model=libact_learner, loss='01')
    libact_labeler = IdealLabeler(libact_full_dataset)
    libact_learner.train(libact_train_dataset)

    for _ in range(n_queries):
        query_idx = libact_qs.make_query()
        query_label = libact_labeler.label(X[query_idx])
        libact_train_dataset.update(query_idx, query_label)
        libact_learner.train(libact_train_dataset)
Example #6
0
def main():
    quota = 10  # ask human to label 10 samples
    n_classes = 5
    E_out1, E_out2 = [], []

    trn_ds, tst_ds, ds = split_train_test(n_classes)
    trn_ds2 = copy.deepcopy(trn_ds)

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    qs2 = RandomSampling(trn_ds2)

    model = LogisticRegression()

    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Error')

    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout')
    p2, = ax.plot(query_num, E_out2, 'k', label='random Eout')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show(block=False)

    img_ax = fig.add_subplot(2, 1, 2)
    box = img_ax.get_position()
    img_ax.set_position(
        [box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9])
    plt.show()
    # Give each label its name (labels are from 0 to n_classes-1)
    # lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)])
    x_ds = ds.data
    print(x_ds.shape)
    y_ds = ds.target
    print(y_ds.shape)
    lbr_ds = Dataset(x_ds, y_ds)
    x, _ = zip(*trn_ds.data)

    print(x)
    lbr = IdealLabeler(lbr_ds)
    for i in range(quota):
        ask_id = qs.make_query()
        print("asking sample from Uncertainty Sampling")
        # reshape the image to its width and height
        x, _ = zip(*trn_ds.data)
        lb = lbr.label(x[ask_id])
        # lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8))
        trn_ds.update(ask_id, lb)
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("asking sample from Random Sample")
        x, _ = zip(*trn_ds2.data)
        lb = lbr.label(x[ask_id])
        # lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8))
        trn_ds2.update(ask_id, lb)
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

        ax.set_xlim((0, i + 1))
        ax.set_ylim((0, max(max(E_out1), max(E_out2)) + 0.2))
        query_num = np.arange(0, i + 2)
        p1.set_xdata(query_num)
        p1.set_ydata(E_out1)
        p2.set_xdata(query_num)
        p2.set_ydata(E_out2)

        plt.show()

    input("Press any key to continue...")
def visualize(dataset_file, word2vec_file, word_freq_file, output_file):
    nfolds = 3
    for i, foldid in zip(
        [0], [0]):  #itertools.product(get_set_random_states(), range(nfolds)):
        #for i, foldid in itertools.product(get_set_random_states(), range(nfolds)):
        result = []
        data = read_and_split_data2(dataset_file,
                                    random_state=i,
                                    nfold=nfolds,
                                    foldid=foldid)

        # convert to libact dataset
        docs = data['train_x'] + data['dev_x'] + data['test_x']
        trn_len = len(data['train_x'])
        dev_len = len(data['dev_x'])
        test_len = len(data['test_x'])

        X = np.asarray([[i] for i in range(len(docs))])
        X_train = np.asarray([X[i, :] for i in range(trn_len)])
        X_dev = np.asarray(
            [X[i, :] for i in range(trn_len, trn_len + dev_len)])
        X_test = np.asarray(
            [X[i, :] for i in range(trn_len + dev_len, len(docs))])

        y = data['train_y'] + data['dev_y'] + data['test_y']
        y_train = data['train_y']
        y_dev = data['dev_y']
        y_test = data['test_y']

        n_labeled = 50

        tmp = prepare_data(X_train,
                           y_train,
                           X_test,
                           y_test,
                           n_labeled,
                           random_state=3)
        trn_ds, tst_ds, fully_labeled_trn_ds, n_labeled = tmp
        lbr = IdealLabeler(fully_labeled_trn_ds)
        #        quota = len(y_train) - n_labeled    # number of samples to query
        quota = 10
        batchsize = 10

        # Comparing UncertaintySampling strategy with RandomSampling.
        # model is the base learner, e.g. LogisticRegression, SVM ... etc.
        # train CNN Readability Classifier
        # train CNN Readability Classifier
        print('>>>>>>>>>>>UnCertainty ....')
        preprocessor = rlc.Preprocessor(word2vec_file,
                                        word_freq_file,
                                        topwords_as_vocab=True)
        cls = rlc.ReadlevelClassifier(preprocessor, useGPU=True)
        cls.cuda()

        wrapper = CNNWrapperClassifier((docs, y), cls,
                                       (data['dev_x'], data['dev_y']))
        wrapper.train(trn_ds)
        import ipdb
        #        ipdb.set_trace()

        unlabeled_entry_ids, X_pool = zip(*trn_ds.get_unlabeled_entries())
        dvalue = wrapper.predict_proba(X_pool)
        from sklearn.manifold import TSNE
        X_embedded = TSNE(n_components=2).fit_transform(dvalue)
        labels = np.asarray([[
            lbr.label(X_pool[i]) for i in range(len(unlabeled_entry_ids))
        ]]).transpose()
        print(labels.shape, X_embedded.shape)
        X_embedded = np.concatenate((X_embedded, labels), axis=1)

        result = pd.DataFrame(data=X_embedded)
        with open(output_file, 'w') as f:
            result.to_csv(f, header=False)