Ejemplo n.º 1
0
class AdaptorLibAct:
    """Adaptor for libact query strategies."""
    def __init__(self,
                 X_full_dataset,
                 y_full_dataset,
                 libact_query_alg_ctor,
                 max_samples_number=40):
        self._train_dataset = Dataset(X_full_dataset, y_full_dataset)
        self._ctor = libact_query_alg_ctor
        self._max_samples_number = max_samples_number
        #self._train_dataset.on_update(lambda _, __ : None)
        self._train_dataset._update_callback = set()

    def start(self):
        self._libact_query_alg = MultipleQueryStrategy(
            impl=self._ctor(self._train_dataset),
            query_n=self._max_samples_number)
        self._train_dataset._update_callback = set()

    def make_iteration(self, indexes, y):
        for i in range(indexes.shape[0]):
            self._train_dataset.update(indexes[i], y[i])
        self._libact_query_alg.update(indexes, y)

    def choose_samples_for_annotation(self):
        res = np.array(list(self._libact_query_alg.make_query()))
        return res
Ejemplo n.º 2
0
def libact_QBC(X, y, n_queries):
    y_train = np.array([None for _ in range(len(y))])
    y_train[0], y_train[50], y_train[100] = 0, 1, 2
    libact_train_dataset = Dataset(X, y_train)
    libact_full_dataset = Dataset(X, y)
    libact_learner_list = [
        LogisticRegressionLibact(solver='liblinear',
                                 n_jobs=1,
                                 multi_class='ovr'),
        LogisticRegressionLibact(solver='liblinear',
                                 n_jobs=1,
                                 multi_class='ovr')
    ]
    libact_qs = QueryByCommittee(libact_train_dataset,
                                 models=libact_learner_list,
                                 method='lc')
    libact_labeler = IdealLabeler(libact_full_dataset)
    for libact_learner in libact_learner_list:
        libact_learner.train(libact_train_dataset)

    for _ in range(n_queries):
        query_idx = libact_qs.make_query()
        query_label = libact_labeler.label(X[query_idx])
        libact_train_dataset.update(query_idx, query_label)
        for libact_learner in libact_learner_list:
            libact_learner.train(libact_train_dataset)
Ejemplo n.º 3
0
def libact_EER(X, y, n_queries):
    y_train = np.array([None for _ in range(len(y))])
    y_train[0], y_train[50], y_train[100] = 0, 1, 2
    libact_train_dataset = Dataset(X, y_train)
    libact_full_dataset = Dataset(X, y)
    libact_learner = LogisticRegressionLibact(
        solver='liblinear', n_jobs=1,
        multi_class='ovr')  #SVM(gamma='auto', probability=True)
    libact_qs = EER(libact_train_dataset, model=libact_learner, loss='01')
    libact_labeler = IdealLabeler(libact_full_dataset)
    libact_learner.train(libact_train_dataset)

    for _ in range(n_queries):
        query_idx = libact_qs.make_query()
        query_label = libact_labeler.label(X[query_idx])
        libact_train_dataset.update(query_idx, query_label)
        libact_learner.train(libact_train_dataset)
Ejemplo n.º 4
0
class UncertaintySampler(object):
    def __init__(self, X, y, labs, n=2):

        y = [yy if yy >= 0 else None for yy in y]

        self.dataset = Dataset(X, y)
        self.labs = labs

        self.uc = UncertaintySampling(self.dataset,
                                      method='lc',
                                      model=LinearSVC())
        self.n = n

    def get_next(self):
        print >> sys.stderr, 'get_next: start'
        out = self.uc.make_query(n=self.n)
        print >> sys.stderr, 'get_next: done'
        return out

    def set_label(self, idx, label):
        print >> sys.stderr, 'set_label: start'
        out = self.dataset.update(idx, label)
        print >> sys.stderr, 'set_label: done'
        return out

    def get_data(self):
        X, y = zip(*self.dataset.get_entries())
        X, y = np.vstack(X), np.array(
            [yy if yy is not None else -1 for yy in y])
        return X, y

    def n_hits(self):
        labels = np.array(zip(*self.dataset.get_entries())[1])
        return (labels == 1).sum()

    def n_labeled(self):
        return self.dataset.len_labeled()

    def is_labeled(self, idx):
        return idx in np.where(zip(*self.dataset.get_entries())[1])[0]

    def save(self, outpath):
        """ !! This should be updated to save in same format as simple_las """
        X, y = self.get_data()

        f = h5py.File(
            '%s-%s-%s.h5' %
            (outpath, 'uncertainty', datetime.now().strftime('%Y%m%d_%H%M%S')))
        f['X'] = X
        f['y'] = y
        f['labs'] = self.labs
        f.close()
        # Generate a pool and expand dataset
        pool, bounds_new = expand_pool(D, bounds_old, expansion_rate)
        for entry in pool:
            dataset.append(entry)

        # Query a new sample
        ask_id, clf = qs.make_query(center)
        #        ask_id, clf = qs1.make_query()
        #        ask_id = qs2.make_query()
        #        clf.train(dataset)
        new = dataset.data[ask_id][0].reshape(1, -1)

        # Update model and dataset
        l = get_label(new, landmark, threshold)
        dataset.update(ask_id, l)  # update dataset
        D = np.vstack((D, new))
        L = np.append(L, l)

        if np.any(np.array(L[-10:]) == 1) and np.any(np.array(L[-10:]) == -1):
            center = D[np.array(L) == 1][-1]  # the last positive sample
        else:
            center = center0

        i += 1
        bounds_old = bounds_new

    # Create a mesh grid
    xx, yy = np.meshgrid(np.linspace(BD[0][0], BD[1][0], 200),
                         np.linspace(BD[0][1], BD[1][1], 200))
Ejemplo n.º 6
0
def run_featureselection(trn_dss,
                         tst_ds,
                         y_train,
                         model,
                         method_,
                         qs,
                         X_test,
                         y_test,
                         all_cols,
                         save_name,
                         save,
                         type_,
                         part=20):
    """
    Batch active learning algorithm with feature selection
    """
    E_in, E_out = [], []
    f1score = []
    features_ls = []
    label_holder, asked_id = [], []
    tn, fp, fn, tp = [], [], [], []

    k = trn_dss.len_labeled()
    k_beg = trn_dss.len_labeled()
    quota = len(trn_dss.data)
    iter_ = 0

    while (k < quota):
        clear_output(wait=True)

        # Standard usage of libact objects
        # make_query returns the index of the sample that the active learning algorithm would like to query
        lbls, asks = [], []

        if (part < trn_dss.len_unlabeled()):
            part1 = part
        else:
            part1 = trn_dss.len_unlabeled()

        # -------------------> Feature Selection
        # select features with feature selection
        X_train_feature = [i[0] for i in trn_dss.get_labeled_entries()]
        y_train_feature = [i[1] for i in trn_dss.get_labeled_entries()]
        col_index, features_f = feature_selection(X_train_feature,
                                                  y_train_feature,
                                                  all_cols,
                                                  f_class=True)

        features_ls.append(features_f)

        # update the X_train dataset and y_train with the current selection of variables
        X_train_updated = [i[0][col_index] for i in trn_dss.data]
        y_train_updated = [i[1] for i in trn_dss.data]
        trn_dss_updated = Dataset(X_train_updated, y_train_updated)

        # update X_test
        X_test_feature = [i[col_index] for i in X_test]

        if (type_ == 'random'):
            qs = RandomSampling(trn_dss_updated, method=method_, model=model)
            model1 = model
        elif (type_ == 'unc'):
            qs = UncertaintySampling(trn_dss_updated,
                                     method=method_,
                                     model=model)
            model1 = model
        elif (type_ == 'qbc'):
            qs = QueryByCommittee(trn_dss_updated, models=model)
            model1 = method_
        elif (type_ == 'dens'):
            qs = DWUS(trn_dss_updated, model=model)
            model1 = model

        for i in range(0, part1):
            # ask id only asks for particular id, not all, everytime
            ask_id = qs.make_query()
            asks.append(ask_id)
            # lbl label returns the label of a given sample
            lb = y_train[ask_id]
            lbls.append(lb)
            # update updates the unlabeled sample with queried sample
            trn_dss.update(ask_id, lb)
            trn_dss_updated.update(ask_id, lb)

        label_holder.append(lbls)
        asked_id.append(asks)

        # trains only on the labeled examples and chosen values
        model1.train(trn_dss_updated)
        # predict it
        pred_y = model1.predict(X_test_feature)

        # save the results
        f1score.append(f1_score(y_test, pred_y))
        tn.append(confusion_matrix(y_test, pred_y)[0][0])
        fp.append(confusion_matrix(y_test, pred_y)[0][1])
        fn.append(confusion_matrix(y_test, pred_y)[1][0])
        tp.append(confusion_matrix(y_test, pred_y)[1][1])

        # score returns the mean accuracy of the results
        #E_in = np.append(E_in, 1 - model.score(trn_dss)) #train
        #E_out = np.append(E_out, 1 - model.score(tst_ds)) #test

        k = trn_dss_updated.len_labeled()
        print(k)
        print(quota)
        print('iteration:', iter_)
        print(len(f1score))
        print('train dataset labeled:', trn_dss.len_labeled())
        print('train dataset shape:', trn_dss.format_sklearn()[0].shape)
        print('train dataset sum:', trn_dss.format_sklearn()[1].sum())
        print('Current f1 score:', f1_score(y_test, pred_y))
        print('Current progress:', np.round(k / quota * 100, 2), '%')
        print('Chosen_features:', features_f)

        # number of iterations
        iter_ = iter_ + 1

    q = [i for i in range(k_beg, quota, part)]
    iter_ = [i for i in range(0, len(f1score))]

    if (save == True):
        #q= [i for i in range(k_beg,quota,part)]
        #iter_=[i for i in range(0,len(f1score))]
        saved_file = pd.DataFrame({
            'iter': iter_,
            'quota': q,
            'f1_score': f1score,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'tp': tp,
            'id_index': asked_id,
            'label': label_holder,
            'features': features_ls
        })
        saved_file.to_csv(save_name)

    return q, iter_, f1score, tn, fp, fn, tp, k, trn_dss.data, label_holder, asked_id, features_ls