Exemple #1
0
class UncertaintySampler(object):
    def __init__(self, X, y, labs, n=2):

        y = [yy if yy >= 0 else None for yy in y]

        self.dataset = Dataset(X, y)
        self.labs = labs

        self.uc = UncertaintySampling(self.dataset,
                                      method='lc',
                                      model=LinearSVC())
        self.n = n

    def get_next(self):
        print >> sys.stderr, 'get_next: start'
        out = self.uc.make_query(n=self.n)
        print >> sys.stderr, 'get_next: done'
        return out

    def set_label(self, idx, label):
        print >> sys.stderr, 'set_label: start'
        out = self.dataset.update(idx, label)
        print >> sys.stderr, 'set_label: done'
        return out

    def get_data(self):
        X, y = zip(*self.dataset.get_entries())
        X, y = np.vstack(X), np.array(
            [yy if yy is not None else -1 for yy in y])
        return X, y

    def n_hits(self):
        labels = np.array(zip(*self.dataset.get_entries())[1])
        return (labels == 1).sum()

    def n_labeled(self):
        return self.dataset.len_labeled()

    def is_labeled(self, idx):
        return idx in np.where(zip(*self.dataset.get_entries())[1])[0]

    def save(self, outpath):
        """ !! This should be updated to save in same format as simple_las """
        X, y = self.get_data()

        f = h5py.File(
            '%s-%s-%s.h5' %
            (outpath, 'uncertainty', datetime.now().strftime('%Y%m%d_%H%M%S')))
        f['X'] = X
        f['y'] = y
        f['labs'] = self.labs
        f.close()
Exemple #2
0
def run_featureselection(trn_dss,
                         tst_ds,
                         y_train,
                         model,
                         method_,
                         qs,
                         X_test,
                         y_test,
                         all_cols,
                         save_name,
                         save,
                         type_,
                         part=20):
    """
    Batch active learning algorithm with feature selection
    """
    E_in, E_out = [], []
    f1score = []
    features_ls = []
    label_holder, asked_id = [], []
    tn, fp, fn, tp = [], [], [], []

    k = trn_dss.len_labeled()
    k_beg = trn_dss.len_labeled()
    quota = len(trn_dss.data)
    iter_ = 0

    while (k < quota):
        clear_output(wait=True)

        # Standard usage of libact objects
        # make_query returns the index of the sample that the active learning algorithm would like to query
        lbls, asks = [], []

        if (part < trn_dss.len_unlabeled()):
            part1 = part
        else:
            part1 = trn_dss.len_unlabeled()

        # -------------------> Feature Selection
        # select features with feature selection
        X_train_feature = [i[0] for i in trn_dss.get_labeled_entries()]
        y_train_feature = [i[1] for i in trn_dss.get_labeled_entries()]
        col_index, features_f = feature_selection(X_train_feature,
                                                  y_train_feature,
                                                  all_cols,
                                                  f_class=True)

        features_ls.append(features_f)

        # update the X_train dataset and y_train with the current selection of variables
        X_train_updated = [i[0][col_index] for i in trn_dss.data]
        y_train_updated = [i[1] for i in trn_dss.data]
        trn_dss_updated = Dataset(X_train_updated, y_train_updated)

        # update X_test
        X_test_feature = [i[col_index] for i in X_test]

        if (type_ == 'random'):
            qs = RandomSampling(trn_dss_updated, method=method_, model=model)
            model1 = model
        elif (type_ == 'unc'):
            qs = UncertaintySampling(trn_dss_updated,
                                     method=method_,
                                     model=model)
            model1 = model
        elif (type_ == 'qbc'):
            qs = QueryByCommittee(trn_dss_updated, models=model)
            model1 = method_
        elif (type_ == 'dens'):
            qs = DWUS(trn_dss_updated, model=model)
            model1 = model

        for i in range(0, part1):
            # ask id only asks for particular id, not all, everytime
            ask_id = qs.make_query()
            asks.append(ask_id)
            # lbl label returns the label of a given sample
            lb = y_train[ask_id]
            lbls.append(lb)
            # update updates the unlabeled sample with queried sample
            trn_dss.update(ask_id, lb)
            trn_dss_updated.update(ask_id, lb)

        label_holder.append(lbls)
        asked_id.append(asks)

        # trains only on the labeled examples and chosen values
        model1.train(trn_dss_updated)
        # predict it
        pred_y = model1.predict(X_test_feature)

        # save the results
        f1score.append(f1_score(y_test, pred_y))
        tn.append(confusion_matrix(y_test, pred_y)[0][0])
        fp.append(confusion_matrix(y_test, pred_y)[0][1])
        fn.append(confusion_matrix(y_test, pred_y)[1][0])
        tp.append(confusion_matrix(y_test, pred_y)[1][1])

        # score returns the mean accuracy of the results
        #E_in = np.append(E_in, 1 - model.score(trn_dss)) #train
        #E_out = np.append(E_out, 1 - model.score(tst_ds)) #test

        k = trn_dss_updated.len_labeled()
        print(k)
        print(quota)
        print('iteration:', iter_)
        print(len(f1score))
        print('train dataset labeled:', trn_dss.len_labeled())
        print('train dataset shape:', trn_dss.format_sklearn()[0].shape)
        print('train dataset sum:', trn_dss.format_sklearn()[1].sum())
        print('Current f1 score:', f1_score(y_test, pred_y))
        print('Current progress:', np.round(k / quota * 100, 2), '%')
        print('Chosen_features:', features_f)

        # number of iterations
        iter_ = iter_ + 1

    q = [i for i in range(k_beg, quota, part)]
    iter_ = [i for i in range(0, len(f1score))]

    if (save == True):
        #q= [i for i in range(k_beg,quota,part)]
        #iter_=[i for i in range(0,len(f1score))]
        saved_file = pd.DataFrame({
            'iter': iter_,
            'quota': q,
            'f1_score': f1score,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'tp': tp,
            'id_index': asked_id,
            'label': label_holder,
            'features': features_ls
        })
        saved_file.to_csv(save_name)

    return q, iter_, f1score, tn, fp, fn, tp, k, trn_dss.data, label_holder, asked_id, features_ls