Beispiel #1
0
    def make_query(self):
        dataset = self.dataset
        unlabeled_entry_ids, unlabeled_pool = zip(
            *dataset.get_unlabeled_entries())
        labeled_pool, y = zip(*dataset.get_labeled_entries())
        if len(np.unique(y)) > 2:
            raise ValueError("HintSVM query strategy support binary class "
                "active learning only. Found %s classes" % len(np.unique(y)))

        hint_pool_idx = self.random_state_.choice(
            len(unlabeled_pool), int(len(unlabeled_pool) * self.p))
        hint_pool = np.array(unlabeled_pool)[hint_pool_idx]

        weight = [1.0 for _ in range(len(labeled_pool))] +\
                 [(self.ch / self.cl) for _ in range(len(hint_pool))]
        y = list(y) + [0 for _ in range(len(hint_pool))]
        X = [x for x in labeled_pool] +\
            [x for x in hint_pool]

        p_val = hintsvm_query(
            np.array(X, dtype=np.float64),
            np.array(y, dtype=np.float64),
            np.array(weight, dtype=np.float64),
            np.array(unlabeled_pool, dtype=np.float64),
            self.svm_params)

        p_val = [abs(float(val[0])) for val in p_val]
        idx = int(np.argmax(p_val))
        return unlabeled_entry_ids[idx]
Beispiel #2
0
    def retrieve_score_list(self):
        dataset = self.dataset
        unlabeled_entry_ids, unlabeled_pool = zip(
            *dataset.get_unlabeled_entries())
        labeled_pool, y = zip(*dataset.get_labeled_entries())
        if len(np.unique(y)) > 2:
            raise ValueError("HintSVM query strategy support binary class "
                             "active learning only. Found %s classes" %
                             len(np.unique(y)))

        hint_pool_idx = self.random_state_.choice(
            len(unlabeled_pool), int(len(unlabeled_pool) * self.p))
        hint_pool = np.array(unlabeled_pool)[hint_pool_idx]

        weight = [1.0 for _ in range(len(labeled_pool))] + \
                 [(self.ch / self.cl) for _ in range(len(hint_pool))]
        if 0 in y:
            assert -1 not in y, "we need to switch 0 to -1 in this case"
            y = map(lambda v: v if v != 0 else -1, y)
        y = list(y) + [0 for _ in range(len(hint_pool))]  # cant have zeros !!!
        X = [x for x in labeled_pool] + \
            [x for x in hint_pool]

        p_val = hintsvm_query(np.array(X, dtype=np.float64),
                              np.array(y, dtype=np.float64),
                              np.array(weight, dtype=np.float64),
                              np.array(unlabeled_pool, dtype=np.float64),
                              self.svm_params)

        p_val = [abs(float(val[0])) for val in p_val]
        return dict(zip(unlabeled_entry_ids, p_val))
Beispiel #3
0
    def make_query(self, return_score=False):
        """Return the index of the sample to be queried and labeled and
        selection score of each sample. Read-only.

        No modification to the internal states.

        Returns
        -------
        ask_id : int
            The index of the next unlabeled sample to be queried and labeled.

        score : list of (index, score) tuple
            Selection score of unlabled entries, the larger the better.

        """
        dataset = self.dataset
        unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries())

        unlabeled_entry_ids, scores = zip(*self._get_scores())
        ask_id = np.argmax(scores)

        if return_score:
            return unlabeled_entry_ids[ask_id], \
                   list(zip(unlabeled_entry_ids, scores))
        else:
            return unlabeled_entry_ids[ask_id]
Beispiel #4
0
    def retrieve_score_list(self):
        unlabeled_entry_ids, _ = zip(*self.dataset.get_unlabeled_entries())
        labeled_entry_ids = np.array([eid
                                      for eid, x in enumerate(self.dataset.data)
                                      if x[1] != None])
        labels = np.array([x[1]
                           for eid, x in enumerate(self.dataset.data)
                           if x[1] != None]).reshape(-1, 1)
        centers = self.kmeans_.cluster_centers_
        P_k_x = self.P_k_x
        p_x = self.p_x[list(unlabeled_entry_ids)]

        clf = DensityWeightedLogisticRegression(P_k_x[labeled_entry_ids, :],
                                                centers,
                                                self.C)
        clf.train(labeled_entry_ids, labels)
        P_y_k = clf.predict()

        P_y_x = np.zeros(len(unlabeled_entry_ids))
        for k, center in enumerate(centers):
            P_y_x += P_y_k[k] * P_k_x[unlabeled_entry_ids, k]

        # binary case
        expected_error = P_y_x
        expected_error[P_y_x >= 0.5] = 1. - P_y_x[P_y_x >= 0.5]

        scores = expected_error * p_x
        return dict(zip(unlabeled_entry_ids, scores))
Beispiel #5
0
    def make_query(self, return_score=False):

        dataset = self.dataset
        self.model.train(dataset)

        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())

        if isinstance(self.model, ProbabilisticModel):
            dvalue = self.model.predict_proba(X_pool)
        elif isinstance(self.model, ContinuousModel):
            dvalue = self.model.predict_real(X_pool)

        if np.shape(dvalue)[1] > 2:
            dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2])

        dist = np.abs(dvalue[:, 0] - dvalue[:, 1])
        arr1, arr2 = [], []
        arr1.append(np.array(dvalue[:, 0]).tolist())
        arr2.append(np.array(dvalue[:, 1]).tolist())

        div = -np.max(cosine_similarity(arr1, arr2), axis=1)
        score = (self.lmbda * dist) + ((1 - self.lmbda) * div)

        ask_id = np.argmin(score)

        if return_score:
            return unlabeled_entry_ids[ask_id], \
                   list(zip(unlabeled_entry_ids, score))
        else:
            return unlabeled_entry_ids[ask_id]
Beispiel #6
0
    def make_query(self, return_score=False):

        dataset = self.dataset
        self.model.train(dataset)

        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())

        if isinstance(self.model, ProbabilisticModel):
            dvalue = self.model.predict_proba(X_pool)
        elif isinstance(self.model, ContinuousModel):
            dvalue = self.model.predict_real(X_pool)
        '''if self.method == 'sm': 
            if np.shape(dvalue)[1] > 2:
                dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2])
            score = -np.abs(dvalue[:, 0] - dvalue[:, 1])'''

        if self.method == 'mm':  # max margin
            margin = np.partition(-dvalue, 1, axis=1)
            score = -np.abs(margin[:, 0] - margin[:, 1])
        '''elif self.method == 'entropy':
            score = np.sum(-dvalue * np.log(dvalue), axis=1)'''

        ask_id = np.argmax(score)

        if return_score:
            return unlabeled_entry_ids[ask_id], \
                   list(zip(unlabeled_entry_ids, score))
        else:
            return unlabeled_entry_ids[ask_id]
Beispiel #7
0
    def make_query(self):
        dataset = self.dataset
        unlabeled_entry_ids, unlabeled_pool = zip(
            *dataset.get_unlabeled_entries())
        labeled_pool, y = zip(*dataset.get_labeled_entries())
        if len(np.unique(y)) > 2:
            raise ValueError("HintSVM query strategy support binary class "
                             "active learning only. Found %s classes" %
                             len(np.unique(y)))

        hint_pool_idx = self.random_state_.choice(
            len(unlabeled_pool), int(len(unlabeled_pool) * self.p))
        hint_pool = np.array(unlabeled_pool)[hint_pool_idx]

        weight = [1.0 for _ in range(len(labeled_pool))] +\
                 [(self.ch / self.cl) for _ in range(len(hint_pool))]
        y = list(y) + [0 for _ in range(len(hint_pool))]
        X = [x.tolist() for x in labeled_pool] +\
            [x.tolist() for x in hint_pool]

        p_val = hintsvm_query(np.array(X), np.array(y), np.array(weight),
                              np.array([x.tolist() for x in unlabeled_pool]),
                              self.svm_params)

        p_val = [abs(float(val[0])) for val in p_val]
        idx = int(np.argmax(p_val))
        return unlabeled_entry_ids[idx]
Beispiel #8
0
    def make_query(self):
        dataset = self.dataset
        labeled_pool, Y = zip(*dataset.get_labeled_entries())
        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())
        labeled_pool = np.array(labeled_pool)
        Y = np.array(Y)
        X_pool = np.array(X_pool)

        br = BinaryRelevance(self.br_base)
        br.train(Dataset(labeled_pool, Y))

        trnf = br.predict_proba(labeled_pool)
        poolf = br.predict_proba(X_pool)
        f = poolf * 2 - 1

        trnf = np.sort(trnf, axis=1)[:, ::-1]
        trnf /= np.tile(trnf.sum(axis=1).reshape(-1, 1), (1, trnf.shape[1]))
        if len(np.unique(Y.sum(axis=1))) == 1:
            lr = DummyClf()
        else:
            lr = self.logistic_regression_
        lr.train(Dataset(trnf, Y.sum(axis=1)))

        idx_poolf = np.argsort(poolf, axis=1)[:, ::-1]
        poolf = np.sort(poolf, axis=1)[:, ::-1]
        poolf /= np.tile(poolf.sum(axis=1).reshape(-1, 1), (1, poolf.shape[1]))
        pred_num_lbl = lr.predict(poolf).astype(int)

        yhat = -1 * np.ones((len(X_pool), self.n_labels), dtype=int)
        for i, p in enumerate(pred_num_lbl):
            yhat[i, idx_poolf[i, :p]] = 1

        score = ((1 - yhat * f) / 2).sum(axis=1)
        ask_id = self.random_state_.choice(np.where(score == np.max(score))[0])
        return unlabeled_entry_ids[ask_id]
Beispiel #9
0
    def make_query(self):
        dataset = self.dataset
        unlabeled_entry_ids, unlabeled_pool = zip(
            *dataset.get_unlabeled_entries())
        labeled_pool, y = zip(*dataset.get_labeled_entries())

        cl = self.cl
        ch = self.ch
        p = self.p
        hint_pool_idx = self.random_state_.choice(
            len(unlabeled_pool), int(len(unlabeled_pool)*p))
        hint_pool = np.array(unlabeled_pool)[hint_pool_idx]

        weight = [1.0 for _ in range(len(labeled_pool))] +\
                 [(ch/cl) for i in range(len(hint_pool))]
        y = list(y) + [0 for i in range(len(hint_pool))]
        X = [x.tolist() for x in labeled_pool] +\
            [x.tolist() for x in hint_pool]

        p_val = hintsvm_query(
            np.array(X), np.array(y), np.array(weight),
            np.array([x.tolist() for x in unlabeled_pool]), self.svm_params)

        p_val = [abs(float(val[0])) for val in p_val]
        idx = int(np.argmax(p_val))
        return unlabeled_entry_ids[idx]
    def __init__(self, dataset, classes, active_selecting=True,
            subsample_qs=None, random_state=None):
        super(HierarchicalSampling, self).__init__(dataset)
        X = np.array(next(zip(*self.dataset.get_entries())))
        cluster = AgglomerativeClustering()
        cluster.fit(X)
        childrens = cluster.children_

        if subsample_qs is not None:
            if not isinstance(subsample_qs, QueryStrategy):
                raise TypeError("subsample_qs has to be a QueryStrategy")
            self.sub_qs = subsample_qs
        else:
            self.sub_qs = None

        self.active_selecting = active_selecting
        self.random_state_ = seed_random_state(random_state)
        self.n = len(childrens) + 1
        self.m = self.n * 2 - 1
        self.num_class = len(classes)
        self.classes = list(classes)
        self.class_id = dict(zip(self.classes, range(self.num_class)))

        self.parent = np.full(self.m, NO_NODE, dtype=int)
        self.size = np.zeros(self.m, dtype=int)
        self.depth = np.zeros(self.m, dtype=int)
        for i, (left_child, right_child) in enumerate(childrens):
            parent = i + self.n
            self.parent[left_child] = parent
            self.parent[right_child] = parent
        self.left_child = np.concatenate([np.full(self.n, NO_NODE), childrens[:,0]]).astype(int)
        self.right_child = np.concatenate([np.full(self.n, NO_NODE), childrens[:,1]]).astype(int)

        for i in range(self.n):
            node = i
            cur_depth = 0
            while node != NO_NODE:
                assert node >= 0 and node < self.m
                self.size[node] += 1
                self.depth[node] = max(self.depth[node], cur_depth)
                cur_depth += 1
                node = self.parent[node]

        self.count = np.zeros((self.m, self.num_class), dtype=int)
        self.total = np.zeros(self.m, dtype=int)
        self.upper_bound = np.ones((self.m, self.num_class), dtype=float)
        self.lower_bound = np.zeros((self.m, self.num_class), dtype=float)
        self.admissible = np.zeros((self.m, self.num_class), dtype=bool)
        self.best_label = np.full(self.m, NO_LABEL, dtype=int)
        self.split = np.zeros(self.m, dtype=bool)
        self.cost = self.size.copy()

        self.prunings = [self.m-1]

        for i, entry in enumerate(self.dataset.data):
            if entry[1] != None:
                self.update(i, entry[1])
Beispiel #11
0
def import_scipy_mat(filename):
    from scipy.io import loadmat
    data = loadmat(filename)
    X = data['X']
    y = data['y']
    zipper = list(zip(X, y))
    np.random.shuffle(zipper)
    X, y = zip(*zipper)
    X, y = np.array(X), np.array(y).reshape(-1)
    return Dataset(X, y)
Beispiel #12
0
def import_scipy_mat(filename):
    from scipy.io import loadmat
    data = loadmat(filename)
    X = data['X']
    y = data['y']
    zipper = list(zip(X, y))
    np.random.shuffle(zipper)
    X, y = zip(*zipper)
    X, y = np.array(X), np.array(y).reshape(-1)
    return Dataset(X, y)
    def _get_scores(self):
        dataset = self.dataset

        _, y = dataset.get_entries()

        unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries()
        dvalue, score = (None, [random()])

        if y.any():
            self.model.train(dataset)

            if isinstance(self.model, ProbabilisticModel):
                dvalue = self.model.predict_proba(X_pool)
            elif isinstance(self.model, ContinuousModel):
                dvalue = self.model.predict_real(X_pool)

            if self.method == 'lc':  # least confident
                score = -np.max(dvalue, axis=1)

            elif self.method == 'sm':  # smallest margin
                if np.shape(dvalue)[1] > 2:
                    # Find 2 largest decision values
                    dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2])
                score = -np.abs(dvalue[:, 0] - dvalue[:, 1])

            elif self.method == 'entropy':
                score = np.sum(-dvalue * np.log(dvalue), axis=1)
        return zip(unlabeled_entry_ids, score)
Beispiel #14
0
    def make_query(self):
        dataset = self.dataset
        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())

        if self.disagreement == 'vote':
            # Let the trained students vote for unlabeled data
            votes = np.zeros((len(X_pool), len(self.students)))
            for i, student in enumerate(self.students):
                votes[:, i] = student.predict(X_pool)

            vote_entropy = self._vote_disagreement(votes)
            ask_idx = self.random_state_.choice(
                    np.where(np.isclose(vote_entropy, np.max(vote_entropy)))[0])

        elif self.disagreement == 'kl_divergence':
            proba = []
            for student in self.students:
                proba.append(student.predict_proba(X_pool))
            proba = np.array(proba).transpose(1, 0, 2).astype(float)

            avg_kl = self._kl_divergence_disagreement(proba)
            ask_idx = self.random_state_.choice(
                    np.where(np.isclose(avg_kl, np.max(avg_kl)))[0])

        return unlabeled_entry_ids[ask_idx]
Beispiel #15
0
 def _labeled_uniform_sample(self, sample_size):
     """sample labeled entries uniformly"""
     labeled_entries = self.dataset.get_labeled_entries()
     samples = [labeled_entries[
         self.random_state_.randint(0, len(labeled_entries))
     ]for _ in range(sample_size)]
     return Dataset(*zip(*samples))
Beispiel #16
0
    def make_query(self):
        dataset = self.dataset
        self.model.train(dataset)

        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())

        if isinstance(self.model, ProbabilisticModel):
            dvalue = self.model.predict_proba(X_pool)
        elif isinstance(self.model, ContinuousModel):
            dvalue = self.model.predict_real(X_pool)

        if self.method == 'lc':  # least confident
            ask_id = np.argmin(np.max(dvalue, axis=1))

        elif self.method == 'sm':  # smallest margin
            if np.shape(dvalue)[1] > 2:
                # Find 2 largest decision values
                dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2])
            margin = np.abs(dvalue[:, 0] - dvalue[:, 1])
            ask_id = np.argmin(margin)

        elif self.method == 'entropy':
            entropy = np.sum(-dvalue * np.log(dvalue), axis=1)
            ask_id = np.argmax(entropy)

        return unlabeled_entry_ids[ask_id]
Beispiel #17
0
    def make_query(self):
        dataset = self.dataset

        unlabeled_entry_ids, scores = zip(*self._get_scores())
        ask_id = self.random_state_.choice(np.where(scores == np.max(scores))[0])

        return unlabeled_entry_ids[ask_id]
Beispiel #18
0
 def _labeled_uniform_sample(self, sample_size):
     """sample labeled entries uniformly"""
     labeled_entries = self.dataset.get_labeled_entries()
     samples = [labeled_entries[
         self.random_state_.randint(0, len(labeled_entries))
     ]for _ in range(sample_size)]
     return Dataset(*zip(*samples))
Beispiel #19
0
def run(trn_ds, tst_ds, lbr, model, qs, quota, j):

    E_in, E_out, l = [], [], []
    model.train(trn_ds)
    E_out.append(1 - model.score(tst_ds))
    counter = 0
    steps = 15
    k = int(j / steps)

    if k == 0:
        k = 1

    for _ in range(0, steps):
        counter = counter + 1
        l = []

        for i in range(0, k):
            ask_id = qs.make_query()
            l.append(ask_id)
            X, _ = zip(*trn_ds.data)
            lb = lbr.label(X[ask_id])
            trn_ds.update(ask_id, lb)

        model.train(trn_ds)
        E_out = np.append(E_out, 1 - model.score(tst_ds))
        #print '-->', len(E_out)
    return E_out, len(E_out), trn_ds, k, j
Beispiel #20
0
    def make_query(self, n=1, return_score=False):
        """Return the index of the sample to be queried and labeled and
        selection score of each sample. Read-only.

        No modification to the internal states.

        Returns
        -------
        ask_ids : list
            The batch of indexes of the next unlabeled samples to be queried and labeled.

        score : list of (index, score) tuple
            Selection score of unlabled entries, the larger the better.

        """
        dataset = self.dataset
        # unlabeled_entry_ids, _ = dataset.get_unlabeled_entries()

        unlabeled_entry_ids, scores = zip(*self._get_scores())
        # ask_ids = np.argmax(scores)
        ask_ids = np.argsort(scores)[-n:][::-1]

        # return unlabeled_entry_ids[ask_id]
        res = [unlabeled_entry_ids[i] for i in ask_ids]
        return res
Beispiel #21
0
    def make_query(self,
                   return_score=False,
                   return_label=False,
                   n_instances=1):
        """Return the index of the sample to be queried and labeled and
        selection score of each sample. Read-only.

        No modification to the internal states.

        Returns
        -------
        ask_id : int
            The index of the next unlabeled sample to be queried and labeled.

        score : list of (index, score) tuple
            Selection score of unlabled entries, the larger the better.

        """
        dataset = self.dataset
        # self.model.train(dataset)

        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())
        X_pool = np.array(X_pool)
        unlabeled_entry_ids = np.array(unlabeled_entry_ids)

        if isinstance(self.model, ProbabilisticModel):
            dvalue = self.model.predict_proba(X_pool)
        elif isinstance(self.model, ContinuousModel):
            dvalue = self.model.predict_real(X_pool)

        if self.method == 'lc':  # least confident
            score = -np.max(dvalue, axis=1)

        elif self.method == 'sm':  # smallest margin
            if np.shape(dvalue)[1] > 2:
                # Find 2 largest decision values
                dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2])
            score = -np.abs(dvalue[:, 0] - dvalue[:, 1])

        elif self.method == 'entropy':
            score = np.sum(-dvalue * np.log(dvalue), axis=1)

        # ask_id = np.argmax(score)

        ask_ids = multi_argmax(score, n_instances=n_instances)

        # pre_label = np.argmax(dvalue, axis=1)
        # ask_label = pre_label[ask_ids]

        if return_score and return_label:
            pre_label = np.argmax(dvalue, axis=1)
            return unlabeled_entry_ids[ask_ids], score[ask_ids], pre_label[
                ask_ids]
        elif return_score:
            return unlabeled_entry_ids[ask_ids], score[ask_ids]
        elif return_label:
            pre_label = np.argmax(dvalue, axis=1)
            return unlabeled_entry_ids[ask_ids], pre_label[ask_ids]
        else:
            return unlabeled_entry_ids[ask_ids]
Beispiel #22
0
    def retrieve_score_list(self):
        dataset = self.dataset
        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())
        if self.disagreement == 'vote':
            # Let the trained students vote for unlabeled data
            votes = np.zeros((len(X_pool), len(self.students)))
            for i, student in enumerate(self.students):
                votes[:, i] = student.predict(X_pool)
            score_list = self._vote_disagreement(votes)
        elif self.disagreement == 'kl_divergence':
            proba = []
            for student in self.students:
                proba.append(student.predict_proba(X_pool))
            proba = np.array(proba).transpose(1, 0, 2).astype(float)
            score_list = self._kl_divergence_disagreement(proba)

        return dict(zip(unlabeled_entry_ids, score_list))
Beispiel #23
0
    def make_query(self):
        dataset = self.dataset
        unlabeled_entry_ids, pool_X = zip(*dataset.get_unlabeled_entries())
        # The input class should be 0-n_classes
        X, y = zip(*dataset.get_labeled_entries())

        pred_embed = np.zeros((len(pool_X), self.embed_dim))
        for i in range(self.embed_dim):
            self.regressors[i].fit(X, self.class_embed[y, i])
            pred_embed[:, i] = self.regressors[i].predict(pool_X)

        dist, _ = self.nn_.kneighbors(pred_embed)
        dist = dist[:, 0]

        ask_idx = self.random_state_.choice(
            np.where(np.isclose(dist, np.max(dist)))[0])
        return unlabeled_entry_ids[ask_idx]
    def make_query(self, n=1):

        dataset = self.dataset
        unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries())
        entry_id = self.random_state_.choice(unlabeled_entry_ids,
                                             size=n,
                                             replace=False)
        return list(entry_id) if n > 1 else entry_id[0]
    def make_query(self):
        dataset = self.dataset
        unlabeled_entry_ids, pool_X = zip(*dataset.get_unlabeled_entries())
        # The input class should be 0-n_classes
        X, y = zip(*dataset.get_labeled_entries())

        pred_embed = np.zeros((len(pool_X), self.embed_dim))
        for i in range(self.embed_dim):
            self.regressors[i].fit(X, self.class_embed[y, i])
            pred_embed[:, i] = self.regressors[i].predict(pool_X)

        dist, _ = self.nn_.kneighbors(pred_embed)
        dist = dist[:, 0]

        ask_idx = self.random_state_.choice(
            np.where(np.isclose(dist, np.max(dist)))[0])
        return unlabeled_entry_ids[ask_idx]
    def make_query(self):
        dataset = self.dataset
        unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries())
        unlabeled_entry_ids = shuffle(unlabeled_entry_ids)

        return_len = min(len(unlabeled_entry_ids), self.batch_size)
        entry_id = unlabeled_entry_ids[:return_len]
#        entry_id = [20, 104,106,64,77,37,164,136,127,130]
        return entry_id
    def __init__(self, *args, **kwargs):
        super(ActiveLearningByLearning, self).__init__(*args, **kwargs)
        self.query_strategies_ = kwargs.pop("query_strategies", None)
        if self.query_strategies_ is None:
            raise TypeError("__init__() missing required keyword-only argument: " "'query_strategies'")
        elif not self.query_strategies_:
            raise ValueError("query_strategies list is empty")

        # check if query_strategies share the same dataset with albl
        for qs in self.query_strategies_:
            if qs.dataset != self.dataset:
                raise ValueError("query_strategies should share the same" "dataset instance with albl")

        # parameters for Exp4.p
        self.delta = kwargs.pop("delta", 0.1)

        # query budget
        self.T = kwargs.pop("T", None)
        if self.T is None:
            raise TypeError("__init__() missing required keyword-only argument: 'T'")

        self.unlabeled_entry_ids, _ = zip(*self.dataset.get_unlabeled_entries())
        self.unlabeled_invert_id_idx = {}
        for i, entry in enumerate(self.dataset.get_unlabeled_entries()):
            self.unlabeled_invert_id_idx[entry[0]] = i

        self.uniform_sampler = kwargs.pop("uniform_sampler", True)
        if not isinstance(self.uniform_sampler, bool):
            raise ValueError("'uniform_sampler' should be {True, False}")

        self.pmin = kwargs.pop("pmin", None)
        n_algorithms = len(self.query_strategies_) + self.uniform_sampler
        if self.pmin and (self.pmin < (1.0 / n_algorithms) or self.pmin < 0):
            raise ValueError("'pmin' should be 0 < pmin < " "1/len(n_active_algorithm)")

        self.exp4p_ = Exp4P(
            query_strategies=self.query_strategies_,
            T=self.T,
            delta=self.delta,
            pmin=self.pmin,
            unlabeled_invert_id_idx=self.unlabeled_invert_id_idx,
            uniform_sampler=self.uniform_sampler,
        )
        self.budget_used = 0

        # classifier instance
        self.model = kwargs.pop("model", None)
        if self.model is None:
            raise TypeError("__init__() missing required keyword-only argument: 'model'")

        random_state = kwargs.pop("random_state", None)
        self.random_state_ = seed_random_state(random_state)

        self.query_dist = None

        self.W = []
        self.queried_hist_ = []
Beispiel #28
0
    def make_query(self):
        dataset = self.dataset
        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())

        # Let the trained students vote for unlabeled data
        votes = np.zeros((len(X_pool), len(self.students)))
        for i, student in enumerate(self.students):
            votes[:, i] = student.predict(X_pool)

        id_disagreement = [(i, dis) for i, dis in
                zip(unlabeled_entry_ids, self.disagreement(votes))]

        disagreement = sorted(id_disagreement, key=lambda id_dis: id_dis[1],
                reverse=True)
        ask_id = self.random_state_.choice(
            [e[0] for e in disagreement if e[1] == disagreement[0][1]])

        return ask_id
    def make_query(self, return_score=False):
        """Return the index of the sample to be queried and labeled and
        selection score of each sample. Read-only.

        No modification to the internal states.

        Returns
        -------
        ask_id : int
            The index of the next unlabeled sample to be queried and labeled.

        score : list of (index, score) tuple
            Selection score of unlabled entries, the larger the better.

        """
        dataset = self.dataset
        self.model.train(dataset)

        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())

        if isinstance(self.model, ProbabilisticModel):
            dvalue = self.model.predict_proba(X_pool)
        elif isinstance(self.model, ContinuousModel):
            dvalue = self.model.predict_real(X_pool)

        if self.method == 'lc':  # least confident
            score = -np.max(dvalue, axis=1)

        elif self.method == 'sm':  # smallest margin
            if np.shape(dvalue)[1] > 2:
                # Find 2 largest decision values
                dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2])
            score = -np.abs(dvalue[:, 0] - dvalue[:, 1])

        elif self.method == 'entropy':
            score = np.sum(-dvalue * np.log(dvalue), axis=1)

        ask_id = np.argmax(score)

        if return_score:
            return unlabeled_entry_ids[ask_id], \
                   list(zip(unlabeled_entry_ids, score))
        else:
            return unlabeled_entry_ids[ask_id]
    def make_query(self):
        dataset = self.dataset
        labeled_pool, Y = zip(*dataset.get_labeled_entries())
        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())

        major_clf = copy.deepcopy(self.major_learner)
        major_clf.train(dataset)
        aux_clf = copy.deepcopy(self.auxiliary_learner)
        aux_clf.train(dataset)

        if self.criterion == 'hlr':
            major_pred = major_clf.predict(X_pool)
            aux_pred = aux_clf.predict(X_pool)
            score = np.abs(major_pred - aux_pred).mean(axis=1)
        elif self.criterion in ['mmr', 'shlr']:
            major_pred = major_clf.predict(X_pool) * 2 - 1

            if 'predict_real' in dir(aux_clf):
                aux_pred = aux_clf.predict_real(X_pool)
            elif 'predict_proba' in dir(aux_clf):
                aux_pred = aux_clf.predict_proba(X_pool) * 2 - 1
            else:
                raise AttributeError("aux_learner did not support either"
                                     "'predict_real' or 'predict_proba'"
                                     "method")

            loss = (major_pred * aux_pred).mean(axis=1)
            if self.criterion == 'mmr':
                score = (1. - major_pred * aux_pred) / 2.
                score = np.sum(score, axis=1)
            elif self.criterion == 'shlr':
                b = self.b
                score = (b - np.clip(major_pred * aux_pred, -b, b)) / 2. / b
                score = np.sum(score, axis=1)
            else:
                raise TypeError(
                    "supported criterion are ['hlr', 'shlr', 'mmr'], the given "
                    "one is: " + self.criterion
                )

        ask_id = self.random_state_.choice(np.where(score == np.max(score))[0])

        return unlabeled_entry_ids[ask_id]
Beispiel #31
0
    def make_query(self):
        labeled_entries = self.dataset.get_labeled_entries()
        Xlabeled, y = zip(*labeled_entries)
        Xlabeled = np.array(Xlabeled)
        y = list(y)

        unlabeled_entries = self.dataset.get_unlabeled_entries()
        unlabeled_entry_ids, X_pool = zip(*unlabeled_entries)

        label_count = self.dataset.get_num_of_labels()

        clf = copy.copy(self.model)
        clf.train(Dataset(Xlabeled, y))

        p = Pool(self.n_jobs)
        errors = p.map(_E, [(Xlabeled, y, x, clf, label_count, self.sigma, self.model) for x in X_pool])
        p.terminate()

        return unlabeled_entry_ids[errors.index(min(errors))]
    def make_query(self):
        dataset = self.dataset
        labeled_pool, Y = zip(*dataset.get_labeled_entries())
        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())

        major_clf = copy.deepcopy(self.major_learner)
        major_clf.train(dataset)
        aux_clf = copy.deepcopy(self.auxiliary_learner)
        aux_clf.train(dataset)

        if self.criterion == 'hlr':
            major_pred = major_clf.predict(X_pool)
            aux_pred = aux_clf.predict(X_pool)
            score = np.abs(major_pred - aux_pred).mean(axis=1)
        elif self.criterion in ['mmr', 'shlr']:
            major_pred = major_clf.predict(X_pool) * 2 - 1

            if 'predict_real' in dir(aux_clf):
                aux_pred = aux_clf.predict_real(X_pool)
            elif 'predict_proba' in dir(aux_clf):
                aux_pred = aux_clf.predict_proba(X_pool) * 2 - 1
            else:
                raise AttributeError("aux_learner did not support either"
                                     "'predict_real' or 'predict_proba'"
                                     "method")

            loss = (major_pred * aux_pred).mean(axis=1)
            if self.criterion == 'mmr':
                score = (1. - major_pred * aux_pred) / 2.
                score = np.sum(score, axis=1)
            elif self.criterion == 'shlr':
                b = self.b
                score = (b - np.clip(major_pred * aux_pred, -b, b)) / 2. / b
                score = np.sum(score, axis=1)
            else:
                raise TypeError(
                    "supported criterion are ['hlr', 'shlr', 'mmr'], the given "
                    "one is: " + self.criterion)

        ask_id = self.random_state_.choice(np.where(score == np.max(score))[0])

        return unlabeled_entry_ids[ask_id]
Beispiel #33
0
    def make_query(self, n_instances=1):
        dataset = self.dataset
        unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries())
        unlabeled_entry_ids = np.array(unlabeled_entry_ids)

        rs = random.sample(range(0, len(unlabeled_entry_ids)), n_instances)
        entry_ids = unlabeled_entry_ids[rs]

        # entry_id = unlabeled_entry_ids[
        #     self.random_state_.randint(0, len(unlabeled_entry_ids))]
        return entry_ids
Beispiel #34
0
    def _get_scores(self):
        dataset = self.dataset
        X, _ = zip(*dataset.data)
        scores = self.base_query_strategy._get_scores()
        _, X_pool = zip(*dataset.get_unlabeled_entries())
        unlabeled_entry_ids, base_scores = zip(*scores)

        self.clustering_method.fit(X)
        pool_cluster = self.clustering_method.predict(X_pool)
        cluster_center = self.clustering_method.cluster_centers_
        similarity = []
        for i in range(len(X_pool)):
            similarity.append(
                self.similarity_metric(
                    X_pool[i].reshape(1, -1),
                    cluster_center[pool_cluster[i]].reshape(1, -1))[0][0])
        similarity = np.asarray(similarity)

        scores = base_scores * similarity**self.beta
        return zip(unlabeled_entry_ids, scores)
    def make_query(self):
        dataset = self.dataset
        self.model.fit(*(dataset.format_sklearn()))

        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())

        mean, var = self.model.predict_mean_var(X_pool)
        score = 1.96 * var**.5 - np.abs(mean)
        ask_id = np.argmax(score)

        return unlabeled_entry_ids[ask_id]
Beispiel #36
0
    def make_query(self):
        labeled_entries = self.dataset.get_labeled_entries()
        Xlabeled, y = zip(*labeled_entries)
        Xlabeled = np.array(Xlabeled)
        y = list(y)

        unlabeled_entries = self.dataset.get_unlabeled_entries()
        unlabeled_entry_ids, X_pool = zip(*unlabeled_entries)

        label_count = self.dataset.get_num_of_labels()

        clf = copy.copy(self.model)
        clf.train(Dataset(Xlabeled, y))

        p = Pool(self.n_jobs)
        errors = p.map(
            _E, [(Xlabeled, y, x, clf, label_count, self.sigma, self.model)
                 for x in X_pool])
        p.terminate()

        return unlabeled_entry_ids[errors.index(min(errors))]
Beispiel #37
0
    def make_query(self):
        dataset = self.dataset
        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())

        # Let the trained students vote for unlabeled data
        votes = np.zeros((len(X_pool), len(self.students)))
        for i, student in enumerate(self.students):
            votes[:, i] = student.predict(X_pool)

        id_disagreement = [
            (i, dis)
            for i, dis in zip(unlabeled_entry_ids, self.disagreement(votes))
        ]

        disagreement = sorted(id_disagreement,
                              key=lambda id_dis: id_dis[1],
                              reverse=True)
        ask_id = self.random_state_.choice(
            [e[0] for e in disagreement if e[1] == disagreement[0][1]])

        return ask_id
Beispiel #38
0
    def make_query(self, return_score=False, n_instances=2):
        """Return the index of the sample to be queried and labeled and
        selection score of each sample. Read-only.

        No modification to the internal states.

        Returns
        -------
        ask_id : int
            The index of the next unlabeled sample to be queried and labeled.

        score : list of (index, score) tuple
            Selection score of unlabled entries, the larger the better.

        """
        dataset = self.dataset
        unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries())

        unlabeled_entry_ids, scores = zip(*self._get_scores())

        unlabeled_entry_ids = np.array(unlabeled_entry_ids)
        scores = np.array(scores)
        # ask_id = np.argmax(scores)
        #
        # if return_score:
        #     return unlabeled_entry_ids[ask_id], \
        #            list(zip(unlabeled_entry_ids, scores))
        # else:
        #     return unlabeled_entry_ids[ask_id]
        ask_ids = multi_argmax(scores, n_instances=n_instances)

        # pre_label = np.argmax(dvalue, axis=1)
        # ask_label = pre_label[ask_ids]

        if return_score:
            return unlabeled_entry_ids[ask_ids], \
                   list(zip(unlabeled_entry_ids, scores))
        else:
            return unlabeled_entry_ids[ask_ids]
Beispiel #39
0
    def make_query(self):
        dataset = self.dataset
        X, Y = zip(*dataset.get_labeled_entries())
        Y = np.array(Y)
        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())
        X_pool = np.array(X_pool)

        clfs = []
        boundaries = []
        for i in range(self.n_labels):
            if len(np.unique(Y[:, i])) == 1:
                clf = DummyClf()
            else:
                clf = copy.deepcopy(self.base_clf)
            clf.train(Dataset(X, Y[:, i]))
            boundaries.append(np.abs(clf.predict_real(X_pool)[:, 1]))
            clfs.append(clf)

        choices = np.where(np.array(boundaries) == np.min(boundaries))[1]
        ask_id = self.random_state_.choice(choices)

        return unlabeled_entry_ids[ask_id]
    def make_query(self):
        dataset = self.dataset
        X, Y = zip(*dataset.get_labeled_entries())
        Y = np.array(Y)
        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())
        X_pool = np.array(X_pool)

        clfs = []
        boundaries = []
        for i in range(self.n_labels):
            if len(np.unique(Y[:, i])) == 1:
                clf = DummyClf()
            else:
                clf = copy.deepcopy(self.base_clf)
            clf.train(Dataset(X, Y[:, i]))
            boundaries.append(np.abs(clf.predict_real(X_pool)[:, 1]))
            clfs.append(clf)

        choices = np.where(np.array(boundaries) == np.min(boundaries))[1]
        ask_id = self.random_state_.choice(choices)

        return unlabeled_entry_ids[ask_id]
Beispiel #41
0
    def format_sklearn(self):
        """
        Returns dataset in (X, y) format for use in scikit-learn.
        Unlabeled entries are ignored.

        Returns
        -------
        X : numpy array, shape = (n_samples, n_features)
            Sample feature set.

        y : numpy array, shape = (n_samples)
            Sample labels.
        """
        X, y = zip(*self.get_labeled_entries())
        return np.array(X), np.array(y)
Beispiel #42
0
    def labeled_uniform_sample(self, sample_size, replace=True):
        """Returns a Dataset object with labeled data only, which is
        resampled uniformly with given sample size.
        Parameter `replace` decides whether sampling with replacement or not.

        Parameters
        ----------
        sample_size
        """
        if replace:
            samples = [
                random.choice(self.get_labeled_entries())
                for _ in range(sample_size)
                ]
        else:
            samples = random.sample(self.get_labeled_entries(), sample_size)
        return Dataset(*zip(*samples))
Beispiel #43
0
    def make_query(self):
        dataset = self.dataset
        self.model.train(dataset)

        unlabeled_entry_ids, X_pool = zip(*dataset.get_unlabeled_entries())

        if self.method == "lc":  # least confident
            ask_id = np.argmin(np.max(self.model.predict_real(X_pool), axis=1))

        elif self.method == "sm":  # smallest margin
            dvalue = self.model.predict_real(X_pool)

            if np.shape(dvalue)[1] > 2:
                # Find 2 largest decision values
                dvalue = -(np.partition(-dvalue, 2, axis=1)[:, :2])

            margin = np.abs(dvalue[:, 0] - dvalue[:, 1])
            ask_id = np.argmin(margin)

        return unlabeled_entry_ids[ask_id]
    def make_query(self):
        dataset = self.dataset
        try:
            unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries())
        except ValueError:
            # might be no more unlabeled data left
            return

        while self.budget_used < self.T:
            self.calc_query()
            ask_idx = self.random_state_.choice(
                np.arange(len(self.unlabeled_invert_id_idx)), size=1, p=self.query_dist
            )[0]
            ask_id = self.unlabeled_entry_ids[ask_idx]

            if ask_id in unlabeled_entry_ids:
                self.budget_used += 1
                return ask_id
            else:
                self.update(ask_id, dataset.data[ask_id][1])

        raise ValueError("Out of query budget")
Beispiel #45
0
 def __init__(self, X=None, y=None):
     if X is None: X = []
     if y is None: y = []
     self.data = list(zip(X, y))
     self.modified = True
     self._update_callback = set()
Beispiel #46
0
 def __init__(self, X=[], y=[]):
     self.data = list(zip(X, y))
     self.modified = True
     self._update_callback = set()
Beispiel #47
0
 def make_query(self):
     dataset = self.dataset
     unlabeled_entry_ids, _ = zip(*dataset.get_unlabeled_entries())
     entry_id = unlabeled_entry_ids[
         self.random_state_.randint(0, len(unlabeled_entry_ids))]
     return entry_id