예제 #1
0
import numpy as np
# ---------IndexCollection
a = [1, 2, 3]
# a_ind = alibox.IndexCollection(a)
# Or create by importing the module
from alipy.index import IndexCollection

a_ind = IndexCollection(a)
# add a single index, warn if there is a repeated element.
a_ind.add(4)
# discard a single index, warn if not exist.
a_ind.discard(4)
# add a batch of indexes.
a_ind.update([4, 5])
# discard a batch of indexes.
a_ind.difference_update([1, 2])
print(a_ind)

# ---------MultiLabelIndexCollection-------------
from alipy.index import MultiLabelIndexCollection
multi_lab_ind1 = MultiLabelIndexCollection([(0, 1), (0, 2), (0, (3, 4)),
                                            (1, (0, 1))],
                                           label_size=5)
multi_lab_ind1.update((0, 0))
multi_lab_ind1.update([(1, 2), (1, (3, 4))])
multi_lab_ind1.update([(2, )])
multi_lab_ind1.difference_update([(0, )])
print(multi_lab_ind1)

# matlab style 1d index supporting
b = [1, 4, 11]
예제 #2
0
from sklearn.linear_model import LogisticRegression
from alipy.query_strategy import QueryInstanceUncertainty
from alipy.index import IndexCollection
from alipy.oracle import MatrixRepository

# Your labeled set
X_lab = np.random.randn(100, 10)
y_lab = np.random.randint(low=0, high=2, size=100)
# The unlabeled pool, the labels of unlabeled data can be anything. The algorithm will not use them.
X_unlab = np.random.rand(100, 10)
y_place_holder = np.random.randint(low=0, high=2, size=100)

# Initialize a query strategy.
unc = QueryInstanceUncertainty(X=np.vstack((X_unlab, X_lab)),
                               y=np.hstack((y_place_holder, y_lab)))
unlab_ind = IndexCollection(
    np.arange(100))  # Indexes of your test set for querying
label_ind = IndexCollection(np.arange(start=100,
                                      stop=200))  # Indexes of your train set
labeled_repo = MatrixRepository(
    examples=X_lab, labels=y_lab,
    indexes=label_ind)  # Create a repository to store the labeled instances

# Initialize your model
model = LogisticRegression()
model.fit(X_lab, y_lab)

# Set the stopping criterion
for i in range(50):
    # Use a sklearn model to select instances.
    select_ind = unc.select(label_index=label_ind,
                            unlabel_index=unlab_ind,
예제 #3
0
파일: multi_label.py 프로젝트: yyht/ALiPy
    def select(self, label_index, unlabel_index, batch_size=1):
        """
            Select the unlabel data in batch mode.
        Parameters
        ----------
        label_index: {list, np.ndarray, IndexCollection}
            The indexes of labeled samples. It should be a 1d array of indexes (column major, start from 0) or
            MultiLabelIndexCollection or a list of tuples with 2 elements, in which,
            the 1st element is the index of instance and the 2nd element is the index of labels.

        unlabel_index: {list, np.ndarray, IndexCollection}
            The indexes of unlabeled samples. It should be a 1d array of indexes (column major, start from 0) or
            MultiLabelIndexCollection or a list of tuples with 2 elements, in which,
            the 1st element is the index of instance and the 2nd element is the index of labels.

        batch_size: int, optional (default=1)
            Selection batch size.

        Returns
        -------
        selected_ind: list
            The selected indexes.        
        """
        if isinstance(label_index, (list, np.ndarray)):
            label_index = IndexCollection(label_index)
        elif isinstance(label_index, MultiLabelIndexCollection):
            label_index = IndexCollection(label_index.get_unbroken_instances())
        elif not isinstance(label_index, IndexCollection):
            raise TypeError("index type error")
        if isinstance(unlabel_index, (list, np.ndarray)):
            unlabel_index = IndexCollection(unlabel_index)
        elif isinstance(unlabel_index, MultiLabelIndexCollection):
            unlabel_index = IndexCollection(
                unlabel_index.get_unbroken_instances())
        elif not isinstance(unlabel_index, IndexCollection):
            raise TypeError("index type error")

        if len(unlabel_index) <= batch_size:
            return list(unlabel_index)

        select_index = []
        for i in range(batch_size):
            selected = self.sequential_select(label_index, unlabel_index)
            label_index.update(selected)
            unlabel_index.difference_update(selected)
            select_index.append((selected, ))

        return select_index
예제 #4
0
파일: multi_label.py 프로젝트: yyht/ALiPy
    def sequential_select(self, label_index, unlabel_index):
        """
            Select one unlabel-data at a time.
        Parameters
        ----------
        label_index: {list, np.ndarray, IndexCollection}
            The indexes of labeled samples. It should be a 1d array of indexes (column major, start from 0) or
            MultiLabelIndexCollection or a list of tuples with 2 elements, in which,
            the 1st element is the index of instance and the 2nd element is the index of labels.

        unlabel_index: {list, np.ndarray, IndexCollection}
            The indexes of unlabeled samples. It should be a 1d array of indexes (column major, start from 0) or
            MultiLabelIndexCollection or a list of tuples with 2 elements, in which,
            the 1st element is the index of instance and the 2nd element is the index of labels.
        Returns
        -------
        selected_ind: int
            The selected index.
        """

        if isinstance(label_index, (list, np.ndarray)):
            label_index = IndexCollection(label_index)
        elif isinstance(label_index, MultiLabelIndexCollection):
            label_index = IndexCollection(label_index.get_unbroken_instances())
        elif not isinstance(label_index, IndexCollection):
            raise TypeError("index type error")
        if isinstance(unlabel_index, (list, np.ndarray)):
            unlabel_index = IndexCollection(unlabel_index)
        elif isinstance(unlabel_index, MultiLabelIndexCollection):
            unlabel_index = IndexCollection(
                unlabel_index.get_unbroken_instances())
        elif not isinstance(unlabel_index, IndexCollection):
            raise TypeError("index type error")

        if len(unlabel_index) <= 1:
            return list(unlabel_index)

        X_pool = self.X[unlabel_index]

        clf = _BinaryRelevance(self.base_clf)
        clf.train(self.X[label_index], self.y[label_index])
        real = clf.predict_real(X_pool)
        pred = clf.predict(X_pool)

        # Separation Margin
        pos = np.copy(real)
        pos[real <= 0] = np.inf
        neg = np.copy(real)
        neg[real >= 0] = -np.inf
        separation_margin = pos.min(axis=1) - neg.max(axis=1)
        uncertainty = 1. / separation_margin

        # Label Cardinality Inconsistency
        average_pos_lbl = self.y[label_index].mean(axis=0).sum()
        label_cardinality = np.sqrt((pred.sum(axis=1) - average_pos_lbl)**2)

        candidate_idx_set = set()
        for b in self.betas:
            # score shape = (len(X_pool), )
            score = uncertainty**b * label_cardinality**(1. - b)
            for idx in np.where(score == np.max(score))[0]:
                candidate_idx_set.add(idx)

        candidates = list(candidate_idx_set)

        approx_err = []
        for idx in candidates:
            br = _BinaryRelevance(self.base_clf)
            br.train(np.vstack((self.X[label_index], X_pool[idx])),
                     np.vstack((self.y[label_index], pred[idx])))
            br_real = br.predict_real(X_pool)

            pos = np.copy(br_real)
            pos[br_real < 0] = 1
            pos = np.max((1. - pos), axis=1)

            neg = np.copy(br_real)
            neg[br_real > 0] = -1
            neg = np.max((1. + neg), axis=1)

            err = neg + pos

            approx_err.append(np.sum(err))

        choices = np.where(np.array(approx_err) == np.min(approx_err))[0]
        ask_idx = candidates[self.random_state_.choice(choices)]

        return unlabel_index[ask_idx]
예제 #5
0
파일: multi_label.py 프로젝트: yyht/ALiPy
    def sequential_select(self, label_index, unlabel_index):
        """
            Select one unlabel-sample at a time.
        Parameters
        ----------
        label_index: {list, np.ndarray, IndexCollection}
            The indexes of labeled samples. It should be a 1d array of indexes (column major, start from 0) or
            MultiLabelIndexCollection or a list of tuples with 2 elements, in which,
            the 1st element is the index of instance and the 2nd element is the index of labels.

        unlabel_index: {list, np.ndarray, IndexCollection}
            The indexes of unlabeled samples. It should be a 1d array of indexes (column major, start from 0) or
            MultiLabelIndexCollection or a list of tuples with 2 elements, in which,
            the 1st element is the index of instance and the 2nd element is the index of labels.
        Returns
        -------
        selected_ind: int
            The selected index.
        """
        if isinstance(label_index, (list, np.ndarray)):
            label_index = IndexCollection(label_index)
        elif isinstance(label_index, MultiLabelIndexCollection):
            label_index = IndexCollection(label_index.get_unbroken_instances())
        elif not isinstance(label_index, IndexCollection):
            raise TypeError("index type error")
        if isinstance(unlabel_index, (list, np.ndarray)):
            unlabel_index = IndexCollection(unlabel_index)
        elif isinstance(unlabel_index, MultiLabelIndexCollection):
            unlabel_index = IndexCollection(
                unlabel_index.get_unbroken_instances())
        elif not isinstance(unlabel_index, IndexCollection):
            raise TypeError("index type error")

        if len(unlabel_index) <= 1:
            return list(unlabel_index)

        labeled_pool = self.X[label_index]
        X_pool = self.X[unlabel_index]

        br = _BinaryRelevance(self.br_base)
        br.train(self.X[label_index], self.y[label_index])

        trnf = br.predict_proba(labeled_pool)
        poolf = br.predict_proba(X_pool)
        f = poolf * 2 - 1

        trnf = np.sort(trnf, axis=1)[:, ::-1]
        trnf /= np.tile(trnf.sum(axis=1).reshape(-1, 1), (1, trnf.shape[1]))
        if len(np.unique(self.y.sum(axis=1))) == 1:
            lr = DummyClf()
        else:
            lr = self.logistic_regression_
        lr.fit(trnf, self.y[label_index].sum(axis=1))

        idx_poolf = np.argsort(poolf, axis=1)[:, ::-1]
        poolf = np.sort(poolf, axis=1)[:, ::-1]
        poolf /= np.tile(poolf.sum(axis=1).reshape(-1, 1), (1, poolf.shape[1]))
        pred_num_lbl = lr.predict(poolf).astype(int)

        yhat = -1 * np.ones((len(X_pool), self.n_labels), dtype=int)
        for i, p in enumerate(pred_num_lbl):
            yhat[i, idx_poolf[i, :p]] = 1

        score = ((1 - yhat * f) / 2).sum(axis=1)
        ask_id = self.random_state_.choice(np.where(score == np.max(score))[0])

        return unlabel_index[ask_id]