import numpy as np # ---------IndexCollection a = [1, 2, 3] # a_ind = alibox.IndexCollection(a) # Or create by importing the module from alipy.index import IndexCollection a_ind = IndexCollection(a) # add a single index, warn if there is a repeated element. a_ind.add(4) # discard a single index, warn if not exist. a_ind.discard(4) # add a batch of indexes. a_ind.update([4, 5]) # discard a batch of indexes. a_ind.difference_update([1, 2]) print(a_ind) # ---------MultiLabelIndexCollection------------- from alipy.index import MultiLabelIndexCollection multi_lab_ind1 = MultiLabelIndexCollection([(0, 1), (0, 2), (0, (3, 4)), (1, (0, 1))], label_size=5) multi_lab_ind1.update((0, 0)) multi_lab_ind1.update([(1, 2), (1, (3, 4))]) multi_lab_ind1.update([(2, )]) multi_lab_ind1.difference_update([(0, )]) print(multi_lab_ind1) # matlab style 1d index supporting b = [1, 4, 11]
from sklearn.linear_model import LogisticRegression from alipy.query_strategy import QueryInstanceUncertainty from alipy.index import IndexCollection from alipy.oracle import MatrixRepository # Your labeled set X_lab = np.random.randn(100, 10) y_lab = np.random.randint(low=0, high=2, size=100) # The unlabeled pool, the labels of unlabeled data can be anything. The algorithm will not use them. X_unlab = np.random.rand(100, 10) y_place_holder = np.random.randint(low=0, high=2, size=100) # Initialize a query strategy. unc = QueryInstanceUncertainty(X=np.vstack((X_unlab, X_lab)), y=np.hstack((y_place_holder, y_lab))) unlab_ind = IndexCollection( np.arange(100)) # Indexes of your test set for querying label_ind = IndexCollection(np.arange(start=100, stop=200)) # Indexes of your train set labeled_repo = MatrixRepository( examples=X_lab, labels=y_lab, indexes=label_ind) # Create a repository to store the labeled instances # Initialize your model model = LogisticRegression() model.fit(X_lab, y_lab) # Set the stopping criterion for i in range(50): # Use a sklearn model to select instances. select_ind = unc.select(label_index=label_ind, unlabel_index=unlab_ind,
def select(self, label_index, unlabel_index, batch_size=1): """ Select the unlabel data in batch mode. Parameters ---------- label_index: {list, np.ndarray, IndexCollection} The indexes of labeled samples. It should be a 1d array of indexes (column major, start from 0) or MultiLabelIndexCollection or a list of tuples with 2 elements, in which, the 1st element is the index of instance and the 2nd element is the index of labels. unlabel_index: {list, np.ndarray, IndexCollection} The indexes of unlabeled samples. It should be a 1d array of indexes (column major, start from 0) or MultiLabelIndexCollection or a list of tuples with 2 elements, in which, the 1st element is the index of instance and the 2nd element is the index of labels. batch_size: int, optional (default=1) Selection batch size. Returns ------- selected_ind: list The selected indexes. """ if isinstance(label_index, (list, np.ndarray)): label_index = IndexCollection(label_index) elif isinstance(label_index, MultiLabelIndexCollection): label_index = IndexCollection(label_index.get_unbroken_instances()) elif not isinstance(label_index, IndexCollection): raise TypeError("index type error") if isinstance(unlabel_index, (list, np.ndarray)): unlabel_index = IndexCollection(unlabel_index) elif isinstance(unlabel_index, MultiLabelIndexCollection): unlabel_index = IndexCollection( unlabel_index.get_unbroken_instances()) elif not isinstance(unlabel_index, IndexCollection): raise TypeError("index type error") if len(unlabel_index) <= batch_size: return list(unlabel_index) select_index = [] for i in range(batch_size): selected = self.sequential_select(label_index, unlabel_index) label_index.update(selected) unlabel_index.difference_update(selected) select_index.append((selected, )) return select_index
def sequential_select(self, label_index, unlabel_index): """ Select one unlabel-data at a time. Parameters ---------- label_index: {list, np.ndarray, IndexCollection} The indexes of labeled samples. It should be a 1d array of indexes (column major, start from 0) or MultiLabelIndexCollection or a list of tuples with 2 elements, in which, the 1st element is the index of instance and the 2nd element is the index of labels. unlabel_index: {list, np.ndarray, IndexCollection} The indexes of unlabeled samples. It should be a 1d array of indexes (column major, start from 0) or MultiLabelIndexCollection or a list of tuples with 2 elements, in which, the 1st element is the index of instance and the 2nd element is the index of labels. Returns ------- selected_ind: int The selected index. """ if isinstance(label_index, (list, np.ndarray)): label_index = IndexCollection(label_index) elif isinstance(label_index, MultiLabelIndexCollection): label_index = IndexCollection(label_index.get_unbroken_instances()) elif not isinstance(label_index, IndexCollection): raise TypeError("index type error") if isinstance(unlabel_index, (list, np.ndarray)): unlabel_index = IndexCollection(unlabel_index) elif isinstance(unlabel_index, MultiLabelIndexCollection): unlabel_index = IndexCollection( unlabel_index.get_unbroken_instances()) elif not isinstance(unlabel_index, IndexCollection): raise TypeError("index type error") if len(unlabel_index) <= 1: return list(unlabel_index) X_pool = self.X[unlabel_index] clf = _BinaryRelevance(self.base_clf) clf.train(self.X[label_index], self.y[label_index]) real = clf.predict_real(X_pool) pred = clf.predict(X_pool) # Separation Margin pos = np.copy(real) pos[real <= 0] = np.inf neg = np.copy(real) neg[real >= 0] = -np.inf separation_margin = pos.min(axis=1) - neg.max(axis=1) uncertainty = 1. / separation_margin # Label Cardinality Inconsistency average_pos_lbl = self.y[label_index].mean(axis=0).sum() label_cardinality = np.sqrt((pred.sum(axis=1) - average_pos_lbl)**2) candidate_idx_set = set() for b in self.betas: # score shape = (len(X_pool), ) score = uncertainty**b * label_cardinality**(1. - b) for idx in np.where(score == np.max(score))[0]: candidate_idx_set.add(idx) candidates = list(candidate_idx_set) approx_err = [] for idx in candidates: br = _BinaryRelevance(self.base_clf) br.train(np.vstack((self.X[label_index], X_pool[idx])), np.vstack((self.y[label_index], pred[idx]))) br_real = br.predict_real(X_pool) pos = np.copy(br_real) pos[br_real < 0] = 1 pos = np.max((1. - pos), axis=1) neg = np.copy(br_real) neg[br_real > 0] = -1 neg = np.max((1. + neg), axis=1) err = neg + pos approx_err.append(np.sum(err)) choices = np.where(np.array(approx_err) == np.min(approx_err))[0] ask_idx = candidates[self.random_state_.choice(choices)] return unlabel_index[ask_idx]
def sequential_select(self, label_index, unlabel_index): """ Select one unlabel-sample at a time. Parameters ---------- label_index: {list, np.ndarray, IndexCollection} The indexes of labeled samples. It should be a 1d array of indexes (column major, start from 0) or MultiLabelIndexCollection or a list of tuples with 2 elements, in which, the 1st element is the index of instance and the 2nd element is the index of labels. unlabel_index: {list, np.ndarray, IndexCollection} The indexes of unlabeled samples. It should be a 1d array of indexes (column major, start from 0) or MultiLabelIndexCollection or a list of tuples with 2 elements, in which, the 1st element is the index of instance and the 2nd element is the index of labels. Returns ------- selected_ind: int The selected index. """ if isinstance(label_index, (list, np.ndarray)): label_index = IndexCollection(label_index) elif isinstance(label_index, MultiLabelIndexCollection): label_index = IndexCollection(label_index.get_unbroken_instances()) elif not isinstance(label_index, IndexCollection): raise TypeError("index type error") if isinstance(unlabel_index, (list, np.ndarray)): unlabel_index = IndexCollection(unlabel_index) elif isinstance(unlabel_index, MultiLabelIndexCollection): unlabel_index = IndexCollection( unlabel_index.get_unbroken_instances()) elif not isinstance(unlabel_index, IndexCollection): raise TypeError("index type error") if len(unlabel_index) <= 1: return list(unlabel_index) labeled_pool = self.X[label_index] X_pool = self.X[unlabel_index] br = _BinaryRelevance(self.br_base) br.train(self.X[label_index], self.y[label_index]) trnf = br.predict_proba(labeled_pool) poolf = br.predict_proba(X_pool) f = poolf * 2 - 1 trnf = np.sort(trnf, axis=1)[:, ::-1] trnf /= np.tile(trnf.sum(axis=1).reshape(-1, 1), (1, trnf.shape[1])) if len(np.unique(self.y.sum(axis=1))) == 1: lr = DummyClf() else: lr = self.logistic_regression_ lr.fit(trnf, self.y[label_index].sum(axis=1)) idx_poolf = np.argsort(poolf, axis=1)[:, ::-1] poolf = np.sort(poolf, axis=1)[:, ::-1] poolf /= np.tile(poolf.sum(axis=1).reshape(-1, 1), (1, poolf.shape[1])) pred_num_lbl = lr.predict(poolf).astype(int) yhat = -1 * np.ones((len(X_pool), self.n_labels), dtype=int) for i, p in enumerate(pred_num_lbl): yhat[i, idx_poolf[i, :p]] = 1 score = ((1 - yhat * f) / 2).sum(axis=1) ask_id = self.random_state_.choice(np.where(score == np.max(score))[0]) return unlabel_index[ask_id]