def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput, n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ AvgScore query strategy for multilabel classification. For more details on this query strategy, see Esuli and Sebastiani., Active Learning Strategies for Multi-Label Text Classification (http://dx.doi.org/10.1007/978-3-642-00958-7_12) Args: classifier: The multilabel classifier for which the labels are to be queried. X_pool: The pool of samples to query from. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. Returns: The index of the instance from X_pool chosen to be labelled; the instance from X_pool chosen to be labelled. """ classwise_confidence = classifier.predict_proba(X_pool) classwise_predictions = classifier.predict(X_pool) classwise_scores = classwise_confidence * (classwise_predictions - 1 / 2) classwise_mean = np.mean(classwise_scores, axis=1) if not random_tie_break: query_idx = multi_argmax(classwise_mean, n_instances) else: query_idx = shuffled_argmax(classwise_mean, n_instances) return query_idx, X_pool[query_idx]
def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput, random_tie_break: bool = False) -> np.ndarray: """ SVM binary minimum multilabel active learning strategy. For details see the paper Klaus Brinker, On Active Learning in Multi-label Classification (https://link.springer.com/chapter/10.1007%2F3-540-31314-1_24) Args: classifier: The multilabel classifier for which the labels are to be queried. Must be an SVM model such as the ones from sklearn.svm. X_pool: The pool of samples to query from. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. Returns: The index of the instance from X_pool chosen to be labelled; the instance from X_pool chosen to be labelled. """ decision_function = np.array([svm.decision_function(X_pool) for svm in classifier.estimator.estimators_]).T min_abs_dist = np.min(np.abs(decision_function), axis=1) if not random_tie_break: return np.argmin(min_abs_dist) return shuffled_argmax(min_abs_dist)
def mean_max_loss( classifier: OneVsRestClassifier, X_pool: modALinput, n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ Mean Max Loss query strategy for SVM multilabel classification. For more details on this query strategy, see Li et al., Multilabel SVM active learning for image classification (http://dx.doi.org/10.1109/ICIP.2004.1421535) Args: classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model such as the ones from sklearn.svm. Although the function will execute for other models as well, the mathematical calculations in Li et al. work only for SVM-s. X_pool: The pool of samples to query from. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. Returns: The index of the instance from X_pool chosen to be labelled; the instance from X_pool chosen to be labelled. """ assert len( X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)' loss = _SVM_loss(classifier, X_pool) if not random_tie_break: query_idx = multi_argmax(loss, n_instances) else: query_idx = shuffled_argmax(loss, n_instances) return query_idx, X_pool[query_idx]
def max_disagreement_sampling( committee: BaseCommittee, X: modALinput, n_instances: int = 1, random_tie_break=False, **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]: """ Maximum disagreement sampling strategy. Args: committee: The committee for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement measure function. Returns: The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ disagreement = KL_max_disagreement(committee, X, **disagreement_measure_kwargs) if not random_tie_break: query_idx = multi_argmax(disagreement, n_instances=n_instances) else: query_idx = shuffled_argmax(disagreement, n_instances=n_instances) return query_idx, X[query_idx]
def max_std_sampling(regressor: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break=False, **predict_kwargs) -> Tuple[np.ndarray, modALinput]: """ Regressor standard deviation sampling strategy. Args: regressor: The regressor for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the CommiteeRegressor. Returns: The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ _, std = regressor.predict(X, return_std=True, **predict_kwargs) std = std.reshape(X.shape[0], ) if not random_tie_break: query_idx = multi_argmax(std, n_instances=n_instances) else: query_idx = shuffled_argmax(std, n_instances=n_instances) return query_idx, X[query_idx]
def entropy_sampling(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, **uncertainty_measure_kwargs) -> np.ndarray: """ Entropy sampling query strategy. Selects the instances where the class probabilities have the largest entropy. Args: classifier: The classifier for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. Returns: The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs) if not random_tie_break: return multi_argmax(entropy, n_instances=n_instances) return shuffled_argmax(entropy, n_instances=n_instances)
def margin_sampling(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, **uncertainty_measure_kwargs) -> np.ndarray: """ Margin sampling query strategy. Selects the instances where the difference between the first most likely and second most likely classes are the smallest. Args: classifier: The classifier for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. Returns: The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs) if not random_tie_break: return multi_argmax(-margin, n_instances=n_instances) return shuffled_argmax(-margin, n_instances=n_instances)
def uncertainty_sampling( classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: """ Uncertainty sampling query strategy. Selects the least sure instances for labelling. Args: classifier: The classifier for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. Returns: The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs) if not random_tie_break: query_idx = multi_argmax(uncertainty, n_instances=n_instances) else: query_idx = shuffled_argmax(uncertainty, n_instances=n_instances) return query_idx, X.iloc[query_idx]
def _query(self, X, pool_idx, n_instances=1, proba=None): proba = proba[pool_idx] if not self.random_tie_break: query_idx = multi_argmax(proba[:, 1], n_instances=n_instances) else: query_idx = shuffled_argmax(proba[:, 1], n_instances=n_instances) return pool_idx[query_idx], X[pool_idx[query_idx]]
def _query(self, X, pool_idx, n_instances=1, proba=None): uncertainty = 1 - np.max(proba[pool_idx], axis=1) if not self.random_tie_break: query_idx = multi_argmax(uncertainty, n_instances=n_instances) else: query_idx = shuffled_argmax(uncertainty, n_instances=n_instances) return pool_idx[query_idx], X[pool_idx[query_idx]]
def max_sampling(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, pool_idx=None, query_kwargs={}, **kwargs) -> Tuple[np.ndarray, modALinput]: """ Maximum sampling query strategy. Selects the samples with the highest prediction probability. Parameters ---------- classifier: BaseEstimator The classifier for which the labels are to be queried. X: modALinput The pool of samples to query from. n_instances: int Number of samples to be queried. random_tie_break: bool If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. **kwargs: Keyword arguments to be passed for the prediction measure function. Returns ------- np.ndarray, modALinput The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ n_samples = X.shape[0] if pool_idx is None: pool_idx = np.arange(n_samples) # First attempt to get the probabilities from the dictionary. proba = query_kwargs.get('pred_proba', []) if len(proba) != n_samples: try: proba = classifier.predict_proba(X, **kwargs) except NotFittedError: proba = np.ones(shape=(n_samples, )) query_kwargs['pred_proba'] = proba proba = proba[pool_idx] if not random_tie_break: query_idx = multi_argmax(proba[:, 1], n_instances=n_instances) else: query_idx = shuffled_argmax(proba[:, 1], n_instances=n_instances) for idx in query_idx: query_kwargs['current_queries'][pool_idx[idx]] = "max" return pool_idx[query_idx], X[pool_idx[query_idx]]
def custom_sampling(classifier, X_pool): popularity_colidx = 3 popularity_median = np.median(X_pool[:, popularity_colidx]) bool_arr = np.apply_along_axis( lambda row: row[popularity_colidx] > popularity_median, 1, X_pool) X_pool = X_pool[bool_arr] try: classwise_uncertainty = classifier.predict_proba(X_pool) except NotFittedError: return np.ones(shape=(X_pool.shape[0], )) uncertainty = 1 - np.max(classwise_uncertainty, axis=1) # query_idx = multi_argmax(uncertainty, n_instances=1) query_idx = shuffled_argmax(uncertainty, n_instances=1) return query_idx, X_pool[query_idx]
def uncertainty_sampling( classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, pool_idx=None, query_kwargs={}, **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]: """ Uncertainty sampling query strategy. Selects the least sure instances for labelling. Parameters ---------- classifier: BaseEstimator The classifier for which the labels are to be queried. X: modALinput The pool of samples to query from. n_instances: int Number of samples to be queried. random_tie_break: bool If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function. Returns ------- np.ndarray, modALinput The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ n_samples = X.shape[0] if pool_idx is None: pool_idx = np.arange(n_samples) query_kwargs['pred_proba'] = [] uncertainty = classifier_uncertainty(classifier, X[pool_idx], query_kwargs, **uncertainty_measure_kwargs) if not random_tie_break: query_idx = multi_argmax(uncertainty, n_instances=n_instances) else: query_idx = shuffled_argmax(uncertainty, n_instances=n_instances) return query_idx, X[query_idx]
def vote_entropy_sampling( committee: BaseCommittee, X: modALinput, n_instances: int = 1, random_tie_break=False, **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]: disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs) version_size = 0 for i in disagreement: if i != 0: version_size += 1 committee.version_sizes.append(version_size) if not random_tie_break: query_idx = multi_argmax(disagreement, n_instances=n_instances) else: query_idx = shuffled_argmax(disagreement, n_instances=n_instances) return query_idx, X[query_idx]
def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary', p_subsample: np.float = 1.0, n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]: """ Expected error reduction query strategy. References: Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf) Args: learner: The ActiveLearner object for which the expected error is to be estimated. X: The samples. loss: The loss function to be used. Can be 'binary' or 'log'. p_subsample: Probability of keeping a sample from the pool when calculating expected error. Significantly improves runtime for large sample pools. n_instances: The number of instances to be sampled. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. Returns: The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0' assert loss in ['binary', 'log'], 'loss must be \'binary\' or \'log\'' expected_error = np.zeros(shape=(len(X), )) possible_labels = np.unique(learner.y_training) try: X_proba = learner.predict_proba(X) except NotFittedError: # TODO: implement a proper cold-start return 0, X[0] cloned_estimator = clone(learner.estimator) for x_idx, x in enumerate(X): # subsample the data if needed if np.random.rand() <= p_subsample: # estimate the expected error for y_idx, y in enumerate(possible_labels): X_new = data_vstack((learner.X_training, x.reshape(1, -1))) y_new = data_vstack((learner.y_training, np.array(y).reshape(1, ))) cloned_estimator.fit(X_new, y_new) refitted_proba = cloned_estimator.predict_proba(X) if loss is 'binary': loss = _proba_uncertainty(refitted_proba) elif loss is 'log': loss = _proba_entropy(refitted_proba) expected_error[x_idx] += np.sum(loss)*X_proba[x_idx, y_idx] else: expected_error[x_idx] = np.inf if not random_tie_break: query_idx = multi_argmax(expected_error, n_instances) else: query_idx = shuffled_argmax(expected_error, n_instances) return query_idx, X[query_idx]