Ejemplo n.º 1
0
def avg_score(classifier: OneVsRestClassifier,
              X_pool: modALinput,
              n_instances: int = 1,
              random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
    """
    AvgScore query strategy for multilabel classification.

    For more details on this query strategy, see
    Esuli and Sebastiani., Active Learning Strategies for Multi-Label Text Classification
    (http://dx.doi.org/10.1007/978-3-642-00958-7_12)

    Args:
        classifier: The multilabel classifier for which the labels are to be queried.
        X_pool: The pool of samples to query from.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.

    Returns:
        The index of the instance from X_pool chosen to be labelled;
        the instance from X_pool chosen to be labelled.
    """

    classwise_confidence = classifier.predict_proba(X_pool)
    classwise_predictions = classifier.predict(X_pool)
    classwise_scores = classwise_confidence * (classwise_predictions - 1 / 2)
    classwise_mean = np.mean(classwise_scores, axis=1)

    if not random_tie_break:
        query_idx = multi_argmax(classwise_mean, n_instances)
    else:
        query_idx = shuffled_argmax(classwise_mean, n_instances)

    return query_idx, X_pool[query_idx]
Ejemplo n.º 2
0
def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput,
                       random_tie_break: bool = False) -> np.ndarray:
    """
    SVM binary minimum multilabel active learning strategy. For details see the paper
    Klaus Brinker, On Active Learning in Multi-label Classification
    (https://link.springer.com/chapter/10.1007%2F3-540-31314-1_24)

    Args:
        classifier: The multilabel classifier for which the labels are to be queried. Must be an SVM model
            such as the ones from sklearn.svm.
        X_pool: The pool of samples to query from.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.

    Returns:
        The index of the instance from X_pool chosen to be labelled;
        the instance from X_pool chosen to be labelled.
    """

    decision_function = np.array([svm.decision_function(X_pool)
                                  for svm in classifier.estimator.estimators_]).T

    min_abs_dist = np.min(np.abs(decision_function), axis=1)

    if not random_tie_break:
        return np.argmin(min_abs_dist)

    return shuffled_argmax(min_abs_dist)
Ejemplo n.º 3
0
def mean_max_loss(
        classifier: OneVsRestClassifier,
        X_pool: modALinput,
        n_instances: int = 1,
        random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
    """
    Mean Max Loss query strategy for SVM multilabel classification.

    For more details on this query strategy, see
    Li et al., Multilabel SVM active learning for image classification
    (http://dx.doi.org/10.1109/ICIP.2004.1421535)

    Args:
        classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model
            such as the ones from sklearn.svm. Although the function will execute for other models as well,
            the mathematical calculations in Li et al. work only for SVM-s.
        X_pool: The pool of samples to query from.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.

    Returns:
        The index of the instance from X_pool chosen to be labelled;
        the instance from X_pool chosen to be labelled.
    """

    assert len(
        X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)'
    loss = _SVM_loss(classifier, X_pool)

    if not random_tie_break:
        query_idx = multi_argmax(loss, n_instances)
    else:
        query_idx = shuffled_argmax(loss, n_instances)

    return query_idx, X_pool[query_idx]
Ejemplo n.º 4
0
def max_disagreement_sampling(
        committee: BaseCommittee,
        X: modALinput,
        n_instances: int = 1,
        random_tie_break=False,
        **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Maximum disagreement sampling strategy.

    Args:
        committee: The committee for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **disagreement_measure_kwargs: Keyword arguments to be passed for the disagreement
         measure function.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    disagreement = KL_max_disagreement(committee, X,
                                       **disagreement_measure_kwargs)

    if not random_tie_break:
        query_idx = multi_argmax(disagreement, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)

    return query_idx, X[query_idx]
Ejemplo n.º 5
0
def max_std_sampling(regressor: BaseEstimator,
                     X: modALinput,
                     n_instances: int = 1,
                     random_tie_break=False,
                     **predict_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Regressor standard deviation sampling strategy.

    Args:
        regressor: The regressor for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the CommiteeRegressor.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    _, std = regressor.predict(X, return_std=True, **predict_kwargs)
    std = std.reshape(X.shape[0], )

    if not random_tie_break:
        query_idx = multi_argmax(std, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(std, n_instances=n_instances)

    return query_idx, X[query_idx]
Ejemplo n.º 6
0
def entropy_sampling(classifier: BaseEstimator,
                     X: modALinput,
                     n_instances: int = 1,
                     random_tie_break: bool = False,
                     **uncertainty_measure_kwargs) -> np.ndarray:
    """
    Entropy sampling query strategy. Selects the instances where the class probabilities
    have the largest entropy.

    Args:
        classifier: The classifier for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty
            measure function.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs)

    if not random_tie_break:
        return multi_argmax(entropy, n_instances=n_instances)

    return shuffled_argmax(entropy, n_instances=n_instances)
Ejemplo n.º 7
0
def margin_sampling(classifier: BaseEstimator,
                    X: modALinput,
                    n_instances: int = 1,
                    random_tie_break: bool = False,
                    **uncertainty_measure_kwargs) -> np.ndarray:
    """
    Margin sampling query strategy. Selects the instances where the difference between
    the first most likely and second most likely classes are the smallest.
    Args:
        classifier: The classifier for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty
            measure function.
    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs)

    if not random_tie_break:
        return multi_argmax(-margin, n_instances=n_instances)

    return shuffled_argmax(-margin, n_instances=n_instances)
Ejemplo n.º 8
0
def uncertainty_sampling(
        classifier: BaseEstimator,
        X: modALinput,
        n_instances: int = 1,
        random_tie_break: bool = False,
        **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Uncertainty sampling query strategy. Selects the least sure instances for labelling.

    Args:
        classifier: The classifier for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty
            measure function.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    uncertainty = classifier_uncertainty(classifier, X,
                                         **uncertainty_measure_kwargs)

    if not random_tie_break:
        query_idx = multi_argmax(uncertainty, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(uncertainty, n_instances=n_instances)

    return query_idx, X.iloc[query_idx]
Ejemplo n.º 9
0
    def _query(self, X, pool_idx, n_instances=1, proba=None):
        proba = proba[pool_idx]
        if not self.random_tie_break:
            query_idx = multi_argmax(proba[:, 1], n_instances=n_instances)
        else:
            query_idx = shuffled_argmax(proba[:, 1], n_instances=n_instances)

        return pool_idx[query_idx], X[pool_idx[query_idx]]
    def _query(self, X, pool_idx, n_instances=1, proba=None):
        uncertainty = 1 - np.max(proba[pool_idx], axis=1)
        if not self.random_tie_break:
            query_idx = multi_argmax(uncertainty, n_instances=n_instances)
        else:
            query_idx = shuffled_argmax(uncertainty, n_instances=n_instances)

        return pool_idx[query_idx], X[pool_idx[query_idx]]
def max_sampling(classifier: BaseEstimator,
                 X: modALinput,
                 n_instances: int = 1,
                 random_tie_break: bool = False,
                 pool_idx=None,
                 query_kwargs={},
                 **kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Maximum sampling query strategy.
    Selects the samples with the highest prediction probability.

    Parameters
    ----------
    classifier: BaseEstimator
        The classifier for which the labels are to be queried.
    X: modALinput
        The pool of samples to query from.
    n_instances: int
        Number of samples to be queried.
    random_tie_break: bool
        If True, shuffles utility scores to randomize the order.
        This can be used to break the tie when the highest
        utility score is not unique.
    **kwargs:
        Keyword arguments to be passed for
        the prediction measure function.

    Returns
    -------
    np.ndarray, modALinput
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    n_samples = X.shape[0]
    if pool_idx is None:
        pool_idx = np.arange(n_samples)

    # First attempt to get the probabilities from the dictionary.
    proba = query_kwargs.get('pred_proba', [])
    if len(proba) != n_samples:
        try:
            proba = classifier.predict_proba(X, **kwargs)
        except NotFittedError:
            proba = np.ones(shape=(n_samples, ))
        query_kwargs['pred_proba'] = proba

    proba = proba[pool_idx]
    if not random_tie_break:
        query_idx = multi_argmax(proba[:, 1], n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(proba[:, 1], n_instances=n_instances)

    for idx in query_idx:
        query_kwargs['current_queries'][pool_idx[idx]] = "max"

    return pool_idx[query_idx], X[pool_idx[query_idx]]
Ejemplo n.º 12
0
def custom_sampling(classifier, X_pool):
    popularity_colidx = 3
    popularity_median = np.median(X_pool[:, popularity_colidx])
    bool_arr = np.apply_along_axis(
        lambda row: row[popularity_colidx] > popularity_median, 1, X_pool)
    X_pool = X_pool[bool_arr]
    try:
        classwise_uncertainty = classifier.predict_proba(X_pool)
    except NotFittedError:
        return np.ones(shape=(X_pool.shape[0], ))
    uncertainty = 1 - np.max(classwise_uncertainty, axis=1)
    # query_idx = multi_argmax(uncertainty, n_instances=1)
    query_idx = shuffled_argmax(uncertainty, n_instances=1)
    return query_idx, X_pool[query_idx]
Ejemplo n.º 13
0
def uncertainty_sampling(
        classifier: BaseEstimator,
        X: modALinput,
        n_instances: int = 1,
        random_tie_break: bool = False,
        pool_idx=None,
        query_kwargs={},
        **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Uncertainty sampling query strategy.
    Selects the least sure instances for labelling.

    Parameters
    ----------
    classifier: BaseEstimator
        The classifier for which the labels are to be queried.
    X: modALinput
        The pool of samples to query from.
    n_instances: int
        Number of samples to be queried.
    random_tie_break: bool
        If True, shuffles utility scores to randomize the order.
        This can be used to break the tie when the highest
        utility score is not unique.
    **uncertainty_measure_kwargs:
        Keyword arguments to be passed for
        the uncertainty measure function.

    Returns
    -------
    np.ndarray, modALinput
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    n_samples = X.shape[0]
    if pool_idx is None:
        pool_idx = np.arange(n_samples)
    query_kwargs['pred_proba'] = []

    uncertainty = classifier_uncertainty(classifier, X[pool_idx], query_kwargs,
                                         **uncertainty_measure_kwargs)

    if not random_tie_break:
        query_idx = multi_argmax(uncertainty, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(uncertainty, n_instances=n_instances)

    return query_idx, X[query_idx]
Ejemplo n.º 14
0
def vote_entropy_sampling(
        committee: BaseCommittee,
        X: modALinput,
        n_instances: int = 1,
        random_tie_break=False,
        **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:

    disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs)
    version_size = 0
    for i in disagreement:
        if i != 0:
            version_size += 1

    committee.version_sizes.append(version_size)

    if not random_tie_break:
        query_idx = multi_argmax(disagreement, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)

    return query_idx, X[query_idx]
def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary',
                             p_subsample: np.float = 1.0, n_instances: int = 1,
                             random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
    """
    Expected error reduction query strategy.

    References:
        Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf)

    Args:
        learner: The ActiveLearner object for which the expected error
            is to be estimated.
        X: The samples.
        loss: The loss function to be used. Can be 'binary' or 'log'.
        p_subsample: Probability of keeping a sample from the pool when
            calculating expected error. Significantly improves runtime
            for large sample pools.
        n_instances: The number of instances to be sampled.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.


    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """

    assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
    assert loss in ['binary', 'log'], 'loss must be \'binary\' or \'log\''

    expected_error = np.zeros(shape=(len(X), ))
    possible_labels = np.unique(learner.y_training)

    try:
        X_proba = learner.predict_proba(X)
    except NotFittedError:
        # TODO: implement a proper cold-start
        return 0, X[0]

    cloned_estimator = clone(learner.estimator)

    for x_idx, x in enumerate(X):
        # subsample the data if needed
        if np.random.rand() <= p_subsample:
            # estimate the expected error
            for y_idx, y in enumerate(possible_labels):
                X_new = data_vstack((learner.X_training, x.reshape(1, -1)))
                y_new = data_vstack((learner.y_training, np.array(y).reshape(1, )))

                cloned_estimator.fit(X_new, y_new)
                refitted_proba = cloned_estimator.predict_proba(X)
                if loss is 'binary':
                    loss = _proba_uncertainty(refitted_proba)
                elif loss is 'log':
                    loss = _proba_entropy(refitted_proba)

                expected_error[x_idx] += np.sum(loss)*X_proba[x_idx, y_idx]

        else:
            expected_error[x_idx] = np.inf

    if not random_tie_break:
        query_idx = multi_argmax(expected_error, n_instances)
    else:
        query_idx = shuffled_argmax(expected_error, n_instances)

    return query_idx, X[query_idx]