Exemple #1
0
def classifier_margin(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
    """
    Classification margin uncertainty of the classifier for the provided samples. This uncertainty measure takes the
    first and second most likely predictions and takes the difference of their probabilities, which is the margin.

    Args:
        classifier: The classifier for which the prediction margin is to be measured.
        X: The samples for which the prediction margin of classification is to be measured.
        **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.

    Returns:
        Margin uncertainty, which is the difference of the probabilities of first and second most likely predictions.
    """
    try:
        classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs)
    except NotFittedError:
        return np.zeros(shape=(X.shape[0], ))

    if classwise_uncertainty.shape[1] == 1:
        return np.zeros(shape=(classwise_uncertainty.shape[0],))

    part = np.partition(-classwise_uncertainty, 1, axis=1)
    margin = - part[:, 0] + part[:, 1]

    return margin
    def _predict_proba(
        self,
        X: np.ndarray,
        model: BaseEstimator,
        task_type: int,
        Y_train: Optional[np.ndarray] = None,
    ) -> np.ndarray:
        def send_warnings_to_log(
            message: Union[Warning, str],
            category: Type[Warning],
            filename: str,
            lineno: int,
            file: Optional[TextIO] = None,
            line: Optional[str] = None,
        ) -> None:
            self.logger.debug('%s:%s: %s:%s' %
                              (filename, lineno, str(category), message))
            return

        with warnings.catch_warnings():
            warnings.showwarning = send_warnings_to_log
            Y_pred = model.predict_proba(X, batch_size=1000)

        if Y_train is None:
            raise ValueError("Y_train is required for classification problems")

        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
        return Y_pred
Exemple #3
0
def get_preds_probas(
    est: BaseEstimator, X_test: DataFrame, y_test: Series, mapper_dict: Dict
) -> DataFrame:
    """
    Get prediction probabilities (if available) or return true and predicted
    labels
    """
    df_preds = DataFrame(est.predict(X_test), index=X_test.index)
    if hasattr(est.named_steps["clf"], "predict_proba"):
        # Get prediction probabilities (if available)
        df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index)

        # Append prediction and prediction probabilities
        df_summ = concat([df_preds, df_probas], axis=1)
        df_summ.columns = ["predicted_label"] + [
            f"probability_of_{i}" for i in range(0, len(np.unique(y_test)))
        ]

        # Get label (class) with maximum prediction probability for each row
        df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1)
        df_summ["probability_of_max_class"] = df_probas.max(axis=1)

        # Compare .predict_proba() and manually extracted prediction
        # probability
        lhs = df_summ["max_class_number_manually"]
        rhs = df_summ["predicted_label"].replace(mapper_dict)
        assert (lhs == rhs).eq(True).all()
    else:
        df_summ = df_preds.copy()
    # Get true label
    df_summ.insert(0, "true_label", y_test)
    return df_summ
def _expected_future_utility(model: BaseEstimator, test_set: np.ndarray,
                             budget: int, target_label: int):
    """
    The expected future utility of all remaining points is the sum top `budget`
    number of probabilities that the model predicts on the test set.  This is
    assuming that the utility function is the number of targets found, and that
    we can only make `budget` queries.

    Args:
        model (BaseEstimator): Model trained on training set + potential new point
        test_set (ndarray): Test set for the model
        budget (int): number of points that we will be able to query
        target_label (int): Index of target label

    Returns:
        (float) Expected utility
    """

    # Predict the probability of each entry in the test set
    probs = model.predict_proba(test_set)
    positives = probs[:, target_label]

    # sum only the top `budget` probabilities!  Even if there are more, we can
    # only possibly gain `budget` more targets.
    klargest = positives.argpartition(-budget)[-budget:]
    u = np.sum(positives[klargest])

    return u
def proba_hist(model: BaseEstimator, X_train: np.ndarray) -> plt.Figure:
    probs = model.predict_proba(X_train)[:, 1]  # Prob(+ve diagnosis)
    fig, ax = plt.subplots()
    ax.hist(probs)
    ax.set_xlabel("Predicted probability of heart disease")
    ax.set_ylabel("Count")
    return fig
def max_sampling(classifier: BaseEstimator,
                 X: modALinput,
                 n_instances: int = 1,
                 random_tie_break: bool = False,
                 pool_idx=None,
                 query_kwargs={},
                 **kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Maximum sampling query strategy.
    Selects the samples with the highest prediction probability.

    Parameters
    ----------
    classifier: BaseEstimator
        The classifier for which the labels are to be queried.
    X: modALinput
        The pool of samples to query from.
    n_instances: int
        Number of samples to be queried.
    random_tie_break: bool
        If True, shuffles utility scores to randomize the order.
        This can be used to break the tie when the highest
        utility score is not unique.
    **kwargs:
        Keyword arguments to be passed for
        the prediction measure function.

    Returns
    -------
    np.ndarray, modALinput
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    n_samples = X.shape[0]
    if pool_idx is None:
        pool_idx = np.arange(n_samples)

    # First attempt to get the probabilities from the dictionary.
    proba = query_kwargs.get('pred_proba', [])
    if len(proba) != n_samples:
        try:
            proba = classifier.predict_proba(X, **kwargs)
        except NotFittedError:
            proba = np.ones(shape=(n_samples, ))
        query_kwargs['pred_proba'] = proba

    proba = proba[pool_idx]
    if not random_tie_break:
        query_idx = multi_argmax(proba[:, 1], n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(proba[:, 1], n_instances=n_instances)

    for idx in query_idx:
        query_kwargs['current_queries'][pool_idx[idx]] = "max"

    return pool_idx[query_idx], X[pool_idx[query_idx]]
Exemple #7
0
    def _model_predict(self, model: BaseEstimator, data: pd.DataFrame) -> np.array:

        if self._task._task_type == BINARY_CLASSIFICATION:
            predictions = model.predict_proba(data)

        elif self._task._task_type == MULTI_CLASS_CLASSIFICATION:
            predictions = model.predict(data)

        elif self._task._task_type == REGRESSION:
            predictions = model.predict(data)

        return predictions
Exemple #8
0
    def evaluate(self, model: BaseEstimator, num_repetitions: int,
                 *corruptions: DataCorruption):

        schema = self.schema_from_train_data()

        baseline_predictions = model.predict_proba(self._task.test_data)
        baseline_score = self._task.score_on_test_data(baseline_predictions)

        results = []

        # Repeatedly corrupt the test data
        for corruption in corruptions:
            corrupted_scores = []
            anomalies = []
            for _ in range(0, num_repetitions):
                test_data_copy = self._task.test_data.copy(deep=True)
                corrupted_data = corruption.transform(test_data_copy)

                # Determine whether tfdv finds anomalies in the data
                corrupted_data_stats = tfdv.generate_statistics_from_dataframe(
                    corrupted_data)
                tfdv_anomalies = tfdv.validate_statistics(
                    statistics=corrupted_data_stats, schema=schema)

                schema_anomalies = tfdv_anomalies.anomaly_info

                # Compute the prediction score on the test data
                corrupted_predictions = model.predict_proba(corrupted_data)
                corrupted_score = self._task.score_on_test_data(
                    corrupted_predictions)

                anomalies.append(schema_anomalies)
                corrupted_scores.append(corrupted_score)

            results.append(
                SchemaValidationResult(corruption, anomalies, baseline_score,
                                       corrupted_scores))

        return results
Exemple #9
0
def classifier_uncertainty(classifier: BaseEstimator,
                           X: modALinput,
                           query_kwargs: dict = None,
                           **predict_proba_kwargs) -> np.ndarray:
    # calculate uncertainty for each point provided
    try:
        classwise_uncertainty = classifier.predict_proba(
            X, **predict_proba_kwargs)
    except NotFittedError:
        return np.ones(shape=(X.shape[0], ))
    if query_kwargs is not None:
        query_kwargs['pred_proba'] = classwise_uncertainty
    # for each point, select the maximum uncertainty
    uncertainty = 1 - np.max(classwise_uncertainty, axis=1)
    return uncertainty
Exemple #10
0
def classifier_entropy(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
    """
    Entropy of predictions of the for the provided samples.

    Args:
        classifier: The classifier for which the prediction entropy is to be measured.
        X: The samples for which the prediction entropy is to be measured.
        **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.

    Returns:
        Entropy of the class probabilities.
    """
    try:
        classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs)
    except NotFittedError:
        return np.zeros(shape=(X.shape[0], ))

    return np.transpose(entropy(np.transpose(classwise_uncertainty)))
Exemple #11
0
def train_ss_ensemble(
    clf: BaseEstimator,
    params: SSEnsembleParams,
    X: np.ndarray,
    y: np.ndarray,
    lb_mask: np.ndarray,
):
    rng = np.random.RandomState(params.random_state)

    # We want to return out of bag predictions
    ulb_mask = ~lb_mask

    # TODO: not really necessary but we could set them all to zero from the outside as well
    y = y.copy()

    y[ulb_mask] = 0
    ulb_indices = ulb_mask.nonzero()[0]

    y_oob_sum = np.zeros(len(y))
    y_oob_hit = np.zeros(len(y))

    for i in range(params.n_estimators):
        bag_ulb_indices = rng.choice(ulb_indices, size=params.n_samples)
        bag_lb_indices = lb_mask.nonzero()[0]

        bag_indices = np.concatenate([bag_ulb_indices, bag_lb_indices])

        X_bag = X[bag_indices]
        y_bag = y[bag_indices]

        oob_mask = np.ones(len(y), dtype="bool")
        oob_mask[bag_indices] = False

        X_oob = X[oob_mask]

        clf = clone(clf)
        clf.fit(X_bag, y_bag)

        y_oob = clf.predict_proba(X_oob)
        y_oob_sum[oob_mask] += y_oob[:, 1]
        y_oob_hit[oob_mask] += 1

    return y_oob_sum / y_oob_hit
Exemple #12
0
 def _predict(model: BaseEstimator, x: pd.DataFrame, _type_of_target: str):
     if _type_of_target in ('binary', 'multiclass'):
         if hasattr(model, "predict_proba"):
             proba = model.predict_proba(x)
         elif hasattr(model, "decision_function"):
             warnings.warn(
                 'Since {} does not have predict_proba method, '
                 'decision_function is used for the prediction instead.'.
                 format(type(model)))
             proba = model.decision_function(x)
         else:
             raise RuntimeError(
                 'Estimator in classification problem should have '
                 'either predict_proba or decision_function')
         if proba.ndim == 1:
             return proba
         else:
             return proba[:, 1] if proba.shape[1] == 2 else proba
     else:
         return model.predict(x)
Exemple #13
0
def apply_model_to_array(
    model: BaseEstimator,
    array: np.ndarray,
    nodata: float,
    nodata_idx: int,
    count: int = 1,
    dtype: str = "float32",
    predict_proba: bool = False,
    **kwargs,
) -> np.ndarray:
    """Applies a model to an array of covariates.

    Covariate array should be of shape (nbands, nrows, ncols).

    Args:
        model: object with a `model.predict()` function
        array: array of shape (nbands, nrows, ncols) with pixel values
        nodata: numeric nodata value to apply to the output array
        nodata_idx: array of bools with shape (nbands, nrows, ncols) containing nodata locations
        count: number of bands in the prediction output
        dtype: prediction array dtype
        predict_proba: use model.predict_proba() instead of model.predict()
        **kwargs: additonal keywords to pass to model.predict().
            For MaxentModels, this would include transform="logistic"

    Returns:
        ypred_window: Array of shape (nrows, ncols) with model predictions
    """
    # only apply to valid pixels
    valid = ~nodata_idx.any(axis=0)
    covariates = array[:, valid].transpose()
    ypred = model.predict(
        covariates, **kwargs) if not predict_proba else model.predict_proba(
            covariates, **kwargs)

    # reshape to the original window size
    rows, cols = valid.shape
    ypred_window = np.zeros((count, rows, cols), dtype=dtype) + nodata
    ypred_window[:, valid] = ypred.transpose()

    return ypred_window
Exemple #14
0
def classifier_uncertainty(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
    """
    Classification uncertainty of the classifier for the provided samples.

    Args:
        classifier: The classifier for which the uncertainty is to be measured.
        X: The samples for which the uncertainty of classification is to be measured.
        **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.

    Returns:
        Classifier uncertainty, which is 1 - P(prediction is correct).
    """
    # calculate uncertainty for each point provided
    try:
        classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs)
    except NotFittedError:
        return np.ones(shape=(X.shape[0], ))

    # for each point, select the maximum uncertainty
    uncertainty = 1 - np.max(classwise_uncertainty, axis=1)
    return uncertainty
Exemple #15
0
    def _predict_proba(self,
                       X: np.ndarray,
                       pipeline: BaseEstimator,
                       Y_train: Optional[np.ndarray] = None) -> np.ndarray:
        """
        A wrapper function to handle the prediction of classification tasks.
        It also makes sure that the predictions has the same dimensionality
        as the expected labels

        Args:
            X (np.ndarray):
                A set of features to feed to the pipeline
            pipeline (BaseEstimator):
                A model that will take the features X return a prediction y
                This pipeline must be a classification estimator that supports
                the predict_proba method.
            Y_train (Optional[np.ndarray]):
        Returns:
            (np.ndarray):
                The predictions of pipeline for the given features X
        """
        @no_type_check
        def send_warnings_to_log(message,
                                 category,
                                 filename,
                                 lineno,
                                 file=None,
                                 line=None):
            self.logger.debug('%s:%s: %s:%s' %
                              (filename, lineno, category.__name__, message))
            return

        with warnings.catch_warnings():
            warnings.showwarning = send_warnings_to_log
            Y_pred = pipeline.predict_proba(X, batch_size=1000)

        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
        return Y_pred
Exemple #16
0
 def label(self, model: BaseEstimator, x: np.array, target: int):
     probs = model.predict_proba([x])
     probs = probs.reshape(2)
     return np.random.binomial(1, probs[1])
Exemple #17
0
 def label(self, model: BaseEstimator, x: np.array, target: int):
     probs = model.predict_proba([x])
     probs = probs.reshape(2)
     raise np.argmax(probs)
Exemple #18
0
    def __calc_rnsb(self, target_embeddings_dict: List[Dict[str, np.ndarray]],
                    classifier: BaseEstimator) -> Tuple[np.float_, dict]:
        """Calculate the RNSB metric.

        Parameters
        ----------
        target_embeddings_dict : Dict[str, np.ndarray]
            dict with the target words and their embeddings.
        classifier : BaseEstimator
            Trained scikit-learn classifier in the previous step.

        Returns
        -------
        Tuple[np.float_, dict]
            return the calculated kl_divergence and
            negative_sentiment_probabilities in that order.
        """

        # join the embeddings and the word sets in their respective arrays
        target_embeddings_sets = [
            list(target_dict.values())
            for target_dict in target_embeddings_dict
        ]
        target_words_sets = [
            list(target_dict.keys()) for target_dict in target_embeddings_dict
        ]

        # get the probabilities associated with each target word vector
        probabilities = np.array([
            classifier.predict_proba(target_embeddings)
            for target_embeddings in target_embeddings_sets
        ])

        # extract only the negative sentiment probability for each word
        negative_probabilities = np.array(
            [probability[:, 1] for probability in probabilities])

        # flatten the array
        negative_probabilities = np.concatenate([
            negative_probabilities_arr.flatten()
            for negative_probabilities_arr in negative_probabilities
        ])

        # normalization of the probabilities
        sum_of_negative_probabilities = np.sum(negative_probabilities)
        normalized_negative_probabilities = np.array(
            negative_probabilities / sum_of_negative_probabilities)

        # get the uniform dist
        uniform_dist = np.ones(
            normalized_negative_probabilities.shape[0]
        ) * 1 / normalized_negative_probabilities.shape[0]

        # calc the kl divergence
        kl_divergence = entropy(normalized_negative_probabilities,
                                uniform_dist)

        flatten_target_words = [
            item for sublist in target_words_sets for item in sublist
        ]

        # set the probabilities for each word in a dict.
        negative_sentiment_probabilities = {
            word: prob
            for word, prob in zip(flatten_target_words, negative_probabilities)
        }

        return kl_divergence, negative_sentiment_probabilities
 def _predict(model: BaseEstimator, x: pd.DataFrame, _predict_proba: bool):
     if _predict_proba:
         proba = model.predict_proba(x)
         return proba[:, 1] if proba.shape[1] == 2 else proba
     else:
         return model.predict(x)