def classifier_margin(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray: """ Classification margin uncertainty of the classifier for the provided samples. This uncertainty measure takes the first and second most likely predictions and takes the difference of their probabilities, which is the margin. Args: classifier: The classifier for which the prediction margin is to be measured. X: The samples for which the prediction margin of classification is to be measured. **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier. Returns: Margin uncertainty, which is the difference of the probabilities of first and second most likely predictions. """ try: classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs) except NotFittedError: return np.zeros(shape=(X.shape[0], )) if classwise_uncertainty.shape[1] == 1: return np.zeros(shape=(classwise_uncertainty.shape[0],)) part = np.partition(-classwise_uncertainty, 1, axis=1) margin = - part[:, 0] + part[:, 1] return margin
def _predict_proba( self, X: np.ndarray, model: BaseEstimator, task_type: int, Y_train: Optional[np.ndarray] = None, ) -> np.ndarray: def send_warnings_to_log( message: Union[Warning, str], category: Type[Warning], filename: str, lineno: int, file: Optional[TextIO] = None, line: Optional[str] = None, ) -> None: self.logger.debug('%s:%s: %s:%s' % (filename, lineno, str(category), message)) return with warnings.catch_warnings(): warnings.showwarning = send_warnings_to_log Y_pred = model.predict_proba(X, batch_size=1000) if Y_train is None: raise ValueError("Y_train is required for classification problems") Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred
def get_preds_probas( est: BaseEstimator, X_test: DataFrame, y_test: Series, mapper_dict: Dict ) -> DataFrame: """ Get prediction probabilities (if available) or return true and predicted labels """ df_preds = DataFrame(est.predict(X_test), index=X_test.index) if hasattr(est.named_steps["clf"], "predict_proba"): # Get prediction probabilities (if available) df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index) # Append prediction and prediction probabilities df_summ = concat([df_preds, df_probas], axis=1) df_summ.columns = ["predicted_label"] + [ f"probability_of_{i}" for i in range(0, len(np.unique(y_test))) ] # Get label (class) with maximum prediction probability for each row df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1) df_summ["probability_of_max_class"] = df_probas.max(axis=1) # Compare .predict_proba() and manually extracted prediction # probability lhs = df_summ["max_class_number_manually"] rhs = df_summ["predicted_label"].replace(mapper_dict) assert (lhs == rhs).eq(True).all() else: df_summ = df_preds.copy() # Get true label df_summ.insert(0, "true_label", y_test) return df_summ
def _expected_future_utility(model: BaseEstimator, test_set: np.ndarray, budget: int, target_label: int): """ The expected future utility of all remaining points is the sum top `budget` number of probabilities that the model predicts on the test set. This is assuming that the utility function is the number of targets found, and that we can only make `budget` queries. Args: model (BaseEstimator): Model trained on training set + potential new point test_set (ndarray): Test set for the model budget (int): number of points that we will be able to query target_label (int): Index of target label Returns: (float) Expected utility """ # Predict the probability of each entry in the test set probs = model.predict_proba(test_set) positives = probs[:, target_label] # sum only the top `budget` probabilities! Even if there are more, we can # only possibly gain `budget` more targets. klargest = positives.argpartition(-budget)[-budget:] u = np.sum(positives[klargest]) return u
def proba_hist(model: BaseEstimator, X_train: np.ndarray) -> plt.Figure: probs = model.predict_proba(X_train)[:, 1] # Prob(+ve diagnosis) fig, ax = plt.subplots() ax.hist(probs) ax.set_xlabel("Predicted probability of heart disease") ax.set_ylabel("Count") return fig
def max_sampling(classifier: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break: bool = False, pool_idx=None, query_kwargs={}, **kwargs) -> Tuple[np.ndarray, modALinput]: """ Maximum sampling query strategy. Selects the samples with the highest prediction probability. Parameters ---------- classifier: BaseEstimator The classifier for which the labels are to be queried. X: modALinput The pool of samples to query from. n_instances: int Number of samples to be queried. random_tie_break: bool If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. **kwargs: Keyword arguments to be passed for the prediction measure function. Returns ------- np.ndarray, modALinput The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ n_samples = X.shape[0] if pool_idx is None: pool_idx = np.arange(n_samples) # First attempt to get the probabilities from the dictionary. proba = query_kwargs.get('pred_proba', []) if len(proba) != n_samples: try: proba = classifier.predict_proba(X, **kwargs) except NotFittedError: proba = np.ones(shape=(n_samples, )) query_kwargs['pred_proba'] = proba proba = proba[pool_idx] if not random_tie_break: query_idx = multi_argmax(proba[:, 1], n_instances=n_instances) else: query_idx = shuffled_argmax(proba[:, 1], n_instances=n_instances) for idx in query_idx: query_kwargs['current_queries'][pool_idx[idx]] = "max" return pool_idx[query_idx], X[pool_idx[query_idx]]
def _model_predict(self, model: BaseEstimator, data: pd.DataFrame) -> np.array: if self._task._task_type == BINARY_CLASSIFICATION: predictions = model.predict_proba(data) elif self._task._task_type == MULTI_CLASS_CLASSIFICATION: predictions = model.predict(data) elif self._task._task_type == REGRESSION: predictions = model.predict(data) return predictions
def evaluate(self, model: BaseEstimator, num_repetitions: int, *corruptions: DataCorruption): schema = self.schema_from_train_data() baseline_predictions = model.predict_proba(self._task.test_data) baseline_score = self._task.score_on_test_data(baseline_predictions) results = [] # Repeatedly corrupt the test data for corruption in corruptions: corrupted_scores = [] anomalies = [] for _ in range(0, num_repetitions): test_data_copy = self._task.test_data.copy(deep=True) corrupted_data = corruption.transform(test_data_copy) # Determine whether tfdv finds anomalies in the data corrupted_data_stats = tfdv.generate_statistics_from_dataframe( corrupted_data) tfdv_anomalies = tfdv.validate_statistics( statistics=corrupted_data_stats, schema=schema) schema_anomalies = tfdv_anomalies.anomaly_info # Compute the prediction score on the test data corrupted_predictions = model.predict_proba(corrupted_data) corrupted_score = self._task.score_on_test_data( corrupted_predictions) anomalies.append(schema_anomalies) corrupted_scores.append(corrupted_score) results.append( SchemaValidationResult(corruption, anomalies, baseline_score, corrupted_scores)) return results
def classifier_uncertainty(classifier: BaseEstimator, X: modALinput, query_kwargs: dict = None, **predict_proba_kwargs) -> np.ndarray: # calculate uncertainty for each point provided try: classwise_uncertainty = classifier.predict_proba( X, **predict_proba_kwargs) except NotFittedError: return np.ones(shape=(X.shape[0], )) if query_kwargs is not None: query_kwargs['pred_proba'] = classwise_uncertainty # for each point, select the maximum uncertainty uncertainty = 1 - np.max(classwise_uncertainty, axis=1) return uncertainty
def classifier_entropy(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray: """ Entropy of predictions of the for the provided samples. Args: classifier: The classifier for which the prediction entropy is to be measured. X: The samples for which the prediction entropy is to be measured. **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier. Returns: Entropy of the class probabilities. """ try: classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs) except NotFittedError: return np.zeros(shape=(X.shape[0], )) return np.transpose(entropy(np.transpose(classwise_uncertainty)))
def train_ss_ensemble( clf: BaseEstimator, params: SSEnsembleParams, X: np.ndarray, y: np.ndarray, lb_mask: np.ndarray, ): rng = np.random.RandomState(params.random_state) # We want to return out of bag predictions ulb_mask = ~lb_mask # TODO: not really necessary but we could set them all to zero from the outside as well y = y.copy() y[ulb_mask] = 0 ulb_indices = ulb_mask.nonzero()[0] y_oob_sum = np.zeros(len(y)) y_oob_hit = np.zeros(len(y)) for i in range(params.n_estimators): bag_ulb_indices = rng.choice(ulb_indices, size=params.n_samples) bag_lb_indices = lb_mask.nonzero()[0] bag_indices = np.concatenate([bag_ulb_indices, bag_lb_indices]) X_bag = X[bag_indices] y_bag = y[bag_indices] oob_mask = np.ones(len(y), dtype="bool") oob_mask[bag_indices] = False X_oob = X[oob_mask] clf = clone(clf) clf.fit(X_bag, y_bag) y_oob = clf.predict_proba(X_oob) y_oob_sum[oob_mask] += y_oob[:, 1] y_oob_hit[oob_mask] += 1 return y_oob_sum / y_oob_hit
def _predict(model: BaseEstimator, x: pd.DataFrame, _type_of_target: str): if _type_of_target in ('binary', 'multiclass'): if hasattr(model, "predict_proba"): proba = model.predict_proba(x) elif hasattr(model, "decision_function"): warnings.warn( 'Since {} does not have predict_proba method, ' 'decision_function is used for the prediction instead.'. format(type(model))) proba = model.decision_function(x) else: raise RuntimeError( 'Estimator in classification problem should have ' 'either predict_proba or decision_function') if proba.ndim == 1: return proba else: return proba[:, 1] if proba.shape[1] == 2 else proba else: return model.predict(x)
def apply_model_to_array( model: BaseEstimator, array: np.ndarray, nodata: float, nodata_idx: int, count: int = 1, dtype: str = "float32", predict_proba: bool = False, **kwargs, ) -> np.ndarray: """Applies a model to an array of covariates. Covariate array should be of shape (nbands, nrows, ncols). Args: model: object with a `model.predict()` function array: array of shape (nbands, nrows, ncols) with pixel values nodata: numeric nodata value to apply to the output array nodata_idx: array of bools with shape (nbands, nrows, ncols) containing nodata locations count: number of bands in the prediction output dtype: prediction array dtype predict_proba: use model.predict_proba() instead of model.predict() **kwargs: additonal keywords to pass to model.predict(). For MaxentModels, this would include transform="logistic" Returns: ypred_window: Array of shape (nrows, ncols) with model predictions """ # only apply to valid pixels valid = ~nodata_idx.any(axis=0) covariates = array[:, valid].transpose() ypred = model.predict( covariates, **kwargs) if not predict_proba else model.predict_proba( covariates, **kwargs) # reshape to the original window size rows, cols = valid.shape ypred_window = np.zeros((count, rows, cols), dtype=dtype) + nodata ypred_window[:, valid] = ypred.transpose() return ypred_window
def classifier_uncertainty(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray: """ Classification uncertainty of the classifier for the provided samples. Args: classifier: The classifier for which the uncertainty is to be measured. X: The samples for which the uncertainty of classification is to be measured. **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier. Returns: Classifier uncertainty, which is 1 - P(prediction is correct). """ # calculate uncertainty for each point provided try: classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs) except NotFittedError: return np.ones(shape=(X.shape[0], )) # for each point, select the maximum uncertainty uncertainty = 1 - np.max(classwise_uncertainty, axis=1) return uncertainty
def _predict_proba(self, X: np.ndarray, pipeline: BaseEstimator, Y_train: Optional[np.ndarray] = None) -> np.ndarray: """ A wrapper function to handle the prediction of classification tasks. It also makes sure that the predictions has the same dimensionality as the expected labels Args: X (np.ndarray): A set of features to feed to the pipeline pipeline (BaseEstimator): A model that will take the features X return a prediction y This pipeline must be a classification estimator that supports the predict_proba method. Y_train (Optional[np.ndarray]): Returns: (np.ndarray): The predictions of pipeline for the given features X """ @no_type_check def send_warnings_to_log(message, category, filename, lineno, file=None, line=None): self.logger.debug('%s:%s: %s:%s' % (filename, lineno, category.__name__, message)) return with warnings.catch_warnings(): warnings.showwarning = send_warnings_to_log Y_pred = pipeline.predict_proba(X, batch_size=1000) Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred
def label(self, model: BaseEstimator, x: np.array, target: int): probs = model.predict_proba([x]) probs = probs.reshape(2) return np.random.binomial(1, probs[1])
def label(self, model: BaseEstimator, x: np.array, target: int): probs = model.predict_proba([x]) probs = probs.reshape(2) raise np.argmax(probs)
def __calc_rnsb(self, target_embeddings_dict: List[Dict[str, np.ndarray]], classifier: BaseEstimator) -> Tuple[np.float_, dict]: """Calculate the RNSB metric. Parameters ---------- target_embeddings_dict : Dict[str, np.ndarray] dict with the target words and their embeddings. classifier : BaseEstimator Trained scikit-learn classifier in the previous step. Returns ------- Tuple[np.float_, dict] return the calculated kl_divergence and negative_sentiment_probabilities in that order. """ # join the embeddings and the word sets in their respective arrays target_embeddings_sets = [ list(target_dict.values()) for target_dict in target_embeddings_dict ] target_words_sets = [ list(target_dict.keys()) for target_dict in target_embeddings_dict ] # get the probabilities associated with each target word vector probabilities = np.array([ classifier.predict_proba(target_embeddings) for target_embeddings in target_embeddings_sets ]) # extract only the negative sentiment probability for each word negative_probabilities = np.array( [probability[:, 1] for probability in probabilities]) # flatten the array negative_probabilities = np.concatenate([ negative_probabilities_arr.flatten() for negative_probabilities_arr in negative_probabilities ]) # normalization of the probabilities sum_of_negative_probabilities = np.sum(negative_probabilities) normalized_negative_probabilities = np.array( negative_probabilities / sum_of_negative_probabilities) # get the uniform dist uniform_dist = np.ones( normalized_negative_probabilities.shape[0] ) * 1 / normalized_negative_probabilities.shape[0] # calc the kl divergence kl_divergence = entropy(normalized_negative_probabilities, uniform_dist) flatten_target_words = [ item for sublist in target_words_sets for item in sublist ] # set the probabilities for each word in a dict. negative_sentiment_probabilities = { word: prob for word, prob in zip(flatten_target_words, negative_probabilities) } return kl_divergence, negative_sentiment_probabilities
def _predict(model: BaseEstimator, x: pd.DataFrame, _predict_proba: bool): if _predict_proba: proba = model.predict_proba(x) return proba[:, 1] if proba.shape[1] == 2 else proba else: return model.predict(x)