Exemple #1
0
    def _calculate_uncertainty_batch(self, batch: InstanceBatch, progress_bar: Tqdm = None) -> None:
        uncertainty_df = defaultdict(list)
        ids, predictions, labels = batch
        for idx, prediction, label in zip(ids, predictions, labels):
            for w, word in enumerate(prediction['words']):
                for model in self.predictor._model.all_model_keys:

                    tag_mean_probability = prediction[f'{model}_class_probabilities'][w]
                    tag_std_probability  = prediction[f'{model}_class_prob_std'][w]
                    actual_label_idx = label[w]
                    predicted_label_idx = np.argmax(tag_mean_probability)

                    uncertainty_df['instance_id'].append(idx)
                    uncertainty_df['word_id'].append(w)
                    uncertainty_df['model'].append(model)
                    uncertainty_df['word'].append(word)

                    uncertainty_df['actual_tag'].append(
                        self.predictor._model.vocab.get_token_from_index(
                            actual_label_idx,
                            namespace=self.predictor._model.label_namespace
                        )
                    )

                    uncertainty_df['predicted_tag'].append(
                        self.predictor._model.vocab.get_token_from_index(
                            predicted_label_idx,
                            namespace=self.predictor._model.label_namespace
                        )
                    )

                    uncertainty_df['actual_confidence_mean'].append(tag_mean_probability[actual_label_idx])
                    uncertainty_df['actual_confidence_std'].append(tag_std_probability[actual_label_idx])
                    uncertainty_df['predicted_confidence_mean'].append(tag_mean_probability[predicted_label_idx])
                    uncertainty_df['predicted_confidence_std'].append(tag_std_probability[predicted_label_idx])

                    uncertainty_df['mean_probability_distribution'].append(tag_mean_probability)

            progress_bar.update(1)
        return uncertainty_df
Exemple #2
0
    def _calculate_feature_importance_batch(self, batch: InstanceBatch, progress_bar: Tqdm = None) -> None:
        feature_importance_df = defaultdict(list)

        ids, labeled_batch, actual_labels = batch
        batch_text = [[li[fn].tokens for fn in self.field_names] for li in labeled_batch]
        fields = [list(self.field_names) for _ in range(len(labeled_batch))]
        predicted_labels = [li['label'].label for li in labeled_batch]
        seed = [self.seed for _ in range(len(labeled_batch))]

        for interpreter in self.feature_importance_interpreters + self.attention_interpreters:

            if progress_bar:
                progress_bar.set_description(f"{interpreter.id}: interpreting {len(labeled_batch)} instances")

            # Some feature importance measures are too memory-intensive to run with larger batch sizes
            # These numbers are based on empirical tests with a standard 16GB gpu
            if 'shap' in interpreter.id or 'deep' in interpreter.id or 'intgrad' in interpreter.id:
                batch_scores = []
                for sub_batch in utils.batch(labeled_batch, 2):
                    batch_scores.extend(interpreter.saliency_interpret_instances(sub_batch).values())
            else:
                batch_scores = interpreter.saliency_interpret_instances(labeled_batch).values()

            # # There can be more than one array of scores for an instance (e.g. in the pair sequence case)
            scores = [[np.asarray(scoreset) for scoreset in v.values()] for v in batch_scores]

            feature_importance_df['scores'].extend(scores)
            feature_importance_df['seed'].extend(seed)
            feature_importance_df['instance_id'].extend(ids)
            feature_importance_df['instance_text'].extend(batch_text)
            feature_importance_df['instance_fields'].extend(fields)
            feature_importance_df['feature_importance_measure'].extend([interpreter.id for _ in range(len(labeled_batch))])
            feature_importance_df['predicted'].extend(predicted_labels)
            feature_importance_df['actual'].extend(actual_labels)

            if progress_bar:
                progress_bar.update(1)

        return feature_importance_df