def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray] ) -> List[Instance]: """ Takes each predicted cluster and makes it into a labeled `Instance` with only that cluster labeled, so we can compute gradients of the loss `on the model's prediction of that cluster`. This lets us run interpretation methods using those gradients. See superclass docstring for more info. """ # Digging into an Instance makes mypy go crazy, because we have all kinds of things where # the type has been lost. So there are lots of `type: ignore`s here... predicted_clusters = outputs["clusters"] span_field: ListField = instance["spans"] # type: ignore instances = [] for cluster in predicted_clusters: new_instance = instance.duplicate() span_labels = [ 0 if (span.span_start, span.span_end) in cluster else -1 # type: ignore for span in span_field ] # type: ignore new_instance.add_field( "span_labels", SequenceLabelField(span_labels, span_field), self._model.vocab ) new_instance["metadata"].metadata["clusters"] = [cluster] # type: ignore instances.append(new_instance) if not instances: # No predicted clusters; we just give an empty coref prediction. new_instance = instance.duplicate() span_labels = [-1] * len(span_field) # type: ignore new_instance.add_field( "span_labels", SequenceLabelField(span_labels, span_field), self._model.vocab ) new_instance["metadata"].metadata["clusters"] = [] # type: ignore instances.append(new_instance) return instances
def labeled_json_to_labeled_instances( self, json_dict: JsonDict) -> Dict[int, Instance]: seq_offset = 0 seq_len = -1 adhoc_vocab = Vocabulary() instances = {} for i, str_i in sorted(map((lambda x: (int(x), x)), json_dict.keys())): inst_obj = json_dict[str_i] if seq_len == -1: seq_len = len(inst_obj['words']) text_field = TextField( [Token(tok['text']) for tok in inst_obj['words']], {}) instance = Instance({'tokens': text_field}) new_instance = instance.duplicate() tags_field = ConstructiveSupertagField( [json_to_cat(tag) for tag in inst_obj['tags']], text_field, [i - seq_offset]) adhoc_vocab.add_tokens_to_namespace(tags_field.labels, 'labels') new_instance.add_field('tags', tags_field) new_instance.index_fields(adhoc_vocab) instances[i] = new_instance if i + 1 - seq_offset == seq_len: seq_offset += seq_len seq_len = -1 return instances
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray] ) -> List[Instance]: new_instance = instance.duplicate() label = numpy.argmax(outputs["probs"]) new_instance.add_field("label", LabelField(int(label), skip_indexing=True)) return [new_instance]
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray] ) -> List[Instance]: new_instance = instance.duplicate() label = numpy.argmax(outputs["probs"]) # Skip indexing, we have integer representations of the strings "entailment", etc. new_instance.add_field("label", LabelField(int(label), skip_indexing=True)) return [new_instance]
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, np.ndarray]) -> Instance: new_instance = instance.duplicate() label = np.argmax(outputs["class_probabilities"]) new_instance.add_field("label", LabelField(int(label), skip_indexing=True)) return [new_instance]
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray] ): new_instance = instance.duplicate() token_field: TextField = instance["tokens"] # type: ignore mask_targets = [Token(target_top_k[0]) for target_top_k in outputs["top_tokens"][0]] new_instance.add_field( "target_ids", TextField(mask_targets, token_field._token_indexers), vocab=self._model.vocab, ) return [new_instance]
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, np.ndarray]) -> List[Instance]: new_instance = instance.duplicate() text_field: TextField = instance["tokens"] for name in self._model.all_model_keys: predicted_tags = np.argmax(outputs[f"{name}_class_probabilities"], axis=-1)[:len(text_field)].tolist() new_instance.add_field( f"{name}_tags", SequenceLabelField(predicted_tags, text_field), self._model.vocab) return [new_instance]
def test_duplicate(self): # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in # a `TextField`. See https://github.com/allenai/allennlp/issues/4270. instance = Instance({ "words": TextField( [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}) }) other = instance.duplicate() assert other == instance # Adding new fields to the original instance should not effect the duplicate. instance.add_field("labels", LabelField("some_label")) assert "labels" not in other.fields assert other != instance # sanity check on the '__eq__' method.
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray]) -> List[Instance]: new_instance = instance.duplicate() span_start = int(outputs["best_span"][0]) span_end = int(outputs["best_span"][1]) start_of_context = ( len(self._dataset_reader._tokenizer.sequence_pair_start_tokens) + len(instance["metadata"]["question_tokens"]) + len(self._dataset_reader._tokenizer.sequence_pair_mid_tokens)) answer_span = SpanField( start_of_context + span_start, start_of_context + span_end, instance["question_with_context"], ) new_instance.add_field("answer_span", answer_span) return [new_instance]
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray]) -> List[Instance]: predicted_tags = outputs["tags"] predicted_spans = [] i = 0 while i < len(predicted_tags): tag = predicted_tags[i] # if its a U, add it to the list if tag[0] == "U": current_tags = [ t if idx == i else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) # if its a B, keep going until you hit an L. elif tag[0] == "B": begin_idx = i while tag[0] != "L": i += 1 tag = predicted_tags[i] end_idx = i current_tags = [ t if begin_idx <= idx <= end_idx else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) i += 1 # Creates a new instance for each contiguous tag instances = [] for labels in predicted_spans: new_instance = instance.duplicate() text_field: TextField = instance["tokens"] new_instance.add_field("tags", SequenceLabelField(labels, text_field), self._model.vocab) new_instance.add_field("ignore_loss_on_o_tags", FlagField(True)) instances.append(new_instance) return instances
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray] ) -> List[Instance]: new_instance = instance.duplicate() # For BiDAF if "best_span" in outputs: span_start_label = outputs["best_span"][0] span_end_label = outputs["best_span"][1] passage_field: SequenceField = new_instance["passage"] # type: ignore new_instance.add_field( "span_start", IndexField(int(span_start_label), passage_field), self._model.vocab ) new_instance.add_field( "span_end", IndexField(int(span_end_label), passage_field), self._model.vocab ) # For NAQANet model. It has the fields: answer_as_passage_spans, answer_as_question_spans, # answer_as_add_sub_expressions, answer_as_counts. We need labels for all. elif "answer" in outputs: answer_type = outputs["answer"]["answer_type"] # When the problem is a counting problem if answer_type == "count": field = ListField([LabelField(int(outputs["answer"]["count"]), skip_indexing=True)]) new_instance.add_field("answer_as_counts", field, self._model.vocab) # When the answer is in the passage elif answer_type == "passage_span": # TODO(mattg): Currently we only handle one predicted span. span = outputs["answer"]["spans"][0] # Convert character span indices into word span indices word_span_start = None word_span_end = None offsets = new_instance["metadata"].metadata["passage_token_offsets"] # type: ignore for index, offset in enumerate(offsets): if offset[0] == span[0]: word_span_start = index if offset[1] == span[1]: word_span_end = index passage_field: SequenceField = new_instance["passage"] # type: ignore field = ListField([SpanField(word_span_start, word_span_end, passage_field)]) new_instance.add_field("answer_as_passage_spans", field, self._model.vocab) # When the answer is an arithmetic calculation elif answer_type == "arithmetic": # The different numbers in the passage that the model encounters sequence_labels = outputs["answer"]["numbers"] numbers_field: ListField = instance["number_indices"] # type: ignore # The numbers in the passage are given signs, that's what we are labeling here. # Negative signs are given the class label 2 (for 0 and 1, the sign matches the # label). labels = [] for label in sequence_labels: if label["sign"] == -1: labels.append(2) else: labels.append(label["sign"]) # There's a dummy number added in the dataset reader to handle passages with no # numbers; it has a label of 0 (not included). labels.append(0) field = ListField([SequenceLabelField(labels, numbers_field)]) new_instance.add_field("answer_as_add_sub_expressions", field, self._model.vocab) # When the answer is in the question elif answer_type == "question_span": span = outputs["answer"]["spans"][0] # Convert character span indices into word span indices word_span_start = None word_span_end = None question_offsets = new_instance["metadata"].metadata[ # type: ignore "question_token_offsets" ] for index, offset in enumerate(question_offsets): if offset[0] == span[0]: word_span_start = index if offset[1] == span[1]: word_span_end = index question_field: SequenceField = new_instance["question"] # type: ignore field = ListField([SpanField(word_span_start, word_span_end, question_field)]) new_instance.add_field("answer_as_question_spans", field, self._model.vocab) return [new_instance]
def predictions_to_labeled_instances( self, instance: Instance, outputs: Dict[str, numpy.ndarray]) -> List[Instance]: """ This function currently only handles BIOUL tags. Imagine an NER model predicts three named entities (each one with potentially multiple tokens). For each individual entity, we create a new Instance that has the label set to only that entity and the rest of the tokens are labeled as outside. We then return a list of those Instances. For example: ```text Mary went to Seattle to visit Microsoft Research U-Per O O U-Loc O O B-Org L-Org ``` We create three instances. ```text Mary went to Seattle to visit Microsoft Research U-Per O O O O O O O Mary went to Seattle to visit Microsoft Research O O O U-LOC O O O O Mary went to Seattle to visit Microsoft Research O O O O O O B-Org L-Org ``` We additionally add a flag to these instances to tell the model to only compute loss on non-O tags, so that we get gradients that are specific to the particular span prediction that each instance represents. """ predicted_tags = outputs["tags"] predicted_spans = [] i = 0 while i < len(predicted_tags): tag = predicted_tags[i] # if its a U, add it to the list if tag[0] == "U": current_tags = [ t if idx == i else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) # if its a B, keep going until you hit an L. elif tag[0] == "B": begin_idx = i while tag[0] != "L": i += 1 tag = predicted_tags[i] end_idx = i current_tags = [ t if begin_idx <= idx <= end_idx else "O" for idx, t in enumerate(predicted_tags) ] predicted_spans.append(current_tags) i += 1 # Creates a new instance for each contiguous tag instances = [] for labels in predicted_spans: new_instance = instance.duplicate() text_field: TextField = instance["tokens"] # type: ignore new_instance.add_field("tags", SequenceLabelField(labels, text_field), self._model.vocab) new_instance.add_field("ignore_loss_on_o_tags", FlagField(True)) instances.append(new_instance) return instances