def test_bioul_tags_to_spans(self): tag_sequence = ["B-PER", "I-PER", "L-PER", "U-PER", "U-LOC", "O"] spans = span_utils.bioul_tags_to_spans(tag_sequence) assert spans == [("PER", (0, 2)), ("PER", (3, 3)), ("LOC", (4, 4))] tag_sequence = ["B-PER", "I-PER", "O"] with pytest.raises(span_utils.InvalidTagSequence): spans = span_utils.bioul_tags_to_spans(tag_sequence)
def test_bioul_tags_to_spans_without_labels(self): tag_sequence = ["B", "I", "L", "U", "U", "O"] spans = span_utils.bioul_tags_to_spans(tag_sequence) assert spans == [("", (0, 2)), ("", (3, 3)), ("", (4, 4))] tag_sequence = ["B", "I", "O"] with pytest.raises(span_utils.InvalidTagSequence): spans = span_utils.bioul_tags_to_spans(tag_sequence)
def test_bioul_tags_to_spans_without_labels(self): tag_sequence = ['B', 'I', 'L', 'U', 'U', 'O'] spans = span_utils.bioul_tags_to_spans(tag_sequence) assert spans == [('', (0, 2)), ('', (3, 3)), ('', (4, 4))] tag_sequence = ['B', 'I', 'O'] with self.assertRaises(span_utils.InvalidTagSequence): spans = span_utils.bioul_tags_to_spans(tag_sequence)
def test_bioul_tags_to_spans(self): tag_sequence = ['B-PER', 'I-PER', 'L-PER', 'U-PER', 'U-LOC', 'O'] spans = span_utils.bioul_tags_to_spans(tag_sequence) assert spans == [('PER', (0, 2)), ('PER', (3, 3)), ('LOC', (4, 4))] tag_sequence = ['B-PER', 'I-PER', 'O'] with self.assertRaises(span_utils.InvalidTagSequence): spans = span_utils.bioul_tags_to_spans(tag_sequence)
def test_bioul_tags_to_spans_without_labels(self): tag_sequence = ['B', 'I', 'L', 'U', 'U', 'O'] spans = span_utils.bioul_tags_to_spans(tag_sequence) assert spans == [('', (0, 2)), ('', (3, 3)), ('', (4, 4))] tag_sequence = ['B', 'I', 'O'] with self.assertRaises(span_utils.InvalidTagSequence): spans = span_utils.bioul_tags_to_spans(tag_sequence)
def test_bioul_tags_to_spans(self): tag_sequence = ['B-PER', 'I-PER', 'L-PER', 'U-PER', 'U-LOC', 'O'] spans = span_utils.bioul_tags_to_spans(tag_sequence) assert spans == [('PER', (0, 2)), ('PER', (3, 3)), ('LOC', (4, 4))] tag_sequence = ['B-PER', 'I-PER', 'O'] with self.assertRaises(span_utils.InvalidTagSequence): spans = span_utils.bioul_tags_to_spans(tag_sequence)
def get_spans(taglist, wordlist): entities = {k: [] for k in available_entity_types_sciERC} spans = bioul_tags_to_spans(taglist) for enttype, (start, end) in spans: entities[enttype].append([start, end + 1, " ".join(wordlist[start : end + 1])]) return entities
def predict_contextual(self, sentence): # cx_results = self.contextual_ner.predict(sentence) tokens = cx_results['words'] cx_spans = bioul_tags_to_spans(cx_results['tags']) cx_spans = [(s, e + 1) for l, (s, e) in cx_spans] # consistent with em return tokens, cx_spans
def __call__(self, predictions: torch.Tensor, gold_labels: torch.Tensor, mask: Optional[torch.Tensor] = None, prediction_map: Optional[torch.Tensor] = None): """ Parameters ---------- predictions : ``torch.Tensor``, required. A tensor of predictions of shape (batch_size, sequence_length, num_classes). gold_labels : ``torch.Tensor``, required. A tensor of integer class label of shape (batch_size, sequence_length). It must be the same shape as the ``predictions`` tensor without the ``num_classes`` dimension. mask: ``torch.Tensor``, optional (default = None). A masking tensor the same size as ``gold_labels``. prediction_map: ``torch.Tensor``, optional (default = None). A tensor of size (batch_size, num_classes) which provides a mapping from the index of predictions to the indices of the label vocabulary. If provided, the output label at each timestep will be ``vocabulary.get_index_to_token_vocabulary(prediction_map[batch, argmax(predictions[batch, t]))``, rather than simply ``vocabulary.get_index_to_token_vocabulary(argmax(predictions[batch, t]))``. This is useful in cases where each Instance in the dataset is associated with a different possible subset of labels from a large label-space (IE FrameNet, where each frame has a different set of possible roles associated with it). """ if mask is None: mask = torch.ones_like(gold_labels) predictions, gold_labels, mask, prediction_map = self.unwrap_to_tensors( predictions, gold_labels, mask, prediction_map) num_classes = predictions.size(-1) if (gold_labels >= num_classes).any(): raise ConfigurationError( "A gold label passed to SpanBasedF1Measure contains an " "id >= {}, the number of classes.".format(num_classes)) sequence_lengths = get_lengths_from_binary_sequence_mask(mask) argmax_predictions = predictions.max(-1)[1] if prediction_map is not None: argmax_predictions = torch.gather(prediction_map, 1, argmax_predictions) gold_labels = torch.gather(prediction_map, 1, gold_labels.long()) argmax_predictions = argmax_predictions.float() # Iterate over timesteps in batch. batch_size = gold_labels.size(0) for i in range(batch_size): sequence_prediction = argmax_predictions[i, :] sequence_gold_label = gold_labels[i, :] length = sequence_lengths[i] if length == 0: # It is possible to call this metric with sequences which are # completely padded. These contribute nothing, so we skip these rows. continue predicted_string_labels = [ self._label_vocabulary[label_id] for label_id in sequence_prediction[:length].tolist() ] gold_string_labels = [ self._label_vocabulary[label_id] for label_id in sequence_gold_label[:length].tolist() ] if self._label_encoding == "BIO": predicted_spans = bio_tags_to_spans(predicted_string_labels, self._ignore_classes) gold_spans = bio_tags_to_spans(gold_string_labels, self._ignore_classes) elif self._label_encoding == "IOB1": predicted_spans = iob1_tags_to_spans(predicted_string_labels, self._ignore_classes) gold_spans = iob1_tags_to_spans(gold_string_labels, self._ignore_classes) elif self._label_encoding == "BIOUL": predicted_spans = bioul_tags_to_spans(predicted_string_labels, self._ignore_classes) gold_spans = bioul_tags_to_spans(gold_string_labels, self._ignore_classes) predicted_spans = self._handle_continued_spans(predicted_spans) gold_spans = self._handle_continued_spans(gold_spans) for span in predicted_spans: if span in gold_spans: self._true_positives[span[0]] += 1 gold_spans.remove(span) else: self._false_positives[span[0]] += 1 # These spans weren't predicted. for span in gold_spans: self._false_negatives[span[0]] += 1
def text_to_instance( self, # type: ignore tokens: List[Token], verb_label: List[int], parseTree: Tree, tags: List[str] = None, fout=None) -> Instance: """ We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ # pylint: disable=arguments-differ # Convert tags to BIOUL QUESTION - BIO or IOB1? # print(f"Tags before: {tags}") if (self.label_encoding == "BIOUL"): if (tags is not None): old_tags = deepcopy(tags) tags = to_bioul(tags, encoding="BIO") try: spans = bioul_tags_to_spans(tags) except InvalidTagSequence: print(f"Old tags: {old_tags}") print(f"New tags: {tags}\n") # Create span matrix from parse tree leftLabelsTree = leftMost(parseTree) rightLabelsTree = rightMost(parseTree) # leaves = [] # right_leaves = [] # get_leaves(parseTree, leaves) # get_leaves(parseTree, right_leaves) # assert(leaves == right_leaves) # leaf2idx = {} # for idx, leaf in enumerate(leaves): # leaf2idx[leaf] = idx leftList = [] rightList = [] addToList(leftLabelsTree, leftList) addToList(rightLabelsTree, rightList) if len(leftList) != len(rightList): raise Exception( f"For tree {parseTree}, leftList and rightList lengths do not match" ) span_matrix = np.zeros([len(tokens), len(tokens)]) for idx in range(len(leftList)): leftLabel, rightLabel = leftList[idx], rightList[idx] if (leftLabel == rightLabel): continue span_matrix[leftLabel, rightLabel] = 1 # print(f"Tags after: {tags}\n") # print(tokens) # print(verb_label) # print(tags) fields: Dict[str, Field] = {} text_field = TextField(tokens, token_indexers=self._token_indexers) fields['tokens'] = text_field fields['verb_indicator'] = SequenceLabelField(verb_label, text_field) if (self.label_encoding == "BIOUL"): fields['span_matrix'] = ArrayField(span_matrix) if all([x == 0 for x in verb_label]): verb = None else: verb = tokens[verb_label.index(1)].text metadata_dict = {"words": [x.text for x in tokens], "verb": verb} if tags: fields['tags'] = SequenceLabelField(tags, text_field) metadata_dict["gold_tags"] = tags fields["metadata"] = MetadataField(metadata_dict) if (fout is not None): srl_dict = {"parse_tree": parseTree, "span_matrix": span_matrix} pickle.dump(srl_dict, fout) return Instance(fields)
def format(self, predictions, sent_char_offset, input_text): tokenized_text = predictions["tokenized_text"] predicted_tasks = predictions.keys() formatted_predictions = {} formatted_predictions["tokenized_text"] = tokenized_text ### Format NER and EMD ### for task_name in ["ner", "emd"]: if task_name in predicted_tasks: decoded_bioul = [] assert len(predictions[task_name]) == 1 spans = bioul_tags_to_spans(predictions[task_name][0]) for tag, (begin, end) in spans: entity = { "type": tag, "begin_token": begin, "end_token": end, "begin_char": sent_char_offset[begin], "end_char": sent_char_offset[end] + len(tokenized_text[end]), "tokenized_text": tokenized_text[begin:(end + 1)], "text": input_text[sent_char_offset[begin]:( sent_char_offset[end] + len(tokenized_text[end]))] } decoded_bioul.append(entity) formatted_predictions[task_name] = decoded_bioul ### Format Relation ### if "relation" in predicted_tasks: decoded_relation_arcs = [] assert len(predictions["relation"]) == 1 for i, relation in enumerate(predictions["relation"][0]): indices = find_indices(relation, lambda x: x != "*") for ind in indices: tag = relation[ind] if tag[:4] == "ARG1": arg1_index, arg1_text = ind, tokenized_text[ind] if tag[:4] == "ARG2": arg2_index, arg2_text = ind, tokenized_text[ind] rel = { "type": tag[5:], "arg1_index": arg1_index, "arg1_text": arg1_text, "arg1_begin_char": sent_char_offset[arg1_index], "arg1_end_char": sent_char_offset[arg1_index] + len(arg1_text), "arg2_index": arg2_index, "arg2_text": arg2_text, "arg2_begin_char": sent_char_offset[arg2_index], "arg2_end_char": sent_char_offset[arg2_index] + len(arg2_text) } decoded_relation_arcs.append(rel) formatted_predictions["relation_arcs"] = decoded_relation_arcs ### Format Coreference ### if "coref" in predicted_tasks: decoded_coref_arcs = [] decoded_coref_clusters = [] assert len(predictions["coref"]) == 1 for cluster in predictions["coref"][0]: ## Format the clusters decoded_cluster = [] for mention in cluster: begin, end = mention m = { "begin": begin, "end": end, "begin_char": sent_char_offset[begin], "end_char": sent_char_offset[end] + len(tokenized_text[end]), "tokenized_text": tokenized_text[begin:(end + 1)], "text": input_text[sent_char_offset[begin]:( sent_char_offset[end] + len(tokenized_text[end]))] } decoded_cluster.append(m) decoded_coref_clusters.append(decoded_cluster) ## Format the arcs for i in range(len(cluster) - 1): mention1_begin, mention1_end = cluster[i] mention2_begin, mention2_end = cluster[i + 1] coref_arc = { "mention1_begin": mention1_begin, "mention1_end": mention1_end, "mention1_begin_char": sent_char_offset[mention1_begin], "mention1_end_char": sent_char_offset[mention1_end] + len(tokenized_text[mention1_end]), "tokenized_text1": tokenized_text[mention1_begin:(mention1_end + 1)], "text1": input_text[sent_char_offset[mention1_begin]:( sent_char_offset[mention1_end] + len(tokenized_text[mention1_end]))], "mention2_begin": mention2_begin, "mention2_end": mention2_end, "mention2_begin_char": sent_char_offset[mention2_begin], "mention2_end_char": sent_char_offset[mention2_end] + len(tokenized_text[mention2_end]), "tokenized_text2": tokenized_text[mention2_begin:(mention2_end + 1)], "text2": input_text[sent_char_offset[mention2_begin]:( sent_char_offset[mention2_end] + len(tokenized_text[mention2_end]))] } decoded_coref_arcs.append(coref_arc) formatted_predictions["coref_arcs"] = decoded_coref_arcs formatted_predictions["coref_clusters"] = decoded_coref_clusters return formatted_predictions