예제 #1
0
    def test_bioul_tags_to_spans(self):
        tag_sequence = ["B-PER", "I-PER", "L-PER", "U-PER", "U-LOC", "O"]
        spans = span_utils.bioul_tags_to_spans(tag_sequence)
        assert spans == [("PER", (0, 2)), ("PER", (3, 3)), ("LOC", (4, 4))]

        tag_sequence = ["B-PER", "I-PER", "O"]
        with pytest.raises(span_utils.InvalidTagSequence):
            spans = span_utils.bioul_tags_to_spans(tag_sequence)
예제 #2
0
    def test_bioul_tags_to_spans_without_labels(self):
        tag_sequence = ["B", "I", "L", "U", "U", "O"]
        spans = span_utils.bioul_tags_to_spans(tag_sequence)
        assert spans == [("", (0, 2)), ("", (3, 3)), ("", (4, 4))]

        tag_sequence = ["B", "I", "O"]
        with pytest.raises(span_utils.InvalidTagSequence):
            spans = span_utils.bioul_tags_to_spans(tag_sequence)
예제 #3
0
    def test_bioul_tags_to_spans_without_labels(self):
        tag_sequence = ['B', 'I', 'L', 'U', 'U', 'O']
        spans = span_utils.bioul_tags_to_spans(tag_sequence)
        assert spans == [('', (0, 2)), ('', (3, 3)), ('', (4, 4))]

        tag_sequence = ['B', 'I', 'O']
        with self.assertRaises(span_utils.InvalidTagSequence):
            spans = span_utils.bioul_tags_to_spans(tag_sequence)
예제 #4
0
    def test_bioul_tags_to_spans(self):
        tag_sequence = ['B-PER', 'I-PER', 'L-PER', 'U-PER', 'U-LOC', 'O']
        spans = span_utils.bioul_tags_to_spans(tag_sequence)
        assert spans == [('PER', (0, 2)), ('PER', (3, 3)), ('LOC', (4, 4))]

        tag_sequence = ['B-PER', 'I-PER', 'O']
        with self.assertRaises(span_utils.InvalidTagSequence):
            spans = span_utils.bioul_tags_to_spans(tag_sequence)
예제 #5
0
    def test_bioul_tags_to_spans_without_labels(self):
        tag_sequence = ['B', 'I', 'L', 'U', 'U', 'O']
        spans = span_utils.bioul_tags_to_spans(tag_sequence)
        assert spans == [('', (0, 2)), ('', (3, 3)), ('', (4, 4))]

        tag_sequence = ['B', 'I', 'O']
        with self.assertRaises(span_utils.InvalidTagSequence):
            spans = span_utils.bioul_tags_to_spans(tag_sequence)
예제 #6
0
    def test_bioul_tags_to_spans(self):
        tag_sequence = ['B-PER', 'I-PER', 'L-PER', 'U-PER', 'U-LOC', 'O']
        spans = span_utils.bioul_tags_to_spans(tag_sequence)
        assert spans == [('PER', (0, 2)), ('PER', (3, 3)), ('LOC', (4, 4))]

        tag_sequence = ['B-PER', 'I-PER', 'O']
        with self.assertRaises(span_utils.InvalidTagSequence):
            spans = span_utils.bioul_tags_to_spans(tag_sequence)
예제 #7
0
def get_spans(taglist, wordlist):
    entities = {k: [] for k in available_entity_types_sciERC}
    spans = bioul_tags_to_spans(taglist)
    for enttype, (start, end) in spans:
        entities[enttype].append([start, end + 1, " ".join(wordlist[start : end + 1])])

    return entities
예제 #8
0
    def predict_contextual(self, sentence):
        #
        cx_results = self.contextual_ner.predict(sentence)
        tokens = cx_results['words']

        cx_spans = bioul_tags_to_spans(cx_results['tags'])
        cx_spans = [(s, e + 1) for l, (s, e) in cx_spans]  # consistent with em

        return tokens, cx_spans
예제 #9
0
    def __call__(self,
                 predictions: torch.Tensor,
                 gold_labels: torch.Tensor,
                 mask: Optional[torch.Tensor] = None,
                 prediction_map: Optional[torch.Tensor] = None):
        """
        Parameters
        ----------
        predictions : ``torch.Tensor``, required.
            A tensor of predictions of shape (batch_size, sequence_length, num_classes).
        gold_labels : ``torch.Tensor``, required.
            A tensor of integer class label of shape (batch_size, sequence_length). It must be the same
            shape as the ``predictions`` tensor without the ``num_classes`` dimension.
        mask: ``torch.Tensor``, optional (default = None).
            A masking tensor the same size as ``gold_labels``.
        prediction_map: ``torch.Tensor``, optional (default = None).
            A tensor of size (batch_size, num_classes) which provides a mapping from the index of predictions
            to the indices of the label vocabulary. If provided, the output label at each timestep will be
            ``vocabulary.get_index_to_token_vocabulary(prediction_map[batch, argmax(predictions[batch, t]))``,
            rather than simply ``vocabulary.get_index_to_token_vocabulary(argmax(predictions[batch, t]))``.
            This is useful in cases where each Instance in the dataset is associated with a different possible
            subset of labels from a large label-space (IE FrameNet, where each frame has a different set of
            possible roles associated with it).
        """
        if mask is None:
            mask = torch.ones_like(gold_labels)

        predictions, gold_labels, mask, prediction_map = self.unwrap_to_tensors(
            predictions, gold_labels, mask, prediction_map)

        num_classes = predictions.size(-1)
        if (gold_labels >= num_classes).any():
            raise ConfigurationError(
                "A gold label passed to SpanBasedF1Measure contains an "
                "id >= {}, the number of classes.".format(num_classes))

        sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
        argmax_predictions = predictions.max(-1)[1]

        if prediction_map is not None:
            argmax_predictions = torch.gather(prediction_map, 1,
                                              argmax_predictions)
            gold_labels = torch.gather(prediction_map, 1, gold_labels.long())

        argmax_predictions = argmax_predictions.float()

        # Iterate over timesteps in batch.
        batch_size = gold_labels.size(0)
        for i in range(batch_size):
            sequence_prediction = argmax_predictions[i, :]
            sequence_gold_label = gold_labels[i, :]
            length = sequence_lengths[i]

            if length == 0:
                # It is possible to call this metric with sequences which are
                # completely padded. These contribute nothing, so we skip these rows.
                continue

            predicted_string_labels = [
                self._label_vocabulary[label_id]
                for label_id in sequence_prediction[:length].tolist()
            ]
            gold_string_labels = [
                self._label_vocabulary[label_id]
                for label_id in sequence_gold_label[:length].tolist()
            ]

            if self._label_encoding == "BIO":
                predicted_spans = bio_tags_to_spans(predicted_string_labels,
                                                    self._ignore_classes)
                gold_spans = bio_tags_to_spans(gold_string_labels,
                                               self._ignore_classes)
            elif self._label_encoding == "IOB1":
                predicted_spans = iob1_tags_to_spans(predicted_string_labels,
                                                     self._ignore_classes)
                gold_spans = iob1_tags_to_spans(gold_string_labels,
                                                self._ignore_classes)
            elif self._label_encoding == "BIOUL":
                predicted_spans = bioul_tags_to_spans(predicted_string_labels,
                                                      self._ignore_classes)
                gold_spans = bioul_tags_to_spans(gold_string_labels,
                                                 self._ignore_classes)

            predicted_spans = self._handle_continued_spans(predicted_spans)
            gold_spans = self._handle_continued_spans(gold_spans)

            for span in predicted_spans:
                if span in gold_spans:
                    self._true_positives[span[0]] += 1
                    gold_spans.remove(span)
                else:
                    self._false_positives[span[0]] += 1
            # These spans weren't predicted.
            for span in gold_spans:
                self._false_negatives[span[0]] += 1
예제 #10
0
    def text_to_instance(
            self,  # type: ignore
            tokens: List[Token],
            verb_label: List[int],
            parseTree: Tree,
            tags: List[str] = None,
            fout=None) -> Instance:
        """
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ

        # Convert tags to BIOUL QUESTION -  BIO or IOB1?
        # print(f"Tags before: {tags}")

        if (self.label_encoding == "BIOUL"):
            if (tags is not None):
                old_tags = deepcopy(tags)
                tags = to_bioul(tags, encoding="BIO")
                try:
                    spans = bioul_tags_to_spans(tags)
                except InvalidTagSequence:
                    print(f"Old tags: {old_tags}")
                    print(f"New tags: {tags}\n")

            # Create span matrix from parse tree
            leftLabelsTree = leftMost(parseTree)
            rightLabelsTree = rightMost(parseTree)

            # leaves = []
            # right_leaves = []
            # get_leaves(parseTree, leaves)
            # get_leaves(parseTree, right_leaves)
            # assert(leaves == right_leaves)
            # leaf2idx = {}
            # for idx, leaf in enumerate(leaves):
            #     leaf2idx[leaf] = idx

            leftList = []
            rightList = []

            addToList(leftLabelsTree, leftList)
            addToList(rightLabelsTree, rightList)

            if len(leftList) != len(rightList):
                raise Exception(
                    f"For tree {parseTree}, leftList and rightList lengths do not match"
                )

            span_matrix = np.zeros([len(tokens), len(tokens)])

            for idx in range(len(leftList)):
                leftLabel, rightLabel = leftList[idx], rightList[idx]
                if (leftLabel == rightLabel):
                    continue
                span_matrix[leftLabel, rightLabel] = 1

        # print(f"Tags after: {tags}\n")

        # print(tokens)
        # print(verb_label)
        # print(tags)

        fields: Dict[str, Field] = {}
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        fields['tokens'] = text_field
        fields['verb_indicator'] = SequenceLabelField(verb_label, text_field)
        if (self.label_encoding == "BIOUL"):
            fields['span_matrix'] = ArrayField(span_matrix)

        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokens[verb_label.index(1)].text
        metadata_dict = {"words": [x.text for x in tokens], "verb": verb}
        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)
            metadata_dict["gold_tags"] = tags
        fields["metadata"] = MetadataField(metadata_dict)

        if (fout is not None):
            srl_dict = {"parse_tree": parseTree, "span_matrix": span_matrix}
            pickle.dump(srl_dict, fout)

        return Instance(fields)
예제 #11
0
    def format(self, predictions, sent_char_offset, input_text):
        tokenized_text = predictions["tokenized_text"]
        predicted_tasks = predictions.keys()

        formatted_predictions = {}
        formatted_predictions["tokenized_text"] = tokenized_text

        ### Format NER and EMD ###
        for task_name in ["ner", "emd"]:
            if task_name in predicted_tasks:
                decoded_bioul = []
                assert len(predictions[task_name]) == 1

                spans = bioul_tags_to_spans(predictions[task_name][0])
                for tag, (begin, end) in spans:
                    entity = {
                        "type":
                        tag,
                        "begin_token":
                        begin,
                        "end_token":
                        end,
                        "begin_char":
                        sent_char_offset[begin],
                        "end_char":
                        sent_char_offset[end] + len(tokenized_text[end]),
                        "tokenized_text":
                        tokenized_text[begin:(end + 1)],
                        "text":
                        input_text[sent_char_offset[begin]:(
                            sent_char_offset[end] + len(tokenized_text[end]))]
                    }
                    decoded_bioul.append(entity)

                formatted_predictions[task_name] = decoded_bioul

        ### Format Relation ###
        if "relation" in predicted_tasks:
            decoded_relation_arcs = []
            assert len(predictions["relation"]) == 1

            for i, relation in enumerate(predictions["relation"][0]):
                indices = find_indices(relation, lambda x: x != "*")
                for ind in indices:
                    tag = relation[ind]
                    if tag[:4] == "ARG1":
                        arg1_index, arg1_text = ind, tokenized_text[ind]
                    if tag[:4] == "ARG2":
                        arg2_index, arg2_text = ind, tokenized_text[ind]
                rel = {
                    "type": tag[5:],
                    "arg1_index": arg1_index,
                    "arg1_text": arg1_text,
                    "arg1_begin_char": sent_char_offset[arg1_index],
                    "arg1_end_char":
                    sent_char_offset[arg1_index] + len(arg1_text),
                    "arg2_index": arg2_index,
                    "arg2_text": arg2_text,
                    "arg2_begin_char": sent_char_offset[arg2_index],
                    "arg2_end_char":
                    sent_char_offset[arg2_index] + len(arg2_text)
                }
                decoded_relation_arcs.append(rel)

            formatted_predictions["relation_arcs"] = decoded_relation_arcs

        ### Format Coreference ###
        if "coref" in predicted_tasks:
            decoded_coref_arcs = []
            decoded_coref_clusters = []
            assert len(predictions["coref"]) == 1

            for cluster in predictions["coref"][0]:
                ## Format the clusters
                decoded_cluster = []
                for mention in cluster:
                    begin, end = mention
                    m = {
                        "begin":
                        begin,
                        "end":
                        end,
                        "begin_char":
                        sent_char_offset[begin],
                        "end_char":
                        sent_char_offset[end] + len(tokenized_text[end]),
                        "tokenized_text":
                        tokenized_text[begin:(end + 1)],
                        "text":
                        input_text[sent_char_offset[begin]:(
                            sent_char_offset[end] + len(tokenized_text[end]))]
                    }
                    decoded_cluster.append(m)
                decoded_coref_clusters.append(decoded_cluster)

                ## Format the arcs
                for i in range(len(cluster) - 1):
                    mention1_begin, mention1_end = cluster[i]
                    mention2_begin, mention2_end = cluster[i + 1]
                    coref_arc = {
                        "mention1_begin":
                        mention1_begin,
                        "mention1_end":
                        mention1_end,
                        "mention1_begin_char":
                        sent_char_offset[mention1_begin],
                        "mention1_end_char":
                        sent_char_offset[mention1_end] +
                        len(tokenized_text[mention1_end]),
                        "tokenized_text1":
                        tokenized_text[mention1_begin:(mention1_end + 1)],
                        "text1":
                        input_text[sent_char_offset[mention1_begin]:(
                            sent_char_offset[mention1_end] +
                            len(tokenized_text[mention1_end]))],
                        "mention2_begin":
                        mention2_begin,
                        "mention2_end":
                        mention2_end,
                        "mention2_begin_char":
                        sent_char_offset[mention2_begin],
                        "mention2_end_char":
                        sent_char_offset[mention2_end] +
                        len(tokenized_text[mention2_end]),
                        "tokenized_text2":
                        tokenized_text[mention2_begin:(mention2_end + 1)],
                        "text2":
                        input_text[sent_char_offset[mention2_begin]:(
                            sent_char_offset[mention2_end] +
                            len(tokenized_text[mention2_end]))]
                    }
                    decoded_coref_arcs.append(coref_arc)

            formatted_predictions["coref_arcs"] = decoded_coref_arcs
            formatted_predictions["coref_clusters"] = decoded_coref_clusters

        return formatted_predictions