Exemple #1
0
    def __get_span_to_id(column):
        span_to_id = {}

        ids_to_stack = defaultdict(list)

        for i in range(0, len(column)):
            entry = column[i]

            if entry != "-":
                parallel_annotations = entry.split("|")

                for annotation in parallel_annotations:
                    if annotation.startswith("(") and annotation.endswith(")"):
                        set_id = annotation[1:-1]
                        span_to_id[spans.Span(i, i)] = int(set_id)
                    elif annotation.startswith("("):
                        set_id = annotation[1:]
                        ids_to_stack[set_id].append(i)
                    elif annotation.endswith(")"):
                        set_id = annotation[:-1]
                        span_to_id[
                            spans.Span(ids_to_stack[set_id].pop(), i)
                        ] = int(set_id)

        return span_to_id
Exemple #2
0
    def __extract_sentence_spans(in_sentence_ids):
        sentence_spans = []

        span_start = 0

        for i in range(1, len(in_sentence_ids)):
            if in_sentence_ids[i] <= in_sentence_ids[i - 1]:
                sentence_spans.append(spans.Span(span_start, i - 1))
                span_start = i

        sentence_spans.append(spans.Span(span_start, len(in_sentence_ids) - 1))

        return sentence_spans
Exemple #3
0
def get_relevant_subtree(span, document):
    """ Get the fragment of the parse tree and the input span.

    Args:
        span (Span): A span in a document.
        document (CoNLLDocument): A document.

    Returns:
        nltk.ParentedTree: The fragment of the parse tree at the span in the
        document.
    """
    in_sentence_ids = document.in_sentence_ids[span.begin:span.end + 1]
    in_sentence_span = spans.Span(in_sentence_ids[0], in_sentence_ids[-1])

    sentence_id, sentence_span = document.get_sentence_id_and_span(span)

    sentence_tree = document.parse[sentence_id]

    spanning_leaves = sentence_tree.treeposition_spanning_leaves(
        in_sentence_span.begin, in_sentence_span.end + 1)
    mention_subtree = sentence_tree[spanning_leaves]

    if mention_subtree in sentence_tree.leaves():
        mention_subtree = sentence_tree[spanning_leaves[:-2]]

    return mention_subtree
Exemple #4
0
    def __init__(self, identifier, sentences, coref):
        """ Construct a document from sentence and coreference information.

        Args:
            identifier (str): A unique identifier for the document.
            sentences(list): A list of sentence information. The ith item
                contains information about the ith sentence. We assume that
                each ``sentences[i]`` is a 6-tuple
                ``tokens, pos, ner, speakers, parse, dep``, where

                * tokens (list(str)): All tokens in the sentence.
                * pos (list(str)): All part-of-speech tags in the sentence.
                * ner (list(str)): All named entity tags in the sentence (if a
                  token does not have a tag, the tag is set to NONE).
                * speakers (list(str)): All speaker ids in the sentence.
                * parse (str): A string representation of the sentence's parse
                  tree (should be readable by nltk)
                * dep (list(StanfordDependencies.CoNLL.Token): All dependencies
                  in the sentence represented as lists of tokens with label
                  information and pointers to heads.
            coref (dict(span, int)): A mapping of mention spans to their
            coreference set id.
        """
        self.identifier = identifier

        self.in_sentence_ids = []
        self.sentence_spans = []
        self.tokens = []
        self.pos = []
        self.ner = []
        self.parse = []
        self.dep = []
        self.speakers = []
        self.coref = coref

        for sentence in sentences:
            tokens, pos, ner, speakers, parse, dep = sentence

            offset = len(self.tokens)

            self.in_sentence_ids += list(range(0, len(tokens)))

            self.sentence_spans.append(spans.Span(
                offset, offset + len(tokens) - 1
            ))

            self.tokens += tokens
            self.pos += pos
            self.ner += ner
            self.parse.append(nltk.ParentedTree.fromstring(parse))
            self.dep.append(dep)
            self.speakers += speakers

        self.annotated_mentions = self.__get_annotated_mentions()
        self.system_mentions = []
Exemple #5
0
    def test_adjust_head_for_nam(self):
        self.assertEqual(
            (spans.Span(0, 1), ["Khan", "Younes"]),
            head_finders.HeadFinder.adjust_head_for_nam(
                ["Khan", "Younes", "in", "the", "southern", "Ghaza", "Strip"],
                ["NNP", "NNS", "IN", "DT", "JJ", "NNP", "NNP"], "GPE"))

        self.assertEqual(
            (spans.Span(0, 1), ["Walter", "Sisulu"]),
            head_finders.HeadFinder.adjust_head_for_nam(["Walter", "Sisulu"],
                                                        ["NNP", "NNP"],
                                                        "PERSON"))

        self.assertEqual(
            (spans.Span(1, 5), ['vice', 'president', 'Robert', 'W.', 'Reedy']),
            head_finders.HeadFinder.adjust_head_for_nam(
                ['former', 'vice', 'president', 'Robert', 'W.', 'Reedy'],
                ["JJ", "NN", "NN", "NNP", "NNP", "NNP"], "PERSON"))

        self.assertEqual(
            (spans.Span(0, 1), ['Michael', 'Wolf']),
            head_finders.HeadFinder.adjust_head_for_nam(
                ['Michael', 'Wolf', ',', 'a', 'contributing', 'editor'],
                ["NNP", "NNP", ",", "DT", "VBG", "NN"], "PERSON"))

        self.assertEqual(
            (spans.Span(0, 1), ['Mr.', 'Clinton']),
            head_finders.HeadFinder.adjust_head_for_nam(['Mr.', 'Clinton'],
                                                        ["NNP", "NNP"],
                                                        "NONE"))

        self.assertEqual(
            (spans.Span(0, 0), ['Taiwan']),
            head_finders.HeadFinder.adjust_head_for_nam(['Taiwan', "'s"],
                                                        ["NNP", "POS"], "GPE"))

        self.assertEqual(
            (spans.Span(0, 2), ["Jim", "'s", "Tools"]),
            head_finders.HeadFinder.adjust_head_for_nam(['Jim', "'s", "Tools"],
                                                        ["NNP", "POS", "NNP"],
                                                        "ORG"))

        self.assertEqual(
            (spans.Span(0, 3), ["Taiwan", "'s", "False", "Cypresses"]),
            head_finders.HeadFinder.adjust_head_for_nam(
                ["Taiwan", "'s", "False", "Cypresses"],
                ["NNP", "POS", "JJ", "NNP"], "NONE"))
Exemple #6
0
def __extract_system_mention_spans(document):
    mention_spans = []
    for i, sentence_span in enumerate(document.sentence_spans):
        sentence_tree = document.parse[i]

        in_sentence_spans = __extract_mention_spans_for_sentence(
            sentence_tree,
            document.ner[sentence_span.begin:sentence_span.end + 1])

        mention_spans += [
            spans.Span(sentence_span.begin + span.begin,
                       sentence_span.begin + span.end)
            for span in in_sentence_spans
        ]

    return sorted(mention_spans)
Exemple #7
0
def __get_in_tree_span(parented_tree):
    start = 0

    current_tree = parented_tree

    while current_tree.parent() is not None:
        for child in current_tree.parent():
            if child == current_tree:
                break
            else:
                start += len(child.leaves())

        current_tree = current_tree.parent()

    end = start + len(parented_tree.leaves()) - 1

    return spans.Span(start, end)
Exemple #8
0
def get_modifier(mention):
    head_span_in_mention = spans.Span(
        mention.attributes["head_span"].begin - mention.span.begin,
        mention.attributes["head_span"].end - mention.span.begin)

    modifiers = set()

    for index, (token, pos) in enumerate(
            zip(mention.attributes["tokens"], mention.attributes["pos"])):
        if (token.lower()
                not in ["the", "this", "that", "those", "these", "a", "an"]
                and pos not in ["POS", "IN"]
                and (index < head_span_in_mention.begin
                     or index > head_span_in_mention.end)):
            modifiers.add(token.lower())

    return modifiers
Exemple #9
0
def __get_span_from_ner(pos, ner):
    i = 0
    spans_from_ner = []
    while i < len(ner):
        current_tag = ner[i]
        if current_tag != "NONE":
            start = i

            while i + 1 < len(ner) and ner[i + 1] != "NONE" and ner[i] == ner[
                    i + 1]:
                i += 1

            if i + 1 < len(pos) and pos[i + 1] == "POS":
                i += 1

            spans_from_ner.append(spans.Span(start, i))

        i += 1

    return sorted(spans_from_ner)
Exemple #10
0
    def __generate_html_for_errors(self, document, mentions):
        document_html = "\n\t\t\t<ol class=\"text\">\n" \
                        "\t\t\t\t<li class=\"sentence\">"

        self.navi["gold"] = "\n\t\t\t\t<div class=\"goldNavi\">" \
                            "<h3>Reference Entities</h3>" \
                            "<span class=\"tease\">show all</span>" \
                            "\n\t\t\t\t\t<ul>"

        self.navi["system"] = "\n\t\t\t\t<div class=\"systemNavi\">" \
                              "<h3>System Entities</h3>" \
                              "<span class=\"tease\">show all</span>" \
                              "\n\t\t\t\t\t<ul>"

        chains = set()

        index = 0

        sentence_id, sentence_span = document.get_sentence_id_and_span(
            spans.Span(0, 0))

        annotated_mentions = set(document.annotated_mentions)

        for token in document.tokens:
            token = html_escape(token, True)

            mention_id = 0

            mention_text = ""

            processed_gold_mentions = set()

            for mention in mentions:
                if mention.span.begin > index:
                    break

                if mention.span.end < index:
                    mention_id += 1
                    continue

                mention_tokens = html_escape(
                    " ".join(mention.attributes['tokens']), True)

                mention_head = html_escape(
                    " ".join(mention.attributes['head']), True)

                mention_type = html_escape("".join(mention.attributes['type']),
                                           True)

                mention_span = str(mention.span)

                if mention in annotated_mentions and \
                   mention not in processed_gold_mentions:
                    system = "gold"

                    processed_gold_mentions.add(mention)
                else:
                    system = "system"

                chain_id = system + str(mention.attributes['annotated_set_id'])

                if chain_id not in chains:
                    self.navi[system] += "\n\t\t\t\t\t\t<li class=\"" + \
                                         chain_id +\
                                         "\">" + mention_tokens + "</li>"
                    chains.add(chain_id)

                if chain_id not in self.chain_to_colour.keys():
                    while True:
                        # r = lambda: randint(170, 255)
                        # colour = '#%02X%02X%02X' % (r(), r(), r())
                        colour = self.colours_panel[self.colour_id]
                        self.colour_id += 1
                        if self.colour_id == len(self.colours_panel):
                            self.colour_id = 0

                        # if colour not in self.colours:
                        # self.colours.append(colour)
                        # break
                        break

                    self.chain_to_colour[chain_id] = colour

                span_id = document.get_html_friendly_identifier() + "_" + \
                          str(mention_id)

                temp_text = "<span " \
                            "id=\"" + span_id + "\" " \
                            "class=\"" + chain_id + " mention\" " \
                            "data-mentiontype=\"" + mention_type + "\" " \
                            "data-mentionhead=\"" + mention_head + "\" " \
                            "data-span=\"" + mention_span + "\">"

                if mention.span.begin == index and mention.span.end == index:
                    if mention_text.endswith("</span> "):
                        mention_text = temp_text + mention_text.strip() + \
                            "</span> "
                    elif mention_text == "":
                        mention_text = temp_text + token + "</span> "
                elif mention.span.begin == index:
                    if mention_text == "":
                        mention_text = temp_text + token + " "
                    else:
                        mention_text = temp_text + mention_text
                elif mention.span.end == index:
                    if mention_text == "":
                        mention_text = token + "</span> "
                    else:
                        mention_text = mention_text.strip() + "</span> "

                mention_id += 1

            if mention_text == "":
                mention_text = token + " "

            token_span = spans.Span(index, index)

            if document.get_sentence_id_and_span(token_span) is None or \
                    sentence_span != document.get_sentence_id_and_span(
                            token_span)[1]:
                mention_text = "</li>\n" \
                               "\t\t\t\t<li class=\"sentence\">" + mention_text

                sentence_id, sentence_span = document.get_sentence_id_and_span(
                    token_span)

            document_html += mention_text

            index += 1

        document_html.strip()

        return document_html + "</li>\n\t\t\t</ol>"
Exemple #11
0
    def from_document(span, document, first_in_gold_entity=False):
        """
        Create a mention from a span in a document.

        All attributes of the mention are computed from the linguistic
        information found in the document. For information about the
        attributes, see the class documentation.

        Args:
            document (CoNLLDocument): The document the mention belongs to.
            span (Span): The span of the mention in the document.

        Returns:
            Mention: A mention extracted from the input span in the input
            document.
        """

        i, sentence_span = document.get_sentence_id_and_span(span)

        attributes = {
            "tokens": document.tokens[span.begin:span.end + 1],
            "pos": document.pos[span.begin:span.end + 1],
            "ner": document.ner[span.begin:span.end + 1],
            "sentence_id": i,
            "parse_tree": mention_property_computer.get_relevant_subtree(
                span, document),
            "speaker": document.speakers[span.begin],
            "antecedent": None,
            "set_id": None,
            "first_in_gold_entity": first_in_gold_entity
        }

        if span in document.coref:
            attributes["annotated_set_id"] = document.coref[span]
        else:
            attributes["annotated_set_id"] = None

        attributes["is_apposition"] = \
            mention_property_computer.is_apposition(attributes)

        attributes["grammatical_function"] = \
            mention_property_computer.get_grammatical_function(attributes)

        (head, in_mention_span, head_index) = \
            mention_property_computer.compute_head_information(attributes)

        attributes["head"] = head
        attributes["head_span"] = spans.Span(
            span.begin + in_mention_span.begin,
            span.begin + in_mention_span.end
        )
        attributes["head_index"] = head_index

        attributes["type"] = mention_property_computer.get_type(attributes)
        attributes["fine_type"] = mention_property_computer.get_fine_type(
            attributes)

        if attributes["type"] == "PRO":
            attributes["citation_form"] = \
                mention_property_computer.get_citation_form(
                    attributes)

        attributes["number"] = \
            mention_property_computer.compute_number(attributes)
        attributes["gender"] = \
            mention_property_computer.compute_gender(attributes)

        attributes["semantic_class"] = \
            mention_property_computer.compute_semantic_class(attributes)

        attributes["head_as_lowercase_string"] = " ".join(attributes[
            "head"]).lower()

        attributes["tokens_as_lowercase_string"] = " ".join(attributes[
            "tokens"]).lower()

        dep_tree = document.dep[i]

        index = span.begin + head_index - sentence_span.begin

        governor_id = dep_tree[index].head - 1

        if governor_id == -1:
            attributes["governor"] = "NONE"
        else:
            attributes["governor"] = dep_tree[governor_id].form.lower()

        attributes["ancestry"] = Mention._get_ancestry(dep_tree, index)

        attributes["deprel"] = dep_tree[index].deprel

        return Mention(document, span, attributes)
Exemple #12
0
    def __generate_html_for_raw(self, document, mentions):
        document_html = "\n\t\t\t<ol class=\"text\">\n" \
                        "\t\t\t\t<li class=\"sentence\">"

        self.navi["gold"] = "\n\t\t\t\t<div class=\"goldNavi\">" \
                            "<h3>Reference Entities</h3>" \
                            "<span class=\"tease\">show all</span>" \
                            "\n\t\t\t\t\t<ul>"

        self.navi["system"] = "\n\t\t\t\t<div class=\"systemNavi\">" \
                              "<h3>System Entities</h3>" \
                              "<span class=\"tease\">show all</span>" \
                              "\n\t\t\t\t\t<ul>"

        chains = set()

        index = 0

        sentence_id, sentence_span = document.get_sentence_id_and_span(
            spans.Span(0, 0))

        annotated_mentions = set(document.annotated_mentions)

        temp_navi = {"gold": {}, "system": {}}

        chain_counter = {
            "gold": collections.Counter(),
            "system": collections.Counter()
        }

        for system in ["gold", "system"]:
            for mention in annotated_mentions:
                chain_counter[system].update(
                    [system + str(mention.attributes["annotated_set_id"])])

        for token in document.tokens:
            token = html_escape(token, True)

            mention_id = 0

            mention_text = ""

            processed_gold_mentions = set()

            for mention in mentions:
                if mention.span.begin > index:
                    break

                if mention.span.end < index:
                    mention_id += 1
                    continue

                mention_tokens = html_escape(
                    " ".join(mention.attributes['tokens']), True)

                mention_head = html_escape(
                    " ".join(mention.attributes['head']), True)

                mention_type = html_escape("".join(mention.attributes['type']),
                                           True)

                mention_span = str(mention.span)

                system = "system"

                chain_id = system + str(mention.attributes['annotated_set_id'])

                if chain_id not in chains:
                    temp_navi[system][chain_id] = "\n\t\t\t\t\t\t<li " \
                                                  "class=\"" + \
                                         chain_id +\
                                         "\">" + mention_tokens + "</li>"
                    chains.add(chain_id)

                if chain_id not in self.chain_to_colour.keys():
                    while True:
                        r = lambda: randint(170, 255)
                        colour = '#%02X%02X%02X' % (r(), r(), r())
                        if colour not in self.colours:
                            self.colours.append(colour)
                            break

                    self.chain_to_colour[chain_id] = colour

                span_id = document.get_html_friendly_identifier() + "_" + \
                          str(mention_id)

                style = ""

                if chain_counter[system][chain_id] > 1:
                    style = "style=\"background-color:" + self.chain_to_colour[
                        chain_id] + "\" "

                temp_text = "<span " \
                            "id=\"" + span_id + "\" " \
                            "class=\"" + chain_id + " mention\" "  + \
                            style + \
                            "data-mentiontype=\"" + mention_type + "\" " \
                            "data-mentionhead=\"" + mention_head + "\" " \
                            "data-span=\"" + mention_span + "\">"

                if mention.span.begin == index and mention.span.end == index:
                    if mention_text.endswith("</span> "):
                        mention_text = temp_text + mention_text.strip() + \
                            "</span> "
                    elif mention_text == "":
                        mention_text = temp_text + token + "</span> "
                elif mention.span.begin == index:
                    if mention_text == "":
                        mention_text = temp_text + token + " "
                    else:
                        mention_text = temp_text + mention_text
                elif mention.span.end == index:
                    if mention_text == "":
                        mention_text = token + "</span> "
                    else:
                        mention_text = mention_text.strip() + "</span> "

                mention_id += 1

            if mention_text == "":
                mention_text = token + " "

            token_span = spans.Span(index, index)

            if document.get_sentence_id_and_span(token_span) is None or \
                    sentence_span != document.get_sentence_id_and_span(
                            token_span)[1]:
                mention_text = "</li>\n" \
                               "\t\t\t\t<li class=\"sentence\">" + mention_text

                sentence_id, sentence_span = document.get_sentence_id_and_span(
                    token_span)

            document_html += mention_text

            index += 1

        document_html.strip()

        for system in ["system"]:
            for key, val in chain_counter[system].items():
                if val > 1:
                    self.navi[system] += temp_navi[system][key]

        return document_html + "</li>\n\t\t\t</ol>"
Exemple #13
0
    def adjust_head_for_nam(tokens, pos, ner_type, in_mention_span_old_head,
                            old_head):
        """
        Adjust head for proper names via heuristics.

        Based on heuristics depending on the named entity type (person,
        organization, ...) and part-of-speech tags, adjust the head of a
        named entity mention to a meaningful extent useful for coreference
        resolution.

        For example, for the mention "Khan Younes in Southern Gaza Strip",
        this function will compute "Khan Younes" as the head.

        Args:
            tokens (list(str)): The tokens of the mention.
            pos (list(str)): The part-of-speech tags of the mention.
            ner_type (str): The named entity type of the mention. Should be
                one of PERSON, ORG, GPE, FAC, NORP, PRODUCT, EVENT, MONEY,
                WORK_OF_ART, LOC, LAW, LANGUAGE, DATE, TIME, ORDINAL,
                CARDINAL, QUANTITY, PERCENT or NONE.
            in_mention_span_old_head (spans.Span): The in-mention span of the
                old head.
            old_head (list(str)): The tokens of the old head.

        Returns:
            (Span, list(str)): The in-mention span of the adjusted head and
                the tokens of the adjusted head.
        """
        # TODO: get rid of this ugly hack
        if len(pos) == 0:
            return spans.Span(0, 0), "NOHEAD"

        stop_regex = re.compile("CC|,|\.|:|;|V.*|IN|W.*|ADVP|NN$")

        if re.match(
                "ORG.*|GPE.*|FAC.*|NORP.*|PRODUCT|EVENT|MONEY|" +
                "WORK_OF_ART|LOC.*|LAW|LANGUAGE", ner_type):
            start_regex = re.compile("NN(S)?|NNP(S)?")
            stop_regex = re.compile("V.*|IN|W.*|ADVP|,|-LRB-")
        elif ner_type == "PERSON":
            start_regex = re.compile("NN(S)?|NNP(S)?")
            stop_regex = re.compile("IN|CC|,|\.|:|;|V.*|W.*|-LRB-")
        elif re.match("DATE|TIME", ner_type):
            start_regex = re.compile("NN(S)?|NNP(S)?|CD")
        elif re.match("ORDINAL", ner_type):
            start_regex = re.compile("NN|JJ|RB")
        elif re.match("CARDINAL", ner_type):
            start_regex = re.compile("CD")
        elif re.match("QUANTITY|PERCENT", ner_type):
            start_regex = re.compile("CD|JJ|NN")
        elif ner_type == "NONE":
            start_regex = re.compile("NN(S)?|NNP(S)?|CD")
        else:
            logger.warning("No head adjustment rule defined for NER class " +
                           ner_type + ".")
            return in_mention_span_old_head, old_head

        head_start = -1

        position = 0

        for i in range(0, len(tokens)):
            position = i
            if head_start == -1 and start_regex.match(pos[i]):
                head_start = i
            elif head_start >= 0 and stop_regex.match(pos[i]):
                return spans.Span(head_start, i - 1), tokens[head_start:i]

        if head_start == -1:
            head_start = 0

        if pos[position] == "POS" and position == len(pos) - 1:
            position -= 1

        return spans.Span(head_start,
                          position), tokens[head_start:position + 1]
    def test_post_process_same_head_largest_span(self):
        all_mentions = {
            mentions.Mention(
                None, spans.Span(0, 3), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(3, 3)
                }),
            mentions.Mention(
                None, spans.Span(0, 6), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(3, 3)
                }),
            mentions.Mention(
                None, spans.Span(0, 2), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(1, 1)
                }),
            mentions.Mention(
                None, spans.Span(5, 6), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(5, 6)
                }),
            mentions.Mention(
                None, spans.Span(0, 0), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(0, 0)
                })
        }

        expected_mentions = sorted([
            mentions.Mention(
                None, spans.Span(0, 6), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(3, 3)
                }),
            mentions.Mention(
                None, spans.Span(0, 2), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(1, 1)
                }),
            mentions.Mention(
                None, spans.Span(5, 6), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(5, 6)
                }),
            mentions.Mention(
                None, spans.Span(0, 0), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(0, 0)
                })
        ])

        self.assertEqual(
            expected_mentions,
            mention_extractor.post_process_same_head_largest_span(
                all_mentions))

        all_mentions_2 = {
            mentions.Mention(
                None, spans.Span(0, 1), {
                    "tokens": ["Taiwan", "'s"],
                    "type": "NAM",
                    "head_index": 0,
                    "head_span": spans.Span(0, 0)
                }),
            mentions.Mention(
                None, spans.Span(0, 0), {
                    "tokens": ["Taiwan"],
                    "type": "NAM",
                    "head_index": 0,
                    "head_span": spans.Span(0, 0)
                }),
            mentions.Mention(
                None, spans.Span(2, 3), {
                    "tokens": ["the", "CCP"],
                    "type": "NAM",
                    "head_index": 1,
                    "head_span": spans.Span(3, 3)
                }),
            mentions.Mention(
                None, spans.Span(3, 3), {
                    "tokens": ["CCP"],
                    "type": "NAM",
                    "head_index": 0,
                    "head_span": spans.Span(3, 3)
                })
        }

        expected_mentions_2 = sorted([
            mentions.Mention(
                None, spans.Span(0, 1), {
                    "tokens": ["Taiwan", "'s"],
                    "type": "NAM",
                    "head_index": 0,
                    "head_span": spans.Span(0, 0)
                }),
            mentions.Mention(
                None, spans.Span(2, 3), {
                    "tokens": ["the", "CCP"],
                    "type": "NAM",
                    "head_index": 1,
                    "head_span": spans.Span(3, 3)
                }),
        ])

        self.assertEqual(
            expected_mentions_2,
            mention_extractor.post_process_same_head_largest_span(
                all_mentions_2))
Exemple #15
0
def compute_head_information(attributes):
    """ Compute the head of the mention.

    Args:
        attributes (dict(str, object)): Attributes of the mention, must contain
            values for "tokens", "parse_tree", "pos", "ner", "is_apposition"

    Returns:
        (list(str), Span, int): The head, the head span (in the document) and
        the starting index of the head (in the mention).
    """
    mention_subtree = attributes["parse_tree"]

    head_finder = head_finders.HeadFinder()
    head_index = 0
    head = [attributes["tokens"][0]]

    if len(mention_subtree.leaves()) == len(attributes["tokens"]):
        head_tree = head_finder.get_head(mention_subtree)
        head_index = get_head_index(head_tree, mention_subtree.pos())
        head = [head_tree[0]]

    in_mention_span = spans.Span(head_index, head_index)

    if attributes["pos"][head_index].startswith("NNP"):
        in_mention_span, head = \
            head_finders.HeadFinder.adjust_head_for_nam(
                attributes["tokens"],
                attributes["pos"],
                attributes["ner"][head_index])

    # proper name mention: head index last word of head
    # (e.g. "Obama" in "Barack Obama")
    head_index = in_mention_span.end

    # special handling for appositions
    if attributes["is_apposition"]:
        # "Secretary of State Madeleine Albright"
        # => take "Madeleine Albright" as head
        if len(mention_subtree) == 2:
            head_tree = mention_subtree[1]
            head = head_tree.leaves()
            in_mention_span = spans.Span(len(mention_subtree[0].leaves()),
                                         len(attributes["tokens"]) - 1)
            head_index = in_mention_span.end
        else:
            start = 0
            for child in mention_subtree:
                if __head_pos_starts_with(child, "NNP"):
                    end = min([
                        start + len(child.leaves()),
                        len(attributes["tokens"])
                    ])
                    head_index = end - 1
                    in_mention_span, head = \
                        head_finders.HeadFinder.adjust_head_for_nam(
                            attributes["tokens"][start:end],
                            attributes["pos"][start:end],
                            attributes["ner"][head_index])
                    break
                start += len(child.leaves())

    return head, in_mention_span, head_index
    def setUp(self):
        self.gold_first_cluster = [
            mentions.Mention(None, spans.Span(0, 0), {
                "tokens": ["a"],
                "type": "NOM",
                "annotated_set_id": 0
            }),
            mentions.Mention(None, spans.Span(1, 1), {
                "tokens": ["US"],
                "type": "NAM",
                "annotated_set_id": 0
            }),
            mentions.Mention(
                None, spans.Span(2, 3), {
                    "tokens": ["angry", "salesman"],
                    "type": "PRO",
                    "annotated_set_id": 0
                }),
            mentions.Mention(None, spans.Span(4, 5), {
                "tokens": ["the", "rainbow"],
                "type": "NAM",
                "annotated_set_id": 0
            }),
            mentions.Mention(None, spans.Span(5, 6), {
                "tokens": ["and", "far"],
                "type": "NOM",
                "annotated_set_id": 0
            }),
            mentions.Mention(None, spans.Span(7, 7), {
                "tokens": ["neypmd"],
                "type": "NOM",
                "annotated_set_id": 0
            }),
        ]

        self.gold_second_cluster = [
            mentions.Mention(None, spans.Span(7, 8), {
                "type": "NOM",
                "annotated_set_id": 1
            }),
            mentions.Mention(None, spans.Span(9, 9), {
                "type": "NAM",
                "annotated_set_id": 1
            }),
            mentions.Mention(None, spans.Span(10, 10), {
                "type": "PRO",
                "annotated_set_id": 1
            }),
        ]

        self.system1_mentions = [
            mentions.Mention(None, spans.Span(0, 0), {"set_id": 0}),
            mentions.Mention(None, spans.Span(2, 3), {"set_id": 0}),
            mentions.Mention(None, spans.Span(4, 5), {"set_id": 2}),
            mentions.Mention(None, spans.Span(5, 6), {"set_id": 2}),
            mentions.Mention(None, spans.Span(3, 4), {"set_id": 1}),
            mentions.Mention(None, spans.Span(7, 8), {"set_id": 1}),
        ]

        self.system2_cluster = [
            mentions.Mention(None, spans.Span(0, 0), {
                "tokens": ["a"],
                "set_id": 0
            }),
            mentions.Mention(None, spans.Span(2, 3), {
                "tokens": ["angry", "salesman"],
                "set_id": 0
            }),
            mentions.Mention(None, spans.Span(7, 8), {
                "tokens": ["snafu", "foo"],
                "set_id": 0
            }),
            mentions.Mention(None, spans.Span(9, 9), {
                "tokens": ["bar"],
                "set_id": 0
            }),
        ]
        self.system2_cluster[1].attributes["antecedent"] = \
            self.system2_cluster[0]
        self.system2_cluster[2].attributes["antecedent"] = \
            self.system2_cluster[0]
        self.system2_cluster[3].attributes["antecedent"] = \
            self.system2_cluster[2]

        self.maxDiff = None
Exemple #17
0
    def run_on_doc(self, doc_file, name=None):
        if self.with_coref:
            soup = bs4.BeautifulSoup(doc_file.read())
            preprocessed = self.proc.parse_doc(soup.text)
        else:
            data = doc_file.read()
            preprocessed = self.proc.parse_doc(data)

        sentences = []

        for sentence in preprocessed["sentences"]:
            processed_ner = []
            for ner in sentence["ner"]:
                if ner == "O" or ner == "MISC":
                    processed_ner.append("NONE")
                else:
                    processed_ner.append(ner)

            processed_dep = []

            index_to_dep_info = {}
            for dep_info in sentence["deps_basic"]:
                label, head, in_sent_index = dep_info
                index_to_dep_info[in_sent_index] = label, head

            for i in range(0, len(sentence["tokens"])):
                if i in index_to_dep_info.keys():
                    label, head = index_to_dep_info[i]
                    processed_dep.append(
                        CoNLL.Token(
                            form=sentence["tokens"][i],
                            lemma=sentence["lemmas"][i],
                            pos=sentence["pos"][i],
                            index=i+1,
                            head=head+1,
                            deprel=label,
                            cpos=None,
                            feats=None,
                            phead=None,
                            pdeprel=None,
                            extra=None
                        )
                    )
                else:
                    processed_dep.append(
                        CoNLL.Token(
                            form=sentence["tokens"][i],
                            lemma=sentence["lemmas"][i],
                            pos=sentence["pos"][i],
                            index=i+1,
                            head=0,
                            deprel="punc",
                            cpos=None,
                            feats=None,
                            phead=None,
                            pdeprel=None,
                            extra=None
                        )
                    )

            sentences.append(
                (sentence["tokens"],
                 sentence["pos"],
                 processed_ner,
                 ["-"]*len(sentence["tokens"]),
                 sentence["parse"],
                 processed_dep,
                )
            )

        if not name:
            name = doc_file.name

        if self.with_coref:
            antecedent_decisions = {}
            coref = {}

            mention_id_to_spans = {}

            max_entity = 0

            for mention in soup.findAll("mention"):
                if mention.get("entity"):
                    max_entity = max(max_entity, int(mention.get("entity")))

            for mention in soup.findAll("mention"):
                mention_id = int(mention.get("id"))

                span = spans.Span(int(mention.get("span_start")),
                                  int(mention.get("span_end")))

                mention_id_to_spans[mention_id] = span

                if mention.get("entity"):
                    annotated_set_id = int(mention.get("entity"))
                else:
                    annotated_set_id = max_entity + 1 + mention_id

                coref[span] = annotated_set_id

                if mention.get("antecedent"):
                    antecedent_decisions[span] = mention_id_to_spans[
                        int(mention.get("antecedent"))
                    ]

            doc = documents.Document(
                name,
                sentences,
                coref)

            spans_to_annotated_mentions = {}

            for mention in doc.annotated_mentions:
                spans_to_annotated_mentions[mention.span] = mention

            for span in antecedent_decisions:
                ante_span = antecedent_decisions[span]
                ana = spans_to_annotated_mentions[span]
                ante = spans_to_annotated_mentions[ante_span]
                ana.attributes["antecedent"] = ante
        else:
            doc = documents.Document(
                name,
                sentences,
                {})

        return doc
    def test_post_process_embedded_head_largest_span(self):
        all_mentions_1 = {
            mentions.Mention(
                None, spans.Span(0, 3), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(3, 3)
                }),
            mentions.Mention(
                None, spans.Span(0, 6), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(2, 3)
                }),
            mentions.Mention(
                None, spans.Span(0, 2), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(1, 1)
                }),
            mentions.Mention(
                None, spans.Span(5, 6), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(5, 6)
                })
        }

        expected_mentions_1 = sorted([
            mentions.Mention(
                None, spans.Span(0, 6), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(2, 3)
                }),
            mentions.Mention(
                None, spans.Span(0, 2), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(1, 1)
                }),
            mentions.Mention(
                None, spans.Span(5, 6), {
                    "tokens": [],
                    "type": "NOM",
                    "head_index": 0,
                    "head_span": spans.Span(5, 6)
                })
        ])

        self.assertEqual(
            expected_mentions_1,
            mention_extractor.post_process_embedded_head_largest_span(
                all_mentions_1))
Exemple #19
0
    def setUp(self):
        self.first_cluster = [
            mentions.Mention(
                None,
                spans.Span(0, 0),
                {"tokens": ["a"], "annotated_set_id": 0}),

            mentions.Mention(
                None,
                spans.Span(1, 1),
                {"tokens": ["b"], "annotated_set_id": 0}),

            mentions.Mention(
                None,
                spans.Span(2, 3),
                {"tokens": ["c", "d"], "annotated_set_id": 0}),

            mentions.Mention(
                None,
                spans.Span(4, 5),
                {"tokens": ["e", "f"], "annotated_set_id": 0}),

            mentions.Mention(
                None,
                spans.Span(5, 6),
                {"tokens": ["f", "g"], "annotated_set_id": 0}),

            mentions.Mention(
                None,
                spans.Span(7, 7),
                {"tokens": ["h"], "annotated_set_id": 0}),
        ]

        self.second_cluster = [
            mentions.Mention(
                None,
                spans.Span(3, 4),
                {"tokens": ["d", "e"], "annotated_set_id": 1}),

            mentions.Mention(
                None,
                spans.Span(7, 8),
                {"tokens": ["h", "i"], "annotated_set_id": 1}),

            mentions.Mention(
                None,
                spans.Span(10, 10),
                {"tokens": ["k"], "annotated_set_id": 1})
        ]

        self.system_cluster = [
            mentions.Mention(
                None,
                spans.Span(0, 0),
                {"tokens": ["a"], "annotated_set_id": 0}),

            mentions.Mention(
                None,
                spans.Span(2, 3),
                {"tokens": ["c", "d"], "annotated_set_id": 0}),

            mentions.Mention(
                None,
                spans.Span(4, 5),
                {"tokens": ["e", "f"], "annotated_set_id": 2}),

            mentions.Mention(
                None,
                spans.Span(5, 6),
                {"tokens": ["f", "g"], "annotated_set_id": 2}),

            mentions.Mention(
                None,
                spans.Span(7, 7),
                {"tokens": ["h"], "annotated_set_id": 1}),

            mentions.Mention(
                None,
                spans.Span(10, 10),
                {"tokens": ["k"], "annotated_set_id": 1})
        ]

        self.maxDiff = None
    def test_extract_system_mentions(self):
        expected_spans = sorted([
            spans.Span(0, 1),
            spans.Span(0, 5),
            spans.Span(3, 5),
            spans.Span(5, 5),
            spans.Span(8, 10),
            spans.Span(8, 11),
            spans.Span(13, 16),
            spans.Span(13, 20),
            spans.Span(14, 14),
            spans.Span(18, 20),
            spans.Span(22, 23),
            spans.Span(25, 25),
            spans.Span(33, 34)
        ])

        self.assertEqual(expected_spans, [
            mention.span
            for mention in mention_extractor.extract_system_mentions(
                self.real_document, filter_mentions=False)[1:]
        ])

        expected_spans = sorted([
            spans.Span(2, 2),
            spans.Span(4, 4),
            spans.Span(6, 7),
            spans.Span(6, 11),
            spans.Span(9, 10),
            spans.Span(9, 11)
        ])

        self.assertEqual(expected_spans, [
            mention.span
            for mention in mention_extractor.extract_system_mentions(
                self.another_real_document, filter_mentions=False)[1:]
        ])

        expected_spans = sorted([
            spans.Span(2, 2),
            spans.Span(4, 4),
            spans.Span(6, 11),
            spans.Span(9, 10),
            spans.Span(9, 11)
        ])

        self.assertEqual(expected_spans, [
            mention.span
            for mention in mention_extractor.extract_system_mentions(
                self.another_real_document, filter_mentions=True)[1:]
        ])
    def test_post_process_appositions(self):
        three_children_tree = nltk.ParentedTree.fromstring(
            "(NP (NP (NP (NP (DT The) (NNP ROC) (POS 's)) (NN ambassador)) "
            "(PP (IN to) (NP (NNP Nicaragua)))) (, ,) (NP (NNP Antonio) "
            "(NNP Tsai)) (, ,))")

        three_children_all_mentions = {
            mentions.Mention(
                None, spans.Span(0, 6), {
                    "tokens": [
                        "The", "ROC", "'s", "ambassador", "to", "Nicaragua",
                        ",", "Antonio", "Tsai"
                    ],
                    "is_apposition":
                    True,
                    "type":
                    "NAM",
                    "parse_tree":
                    three_children_tree
                }),
            mentions.Mention(
                None, spans.Span(0, 4), {
                    "tokens":
                    ["The", "ROC", "'s", "ambassador", "to", "Nicaragua"],
                    "is_apposition": False,
                    "type": "NOM",
                    "parse_tree": three_children_tree[0]
                }),
            mentions.Mention(
                None, spans.Span(0, 3), {
                    "tokens": ["The", "ROC", "'s", "ambassador"],
                    "is_apposition": False,
                    "type": "NOM",
                    "parse_tree": three_children_tree[0][0]
                }),
            mentions.Mention(
                None, spans.Span(0, 2), {
                    "tokens": ["The", "ROC", "'s"],
                    "is_apposition": False,
                    "type": "NAM",
                    "parse_tree": three_children_tree[0][0][0]
                }),
            mentions.Mention(
                None, spans.Span(4, 4), {
                    "tokens": ["Nicaragua"],
                    "is_apposition": False,
                    "type": "NAM",
                    "parse_tree": three_children_tree[0][1][1]
                }),
            mentions.Mention(
                None, spans.Span(5, 6), {
                    "tokens": ["Antonio", "Tsai"],
                    "is_apposition": False,
                    "type": "NAM",
                    "parse_tree": three_children_tree[2]
                })
        }

        three_children_expected = sorted([
            mentions.Mention(
                None, spans.Span(0, 6), {
                    "tokens": [
                        "The", "ROC", "'s", "ambassador", "to", "Nicaragua",
                        ",", "Antonio", "Tsai"
                    ],
                    "is_apposition":
                    True,
                    "type":
                    "NAM",
                    "parse_tree":
                    three_children_tree
                }),
            mentions.Mention(
                None, spans.Span(0, 3), {
                    "tokens": ["The", "ROC", "'s", "ambassador"],
                    "is_apposition": False,
                    "type": "NOM",
                    "parse_tree": three_children_tree[0][0]
                }),
            mentions.Mention(
                None, spans.Span(0, 2), {
                    "tokens": ["The", "ROC", "'s"],
                    "is_apposition": False,
                    "type": "NAM",
                    "parse_tree": three_children_tree[0][0][0]
                }),
            mentions.Mention(
                None, spans.Span(4, 4), {
                    "tokens": ["Nicaragua"],
                    "is_apposition": False,
                    "type": "NAM",
                    "parse_tree": three_children_tree[0][1][1]
                }),
        ])

        self.assertEqual(
            three_children_expected,
            mention_extractor.post_process_appositions(
                three_children_all_mentions))

        two_children_tree = nltk.ParentedTree.fromstring(
            "(NP (NP (NP (NNP Secretary)) (PP (IN of) (NP (NNP State)))) "
            "(NP (NNP Madeleine) (NNP Albright)))")

        two_children_all_mentions = {
            mentions.Mention(
                None, spans.Span(0, 4), {
                    "tokens":
                    ["Secretary", "of", "Sate", "Madeleine", "Albright"],
                    "is_apposition": True,
                    "type": "NAM",
                    "parse_tree": two_children_tree
                }),
            mentions.Mention(
                None, spans.Span(0, 0), {
                    "tokens": ["Secretary"],
                    "is_apposition": False,
                    "type": "NAM",
                    "parse_tree": two_children_tree[0][0]
                }),
            mentions.Mention(
                None, spans.Span(0, 2), {
                    "tokens": ["Secretary", "of", "State"],
                    "is_apposition": False,
                    "type": "NAM",
                    "parse_tree": two_children_tree[0]
                }),
            mentions.Mention(
                None, spans.Span(2, 2), {
                    "tokens": ["State"],
                    "is_apposition": False,
                    "type": "NAM",
                    "parse_tree": two_children_tree[0][1][1]
                }),
            mentions.Mention(
                None, spans.Span(2, 2), {
                    "tokens": ["Madeleine", "Albright"],
                    "is_apposition": False,
                    "type": "NAM",
                    "parse_tree": two_children_tree[1]
                })
        }

        two_children_expected = sorted([
            mentions.Mention(
                None, spans.Span(0, 4), {
                    "tokens":
                    ["Secretary", "of", "Sate", "Madeleine", "Albright"],
                    "is_apposition": True,
                    "type": "NAM",
                    "parse_tree": two_children_tree
                })
        ])

        self.assertEqual(
            two_children_expected,
            mention_extractor.post_process_appositions(
                two_children_all_mentions))