Example #1
0
    def build_next_sent(self, doc, c_parse):
        # Build token spans.
        sep = ' '

        sent_token_nodes = c_parse['tokens']
        id2node = c_parse['id2node']

        for i, token_node in enumerate(sent_token_nodes):
            if i == len(sent_token_nodes) - 1:
                sep = '\n'

            word = html.unescape(token_node.attributes['word'])

            self.text += word
            self.text += sep

            w_start = self.offset
            self.offset += len(word)
            w_end = self.offset
            self.offset += 1

            token_span = Span(w_start, w_end)

            doc.add_token_span(token_span)

            self.id2span[token_node.node_id] = token_span

        for tid, node in id2node.items():
            if tid not in self.id2span:
                leave_tokens = node.get_leaves()

                begin_token_span = self.id2span[
                    leave_tokens[0].attributes['id']]
                end_token_span = self.id2span[
                    leave_tokens[-1].attributes['id']]

                self.id2span[tid] = Span(begin_token_span.begin,
                                         end_token_span.end)
Example #2
0
def get_nltk_span(token_spans, sent_num, indice_groups):
    spans = []
    for indices in indice_groups:
        start = -1
        end = -1
        for index in indices:
            s = token_spans[sent_num][index]
            if s:
                if start < 0:
                    start = s[0]
                end = s[1]

        if start >= 0 and end >= 0:
            spans.append(Span(start, end))
    return spans
Example #3
0
    def parse_conll_data(self, corpus, conll_in):
        text = ""
        offset = 0

        arg_text = []
        sent_predicates = []
        sent_args = defaultdict(list)
        doc = DEDocument(corpus)

        props = []

        for line in conll_in:
            parts = line.strip().split()
            if len(parts) < 8:
                text += "\n"
                offset += 1

                for index, predicate in enumerate(sent_predicates):
                    arg_content = sent_args[index]
                    props.append((predicate, arg_content))

                sent_predicates.clear()
                sent_args.clear()
                arg_text.clear()

                continue

            fname, _, index, token, pos, parse, lemma, sense = parts[:8]
            pb_annos = parts[8:]

            if len(arg_text) == 0:
                arg_text = [None] * len(pb_annos)

            domain = fname.split("/")[1]

            start = offset
            end = start + len(token)

            text += token + " "
            offset += len(token) + 1

            for index, t in enumerate(arg_text):
                if t:
                    arg_text[index] += " " + token

            if not sense == "-":
                sent_predicates.append((start, end, token))

            for index, anno in enumerate(pb_annos):
                if anno == "(V*)":
                    continue

                if anno.startswith("("):
                    role = anno.strip("(").strip(")").strip("*")
                    sent_args[index].append([role, start])
                    arg_text[index] = token
                if anno.endswith(")"):
                    sent_args[index][-1].append(end)
                    sent_args[index][-1].append(arg_text[index])
                    arg_text[index] = ""

        doc.set_text(text)

        for (p_start, p_end, p_token), args in props:
            hopper = doc.add_hopper()

            pred = doc.add_predicate(hopper, Span(p_start, p_end), p_token)

            if pred is not None:
                for role, arg_start, arg_end, arg_text in args:
                    filler = doc.add_filler(Span(arg_start, arg_end), arg_text)
                    doc.add_argument_mention(pred, filler.aid, role)

        return doc
Example #4
0
    def parse_full_text(self, full_text_file, doc):
        root = ET.parse(full_text_file).getroot()

        full_text = ""
        offset = 0

        annotations = []

        for sent in root.findall("icsi:sentence", self.ns):
            sent_text = sent.find("icsi:text", self.ns).text

            full_text += sent_text
            full_text += "\n"

            for anno_set in sent.findall("icsi:annotationSet", self.ns):
                targets = []
                fes = []

                if not "frameName" in anno_set.attrib:
                    continue

                frame_name = anno_set.attrib["frameName"]

                for layer in anno_set.findall("icsi:layer", self.ns):
                    layer_type = layer.attrib["name"]

                    if layer_type == "Target":
                        label = layer.find("icsi:label", self.ns)

                        if label is not None:
                            s = int(label.attrib["start"])
                            e = int(label.attrib["end"]) + 1
                            text = sent_text[s:e]
                            targets.append((s + offset, e + offset, text))
                    elif layer_type == "FE":
                        for label in layer.findall("icsi:label", self.ns):
                            label_name = label.attrib["name"]

                            if "itype" in label.attrib:
                                # Null instantiation.
                                pass
                            else:
                                s = int(label.attrib["start"])
                                e = int(label.attrib["end"]) + 1
                                text = sent_text[s:e]
                                fes.append(
                                    (s + offset, e + offset, text, label_name))

                if targets:
                    max_len = 0
                    target = None
                    for i, (s, e, text) in enumerate(targets):
                        if e - s > max_len:
                            max_len = e - s
                            target = s, e, text

                    annotations.append((frame_name, target, fes))

            offset = len(full_text)

        doc.set_text(full_text)

        for frame_name, target, fes in annotations:
            ev = doc.add_hopper()
            target_start, target_end, text = target
            evm = doc.add_predicate(ev,
                                    Span(target_start, target_end),
                                    text=text,
                                    frame_type=frame_name)

            for start, end, fe_text, role in fes:
                filler = doc.add_filler(Span(start, end), fe_text)
                doc.add_argument_mention(evm, filler.aid, role)

        return doc
Example #5
0
    def parse_ace_data(self, corpus, source_file, anno_file):
        with open(source_file) as source_in:
            doc = DEDocument(corpus)

            text = self.get_source_text(source_in)

            doc.set_text(text)

            tree = ET.parse(anno_file)
            root = tree.getroot()

            for xml_doc in root.iter("document"):
                docid = xml_doc.attrib["DOCID"]
                doc.set_id(docid)

                # Parse entity.
                entity2mention = defaultdict(list)

                for entity in xml_doc.iter("entity"):
                    entity_type = entity.attrib["TYPE"]
                    entity_subtype = entity.attrib["SUBTYPE"]
                    full_type = entity_type + "_" + entity_subtype

                    ent = doc.add_entity(full_type, entity.attrib["ID"])

                    for em in entity:
                        for head in em.iter("head"):
                            for charseq in head.iter("charseq"):
                                start = int(charseq.attrib["START"])
                                end = int(charseq.attrib["END"])

                                entity_span = Span(start, end + 1)

                                ent_mention = doc.add_entity_mention(
                                    ent,
                                    entity_span,
                                    charseq.text,
                                    em.attrib["ID"],
                                    entity_type=full_type,
                                    validate=False,
                                )

                                entity2mention[entity.attrib["ID"]].append(
                                    ent_mention)

                # Parse event.
                for event_node in xml_doc.iter("event"):
                    event_type = event_node.attrib["TYPE"]
                    event_subtype = event_node.attrib["SUBTYPE"]

                    hopper = doc.add_hopper(event_node.attrib["ID"])

                    event_mentions = []

                    for evm_node in event_node:
                        for anchor in evm_node.iter("anchor"):
                            for charseq in anchor.iter("charseq"):
                                start = int(charseq.attrib["START"])
                                end = int(charseq.attrib["END"])

                                evm = doc.add_predicate(
                                    hopper,
                                    Span(start, end + 1),
                                    charseq.text,
                                    eid=evm_node.attrib["ID"],
                                    frame_type=event_type + "_" +
                                    event_subtype,
                                    validate=False,
                                )

                                event_mentions.append(evm)

                    for em_arg in event_node.iter("event_argument"):
                        role = em_arg.attrib["ROLE"]
                        arg_id = em_arg.attrib["REFID"]

                        entity_mentions = entity2mention[arg_id]

                        if len(entity_mentions) > 0:
                            closest_ent, closest_evm, _ = find_close_mention(
                                event_mentions, entity_mentions)
                            doc.add_argument_mention(closest_evm,
                                                     closest_ent.aid, role)

                return doc
Example #6
0
    def parse_ere(self, ere_file, doc):
        root = ET.parse(ere_file).getroot()

        doc_info = root.attrib

        doc.set_id = doc_info['doc_id']
        doc.set_doc_type = doc_info['source_type']

        for entity_node in root.find('entities'):
            entity_ids = []

            ent = doc.add_entity(entity_node.attrib['type'],
                                 entity_node.attrib['id'])

            for entity_mention in entity_node.findall('entity_mention'):
                ent_info = entity_mention.attrib
                entity_ids.append(ent_info['id'])

                entity_text = entity_mention.find('mention_text').text

                entity_span = Span(ent_info['offset'], ent_info['length'])

                doc.add_entity_mention(
                    ent,
                    entity_span,
                    entity_text,
                    ent_info['id'],
                    noun_type=ent_info['noun_type'],
                    entity_type=ent_info.get('type', None),
                )

        for filler in root.find('fillers'):
            filler_info = filler.attrib
            b = int(filler_info['offset'])
            l = int(filler_info['length'])
            doc.add_filler(Span(b, b + l),
                           filler.text,
                           eid=filler_info['id'],
                           filler_type=filler_info['type'])

        for event_node in root.find('hoppers'):
            evm_ids = []

            event = doc.add_hopper(event_node.attrib['id'])

            for event_mention in event_node.findall('event_mention'):
                evm_info = event_mention.attrib
                evm_ids.append(evm_info['id'])

                trigger = event_mention.find('trigger')
                trigger_text = trigger.text
                offset = trigger.attrib['offset']
                length = trigger.attrib['length']

                evm = doc.add_predicate(event,
                                        Span(offset, offset + length),
                                        trigger_text,
                                        eid=evm_info['id'],
                                        frame_type=evm_info['type'] + '_' +
                                        evm_info['subtype'],
                                        realis=evm_info['realis'])

                for em_arg in event_mention.findall('em_arg'):
                    arg_info = em_arg.attrib

                    arg_ent_mention = None
                    if 'entity_mention_id' in arg_info:
                        arg_ent_mention = arg_info['entity_mention_id']
                    if 'filler_id' in arg_info:
                        arg_ent_mention = arg_info['filler_id']

                    role = arg_info['role']

                    doc.add_argument_mention(evm, arg_ent_mention, role)

        for relation_node in root.find('relations'):
            relation_info = relation_node.attrib
            relation = doc.add_relation(relation_info['id'],
                                        relation_type=relation_info['type'] +
                                        '_' + relation_info['subtype'])

            for rel_mention_node in relation_node.findall('relation_mention'):
                rel_mention_id = rel_mention_node.attrib['id']
                rel_realis = rel_mention_node.attrib['realis']

                args = {}
                for mention_part in rel_mention_node:
                    if mention_part.tag.startswith('rel_arg'):
                        if 'entity_mention_id' in mention_part.attrib:
                            ent_id = mention_part.attrib['entity_mention_id']
                        else:
                            ent_id = mention_part.attrib['filler_id']

                        role = mention_part.attrib['role']
                        args[role] = ent_id

                trigger = rel_mention_node.find('trigger')
                if trigger is not None:
                    trigger_text = trigger.text
                    trigger_begin = trigger.attrib['offset']
                    trigger_len = trigger.attrib['length']
                else:
                    trigger_text = ''
                    trigger_begin = None
                    trigger_len = None

                rel_mention = RelationMention(rel_mention_id,
                                              Span(trigger_begin, trigger_len),
                                              trigger_text, rel_realis)

                for role, ent in args.items():
                    rel_mention.add_arg(role, ent)

                relation.add_mention(rel_mention)
Example #7
0
    def parse_ere(self, ere_file, doc):
        root = ET.parse(ere_file).getroot()

        doc_info = root.attrib

        doc.set_id = doc_info["doc_id"]
        doc.set_doc_type = doc_info["source_type"]

        for entity_node in root.find("entities"):
            entity_ids = []

            ent = doc.add_entity(entity_node.attrib["type"],
                                 entity_node.attrib["id"])

            for entity_mention in entity_node.findall("entity_mention"):
                ent_info = entity_mention.attrib
                entity_ids.append(ent_info["id"])

                entity_text = entity_mention.find("mention_text").text

                entity_span = Span(ent_info["offset"], ent_info["length"])

                doc.add_entity_mention(
                    ent,
                    entity_span,
                    entity_text,
                    ent_info["id"],
                    noun_type=ent_info["noun_type"],
                    entity_type=ent_info.get("type", None),
                )

        for filler in root.find("fillers"):
            filler_info = filler.attrib
            b = int(filler_info["offset"])
            l = int(filler_info["length"])
            doc.add_filler(
                Span(b, b + l),
                filler.text,
                eid=filler_info["id"],
                filler_type=filler_info["type"],
            )

        for event_node in root.find("hoppers"):
            evm_ids = []

            event = doc.add_hopper(event_node.attrib["id"])

            for event_mention in event_node.findall("event_mention"):
                evm_info = event_mention.attrib
                evm_ids.append(evm_info["id"])

                trigger = event_mention.find("trigger")
                trigger_text = trigger.text
                offset = trigger.attrib["offset"]
                length = trigger.attrib["length"]

                evm = doc.add_predicate(
                    event,
                    Span(offset, offset + length),
                    trigger_text,
                    eid=evm_info["id"],
                    frame_type=evm_info["type"] + "_" + evm_info["subtype"],
                    realis=evm_info["realis"],
                )

                for em_arg in event_mention.findall("em_arg"):
                    arg_info = em_arg.attrib

                    arg_ent_mention = None
                    if "entity_mention_id" in arg_info:
                        arg_ent_mention = arg_info["entity_mention_id"]
                    if "filler_id" in arg_info:
                        arg_ent_mention = arg_info["filler_id"]

                    role = arg_info["role"]

                    doc.add_argument_mention(evm, arg_ent_mention, role)

        for relation_node in root.find("relations"):
            relation_info = relation_node.attrib
            relation = doc.add_relation(
                relation_info["id"],
                relation_type=relation_info["type"] + "_" +
                relation_info["subtype"],
            )

            for rel_mention_node in relation_node.findall("relation_mention"):
                rel_mention_id = rel_mention_node.attrib["id"]
                rel_realis = rel_mention_node.attrib["realis"]

                args = {}
                for mention_part in rel_mention_node:
                    if mention_part.tag.startswith("rel_arg"):
                        if "entity_mention_id" in mention_part.attrib:
                            ent_id = mention_part.attrib["entity_mention_id"]
                        else:
                            ent_id = mention_part.attrib["filler_id"]

                        role = mention_part.attrib["role"]
                        args[role] = ent_id

                trigger = rel_mention_node.find("trigger")
                if trigger is not None:
                    trigger_text = trigger.text
                    trigger_begin = trigger.attrib["offset"]
                    trigger_len = trigger.attrib["length"]
                else:
                    trigger_text = ""
                    trigger_begin = None
                    trigger_len = None

                rel_mention = RelationMention(
                    rel_mention_id,
                    Span(trigger_begin, trigger_len),
                    trigger_text,
                    rel_realis,
                )

                for role, ent in args.items():
                    rel_mention.add_arg(role, ent)

                relation.add_mention(rel_mention)