Beispiel #1
0
    def get_doc(self):
        super().get_doc()

        full_text_dir = os.path.join(self.params.fn_path, "fulltext")

        for full_text_path in glob.glob(full_text_dir + "/*.xml"):
            doc = DEDocument(self.corpus)
            docid = os.path.basename(full_text_path).replace(".xml", "")
            doc.set_id(docid)
            self.parse_full_text(full_text_path, doc)
            yield doc
Beispiel #2
0
 def read_rich_ere(self, corpus, source_path, l_ere_path, ranges):
     with open(source_path) as source:
         text = source.read()
         doc = DEDocument(corpus, text, ranges, self.params.ignore_quote)
         for ere_path in l_ere_path:
             with open(ere_path) as ere:
                 logging.info("Processing: " + os.path.basename(ere_path))
                 self.parse_ere(ere, doc)
                 return doc
Beispiel #3
0
    def get_doc(self):
        for docid, instances in self.nombank_annos.items():
            if self.params.gc_only and docid not in self.gc_annos:
                continue

            doc = DEDocument(self.corpus)
            doc.set_id(docid)

            fileid = docid.split('_')[-1][:2] + '/' + docid

            parsed_sents = self.wsj_treebank.parsed_sents(fileids=fileid)
            doc.set_parsed_sents(parsed_sents)

            token_spans = self.set_wsj_text(doc, fileid)
            doc.set_token_spans(token_spans)

            self.add_all_annotations(doc, parsed_sents)

            yield doc
Beispiel #4
0
    def get_doc(self):
        for xml_path in self.xml_paths:
            logging.info("Parsing: " + xml_path)

            root = ET.parse(xml_path).getroot()
            self.corpus.set_corpus_name(root.attrib['corpusname'])

            doc = DEDocument(self.corpus)
            doc.set_id(self.corpus.corpus_name)

            body = root.find("body")

            for sent in body:
                c_graph_node = sent.find("graph")

                c_parse = self.read_constituent_parse(c_graph_node)
                self.build_next_sent(doc, c_parse)

            doc.set_text(self.text)

            for sent in body:
                self.read_frame_parse(doc, sent)

            yield doc
Beispiel #5
0
    def parse_conll_data(self, corpus, conll_in):
        text = ""
        offset = 0

        arg_text = []
        sent_predicates = []
        sent_args = defaultdict(list)
        doc = DEDocument(corpus)

        props = []

        for line in conll_in:
            parts = line.strip().split()
            if len(parts) < 8:
                text += "\n"
                offset += 1

                for index, predicate in enumerate(sent_predicates):
                    arg_content = sent_args[index]
                    props.append((predicate, arg_content))

                sent_predicates.clear()
                sent_args.clear()
                arg_text.clear()

                continue

            fname, _, index, token, pos, parse, lemma, sense = parts[:8]
            pb_annos = parts[8:]

            if len(arg_text) == 0:
                arg_text = [None] * len(pb_annos)

            domain = fname.split("/")[1]

            start = offset
            end = start + len(token)

            text += token + " "
            offset += len(token) + 1

            for index, t in enumerate(arg_text):
                if t:
                    arg_text[index] += " " + token

            if not sense == "-":
                sent_predicates.append((start, end, token))

            for index, anno in enumerate(pb_annos):
                if anno == "(V*)":
                    continue

                if anno.startswith("("):
                    role = anno.strip("(").strip(")").strip("*")
                    sent_args[index].append([role, start])
                    arg_text[index] = token
                if anno.endswith(")"):
                    sent_args[index][-1].append(end)
                    sent_args[index][-1].append(arg_text[index])
                    arg_text[index] = ""

        doc.set_text(text)

        for (p_start, p_end, p_token), args in props:
            hopper = doc.add_hopper()

            pred = doc.add_predicate(hopper, Span(p_start, p_end), p_token)

            if pred is not None:
                for role, arg_start, arg_end, arg_text in args:
                    filler = doc.add_filler(Span(arg_start, arg_end), arg_text)
                    doc.add_argument_mention(pred, filler.aid, role)

        return doc
Beispiel #6
0
    def parse_ace_data(self, corpus, source_file, anno_file):
        with open(source_file) as source_in:
            doc = DEDocument(corpus)

            text = self.get_source_text(source_in)

            doc.set_text(text)

            tree = ET.parse(anno_file)
            root = tree.getroot()

            for xml_doc in root.iter("document"):
                docid = xml_doc.attrib["DOCID"]
                doc.set_id(docid)

                # Parse entity.
                entity2mention = defaultdict(list)

                for entity in xml_doc.iter("entity"):
                    entity_type = entity.attrib["TYPE"]
                    entity_subtype = entity.attrib["SUBTYPE"]
                    full_type = entity_type + "_" + entity_subtype

                    ent = doc.add_entity(full_type, entity.attrib["ID"])

                    for em in entity:
                        for head in em.iter("head"):
                            for charseq in head.iter("charseq"):
                                start = int(charseq.attrib["START"])
                                end = int(charseq.attrib["END"])

                                entity_span = Span(start, end + 1)

                                ent_mention = doc.add_entity_mention(
                                    ent,
                                    entity_span,
                                    charseq.text,
                                    em.attrib["ID"],
                                    entity_type=full_type,
                                    validate=False,
                                )

                                entity2mention[entity.attrib["ID"]].append(
                                    ent_mention)

                # Parse event.
                for event_node in xml_doc.iter("event"):
                    event_type = event_node.attrib["TYPE"]
                    event_subtype = event_node.attrib["SUBTYPE"]

                    hopper = doc.add_hopper(event_node.attrib["ID"])

                    event_mentions = []

                    for evm_node in event_node:
                        for anchor in evm_node.iter("anchor"):
                            for charseq in anchor.iter("charseq"):
                                start = int(charseq.attrib["START"])
                                end = int(charseq.attrib["END"])

                                evm = doc.add_predicate(
                                    hopper,
                                    Span(start, end + 1),
                                    charseq.text,
                                    eid=evm_node.attrib["ID"],
                                    frame_type=event_type + "_" +
                                    event_subtype,
                                    validate=False,
                                )

                                event_mentions.append(evm)

                    for em_arg in event_node.iter("event_argument"):
                        role = em_arg.attrib["ROLE"]
                        arg_id = em_arg.attrib["REFID"]

                        entity_mentions = entity2mention[arg_id]

                        if len(entity_mentions) > 0:
                            closest_ent, closest_evm, _ = find_close_mention(
                                event_mentions, entity_mentions)
                            doc.add_argument_mention(closest_evm,
                                                     closest_ent.aid, role)

                return doc