def build_next_sent(self, doc, c_parse): # Build token spans. sep = ' ' sent_token_nodes = c_parse['tokens'] id2node = c_parse['id2node'] for i, token_node in enumerate(sent_token_nodes): if i == len(sent_token_nodes) - 1: sep = '\n' word = html.unescape(token_node.attributes['word']) self.text += word self.text += sep w_start = self.offset self.offset += len(word) w_end = self.offset self.offset += 1 token_span = Span(w_start, w_end) doc.add_token_span(token_span) self.id2span[token_node.node_id] = token_span for tid, node in id2node.items(): if tid not in self.id2span: leave_tokens = node.get_leaves() begin_token_span = self.id2span[ leave_tokens[0].attributes['id']] end_token_span = self.id2span[ leave_tokens[-1].attributes['id']] self.id2span[tid] = Span(begin_token_span.begin, end_token_span.end)
def get_nltk_span(token_spans, sent_num, indice_groups): spans = [] for indices in indice_groups: start = -1 end = -1 for index in indices: s = token_spans[sent_num][index] if s: if start < 0: start = s[0] end = s[1] if start >= 0 and end >= 0: spans.append(Span(start, end)) return spans
def parse_conll_data(self, corpus, conll_in): text = "" offset = 0 arg_text = [] sent_predicates = [] sent_args = defaultdict(list) doc = DEDocument(corpus) props = [] for line in conll_in: parts = line.strip().split() if len(parts) < 8: text += "\n" offset += 1 for index, predicate in enumerate(sent_predicates): arg_content = sent_args[index] props.append((predicate, arg_content)) sent_predicates.clear() sent_args.clear() arg_text.clear() continue fname, _, index, token, pos, parse, lemma, sense = parts[:8] pb_annos = parts[8:] if len(arg_text) == 0: arg_text = [None] * len(pb_annos) domain = fname.split("/")[1] start = offset end = start + len(token) text += token + " " offset += len(token) + 1 for index, t in enumerate(arg_text): if t: arg_text[index] += " " + token if not sense == "-": sent_predicates.append((start, end, token)) for index, anno in enumerate(pb_annos): if anno == "(V*)": continue if anno.startswith("("): role = anno.strip("(").strip(")").strip("*") sent_args[index].append([role, start]) arg_text[index] = token if anno.endswith(")"): sent_args[index][-1].append(end) sent_args[index][-1].append(arg_text[index]) arg_text[index] = "" doc.set_text(text) for (p_start, p_end, p_token), args in props: hopper = doc.add_hopper() pred = doc.add_predicate(hopper, Span(p_start, p_end), p_token) if pred is not None: for role, arg_start, arg_end, arg_text in args: filler = doc.add_filler(Span(arg_start, arg_end), arg_text) doc.add_argument_mention(pred, filler.aid, role) return doc
def parse_full_text(self, full_text_file, doc): root = ET.parse(full_text_file).getroot() full_text = "" offset = 0 annotations = [] for sent in root.findall("icsi:sentence", self.ns): sent_text = sent.find("icsi:text", self.ns).text full_text += sent_text full_text += "\n" for anno_set in sent.findall("icsi:annotationSet", self.ns): targets = [] fes = [] if not "frameName" in anno_set.attrib: continue frame_name = anno_set.attrib["frameName"] for layer in anno_set.findall("icsi:layer", self.ns): layer_type = layer.attrib["name"] if layer_type == "Target": label = layer.find("icsi:label", self.ns) if label is not None: s = int(label.attrib["start"]) e = int(label.attrib["end"]) + 1 text = sent_text[s:e] targets.append((s + offset, e + offset, text)) elif layer_type == "FE": for label in layer.findall("icsi:label", self.ns): label_name = label.attrib["name"] if "itype" in label.attrib: # Null instantiation. pass else: s = int(label.attrib["start"]) e = int(label.attrib["end"]) + 1 text = sent_text[s:e] fes.append( (s + offset, e + offset, text, label_name)) if targets: max_len = 0 target = None for i, (s, e, text) in enumerate(targets): if e - s > max_len: max_len = e - s target = s, e, text annotations.append((frame_name, target, fes)) offset = len(full_text) doc.set_text(full_text) for frame_name, target, fes in annotations: ev = doc.add_hopper() target_start, target_end, text = target evm = doc.add_predicate(ev, Span(target_start, target_end), text=text, frame_type=frame_name) for start, end, fe_text, role in fes: filler = doc.add_filler(Span(start, end), fe_text) doc.add_argument_mention(evm, filler.aid, role) return doc
def parse_ace_data(self, corpus, source_file, anno_file): with open(source_file) as source_in: doc = DEDocument(corpus) text = self.get_source_text(source_in) doc.set_text(text) tree = ET.parse(anno_file) root = tree.getroot() for xml_doc in root.iter("document"): docid = xml_doc.attrib["DOCID"] doc.set_id(docid) # Parse entity. entity2mention = defaultdict(list) for entity in xml_doc.iter("entity"): entity_type = entity.attrib["TYPE"] entity_subtype = entity.attrib["SUBTYPE"] full_type = entity_type + "_" + entity_subtype ent = doc.add_entity(full_type, entity.attrib["ID"]) for em in entity: for head in em.iter("head"): for charseq in head.iter("charseq"): start = int(charseq.attrib["START"]) end = int(charseq.attrib["END"]) entity_span = Span(start, end + 1) ent_mention = doc.add_entity_mention( ent, entity_span, charseq.text, em.attrib["ID"], entity_type=full_type, validate=False, ) entity2mention[entity.attrib["ID"]].append( ent_mention) # Parse event. for event_node in xml_doc.iter("event"): event_type = event_node.attrib["TYPE"] event_subtype = event_node.attrib["SUBTYPE"] hopper = doc.add_hopper(event_node.attrib["ID"]) event_mentions = [] for evm_node in event_node: for anchor in evm_node.iter("anchor"): for charseq in anchor.iter("charseq"): start = int(charseq.attrib["START"]) end = int(charseq.attrib["END"]) evm = doc.add_predicate( hopper, Span(start, end + 1), charseq.text, eid=evm_node.attrib["ID"], frame_type=event_type + "_" + event_subtype, validate=False, ) event_mentions.append(evm) for em_arg in event_node.iter("event_argument"): role = em_arg.attrib["ROLE"] arg_id = em_arg.attrib["REFID"] entity_mentions = entity2mention[arg_id] if len(entity_mentions) > 0: closest_ent, closest_evm, _ = find_close_mention( event_mentions, entity_mentions) doc.add_argument_mention(closest_evm, closest_ent.aid, role) return doc
def parse_ere(self, ere_file, doc): root = ET.parse(ere_file).getroot() doc_info = root.attrib doc.set_id = doc_info['doc_id'] doc.set_doc_type = doc_info['source_type'] for entity_node in root.find('entities'): entity_ids = [] ent = doc.add_entity(entity_node.attrib['type'], entity_node.attrib['id']) for entity_mention in entity_node.findall('entity_mention'): ent_info = entity_mention.attrib entity_ids.append(ent_info['id']) entity_text = entity_mention.find('mention_text').text entity_span = Span(ent_info['offset'], ent_info['length']) doc.add_entity_mention( ent, entity_span, entity_text, ent_info['id'], noun_type=ent_info['noun_type'], entity_type=ent_info.get('type', None), ) for filler in root.find('fillers'): filler_info = filler.attrib b = int(filler_info['offset']) l = int(filler_info['length']) doc.add_filler(Span(b, b + l), filler.text, eid=filler_info['id'], filler_type=filler_info['type']) for event_node in root.find('hoppers'): evm_ids = [] event = doc.add_hopper(event_node.attrib['id']) for event_mention in event_node.findall('event_mention'): evm_info = event_mention.attrib evm_ids.append(evm_info['id']) trigger = event_mention.find('trigger') trigger_text = trigger.text offset = trigger.attrib['offset'] length = trigger.attrib['length'] evm = doc.add_predicate(event, Span(offset, offset + length), trigger_text, eid=evm_info['id'], frame_type=evm_info['type'] + '_' + evm_info['subtype'], realis=evm_info['realis']) for em_arg in event_mention.findall('em_arg'): arg_info = em_arg.attrib arg_ent_mention = None if 'entity_mention_id' in arg_info: arg_ent_mention = arg_info['entity_mention_id'] if 'filler_id' in arg_info: arg_ent_mention = arg_info['filler_id'] role = arg_info['role'] doc.add_argument_mention(evm, arg_ent_mention, role) for relation_node in root.find('relations'): relation_info = relation_node.attrib relation = doc.add_relation(relation_info['id'], relation_type=relation_info['type'] + '_' + relation_info['subtype']) for rel_mention_node in relation_node.findall('relation_mention'): rel_mention_id = rel_mention_node.attrib['id'] rel_realis = rel_mention_node.attrib['realis'] args = {} for mention_part in rel_mention_node: if mention_part.tag.startswith('rel_arg'): if 'entity_mention_id' in mention_part.attrib: ent_id = mention_part.attrib['entity_mention_id'] else: ent_id = mention_part.attrib['filler_id'] role = mention_part.attrib['role'] args[role] = ent_id trigger = rel_mention_node.find('trigger') if trigger is not None: trigger_text = trigger.text trigger_begin = trigger.attrib['offset'] trigger_len = trigger.attrib['length'] else: trigger_text = '' trigger_begin = None trigger_len = None rel_mention = RelationMention(rel_mention_id, Span(trigger_begin, trigger_len), trigger_text, rel_realis) for role, ent in args.items(): rel_mention.add_arg(role, ent) relation.add_mention(rel_mention)
def parse_ere(self, ere_file, doc): root = ET.parse(ere_file).getroot() doc_info = root.attrib doc.set_id = doc_info["doc_id"] doc.set_doc_type = doc_info["source_type"] for entity_node in root.find("entities"): entity_ids = [] ent = doc.add_entity(entity_node.attrib["type"], entity_node.attrib["id"]) for entity_mention in entity_node.findall("entity_mention"): ent_info = entity_mention.attrib entity_ids.append(ent_info["id"]) entity_text = entity_mention.find("mention_text").text entity_span = Span(ent_info["offset"], ent_info["length"]) doc.add_entity_mention( ent, entity_span, entity_text, ent_info["id"], noun_type=ent_info["noun_type"], entity_type=ent_info.get("type", None), ) for filler in root.find("fillers"): filler_info = filler.attrib b = int(filler_info["offset"]) l = int(filler_info["length"]) doc.add_filler( Span(b, b + l), filler.text, eid=filler_info["id"], filler_type=filler_info["type"], ) for event_node in root.find("hoppers"): evm_ids = [] event = doc.add_hopper(event_node.attrib["id"]) for event_mention in event_node.findall("event_mention"): evm_info = event_mention.attrib evm_ids.append(evm_info["id"]) trigger = event_mention.find("trigger") trigger_text = trigger.text offset = trigger.attrib["offset"] length = trigger.attrib["length"] evm = doc.add_predicate( event, Span(offset, offset + length), trigger_text, eid=evm_info["id"], frame_type=evm_info["type"] + "_" + evm_info["subtype"], realis=evm_info["realis"], ) for em_arg in event_mention.findall("em_arg"): arg_info = em_arg.attrib arg_ent_mention = None if "entity_mention_id" in arg_info: arg_ent_mention = arg_info["entity_mention_id"] if "filler_id" in arg_info: arg_ent_mention = arg_info["filler_id"] role = arg_info["role"] doc.add_argument_mention(evm, arg_ent_mention, role) for relation_node in root.find("relations"): relation_info = relation_node.attrib relation = doc.add_relation( relation_info["id"], relation_type=relation_info["type"] + "_" + relation_info["subtype"], ) for rel_mention_node in relation_node.findall("relation_mention"): rel_mention_id = rel_mention_node.attrib["id"] rel_realis = rel_mention_node.attrib["realis"] args = {} for mention_part in rel_mention_node: if mention_part.tag.startswith("rel_arg"): if "entity_mention_id" in mention_part.attrib: ent_id = mention_part.attrib["entity_mention_id"] else: ent_id = mention_part.attrib["filler_id"] role = mention_part.attrib["role"] args[role] = ent_id trigger = rel_mention_node.find("trigger") if trigger is not None: trigger_text = trigger.text trigger_begin = trigger.attrib["offset"] trigger_len = trigger.attrib["length"] else: trigger_text = "" trigger_begin = None trigger_len = None rel_mention = RelationMention( rel_mention_id, Span(trigger_begin, trigger_len), trigger_text, rel_realis, ) for role, ent in args.items(): rel_mention.add_arg(role, ent) relation.add_mention(rel_mention)