def stat_docs(docs: List[Doc], stat: StatRecorder): for doc in docs: stat.record_kv("doc", 1) stat.srecord_kv("doc_nsent_d10", len(doc.sents)//10) stat_sents(doc.sents, stat)
def stat_sents(sents: List[Sent], stat: StatRecorder): # -- def _has_overlap(_f1, _f2): start1, end1 = _f1.mention.widx, _f1.mention.wridx start2, end2 = _f2.mention.widx, _f2.mention.wridx return not (start1>=end2 or start2>=end1) # -- for sent in sents: stat.record_kv("sent", 1) stat.record_kv("tok", len(sent)) stat.srecord_kv("sent_ntok_d10", len(sent)//10) stat.srecord_kv("sent_nframe", len(sent.events)) cur_pos_list = sent.seq_upos.vals if sent.seq_upos is not None else None # frame for frame in sent.events: widx, wlen = frame.mention.get_span() # -- stat.record_kv("frame", 1) # frame target length stat.srecord_kv("frame_wlen", wlen) # frame trigger upos stat.srecord_kv("frame_trigger_pos", ",".join([] if cur_pos_list is None else cur_pos_list[widx:widx+wlen])) # frame target overlap with others? stat.record_kv("frame_overlapped", int(any(_has_overlap(frame, f2) for f2 in sent.events if f2 is not frame))) # frame type stat.srecord_kv("frame_type", frame.type) stat.srecord_kv("frame_type0", frame.type.split(".")[0]) # in case of PB # args all_args = Counter() stat.srecord_kv("frame_narg", len(frame.args)) for alink in frame.args: rank = alink.info.get("rank", 1) # -- stat.record_kv("arg", 1) stat.record_kv(f"arg_R{rank}", 1) # arg target length stat.srecord_kv("arg_wlen_m30", min(30, alink.mention.wlen)) # arg overlap with others? stat.record_kv("arg_overlapped", int(any(_has_overlap(alink, a2) for a2 in frame.args if a2 is not alink))) stat.record_kv(f"arg_overlapped_R{rank}", int(any(_has_overlap(alink, a2) for a2 in frame.args if a2 is not alink and a2.info.get("rank", 1) == rank))) # arg role stat.srecord_kv("arg_role", alink.role) # -- all_args[alink.role] += 1 # check repeat for rr, cc in all_args.items(): stat.srecord_kv("arg_repeat", cc, c=cc) if cc>1: stat.srecord_kv("arg_repeatR", f"{cc}*{rr}")