Beispiel #1
0
def stat_docs(docs: List[Doc], stat: StatRecorder):
    for doc in docs:
        stat.record_kv("doc", 1)
        stat.srecord_kv("doc_nsent_d10", len(doc.sents)//10)
        stat_sents(doc.sents, stat)
Beispiel #2
0
def stat_sents(sents: List[Sent], stat: StatRecorder):
    # --
    def _has_overlap(_f1, _f2):
        start1, end1 = _f1.mention.widx, _f1.mention.wridx
        start2, end2 = _f2.mention.widx, _f2.mention.wridx
        return not (start1>=end2 or start2>=end1)
    # --
    for sent in sents:
        stat.record_kv("sent", 1)
        stat.record_kv("tok", len(sent))
        stat.srecord_kv("sent_ntok_d10", len(sent)//10)
        stat.srecord_kv("sent_nframe", len(sent.events))
        cur_pos_list = sent.seq_upos.vals if sent.seq_upos is not None else None
        # frame
        for frame in sent.events:
            widx, wlen = frame.mention.get_span()
            # --
            stat.record_kv("frame", 1)
            # frame target length
            stat.srecord_kv("frame_wlen", wlen)
            # frame trigger upos
            stat.srecord_kv("frame_trigger_pos", ",".join([] if cur_pos_list is None else cur_pos_list[widx:widx+wlen]))
            # frame target overlap with others?
            stat.record_kv("frame_overlapped", int(any(_has_overlap(frame, f2) for f2 in sent.events if f2 is not frame)))
            # frame type
            stat.srecord_kv("frame_type", frame.type)
            stat.srecord_kv("frame_type0", frame.type.split(".")[0])  # in case of PB
            # args
            all_args = Counter()
            stat.srecord_kv("frame_narg", len(frame.args))
            for alink in frame.args:
                rank = alink.info.get("rank", 1)
                # --
                stat.record_kv("arg", 1)
                stat.record_kv(f"arg_R{rank}", 1)
                # arg target length
                stat.srecord_kv("arg_wlen_m30", min(30, alink.mention.wlen))
                # arg overlap with others?
                stat.record_kv("arg_overlapped", int(any(_has_overlap(alink, a2) for a2 in frame.args if a2 is not alink)))
                stat.record_kv(f"arg_overlapped_R{rank}",
                               int(any(_has_overlap(alink, a2) for a2 in frame.args if
                                       a2 is not alink and a2.info.get("rank", 1) == rank)))
                # arg role
                stat.srecord_kv("arg_role", alink.role)
                # --
                all_args[alink.role] += 1
            # check repeat
            for rr, cc in all_args.items():
                stat.srecord_kv("arg_repeat", cc, c=cc)
                if cc>1:
                    stat.srecord_kv("arg_repeatR", f"{cc}*{rr}")