def do_stat_srl(insts): cc = Counter() cc_narg = Counter() voc = SimpleVocab.build_empty() # set_ee_heads(insts) voc_pred, voc_arg = SimpleVocab.build_empty(), SimpleVocab.build_empty() voc_deplab = SimpleVocab.build_empty() for sent in yield_sents(insts): cc["sent"] += 1 cc["tok"] += len(sent) cc["frame"] += len(sent.events) # -- _tree = sent.tree_dep if _tree is not None: voc_deplab.feed_iter(_tree.seq_label.vals) for evt in sent.events: voc_pred.feed_one(evt.label) evt_widx = evt.mention.shead_widx cc_narg[f"NARG={len(evt.args)}"] += 1 for arg in evt.args: voc_arg.feed_one(arg.label) cc["arg"] += 1 # check arg overlap for a2 in evt.args: if a2 is arg: continue # not self if not (arg.mention.widx >= a2.mention.wridx or a2.mention.widx >= arg.mention.wridx): cc["arg_overlap"] += 1 else: cc["arg_overlap"] += 0 # -- voc.build_sort() voc_pred.build_sort() voc_arg.build_sort() voc_deplab.build_sort() # -- # get more stat cc2 = dict(cc) cc2.update({ "t/s": f"{cc['tok']/cc['sent']:.2f}", "f/s": f"{cc['frame']/cc['sent']:.2f}", "a/f": f"{cc['arg']/cc['frame']:.2f}" }) zlog(f"CC: {cc2}") zlog(cc_narg) zlog(voc_arg.counts) # -- MAX_PRINT_ITEMS = 20 d_pred = voc_pred.get_info_table() print(d_pred[:MAX_PRINT_ITEMS].to_string()) d_arg = voc_arg.get_info_table() print(d_arg[:MAX_PRINT_ITEMS].to_string()) d_deplab = voc_deplab.get_info_table() print(d_deplab[:MAX_PRINT_ITEMS].to_string()) d = voc.get_info_table() print(d[:MAX_PRINT_ITEMS].to_string())
def do_stat(insts): cc = Counter() voc = SimpleVocab.build_empty() for sent in yield_sents(insts): cc["sent"] += 1 cc["tok"] += len(sent) cc["tok_pair"] += len(sent)**2 _tree = sent.tree_dep _deplabs = _tree.seq_label.vals _slen = len(sent) for i0 in range(_slen): for i1 in range(_slen): if abs(i0 - i1) > 5: continue path1, path2 = _tree.get_path(i0, i1) labs1, labs2 = sorted( [[_deplabs[z].split(":")[0] for z in path1], [_deplabs[z].split(":")[0] for z in path2]]) _len = len(labs1) + len(labs2) # if _len<=0 or _len>2 or "punct" in labs1 or "punct" in labs2: if _len != 2 or "punct" in labs1 or "punct" in labs2: continue _k = (tuple(labs1), tuple(labs2)) voc.feed_one(_k) # -- zlog(cc) voc.build_sort() d = voc.get_info_table() print(d[:100].to_string())
def build_vocab(self, datasets: List): voc_upos = SimpleVocab.build_empty(self.name) for dataset in datasets: for sent in yield_sents(dataset.insts): voc_upos.feed_iter(sent.seq_upos.vals) # finnished voc_upos.build_sort() return (voc_upos, )
def build_vocab(self, datasets: List): conf: ZTaskUdepConf = self.conf # -- voc_udep = SimpleVocab.build_empty(self.name) for dataset in datasets: for sent in yield_sents(dataset.insts): _vals = sent.tree_dep.seq_label.vals if conf.use_l1: _vals = [z.split(":")[0] for z in _vals] voc_udep.feed_iter(_vals) voc_udep.build_sort() _, udep_direct_range = voc_udep.non_special_range() # range of direct labels zlog(f"Finish building voc_udep: {voc_udep}") return (voc_udep, udep_direct_range)
def build_from_stream(dconf: DConf, stream, extra_stream): zlog("Build vocabs from streams.") # here, collect them all # -- basic inputs voc_word = SimpleVocab.build_empty("word") voc_lemma = SimpleVocab.build_empty("lemma") voc_upos = SimpleVocab.build_empty("upos") voc_char = SimpleVocab.build_empty("char") voc_deplab = SimpleVocab.build_empty("deplab") # -- frame ones voc_evt, voc_ef, voc_arg = SimpleVocab.build_empty("evt"), SimpleVocab.build_empty("ef"), SimpleVocab.build_empty("arg") voc_collections = {"word": voc_word, "lemma": voc_lemma, "upos": voc_upos, "char": voc_char, "deplab": voc_deplab, "evt": voc_evt, "ef": voc_ef, "arg": voc_arg} # read all and build for sent in yield_sents(stream): # -- basic inputs if sent.seq_word is not None: voc_word.feed_iter(sent.seq_word.vals) for w in sent.seq_word.vals: voc_char.feed_iter(w) if sent.seq_lemma is not None: voc_lemma.feed_iter(sent.seq_lemma.vals) if sent.seq_upos is not None: voc_upos.feed_iter(sent.seq_upos.vals) if sent.tree_dep is not None and sent.tree_dep.seq_label is not None: voc_deplab.feed_iter(sent.tree_dep.seq_label.vals) # -- frames if sent.entity_fillers is not None: voc_ef.feed_iter((ef.type for ef in sent.entity_fillers)) if sent.events is not None: voc_evt.feed_iter((evt.type for evt in sent.events)) for evt in sent.events: if evt.args is not None: voc_arg.feed_iter((arg.role for arg in evt.args)) # sort everyone! for voc in voc_collections.values(): voc.build_sort() # extra for evt/arg if dconf.dict_frame_file: frames = default_json_serializer.from_file(dconf.dict_frame_file) for one_f in frames.values(): # no count, simply feed!! if len(one_f["lexUnit"]) > 0: # todo(+W): currently ignore non-lex frames voc_evt.feed_one(one_f["name"], c=0) for one_fe in one_f["FE"]: voc_arg.feed_one(one_fe["name"], c=0) zlog(f"After adding frames from {dconf.dict_frame_file}, evt={voc_evt}, arg={voc_arg}") # ----- # deal with pre-trained word embeddings w2vec = None if dconf.pretrain_wv_file: # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs # collect extra words and lemmas extra_word_counts = {} extra_lemma_counts = {} for sent in yield_sents(extra_stream): if sent.seq_word is not None: for w in sent.seq_word.vals: extra_word_counts[w] = extra_word_counts.get(w, 0) + 1 if sent.seq_lemma is not None: for w in sent.seq_lemma.vals: extra_lemma_counts[w] = extra_lemma_counts.get(w, 0) + 1 # must provide dconf.pretrain_file w2vec = WordVectors.load(dconf.pretrain_wv_file) # first filter according to thresholds _filter_f = lambda ww, rank, val: (val >= dconf.word_fthres and rank <= dconf.word_rthres) or \ w2vec.find_key(ww) is not None voc_word.build_filter(_filter_f) voc_lemma.build_filter(_filter_f) # then add extra ones for w in sorted(extra_word_counts.keys(), key=lambda z: (-extra_word_counts[z], z)): if w2vec.find_key(w) is not None and (w not in voc_word): voc_word.feed_one(w) for w in sorted(extra_lemma_counts.keys(), key=lambda z: (-extra_lemma_counts[z], z)): if w2vec.find_key(w) is not None and (w not in voc_lemma): voc_lemma.feed_one(w) # by-product of filtered output pre-trained embeddings for later faster processing if dconf.pretrain_hits_outf: # find all keys again!! w2vec.clear_hits() for vv in [voc_word, voc_lemma]: for _idx in range(*(vv.non_special_range())): w2vec.find_key(vv.idx2word(_idx)) w2vec.save_hits(dconf.pretrain_hits_outf) # embeds word_embed1 = voc_word.filter_embed(w2vec, scale=dconf.pretrain_scale) lemma_embed1 = voc_lemma.filter_embed(w2vec, scale=dconf.pretrain_scale) else: voc_word.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres) voc_lemma.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres) word_embed1 = lemma_embed1 = None # return ret = ZsfpVocabPackage(voc_collections, {"word": word_embed1, "lemma": lemma_embed1}, dconf) return ret