def main(*args): conf = MainConf() conf.update_from_args(args) # -- if conf.R.input_path: # stat mode # -- reader = conf.R.get_reader() insts = list(reader) stat = StatRecorder() if len(insts) > 0: if isinstance(insts[0], Doc): stat_docs(insts, stat) else: stat_sents(insts, stat) # -- key = conf.R.input_path res = {} for ss in [stat.plain_values, stat.special_values]: res.update(ss) show_res = [f"{kk}: {str(res[kk])}\n" for kk in sorted(res.keys())] zlog(f"# -- Stat Mode, Read from {key} and updating {conf.result_center}:\n{''.join(show_res)}") if conf.result_center: if os.path.isfile(conf.result_center): d0 = default_json_serializer.from_file(conf.result_center) else: d0 = {} d0[key] = res default_json_serializer.to_file(d0, conf.result_center) # breakpoint() else: # query mode: query across datasets (key) data = default_json_serializer.from_file(conf.result_center) pattern = re.compile(conf.key_re_pattern) hit_keys = sorted(k for k in data.keys() if re.fullmatch(pattern, k)) zlog(f"Query for {hit_keys}") # breakpoint() # -- while True: try: code = input(">> ") except EOFError: break except KeyboardInterrupt: continue code = code.strip() if len(code) == 0: continue # -- zlog(f"Eval `{code}':") for k in hit_keys: d = data[k] try: one_res = eval(code) except: one_res = traceback.format_exc() zlog(f"#--{k}:\n{one_res}")
def _read_coreness_from_file(file: str): frame_map = default_json_serializer.from_file(file) cmap = {} # FrameName -> {RoleName -> CoreType} for f, v in frame_map.items(): assert f not in cmap, f"Err: repeated frame {f}" new_map = {} for fe in v["FE"]: role, core_type = fe["name"], fe["coreType"] # assert role not in new_map, f"Err: repeated frame-role {f}:{role}" if role in new_map: # skip this one! zwarn(f"repeated frame-role {f}:{role}") else: new_map[role] = core_type cmap[f] = new_map return cmap
def __init__(self, conf: RuleTargetExtractorConf): super().__init__(conf) conf: RuleTargetExtractorConf = self.conf # whitelist: (items are sorted by matching order) self.whitelist = {} # key-feat => {'items': [{'left': [], 'right': [], 'count': int}], 'count': int} # blacklist: (check each rule for exclusion) self.blacklist: List[BlacklistRule] = [] if conf.brule_semafor: self.blacklist.append(BlacklistRule_semafor()) # preload if conf.preload_file: self.from_json(default_json_serializer.from_file(conf.preload_file)) zlog(f"Load RuleTargetExtractor from {conf.preload_file}") # -- # compile self.feat_tok_f = eval(conf.feature_tok_f) # lambda Token: features self.wl_alter_f = eval(conf.wl_alter_f)
def load_progress(self, file: str, forward_stream=False): old_uidx = self.tp.uidx d = default_json_serializer.from_file(file) self.tp.from_json(d) if forward_stream: if old_uidx > self.tp.uidx: zwarn( f"Cannot go to the past: {old_uidx} -> {self.tp.uidx}, skip this!" ) else: _s = self.train_stream for _ in range(self.tp.uidx - old_uidx): _, _eos = _s.next_and_check() if _eos: # restart and get one _s.restart() _s.next() zlog(f"Forward to the future: {old_uidx} -> {self.tp.uidx}!", func="io") zlog(f"Load training progress from {file}", func="io") self.adjust_scheduled_values() # also adjust values!
def build_from_stream(dconf: DConf, stream, extra_stream): zlog("Build vocabs from streams.") # here, collect them all # -- basic inputs voc_word = SimpleVocab.build_empty("word") voc_lemma = SimpleVocab.build_empty("lemma") voc_upos = SimpleVocab.build_empty("upos") voc_char = SimpleVocab.build_empty("char") voc_deplab = SimpleVocab.build_empty("deplab") # -- frame ones voc_evt, voc_ef, voc_arg = SimpleVocab.build_empty("evt"), SimpleVocab.build_empty("ef"), SimpleVocab.build_empty("arg") voc_collections = {"word": voc_word, "lemma": voc_lemma, "upos": voc_upos, "char": voc_char, "deplab": voc_deplab, "evt": voc_evt, "ef": voc_ef, "arg": voc_arg} # read all and build for sent in yield_sents(stream): # -- basic inputs if sent.seq_word is not None: voc_word.feed_iter(sent.seq_word.vals) for w in sent.seq_word.vals: voc_char.feed_iter(w) if sent.seq_lemma is not None: voc_lemma.feed_iter(sent.seq_lemma.vals) if sent.seq_upos is not None: voc_upos.feed_iter(sent.seq_upos.vals) if sent.tree_dep is not None and sent.tree_dep.seq_label is not None: voc_deplab.feed_iter(sent.tree_dep.seq_label.vals) # -- frames if sent.entity_fillers is not None: voc_ef.feed_iter((ef.type for ef in sent.entity_fillers)) if sent.events is not None: voc_evt.feed_iter((evt.type for evt in sent.events)) for evt in sent.events: if evt.args is not None: voc_arg.feed_iter((arg.role for arg in evt.args)) # sort everyone! for voc in voc_collections.values(): voc.build_sort() # extra for evt/arg if dconf.dict_frame_file: frames = default_json_serializer.from_file(dconf.dict_frame_file) for one_f in frames.values(): # no count, simply feed!! if len(one_f["lexUnit"]) > 0: # todo(+W): currently ignore non-lex frames voc_evt.feed_one(one_f["name"], c=0) for one_fe in one_f["FE"]: voc_arg.feed_one(one_fe["name"], c=0) zlog(f"After adding frames from {dconf.dict_frame_file}, evt={voc_evt}, arg={voc_arg}") # ----- # deal with pre-trained word embeddings w2vec = None if dconf.pretrain_wv_file: # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs # collect extra words and lemmas extra_word_counts = {} extra_lemma_counts = {} for sent in yield_sents(extra_stream): if sent.seq_word is not None: for w in sent.seq_word.vals: extra_word_counts[w] = extra_word_counts.get(w, 0) + 1 if sent.seq_lemma is not None: for w in sent.seq_lemma.vals: extra_lemma_counts[w] = extra_lemma_counts.get(w, 0) + 1 # must provide dconf.pretrain_file w2vec = WordVectors.load(dconf.pretrain_wv_file) # first filter according to thresholds _filter_f = lambda ww, rank, val: (val >= dconf.word_fthres and rank <= dconf.word_rthres) or \ w2vec.find_key(ww) is not None voc_word.build_filter(_filter_f) voc_lemma.build_filter(_filter_f) # then add extra ones for w in sorted(extra_word_counts.keys(), key=lambda z: (-extra_word_counts[z], z)): if w2vec.find_key(w) is not None and (w not in voc_word): voc_word.feed_one(w) for w in sorted(extra_lemma_counts.keys(), key=lambda z: (-extra_lemma_counts[z], z)): if w2vec.find_key(w) is not None and (w not in voc_lemma): voc_lemma.feed_one(w) # by-product of filtered output pre-trained embeddings for later faster processing if dconf.pretrain_hits_outf: # find all keys again!! w2vec.clear_hits() for vv in [voc_word, voc_lemma]: for _idx in range(*(vv.non_special_range())): w2vec.find_key(vv.idx2word(_idx)) w2vec.save_hits(dconf.pretrain_hits_outf) # embeds word_embed1 = voc_word.filter_embed(w2vec, scale=dconf.pretrain_scale) lemma_embed1 = voc_lemma.filter_embed(w2vec, scale=dconf.pretrain_scale) else: voc_word.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres) voc_lemma.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres) word_embed1 = lemma_embed1 = None # return ret = ZsfpVocabPackage(voc_collections, {"word": word_embed1, "lemma": lemma_embed1}, dconf) return ret
def read_from_file(cls, fname: str): d = default_json_serializer.from_file(fname) x = cls.cls_from_json(d) return x
def load_progress(self, file: str, forward_stream=False): d = default_json_serializer.from_file(file) self.tp.from_json(d) assert not forward_stream, "Error: 'forward_stream' not supported in this mode!!" zlog(f"Load training progress from {file}", func="io") self.adjust_scheduled_values() # also adjust values!
def load(self, f): conf, cmap = default_json_serializer.from_file(f) self.conf.from_json(conf) self.cmap = cmap self.reinit() zlog(f"Load {self.__class__.__name__} from {f}")
def main(*args): conf: MainConf = init_everything(MainConf(), args) # -- cons_lex = LexConstrainer(conf.lex_conf) cons_fe = FEConstrainer(conf.fe_conf) # -- # some confs lex_use_fn_style = conf.lex_conf.use_fn_style # -- # first try to read frame file if conf.frame_file: assert lex_use_fn_style, "Otherwise do not provide 'frame_file'!!s" external_frames = default_json_serializer.from_file(conf.frame_file) for fname, fv in external_frames.items(): # LU for lu in fv["lexUnit"]: lu_name = lu["name"] cons_lex.add(cons_lex.lu2feat(lu_name), fname, c=0) # no count now, only add entry lu_name2 = LexConstrainer.norm_lu(lu_name) if lu_name2 != lu_name: # also add normed name! cons_lex.add(cons_lex.lu2feat(lu_name2), fname, c=0) # FE for fe in fv["FE"]: fe_name = fe["name"] cons_fe.add(fname, fe_name, c=0) # again no count here! zlog( f"Read from {conf.frame_file}: LU={cons_lex.summary()}, FE={cons_fe.summary()}" ) # -- # then read data! if conf.train.input_path: reader = conf.train.get_reader() for sent in yield_sents(reader): for frame in sent.get_frames(conf.lex_conf.cons_ftag): frame_name = frame.type # LU feats = [] if lex_use_fn_style: # then directly use the stored one!! lu_name = frame.info.get("luName") feats.append(cons_lex.lu2feat(lu_name)) lu_name2 = LexConstrainer.norm_lu(lu_name) if lu_name2 != lu_name: feats.append(cons_lex.lu2feat(lu_name2)) # also add the plain one!! widx, wlen = frame.mention.get_span() feat = cons_lex.span2feat(frame.sent, widx, wlen) feats.append(feat) # -- for feat in feats: cons_lex.add(feat, frame_name, c=1) # FE for alink in frame.args: cons_fe.add(frame_name, alink.role, c=1) zlog( f"Read from {conf.train.input_path}: LU={cons_lex.summary()}, FE={cons_fe.summary()}" ) # -- # summary and save cons_lex.save(conf.lex_save_name) cons_fe.save(conf.fe_save_name)