Example #1
0
def main(*args):
    conf = MainConf()
    conf.update_from_args(args)
    # --
    if conf.R.input_path:
        # stat mode
        # --
        reader = conf.R.get_reader()
        insts = list(reader)
        stat = StatRecorder()
        if len(insts) > 0:
            if isinstance(insts[0], Doc):
                stat_docs(insts, stat)
            else:
                stat_sents(insts, stat)
        # --
        key = conf.R.input_path
        res = {}
        for ss in [stat.plain_values, stat.special_values]:
            res.update(ss)
        show_res = [f"{kk}: {str(res[kk])}\n" for kk in sorted(res.keys())]
        zlog(f"# -- Stat Mode, Read from {key} and updating {conf.result_center}:\n{''.join(show_res)}")
        if conf.result_center:
            if os.path.isfile(conf.result_center):
                d0 = default_json_serializer.from_file(conf.result_center)
            else:
                d0 = {}
            d0[key] = res
            default_json_serializer.to_file(d0, conf.result_center)
            # breakpoint()
    else:
        # query mode: query across datasets (key)
        data = default_json_serializer.from_file(conf.result_center)
        pattern = re.compile(conf.key_re_pattern)
        hit_keys = sorted(k for k in data.keys() if re.fullmatch(pattern, k))
        zlog(f"Query for {hit_keys}")
        # breakpoint()
        # --
        while True:
            try:
                code = input(">> ")
            except EOFError:
                break
            except KeyboardInterrupt:
                continue
            code = code.strip()
            if len(code) == 0: continue
            # --
            zlog(f"Eval `{code}':")
            for k in hit_keys:
                d = data[k]
                try:
                    one_res = eval(code)
                except:
                    one_res = traceback.format_exc()
                zlog(f"#--{k}:\n{one_res}")
Example #2
0
 def _read_coreness_from_file(file: str):
     frame_map = default_json_serializer.from_file(file)
     cmap = {}  # FrameName -> {RoleName -> CoreType}
     for f, v in frame_map.items():
         assert f not in cmap, f"Err: repeated frame {f}"
         new_map = {}
         for fe in v["FE"]:
             role, core_type = fe["name"], fe["coreType"]
             # assert role not in new_map, f"Err: repeated frame-role {f}:{role}"
             if role in new_map:  # skip this one!
                 zwarn(f"repeated frame-role {f}:{role}")
             else:
                 new_map[role] = core_type
         cmap[f] = new_map
     return cmap
Example #3
0
 def __init__(self, conf: RuleTargetExtractorConf):
     super().__init__(conf)
     conf: RuleTargetExtractorConf = self.conf
     # whitelist: (items are sorted by matching order)
     self.whitelist = {}  # key-feat => {'items': [{'left': [], 'right': [], 'count': int}], 'count': int}
     # blacklist: (check each rule for exclusion)
     self.blacklist: List[BlacklistRule] = []
     if conf.brule_semafor:
         self.blacklist.append(BlacklistRule_semafor())
     # preload
     if conf.preload_file:
         self.from_json(default_json_serializer.from_file(conf.preload_file))
         zlog(f"Load RuleTargetExtractor from {conf.preload_file}")
     # --
     # compile
     self.feat_tok_f = eval(conf.feature_tok_f)  # lambda Token: features
     self.wl_alter_f = eval(conf.wl_alter_f)
Example #4
0
 def load_progress(self, file: str, forward_stream=False):
     old_uidx = self.tp.uidx
     d = default_json_serializer.from_file(file)
     self.tp.from_json(d)
     if forward_stream:
         if old_uidx > self.tp.uidx:
             zwarn(
                 f"Cannot go to the past: {old_uidx} -> {self.tp.uidx}, skip this!"
             )
         else:
             _s = self.train_stream
             for _ in range(self.tp.uidx - old_uidx):
                 _, _eos = _s.next_and_check()
                 if _eos:  # restart and get one
                     _s.restart()
                     _s.next()
             zlog(f"Forward to the future: {old_uidx} -> {self.tp.uidx}!",
                  func="io")
     zlog(f"Load training progress from {file}", func="io")
     self.adjust_scheduled_values()  # also adjust values!
Example #5
0
 def build_from_stream(dconf: DConf, stream, extra_stream):
     zlog("Build vocabs from streams.")
     # here, collect them all
     # -- basic inputs
     voc_word = SimpleVocab.build_empty("word")
     voc_lemma = SimpleVocab.build_empty("lemma")
     voc_upos = SimpleVocab.build_empty("upos")
     voc_char = SimpleVocab.build_empty("char")
     voc_deplab = SimpleVocab.build_empty("deplab")
     # -- frame ones
     voc_evt, voc_ef, voc_arg = SimpleVocab.build_empty("evt"), SimpleVocab.build_empty("ef"), SimpleVocab.build_empty("arg")
     voc_collections = {"word": voc_word, "lemma": voc_lemma, "upos": voc_upos, "char": voc_char, "deplab": voc_deplab,
                        "evt": voc_evt, "ef": voc_ef, "arg": voc_arg}
     # read all and build
     for sent in yield_sents(stream):
         # -- basic inputs
         if sent.seq_word is not None:
             voc_word.feed_iter(sent.seq_word.vals)
             for w in sent.seq_word.vals:
                 voc_char.feed_iter(w)
         if sent.seq_lemma is not None:
             voc_lemma.feed_iter(sent.seq_lemma.vals)
         if sent.seq_upos is not None:
             voc_upos.feed_iter(sent.seq_upos.vals)
         if sent.tree_dep is not None and sent.tree_dep.seq_label is not None:
             voc_deplab.feed_iter(sent.tree_dep.seq_label.vals)
         # -- frames
         if sent.entity_fillers is not None:
             voc_ef.feed_iter((ef.type for ef in sent.entity_fillers))
         if sent.events is not None:
             voc_evt.feed_iter((evt.type for evt in sent.events))
             for evt in sent.events:
                 if evt.args is not None:
                     voc_arg.feed_iter((arg.role for arg in evt.args))
     # sort everyone!
     for voc in voc_collections.values():
         voc.build_sort()
     # extra for evt/arg
     if dconf.dict_frame_file:
         frames = default_json_serializer.from_file(dconf.dict_frame_file)
         for one_f in frames.values():  # no count, simply feed!!
             if len(one_f["lexUnit"]) > 0:  # todo(+W): currently ignore non-lex frames
                 voc_evt.feed_one(one_f["name"], c=0)
                 for one_fe in one_f["FE"]:
                     voc_arg.feed_one(one_fe["name"], c=0)
         zlog(f"After adding frames from {dconf.dict_frame_file}, evt={voc_evt}, arg={voc_arg}")
     # -----
     # deal with pre-trained word embeddings
     w2vec = None
     if dconf.pretrain_wv_file:
         # todo(warn): for convenience, extra vocab (usually dev&test) is only used for collecting pre-train vecs
         # collect extra words and lemmas
         extra_word_counts = {}
         extra_lemma_counts = {}
         for sent in yield_sents(extra_stream):
             if sent.seq_word is not None:
                 for w in sent.seq_word.vals:
                     extra_word_counts[w] = extra_word_counts.get(w, 0) + 1
             if sent.seq_lemma is not None:
                 for w in sent.seq_lemma.vals:
                     extra_lemma_counts[w] = extra_lemma_counts.get(w, 0) + 1
         # must provide dconf.pretrain_file
         w2vec = WordVectors.load(dconf.pretrain_wv_file)
         # first filter according to thresholds
         _filter_f = lambda ww, rank, val: (val >= dconf.word_fthres and rank <= dconf.word_rthres) or \
                                           w2vec.find_key(ww) is not None
         voc_word.build_filter(_filter_f)
         voc_lemma.build_filter(_filter_f)
         # then add extra ones
         for w in sorted(extra_word_counts.keys(), key=lambda z: (-extra_word_counts[z], z)):
             if w2vec.find_key(w) is not None and (w not in voc_word):
                 voc_word.feed_one(w)
         for w in sorted(extra_lemma_counts.keys(), key=lambda z: (-extra_lemma_counts[z], z)):
             if w2vec.find_key(w) is not None and (w not in voc_lemma):
                 voc_lemma.feed_one(w)
         # by-product of filtered output pre-trained embeddings for later faster processing
         if dconf.pretrain_hits_outf:
             # find all keys again!!
             w2vec.clear_hits()
             for vv in [voc_word, voc_lemma]:
                 for _idx in range(*(vv.non_special_range())):
                     w2vec.find_key(vv.idx2word(_idx))
             w2vec.save_hits(dconf.pretrain_hits_outf)
         # embeds
         word_embed1 = voc_word.filter_embed(w2vec, scale=dconf.pretrain_scale)
         lemma_embed1 = voc_lemma.filter_embed(w2vec, scale=dconf.pretrain_scale)
     else:
         voc_word.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres)
         voc_lemma.build_filter_thresh(rthres=dconf.word_rthres, fthres=dconf.word_fthres)
         word_embed1 = lemma_embed1 = None
     # return
     ret = ZsfpVocabPackage(voc_collections, {"word": word_embed1, "lemma": lemma_embed1}, dconf)
     return ret
Example #6
0
 def read_from_file(cls, fname: str):
     d = default_json_serializer.from_file(fname)
     x = cls.cls_from_json(d)
     return x
Example #7
0
 def load_progress(self, file: str, forward_stream=False):
     d = default_json_serializer.from_file(file)
     self.tp.from_json(d)
     assert not forward_stream, "Error: 'forward_stream' not supported in this mode!!"
     zlog(f"Load training progress from {file}", func="io")
     self.adjust_scheduled_values()  # also adjust values!
Example #8
0
 def load(self, f):
     conf, cmap = default_json_serializer.from_file(f)
     self.conf.from_json(conf)
     self.cmap = cmap
     self.reinit()
     zlog(f"Load {self.__class__.__name__} from {f}")
Example #9
0
def main(*args):
    conf: MainConf = init_everything(MainConf(), args)
    # --
    cons_lex = LexConstrainer(conf.lex_conf)
    cons_fe = FEConstrainer(conf.fe_conf)
    # --
    # some confs
    lex_use_fn_style = conf.lex_conf.use_fn_style
    # --
    # first try to read frame file
    if conf.frame_file:
        assert lex_use_fn_style, "Otherwise do not provide 'frame_file'!!s"
        external_frames = default_json_serializer.from_file(conf.frame_file)
        for fname, fv in external_frames.items():
            # LU
            for lu in fv["lexUnit"]:
                lu_name = lu["name"]
                cons_lex.add(cons_lex.lu2feat(lu_name), fname,
                             c=0)  # no count now, only add entry
                lu_name2 = LexConstrainer.norm_lu(lu_name)
                if lu_name2 != lu_name:  # also add normed name!
                    cons_lex.add(cons_lex.lu2feat(lu_name2), fname, c=0)
            # FE
            for fe in fv["FE"]:
                fe_name = fe["name"]
                cons_fe.add(fname, fe_name, c=0)  # again no count here!
        zlog(
            f"Read from {conf.frame_file}: LU={cons_lex.summary()}, FE={cons_fe.summary()}"
        )
    # --
    # then read data!
    if conf.train.input_path:
        reader = conf.train.get_reader()
        for sent in yield_sents(reader):
            for frame in sent.get_frames(conf.lex_conf.cons_ftag):
                frame_name = frame.type
                # LU
                feats = []
                if lex_use_fn_style:  # then directly use the stored one!!
                    lu_name = frame.info.get("luName")
                    feats.append(cons_lex.lu2feat(lu_name))
                    lu_name2 = LexConstrainer.norm_lu(lu_name)
                    if lu_name2 != lu_name:
                        feats.append(cons_lex.lu2feat(lu_name2))
                # also add the plain one!!
                widx, wlen = frame.mention.get_span()
                feat = cons_lex.span2feat(frame.sent, widx, wlen)
                feats.append(feat)
                # --
                for feat in feats:
                    cons_lex.add(feat, frame_name, c=1)
                # FE
                for alink in frame.args:
                    cons_fe.add(frame_name, alink.role, c=1)
        zlog(
            f"Read from {conf.train.input_path}: LU={cons_lex.summary()}, FE={cons_fe.summary()}"
        )
    # --
    # summary and save
    cons_lex.save(conf.lex_save_name)
    cons_fe.save(conf.fe_save_name)