Example #1
0
 def aug_word2_vocab(self, stream, extra_stream, extra_embed_file: str):
     zlog(
         f"Aug another word vocab from streams and extra_embed_file={extra_embed_file}"
     )
     word_builder = VocabBuilder("word2")
     for inst in stream:
         word_builder.feed_stream(inst.word_seq.vals)
     # embeddings
     if len(extra_embed_file) > 0:
         extra_word_set = set(w for inst in extra_stream
                              for w in inst.word_seq.vals)
         w2vec = WordVectors.load(extra_embed_file)
         for w in extra_word_set:
             if w2vec.has_key(w) and (
                     not word_builder.has_key_currently(w)):
                 word_builder.feed_one(w)
         word_vocab = word_builder.finish()  # no filtering!!
         word_embed1 = word_vocab.filter_embed(w2vec,
                                               init_nohit=1.0,
                                               scale=1.0)
     else:
         zwarn("WARNING: No pretrain file for aug node!!")
         word_vocab = word_builder.finish()  # no filtering!!
         word_embed1 = None
     self.put_voc("word2", word_vocab)
     self.put_emb("word2", word_embed1)
Example #2
0
 def adjust_at_ckp(self, sname, cur_idxes):
     the_idx = cur_idxes[self.sv_conf.which_idx]
     old_val, new_val = self._set(the_idx)
     if self.cur_val != old_val:
         utils.zlog("Change scheduled value %s at %s: %s => %s." % (self.name, sname, old_val, self.cur_val))
     else:
         utils.zlog("Keep scheduled value %s at %s as %s." % (self.name, sname, self.cur_val))
Example #3
0
 def inference_on_batch(self, insts: List[DocInstance], **kwargs):
     self.refresh_batch(False)
     # -----
     if len(insts) == 0:
         return {}
     # -----
     # todo(note): first do shallow copy!
     for one_doc in insts:
         for one_sent in one_doc.sents:
             one_sent.pred_entity_fillers = [
                 z for z in one_sent.entity_fillers
             ]
             one_sent.pred_events = [
                 shallow_copy(z) for z in one_sent.events
             ]
     # -----
     ndoc, nsent = len(insts), 0
     iconf = self.conf.iconf
     with BK.no_grad_env():
         # splitting into buckets
         all_packs = self.bter.run(insts, training=False)
         for one_pack in all_packs:
             ms_items, bert_expr, basic_expr = one_pack
             nsent += len(ms_items)
             self.predictor.predict(ms_items, bert_expr, basic_expr)
     info = {
         "doc": ndoc,
         "sent": nsent,
         "num_evt": sum(len(z.pred_events) for z in insts)
     }
     if iconf.decode_verbose:
         zlog(f"Decode one mini-batch: {info}")
     return info
Example #4
0
 def do_filter(self, insts_target: str, fcode: str) -> List:
     vs = self.vars
     _ff = compile(fcode, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     ret = [d for d in insts if eval(_ff)]
     zlog(f"Filter by {fcode}: from {len(insts)} to {len(ret)}, {len(ret)/(len(insts)+1e-7)}")
     return ret
Example #5
0
 def end(self):
     # sorting by idx of reading
     self.insts.sort(key=lambda x: x.inst_idx)
     # todo(+1): write other output file
     # self._set_type(self.insts)  # todo(note): no need for this step
     if self.outf is not None:
         with zopen(self.outf, "w") as fd:
             data_writer = get_data_writer(fd, self.out_format)
             data_writer.write(self.insts)
     # evaluation
     evaler = MyIEEvaler(self.eval_conf)
     res = evaler.eval(self.insts, self.insts)
     # the criterion will be average of U/L-evt/arg; now using only labeled results
     # all_results = [res["event"][0], res["event"][1], res["argument"][0], res["argument"][1]]
     # all_results = [res["event"][1], res["argument"][1]]
     all_results = [res[z][1] for z in self.eval_conf.res_list]
     res["res"] = float(np.average([float(z) for z in all_results]))
     # make it loadable by json
     for k in ["event", "argument", "argument2", "entity_filler"]:
         res[k] = str(res.get(k))
     zlog("zzzzzevent: %s" % res["res"], func="result")
     # =====
     # clear pred ones for possible reusing
     for one_doc in self.insts:
         for one_sent in one_doc.sents:
             one_sent.pred_events.clear()
             one_sent.pred_entity_fillers.clear()
     return res
Example #6
0
 def __init__(self, pc: BK.ParamCollection, conf: MaskLMNodeConf,
              vpack: VocabPackage):
     super().__init__(pc, None, None)
     self.conf = conf
     # vocab and padder
     self.word_vocab = vpack.get_voc("word")
     self.padder = DataPadder(
         2, pad_vals=self.word_vocab.pad,
         mask_range=2)  # todo(note): <pad>-id is very large
     # models
     self.hid_layer = self.add_sub_node(
         "hid", Affine(pc, conf._input_dim, conf.hid_dim, act=conf.hid_act))
     self.pred_layer = self.add_sub_node(
         "pred",
         Affine(pc,
                conf.hid_dim,
                conf.max_pred_rank + 1,
                init_rop=NoDropRop()))
     if conf.init_pred_from_pretrain:
         npvec = vpack.get_emb("word")
         if npvec is None:
             zwarn(
                 "Pretrained vector not provided, skip init pred embeddings!!"
             )
         else:
             with BK.no_grad_env():
                 self.pred_layer.ws[0].copy_(
                     BK.input_real(npvec[:conf.max_pred_rank + 1].T))
             zlog(
                 f"Init pred embeddings from pretrained vectors (size={conf.max_pred_rank+1})."
             )
Example #7
0
 def disable_final_dropout(self):
     if len(self.layers) < 1:
         zwarn(
             "Cannot disable final dropout since this Enc layer is empty!!")
     else:
         # get the final one from sequential
         final_layer = self.layers[-1]
         while isinstance(final_layer, Sequential):
             final_layer = final_layer.ns_[-1] if len(
                 final_layer.ns_) else None
         # get final dropout node
         final_drop_node: Dropout = None
         if isinstance(final_layer, RnnLayerBatchFirstWrapper):
             final_drop_nodes = final_layer.rnn_node.drop_nodes
             if final_drop_nodes is not None and len(final_drop_nodes) > 0:
                 final_drop_node = final_drop_nodes[-1]
         elif isinstance(final_layer, CnnLayer):
             final_drop_node = final_layer.drop_node
         elif isinstance(final_layer, TransformerEncoder):
             pass  # todo(note): final is LayerNorm?
         if final_drop_node is None:
             zwarn(
                 f"Failed at disabling final enc-layer dropout: type={type(final_layer)}: {final_layer}"
             )
         else:
             final_drop_node.rop.add_fixed_value("hdrop", 0.)
             zlog(
                 f"Ok at disabling final enc-layer dropout: type={type(final_layer)}: {final_layer}"
             )
Example #8
0
def init_everything(args, ConfType=None):
    # search for basic confs
    all_argv, basic_argv = Conf.search_args(args, ["model_type", "conf_output", "log_file", "msp_seed"],
                                            [str, str, str, int], [None, None, Logger.MAGIC_CODE, None])
    # for basic argvs
    model_type = basic_argv["model_type"]
    conf_output = basic_argv["conf_output"]
    log_file = basic_argv["log_file"]
    msp_seed = basic_argv["msp_seed"]
    if conf_output:
        with zopen(conf_output, "w") as fd:
            for k,v in all_argv.items():
                # todo(note): do not save this one
                if k != "conf_output":
                    fd.write(f"{k}:{v}\n")
    utils.init(log_file, msp_seed)
    # real init of the conf
    if model_type is None:
        utils.zlog("Using the default model type = simple!")
        model_type = "simple"
    if ConfType is None:
        conf = OverallConf(model_type, args)
    else:
        conf = ConfType(model_type, args)
    nn.init(conf.niconf)
    return conf
Example #9
0
 def _next(self):
     if not self.is_active():
         return None  # make sure requiring restart!
     # first go to the one with budget!!
     cur_ptr = self.current_ptr
     cur_budget = self.current_budget
     while cur_budget <= 0. or next(self.random_sampler) >= cur_budget:
         cur_ptr = (cur_ptr + 1) % len(self.base_streams)
         cur_budget = float(self.budgets[cur_ptr])
     # find it
     self.current_ptr = cur_ptr
     self.current_budget = cur_budget - 1  # cost one for the current step (or decreasing for empty streamer)
     # get one
     cur_streamer = self.base_streams[cur_ptr]
     read_times = 0
     while read_times <= 2:  # avoid loop with empty streamer
         one = cur_streamer.next()
         read_times += 1
         if cur_streamer.is_eos(one):
             if cur_ptr == self.stop_sidx:  # actually restart
                 zlog(
                     f"From the multi-streamer, this epoch stats: {self.stats}"
                 )
                 self.stats = [0 for _ in self.stats]
                 return None  # stop here (need restart)
             else:
                 cur_streamer.restart()  # restart right now!
         else:
             self.stats[cur_ptr] += 1
             return one
     # when we get here, it means that there are emtpy loops, thus tail-calling the function again
     return self._next()
Example #10
0
 def write_txt(self, fname):
     with zopen(fname, "w") as fd:
         for pack in self.yield_infos():
             i, w, count, perc, accu_count, accu_perc = pack
             ss = f"{i} {w} {count}({perc:.3f}) {accu_count}({accu_perc:.3f})\n"
             fd.write(ss)
     zlog("Write (txt) to %s: %s" % (fname, str(self)))
Example #11
0
 def check_budgets(self):
     POINTS = [0., 0.5, 0.75, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
     for k in sorted(self.argbu_counts.keys()):
         sorted_counts = self.argbu_counts[k]
         sorted_counts_size = len(sorted_counts)
         piece_counts = [(p, sorted_counts[min(sorted_counts_size-1, int(sorted_counts_size*p))]) for p in POINTS]
         zlog(f"k={k}: {piece_counts}")
Example #12
0
 def do_cal_pd(self, inst_pd: str, scode: str):
     vs = self.vars
     _ff = compile(scode, "", "eval")
     d = self.get_and_check_type(inst_pd, pd.DataFrame)
     #
     ret = eval(_ff)
     zlog(f"Calculation on pd.DataFrame by {scode}, and get another one as: {str(ret)}")
     return ret
Example #13
0
 def process(self, args: List[str]):
     assert len(args) > 0, "Empty command"
     cmd_name, real_args = args[0], args[1:]
     # similar to the cmd package
     method_name = "do_" + cmd_name
     assert hasattr(self, method_name), f"Unknown command {cmd_name}"
     zlog(f"Performing command: {args}")
     return getattr(self, method_name)(*real_args)
Example #14
0
 def do_join(self, insts_target: str, jcode: str) -> List:
     vs = self.vars
     _ff = compile(jcode, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     ret = [eval(_ff) for d in insts]
     ret = Helper.join_list(ret)
     zlog(f"Join-list by {jcode}: from {len(insts)} to {len(ret)}")
     return ret
Example #15
0
 def build_by_reading(prefix):
     zlog("Load vocabs from files.")
     possible_vocabs = ["word", "char", "pos", "deplabel", "ner", "word2"]
     one = MLMVocabPackage({n: None
                            for n in possible_vocabs},
                           {n: None
                            for n in possible_vocabs})
     one.load(prefix=prefix)
     return one
Example #16
0
 def set_var(self, target, v, explanation=None, history_idx=None):
     if hasattr(self.vars, target):
         zlog(f"Overwriting the existing var `{target}`")
     if target not in self.traces:
         self.traces[target] = []
     # (explanation, history-idx)
     history_idx = len(self.history) if history_idx is None else history_idx
     self.traces[target].append((explanation, history_idx))  # the current cmd will be recorded into history
     setattr(self.vars, target, v)
Example #17
0
 def build_by_reading(dconf):
     zlog("Load vocabs from files.")
     possible_vocabs = ["word", "char", "pos", "label"]
     one = ParserVocabPackage({n: None
                               for n in possible_vocabs},
                              {n: None
                               for n in possible_vocabs}, dconf)
     one.load(dconf.dict_dir)
     return one
Example #18
0
 def do_sort(self, insts_target: str, kcode: str) -> List:
     vs = self.vars
     _ff = compile(kcode, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     tmp_tuples = [(d, eval(_ff)) for d in insts]
     tmp_tuples.sort(key=lambda x: x[1])
     ret = [x[0] for x in tmp_tuples]
     zlog(f"Sort by key={kcode}: len = {len(ret)}")
     return ret
Example #19
0
 def do_get_pd(self, insts_target: str, gcode: str) -> pd.DataFrame:
     vs = self.vars
     _ff = compile(gcode, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     #
     fields = [eval(_ff) for d in insts]
     ret = pd.DataFrame(fields)
     zlog(f"Group {len(insts)} instances by {gcode} to pd.DataFrame shape={ret.shape}.")
     return ret
Example #20
0
 def __init__(self, conf: KeyWordConf, num_doc=0, w2f=None, w2d=None):
     self.conf = conf
     # -----
     self.num_doc = num_doc
     self.w2f = {} if w2f is None else w2f  # word -> freq
     self.w2d = {} if w2d is None else w2d  # word -> num-doc (dfs)
     # -----
     self._calc()
     zlog(f"Create(or Load) {self}")
Example #21
0
 def do_ann_start(self, insts_target: str) -> AnnotationTask:
     assert self.cur_cmd_target is not None, "Should assign this to a var to avoid accidental loss!"
     vs = self.vars
     insts = self.get_and_check_type(insts_target, list)
     new_task = PerrAnnotationTask(insts)
     # todo(note): here need auto save?
     if self.cur_ann_task is not None and self.cur_ann_task.remaining>0:
         zlog("Warn: the previous annotation task has not been finished yet!")
     self.cur_ann_task = new_task
     return new_task
Example #22
0
 def set_focus(self, idx=None, offset=0):
     if idx is not None:
         new_focus = idx
     else:
         new_focus = self.focus + offset
     if new_focus>=0 and new_focus<self.length:
         self.focus = new_focus
     else:
         zlog(f"Focus setting failed: at the boundary: {self.focus}, {self.length}")
     return self.focus
Example #23
0
 def __init__(self, expand_span_ext_file, **kwargs):
     self.tables = {}
     with open(expand_span_ext_file) as fd:
         for line in fd:
             data = json.loads(line)
             key, head, start, end = data["key"], data["head"], data[
                 "start"], data["end"]
             assert key not in self.tables
             self.tables[key] = (head, start, end)
     zlog(f"Read from {expand_span_ext_file}, {len(self.tables)} entries")
Example #24
0
 def inference_on_batch(self, insts: List[GeneralSentence], **kwargs):
     conf = self.conf
     self.refresh_batch(False)
     with BK.no_grad_env():
         # special mode
         # use: CUDA_VISIBLE_DEVICES=3 PYTHONPATH=../../src/ python3 -m pdb ../../src/tasks/cmd.py zmlm.main.test ${RUN_DIR}/_conf device:0 dict_dir:${RUN_DIR}/ model_load_name:${RUN_DIR}/zmodel.best test:./_en.debug test_interactive:1
         if conf.test_interactive:
             iinput_sent = input(">> (Interactive testing) Input sent sep by blanks: ")
             iinput_tokens = iinput_sent.split()
             if len(iinput_sent) > 0:
                 iinput_inst = GeneralSentence.create(iinput_tokens)
                 iinput_inst.word_seq.set_idxes([self.word_vocab.get_else_unk(w) for w in iinput_inst.word_seq.vals])
                 iinput_inst.char_seq.build_idxes(self.inputter.vpack.get_voc("char"))
                 iinput_map = self.inputter([iinput_inst])
                 iinput_erase_mask = np.asarray([[z=="Z" for z in iinput_tokens]]).astype(dtype=np.float32)
                 iinput_masked_map = self.inputter.mask_input(iinput_map, iinput_erase_mask, set("pos"))
                 emb_t, mask_t, enc_t, cache, enc_loss = self._emb_and_enc(iinput_masked_map, collect_loss=False, insts=[iinput_inst])
                 mlm_loss = self.masklm.loss(enc_t, iinput_erase_mask, iinput_map)
                 dpar_input_attn = self.prepr_f(cache, self._get_rel_dist(BK.get_shape(mask_t, -1)))
                 self.dpar.predict([iinput_inst], enc_t, dpar_input_attn, mask_t)
                 self.upos.predict([iinput_inst], enc_t, mask_t)
                 # print them
                 import pandas as pd
                 cur_fields = {
                     "idxes": list(range(1, len(iinput_inst)+1)),
                     "word": iinput_inst.word_seq.vals, "pos": iinput_inst.pred_pos_seq.vals,
                     "head": iinput_inst.pred_dep_tree.heads[1:], "dlab": iinput_inst.pred_dep_tree.labels[1:]}
                 zlog(f"Result:\n{pd.DataFrame(cur_fields).to_string()}")
             return {}  # simply return here for interactive mode
         # -----
         # test for MLM simply as in training (use special separate rand_gen to keep the masks the same for testing)
         # todo(+2): do we need to keep testing/validing during training the same? Currently not!
         info = self.fb_on_batch(insts, training=False, rand_gen=self.testing_rand_gen, assign_attns=conf.testing_get_attns)
         # -----
         if len(insts) == 0:
             return info
         # decode for dpar
         input_map = self.inputter(insts)
         emb_t, mask_t, enc_t, cache, _ = self._emb_and_enc(input_map, collect_loss=False, insts=insts)
         dpar_input_attn = self.prepr_f(cache, self._get_rel_dist(BK.get_shape(mask_t, -1)))
         self.dpar.predict(insts, enc_t, dpar_input_attn, mask_t)
         self.upos.predict(insts, enc_t, mask_t)
         if self.ner is not None:
             self.ner.predict(insts, enc_t, mask_t)
         # -----
         if conf.testing_get_attns:
             if conf.enc_choice == "vrec":
                 self._assign_attns_item(insts, "orig", cache=cache)
             elif conf.enc_choice in ["original"]:
                 pass
             else:
                 raise NotImplementedError()
         return info
Example #25
0
 def set_eff_max_layer(self, eff_max_layer=None):
     if eff_max_layer is None:
         return self.eff_max_layer  # simply query
     if eff_max_layer < 0:
         eff_max_layer = self.max_layer + 1 + eff_max_layer
     assert eff_max_layer > 0 and eff_max_layer <= self.max_layer, f"Err: layer our of range {eff_max_layer}"
     if self.eff_max_layer != eff_max_layer:
         zlog(
             f"Set current layer from {self.eff_max_layer} -> {eff_max_layer}!"
         )
         self.eff_max_layer = eff_max_layer
     return self.eff_max_layer
Example #26
0
 def inference_on_batch(self, insts: List[DocInstance], **kwargs):
     self.refresh_batch(False)
     # -----
     if len(insts) == 0:
         return {}
     # -----
     ndoc, nsent = len(insts), 0
     iconf = self.conf.iconf
     # =====
     # get tmp ms_items for each event
     input_ms_items = self._insts2msitems(insts)
     # -----
     if len(input_ms_items) == 0:
         return {}
     # -----
     with BK.no_grad_env():
         # splitting into buckets
         all_packs = self.bter.run(input_ms_items, training=False)
         for one_pack in all_packs:
             ms_items, bert_expr, basic_expr = one_pack
             nsent += len(ms_items)
             # cands
             if iconf.lookup_ef:
                 self._lookup_efs(ms_items)
             else:
                 self.cand_extractor.predict(ms_items, bert_expr,
                                             basic_expr)
             # args
             if iconf.pred_arg:
                 self.arg_linker.predict(ms_items, bert_expr, basic_expr)
             # span
             if iconf.pred_span:
                 self.span_expander.predict(ms_items, bert_expr)
     # put back all predictions
     self._putback_preds(input_ms_items)
     # collect all stats
     num_ef, num_evt, num_arg = 0, 0, 0
     for one_doc in insts:
         for one_sent in one_doc.sents:
             num_ef += len(one_sent.pred_entity_fillers)
             num_evt += len(one_sent.pred_events)
             num_arg += sum(len(z.links) for z in one_sent.pred_events)
     info = {
         "doc": ndoc,
         "sent": nsent,
         "num_ef": num_ef,
         "num_evt": num_evt,
         "num_arg": num_arg
     }
     if iconf.decode_verbose:
         zlog(f"Decode one mini-batch: {info}")
     return info
Example #27
0
 def special_pretrain_load(m: BaseParser, path, strict):
     if FileHelper.isfile(path):
         try:
             zlog(f"Trying to load pretrained model from {path}")
             m.load(path, strict)
             zlog(f"Finished loading pretrained model from {path}")
             return True
         except:
             zlog(traceback.format_exc())
             zlog("Failed loading, keep the original ones.")
     else:
         zlog(f"File does not exist for pretraining loading: {path}")
     return False
Example #28
0
def init(cc=None):
    # set up common cc (cannot be changed later!!)
    if cc is None:
        pass
    elif isinstance(cc, NIConf):
        zlog("Updating NIConf with device=%s." % cc.device, func="config")
        COMMON_CONFIG.update_from_conf(cc)
    elif isinstance(cc, dict):
        COMMON_CONFIG.update_from_v(cc)
    else:
        COMMON_CONFIG.update_from_s(cc)
    #
    BK.init()
Example #29
0
def main(args):
    conf: OverallConf = init_everything(args)
    dconf = conf.dconf
    #
    bmodel = get_berter(dconf.bconf)
    for one_input, one_output in zip(
        [dconf.train, dconf.dev, dconf.test],
        [dconf.aux_repr_train, dconf.aux_repr_dev, dconf.aux_repr_test]):
        zlog(f"Read from {one_input} and write to {one_output}")
        num_doc, num_sent = 0, 0
        if one_input and one_output:
            one_streamer = get_data_reader(one_input, dconf.input_format,
                                           dconf.use_label0, dconf.noef_link0,
                                           None)
            bertaug_streamer = BerterDataAuger(one_streamer, bmodel,
                                               "aux_repr")
            with zopen(one_output, 'wb') as fd:
                for one_doc in bertaug_streamer:
                    PickleRW.save_list(
                        [s.extra_features["aux_repr"] for s in one_doc.sents],
                        fd)
                    num_doc += 1
                    num_sent += len(one_doc.sents)
            zlog(f"Finish with doc={num_doc}, sent={num_sent}")
        else:
            zlog("Skip empty files")
    zlog("Finish all.")
Example #30
0
 def end(self):
     # sorting by idx of reading
     self.insts.sort(key=lambda x: x.inst_idx)
     # todo(+1): write other output file
     if self.outf is not None:
         with zopen(self.outf, "w") as fd:
             data_writer = get_data_writer(fd, self.out_format)
             data_writer.write(self.insts)
     #
     evaler = ParserEvaler()
     # evaler2 = ParserEvaler(ignore_punct=True, punct_set={"PUNCT", "SYM"})
     eval_arg_names = [
         "poses", "heads", "labels", "pred_poses", "pred_heads",
         "pred_labels"
     ]
     for one_inst in self.insts:
         # todo(warn): exclude the ROOT symbol; the model should assign pred_*
         real_values = one_inst.get_real_values_select(eval_arg_names)
         evaler.eval_one(*real_values)
         # evaler2.eval_one(*real_values)
     report_str, res = evaler.summary()
     # _, res2 = evaler2.summary()
     #
     zlog("Results of %s vs. %s" % (self.outf, self.goldf), func="result")
     zlog(report_str, func="result")
     res["gold"] = self.goldf  # record which file
     # res2["gold"] = self.goldf            # record which file
     zlog("zzzzztest: testing result is " + str(res))
     # zlog("zzzzztest2: testing result is " + str(res2))
     zlog("zzzzzpar: %s" % res["res"], func="result")
     return res