def aug_word2_vocab(self, stream, extra_stream, extra_embed_file: str): zlog( f"Aug another word vocab from streams and extra_embed_file={extra_embed_file}" ) word_builder = VocabBuilder("word2") for inst in stream: word_builder.feed_stream(inst.word_seq.vals) # embeddings if len(extra_embed_file) > 0: extra_word_set = set(w for inst in extra_stream for w in inst.word_seq.vals) w2vec = WordVectors.load(extra_embed_file) for w in extra_word_set: if w2vec.has_key(w) and ( not word_builder.has_key_currently(w)): word_builder.feed_one(w) word_vocab = word_builder.finish() # no filtering!! word_embed1 = word_vocab.filter_embed(w2vec, init_nohit=1.0, scale=1.0) else: zwarn("WARNING: No pretrain file for aug node!!") word_vocab = word_builder.finish() # no filtering!! word_embed1 = None self.put_voc("word2", word_vocab) self.put_emb("word2", word_embed1)
def adjust_at_ckp(self, sname, cur_idxes): the_idx = cur_idxes[self.sv_conf.which_idx] old_val, new_val = self._set(the_idx) if self.cur_val != old_val: utils.zlog("Change scheduled value %s at %s: %s => %s." % (self.name, sname, old_val, self.cur_val)) else: utils.zlog("Keep scheduled value %s at %s as %s." % (self.name, sname, self.cur_val))
def inference_on_batch(self, insts: List[DocInstance], **kwargs): self.refresh_batch(False) # ----- if len(insts) == 0: return {} # ----- # todo(note): first do shallow copy! for one_doc in insts: for one_sent in one_doc.sents: one_sent.pred_entity_fillers = [ z for z in one_sent.entity_fillers ] one_sent.pred_events = [ shallow_copy(z) for z in one_sent.events ] # ----- ndoc, nsent = len(insts), 0 iconf = self.conf.iconf with BK.no_grad_env(): # splitting into buckets all_packs = self.bter.run(insts, training=False) for one_pack in all_packs: ms_items, bert_expr, basic_expr = one_pack nsent += len(ms_items) self.predictor.predict(ms_items, bert_expr, basic_expr) info = { "doc": ndoc, "sent": nsent, "num_evt": sum(len(z.pred_events) for z in insts) } if iconf.decode_verbose: zlog(f"Decode one mini-batch: {info}") return info
def do_filter(self, insts_target: str, fcode: str) -> List: vs = self.vars _ff = compile(fcode, "", "eval") insts = self.get_and_check_type(insts_target, list) ret = [d for d in insts if eval(_ff)] zlog(f"Filter by {fcode}: from {len(insts)} to {len(ret)}, {len(ret)/(len(insts)+1e-7)}") return ret
def end(self): # sorting by idx of reading self.insts.sort(key=lambda x: x.inst_idx) # todo(+1): write other output file # self._set_type(self.insts) # todo(note): no need for this step if self.outf is not None: with zopen(self.outf, "w") as fd: data_writer = get_data_writer(fd, self.out_format) data_writer.write(self.insts) # evaluation evaler = MyIEEvaler(self.eval_conf) res = evaler.eval(self.insts, self.insts) # the criterion will be average of U/L-evt/arg; now using only labeled results # all_results = [res["event"][0], res["event"][1], res["argument"][0], res["argument"][1]] # all_results = [res["event"][1], res["argument"][1]] all_results = [res[z][1] for z in self.eval_conf.res_list] res["res"] = float(np.average([float(z) for z in all_results])) # make it loadable by json for k in ["event", "argument", "argument2", "entity_filler"]: res[k] = str(res.get(k)) zlog("zzzzzevent: %s" % res["res"], func="result") # ===== # clear pred ones for possible reusing for one_doc in self.insts: for one_sent in one_doc.sents: one_sent.pred_events.clear() one_sent.pred_entity_fillers.clear() return res
def __init__(self, pc: BK.ParamCollection, conf: MaskLMNodeConf, vpack: VocabPackage): super().__init__(pc, None, None) self.conf = conf # vocab and padder self.word_vocab = vpack.get_voc("word") self.padder = DataPadder( 2, pad_vals=self.word_vocab.pad, mask_range=2) # todo(note): <pad>-id is very large # models self.hid_layer = self.add_sub_node( "hid", Affine(pc, conf._input_dim, conf.hid_dim, act=conf.hid_act)) self.pred_layer = self.add_sub_node( "pred", Affine(pc, conf.hid_dim, conf.max_pred_rank + 1, init_rop=NoDropRop())) if conf.init_pred_from_pretrain: npvec = vpack.get_emb("word") if npvec is None: zwarn( "Pretrained vector not provided, skip init pred embeddings!!" ) else: with BK.no_grad_env(): self.pred_layer.ws[0].copy_( BK.input_real(npvec[:conf.max_pred_rank + 1].T)) zlog( f"Init pred embeddings from pretrained vectors (size={conf.max_pred_rank+1})." )
def disable_final_dropout(self): if len(self.layers) < 1: zwarn( "Cannot disable final dropout since this Enc layer is empty!!") else: # get the final one from sequential final_layer = self.layers[-1] while isinstance(final_layer, Sequential): final_layer = final_layer.ns_[-1] if len( final_layer.ns_) else None # get final dropout node final_drop_node: Dropout = None if isinstance(final_layer, RnnLayerBatchFirstWrapper): final_drop_nodes = final_layer.rnn_node.drop_nodes if final_drop_nodes is not None and len(final_drop_nodes) > 0: final_drop_node = final_drop_nodes[-1] elif isinstance(final_layer, CnnLayer): final_drop_node = final_layer.drop_node elif isinstance(final_layer, TransformerEncoder): pass # todo(note): final is LayerNorm? if final_drop_node is None: zwarn( f"Failed at disabling final enc-layer dropout: type={type(final_layer)}: {final_layer}" ) else: final_drop_node.rop.add_fixed_value("hdrop", 0.) zlog( f"Ok at disabling final enc-layer dropout: type={type(final_layer)}: {final_layer}" )
def init_everything(args, ConfType=None): # search for basic confs all_argv, basic_argv = Conf.search_args(args, ["model_type", "conf_output", "log_file", "msp_seed"], [str, str, str, int], [None, None, Logger.MAGIC_CODE, None]) # for basic argvs model_type = basic_argv["model_type"] conf_output = basic_argv["conf_output"] log_file = basic_argv["log_file"] msp_seed = basic_argv["msp_seed"] if conf_output: with zopen(conf_output, "w") as fd: for k,v in all_argv.items(): # todo(note): do not save this one if k != "conf_output": fd.write(f"{k}:{v}\n") utils.init(log_file, msp_seed) # real init of the conf if model_type is None: utils.zlog("Using the default model type = simple!") model_type = "simple" if ConfType is None: conf = OverallConf(model_type, args) else: conf = ConfType(model_type, args) nn.init(conf.niconf) return conf
def _next(self): if not self.is_active(): return None # make sure requiring restart! # first go to the one with budget!! cur_ptr = self.current_ptr cur_budget = self.current_budget while cur_budget <= 0. or next(self.random_sampler) >= cur_budget: cur_ptr = (cur_ptr + 1) % len(self.base_streams) cur_budget = float(self.budgets[cur_ptr]) # find it self.current_ptr = cur_ptr self.current_budget = cur_budget - 1 # cost one for the current step (or decreasing for empty streamer) # get one cur_streamer = self.base_streams[cur_ptr] read_times = 0 while read_times <= 2: # avoid loop with empty streamer one = cur_streamer.next() read_times += 1 if cur_streamer.is_eos(one): if cur_ptr == self.stop_sidx: # actually restart zlog( f"From the multi-streamer, this epoch stats: {self.stats}" ) self.stats = [0 for _ in self.stats] return None # stop here (need restart) else: cur_streamer.restart() # restart right now! else: self.stats[cur_ptr] += 1 return one # when we get here, it means that there are emtpy loops, thus tail-calling the function again return self._next()
def write_txt(self, fname): with zopen(fname, "w") as fd: for pack in self.yield_infos(): i, w, count, perc, accu_count, accu_perc = pack ss = f"{i} {w} {count}({perc:.3f}) {accu_count}({accu_perc:.3f})\n" fd.write(ss) zlog("Write (txt) to %s: %s" % (fname, str(self)))
def check_budgets(self): POINTS = [0., 0.5, 0.75, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0] for k in sorted(self.argbu_counts.keys()): sorted_counts = self.argbu_counts[k] sorted_counts_size = len(sorted_counts) piece_counts = [(p, sorted_counts[min(sorted_counts_size-1, int(sorted_counts_size*p))]) for p in POINTS] zlog(f"k={k}: {piece_counts}")
def do_cal_pd(self, inst_pd: str, scode: str): vs = self.vars _ff = compile(scode, "", "eval") d = self.get_and_check_type(inst_pd, pd.DataFrame) # ret = eval(_ff) zlog(f"Calculation on pd.DataFrame by {scode}, and get another one as: {str(ret)}") return ret
def process(self, args: List[str]): assert len(args) > 0, "Empty command" cmd_name, real_args = args[0], args[1:] # similar to the cmd package method_name = "do_" + cmd_name assert hasattr(self, method_name), f"Unknown command {cmd_name}" zlog(f"Performing command: {args}") return getattr(self, method_name)(*real_args)
def do_join(self, insts_target: str, jcode: str) -> List: vs = self.vars _ff = compile(jcode, "", "eval") insts = self.get_and_check_type(insts_target, list) ret = [eval(_ff) for d in insts] ret = Helper.join_list(ret) zlog(f"Join-list by {jcode}: from {len(insts)} to {len(ret)}") return ret
def build_by_reading(prefix): zlog("Load vocabs from files.") possible_vocabs = ["word", "char", "pos", "deplabel", "ner", "word2"] one = MLMVocabPackage({n: None for n in possible_vocabs}, {n: None for n in possible_vocabs}) one.load(prefix=prefix) return one
def set_var(self, target, v, explanation=None, history_idx=None): if hasattr(self.vars, target): zlog(f"Overwriting the existing var `{target}`") if target not in self.traces: self.traces[target] = [] # (explanation, history-idx) history_idx = len(self.history) if history_idx is None else history_idx self.traces[target].append((explanation, history_idx)) # the current cmd will be recorded into history setattr(self.vars, target, v)
def build_by_reading(dconf): zlog("Load vocabs from files.") possible_vocabs = ["word", "char", "pos", "label"] one = ParserVocabPackage({n: None for n in possible_vocabs}, {n: None for n in possible_vocabs}, dconf) one.load(dconf.dict_dir) return one
def do_sort(self, insts_target: str, kcode: str) -> List: vs = self.vars _ff = compile(kcode, "", "eval") insts = self.get_and_check_type(insts_target, list) tmp_tuples = [(d, eval(_ff)) for d in insts] tmp_tuples.sort(key=lambda x: x[1]) ret = [x[0] for x in tmp_tuples] zlog(f"Sort by key={kcode}: len = {len(ret)}") return ret
def do_get_pd(self, insts_target: str, gcode: str) -> pd.DataFrame: vs = self.vars _ff = compile(gcode, "", "eval") insts = self.get_and_check_type(insts_target, list) # fields = [eval(_ff) for d in insts] ret = pd.DataFrame(fields) zlog(f"Group {len(insts)} instances by {gcode} to pd.DataFrame shape={ret.shape}.") return ret
def __init__(self, conf: KeyWordConf, num_doc=0, w2f=None, w2d=None): self.conf = conf # ----- self.num_doc = num_doc self.w2f = {} if w2f is None else w2f # word -> freq self.w2d = {} if w2d is None else w2d # word -> num-doc (dfs) # ----- self._calc() zlog(f"Create(or Load) {self}")
def do_ann_start(self, insts_target: str) -> AnnotationTask: assert self.cur_cmd_target is not None, "Should assign this to a var to avoid accidental loss!" vs = self.vars insts = self.get_and_check_type(insts_target, list) new_task = PerrAnnotationTask(insts) # todo(note): here need auto save? if self.cur_ann_task is not None and self.cur_ann_task.remaining>0: zlog("Warn: the previous annotation task has not been finished yet!") self.cur_ann_task = new_task return new_task
def set_focus(self, idx=None, offset=0): if idx is not None: new_focus = idx else: new_focus = self.focus + offset if new_focus>=0 and new_focus<self.length: self.focus = new_focus else: zlog(f"Focus setting failed: at the boundary: {self.focus}, {self.length}") return self.focus
def __init__(self, expand_span_ext_file, **kwargs): self.tables = {} with open(expand_span_ext_file) as fd: for line in fd: data = json.loads(line) key, head, start, end = data["key"], data["head"], data[ "start"], data["end"] assert key not in self.tables self.tables[key] = (head, start, end) zlog(f"Read from {expand_span_ext_file}, {len(self.tables)} entries")
def inference_on_batch(self, insts: List[GeneralSentence], **kwargs): conf = self.conf self.refresh_batch(False) with BK.no_grad_env(): # special mode # use: CUDA_VISIBLE_DEVICES=3 PYTHONPATH=../../src/ python3 -m pdb ../../src/tasks/cmd.py zmlm.main.test ${RUN_DIR}/_conf device:0 dict_dir:${RUN_DIR}/ model_load_name:${RUN_DIR}/zmodel.best test:./_en.debug test_interactive:1 if conf.test_interactive: iinput_sent = input(">> (Interactive testing) Input sent sep by blanks: ") iinput_tokens = iinput_sent.split() if len(iinput_sent) > 0: iinput_inst = GeneralSentence.create(iinput_tokens) iinput_inst.word_seq.set_idxes([self.word_vocab.get_else_unk(w) for w in iinput_inst.word_seq.vals]) iinput_inst.char_seq.build_idxes(self.inputter.vpack.get_voc("char")) iinput_map = self.inputter([iinput_inst]) iinput_erase_mask = np.asarray([[z=="Z" for z in iinput_tokens]]).astype(dtype=np.float32) iinput_masked_map = self.inputter.mask_input(iinput_map, iinput_erase_mask, set("pos")) emb_t, mask_t, enc_t, cache, enc_loss = self._emb_and_enc(iinput_masked_map, collect_loss=False, insts=[iinput_inst]) mlm_loss = self.masklm.loss(enc_t, iinput_erase_mask, iinput_map) dpar_input_attn = self.prepr_f(cache, self._get_rel_dist(BK.get_shape(mask_t, -1))) self.dpar.predict([iinput_inst], enc_t, dpar_input_attn, mask_t) self.upos.predict([iinput_inst], enc_t, mask_t) # print them import pandas as pd cur_fields = { "idxes": list(range(1, len(iinput_inst)+1)), "word": iinput_inst.word_seq.vals, "pos": iinput_inst.pred_pos_seq.vals, "head": iinput_inst.pred_dep_tree.heads[1:], "dlab": iinput_inst.pred_dep_tree.labels[1:]} zlog(f"Result:\n{pd.DataFrame(cur_fields).to_string()}") return {} # simply return here for interactive mode # ----- # test for MLM simply as in training (use special separate rand_gen to keep the masks the same for testing) # todo(+2): do we need to keep testing/validing during training the same? Currently not! info = self.fb_on_batch(insts, training=False, rand_gen=self.testing_rand_gen, assign_attns=conf.testing_get_attns) # ----- if len(insts) == 0: return info # decode for dpar input_map = self.inputter(insts) emb_t, mask_t, enc_t, cache, _ = self._emb_and_enc(input_map, collect_loss=False, insts=insts) dpar_input_attn = self.prepr_f(cache, self._get_rel_dist(BK.get_shape(mask_t, -1))) self.dpar.predict(insts, enc_t, dpar_input_attn, mask_t) self.upos.predict(insts, enc_t, mask_t) if self.ner is not None: self.ner.predict(insts, enc_t, mask_t) # ----- if conf.testing_get_attns: if conf.enc_choice == "vrec": self._assign_attns_item(insts, "orig", cache=cache) elif conf.enc_choice in ["original"]: pass else: raise NotImplementedError() return info
def set_eff_max_layer(self, eff_max_layer=None): if eff_max_layer is None: return self.eff_max_layer # simply query if eff_max_layer < 0: eff_max_layer = self.max_layer + 1 + eff_max_layer assert eff_max_layer > 0 and eff_max_layer <= self.max_layer, f"Err: layer our of range {eff_max_layer}" if self.eff_max_layer != eff_max_layer: zlog( f"Set current layer from {self.eff_max_layer} -> {eff_max_layer}!" ) self.eff_max_layer = eff_max_layer return self.eff_max_layer
def inference_on_batch(self, insts: List[DocInstance], **kwargs): self.refresh_batch(False) # ----- if len(insts) == 0: return {} # ----- ndoc, nsent = len(insts), 0 iconf = self.conf.iconf # ===== # get tmp ms_items for each event input_ms_items = self._insts2msitems(insts) # ----- if len(input_ms_items) == 0: return {} # ----- with BK.no_grad_env(): # splitting into buckets all_packs = self.bter.run(input_ms_items, training=False) for one_pack in all_packs: ms_items, bert_expr, basic_expr = one_pack nsent += len(ms_items) # cands if iconf.lookup_ef: self._lookup_efs(ms_items) else: self.cand_extractor.predict(ms_items, bert_expr, basic_expr) # args if iconf.pred_arg: self.arg_linker.predict(ms_items, bert_expr, basic_expr) # span if iconf.pred_span: self.span_expander.predict(ms_items, bert_expr) # put back all predictions self._putback_preds(input_ms_items) # collect all stats num_ef, num_evt, num_arg = 0, 0, 0 for one_doc in insts: for one_sent in one_doc.sents: num_ef += len(one_sent.pred_entity_fillers) num_evt += len(one_sent.pred_events) num_arg += sum(len(z.links) for z in one_sent.pred_events) info = { "doc": ndoc, "sent": nsent, "num_ef": num_ef, "num_evt": num_evt, "num_arg": num_arg } if iconf.decode_verbose: zlog(f"Decode one mini-batch: {info}") return info
def special_pretrain_load(m: BaseParser, path, strict): if FileHelper.isfile(path): try: zlog(f"Trying to load pretrained model from {path}") m.load(path, strict) zlog(f"Finished loading pretrained model from {path}") return True except: zlog(traceback.format_exc()) zlog("Failed loading, keep the original ones.") else: zlog(f"File does not exist for pretraining loading: {path}") return False
def init(cc=None): # set up common cc (cannot be changed later!!) if cc is None: pass elif isinstance(cc, NIConf): zlog("Updating NIConf with device=%s." % cc.device, func="config") COMMON_CONFIG.update_from_conf(cc) elif isinstance(cc, dict): COMMON_CONFIG.update_from_v(cc) else: COMMON_CONFIG.update_from_s(cc) # BK.init()
def main(args): conf: OverallConf = init_everything(args) dconf = conf.dconf # bmodel = get_berter(dconf.bconf) for one_input, one_output in zip( [dconf.train, dconf.dev, dconf.test], [dconf.aux_repr_train, dconf.aux_repr_dev, dconf.aux_repr_test]): zlog(f"Read from {one_input} and write to {one_output}") num_doc, num_sent = 0, 0 if one_input and one_output: one_streamer = get_data_reader(one_input, dconf.input_format, dconf.use_label0, dconf.noef_link0, None) bertaug_streamer = BerterDataAuger(one_streamer, bmodel, "aux_repr") with zopen(one_output, 'wb') as fd: for one_doc in bertaug_streamer: PickleRW.save_list( [s.extra_features["aux_repr"] for s in one_doc.sents], fd) num_doc += 1 num_sent += len(one_doc.sents) zlog(f"Finish with doc={num_doc}, sent={num_sent}") else: zlog("Skip empty files") zlog("Finish all.")
def end(self): # sorting by idx of reading self.insts.sort(key=lambda x: x.inst_idx) # todo(+1): write other output file if self.outf is not None: with zopen(self.outf, "w") as fd: data_writer = get_data_writer(fd, self.out_format) data_writer.write(self.insts) # evaler = ParserEvaler() # evaler2 = ParserEvaler(ignore_punct=True, punct_set={"PUNCT", "SYM"}) eval_arg_names = [ "poses", "heads", "labels", "pred_poses", "pred_heads", "pred_labels" ] for one_inst in self.insts: # todo(warn): exclude the ROOT symbol; the model should assign pred_* real_values = one_inst.get_real_values_select(eval_arg_names) evaler.eval_one(*real_values) # evaler2.eval_one(*real_values) report_str, res = evaler.summary() # _, res2 = evaler2.summary() # zlog("Results of %s vs. %s" % (self.outf, self.goldf), func="result") zlog(report_str, func="result") res["gold"] = self.goldf # record which file # res2["gold"] = self.goldf # record which file zlog("zzzzztest: testing result is " + str(res)) # zlog("zzzzztest2: testing result is " + str(res2)) zlog("zzzzzpar: %s" % res["res"], func="result") return res