def _run_end(self): x = self.test_recorder.summary() res = self.res_manager.end() x.update(res) MltDevResult.calc_acc(x) Helper.printd(x, sep=" || ") return x
def _run_train_report(self): x = self.train_recorder.summary() y = GLOBAL_RECORDER.summary() if len(y) > 0: x.update(y) MltDevResult.calc_acc(x) Helper.printd(x, " || ") return RecordResult(x, score=x.get("res", 0.))
def _run_train_report(self): x = self.train_recorder.summary() y = GLOBAL_RECORDER.summary() # todo(warn): get loss/tok # todo(note): too complex to div here, only accumulating the sums. # x["loss_tok"] = x.get("loss_sum", 0.)/x["tok"] if len(y) > 0: x.update(y) Helper.printd(x, " || ") return RecordResult(x)
def _run_train_report(self): x = self.train_recorder.summary() y = GLOBAL_RECORDER.summary() # todo(warn): get loss/tok x["loss_tok"] = x.get("loss_sum", 0.) / x["tok"] if len(y) > 0: x.update(y) # zlog(x, "report") Helper.printd(x, " || ") return RecordResult(x)
def main(args): conf: DecodeAConf = init_everything(args, DecodeAConf) dconf, mconf = conf.dconf, conf.mconf iconf = mconf.iconf # vocab vpack = IEVocabPackage.build_by_reading(conf) # prepare data test_streamer = get_data_reader(dconf.test, dconf.input_format, dconf.use_label0, dconf.noef_link0, dconf.aux_repr_test, max_evt_layers=dconf.max_evt_layers) # model model = build_model(conf.model_type, conf, vpack) model.load(dconf.model_load_name) # use bert? if dconf.use_bert: bmodel = get_berter(dconf.bconf) test_streamer = BerterDataAuger(test_streamer, bmodel, "aux_repr") # finally prepare iter (No Cache!!, actually no batch_stream) test_inst_preparer = model.get_inst_preper(False) test_iter = index_stream(test_streamer, vpack, False, False, test_inst_preparer) # ===== # run decoder = ArgAugDecoder(conf.aconf, model) all_docs = [] stat_recorder = StatRecorder(False) with Timer(tag="Decode", info="Decoding", print_date=True): with zopen(dconf.output_file, 'w') as fd: data_writer = get_data_writer(fd, dconf.output_format) for one_doc in test_iter: info = decoder.decode(one_doc) stat_recorder.record(info) if conf.verbose: zlog(f"Decode one doc, id={one_doc.doc_id} info={info}") # release resources for one_sent in one_doc.sents: one_sent.extra_features[ "aux_repr"] = None # todo(note): special name! # write output data_writer.write([one_doc]) # all_docs.append(one_doc) if conf.verbose: zlog(f"Finish decoding, overall: {stat_recorder.summary()}") # eval? if conf.do_eval: evaler = MyIEEvaler(MyIEEvalerConf()) result = evaler.eval(all_docs, all_docs) Helper.printd(result) zlog("The end.")
def main(): utils.init("zlog", 1234) z = StatRecorder(True) times = Random.randint(100) for _ in range(times): with z.go(): z.record_kv("distr_n", Random.randint(10)) Helper.printd(z.summary(), "\n") # cc = Conf0() cc.update_from_args(["a:10", "y:www", "z.x:1"]) pass
def __init__(self, conf: MyIEModelConf, vpack: VocabPackage): self.conf = conf self.vpack = vpack tconf = conf.tconf # ===== Vocab ===== # ===== Model ===== self.pc = BK.ParamCollection(True) # bottom-part: input + encoder self.bter: MyIEBT = self.build_encoder() self.lexi_output_dim = self.bter.emb_output_dim self.enc_ef_output_dim, self.enc_evt_output_dim = self.bter.get_output_dims()[0] self.enc_lrf_sv = ScheduledValue("enc_lrf", tconf.enc_lrf) self.pc.optimizer_set(tconf.enc_optim.optim, self.enc_lrf_sv, tconf.enc_optim, params=self.bter.get_parameters(), check_repeat=True, check_full=True) # upper-parts: the decoders self.decoders: List = self.build_decoders() self.dec_lrf_sv = ScheduledValue("dec_lrf", tconf.dec_lrf) self.pc.optimizer_set(tconf.dec_optim.optim, self.dec_lrf_sv, tconf.dec_optim, params=Helper.join_list(z.get_parameters() for z in self.decoders), check_repeat=True, check_full=True) # ===== For training ===== # schedule values self.margin = ScheduledValue("margin", tconf.margin) self._scheduled_values = [self.margin, self.enc_lrf_sv, self.dec_lrf_sv] # for refreshing dropouts self.previous_refresh_training = True # ===== # others self.train_constrain_evt_types = {"": None, "kbp17": KBP17_TYPES}[conf.tconf.constrain_evt_types] self.test_constrain_evt_types = {"": None, "kbp17": KBP17_TYPES}[conf.iconf.constrain_evt_types]
def filter_vals(word_vals, word_filter=(lambda ww, rank, val: True)): ranked_list = Helper.rank_key(word_vals) truncated_vals = {} for ii, ww in enumerate(ranked_list): rank, val = ii+1, word_vals[ww] if word_filter(ww, rank, val): truncated_vals[ww] = val return truncated_vals
def do_join(self, insts_target: str, jcode: str) -> List: vs = self.vars _ff = compile(jcode, "", "eval") insts = self.get_and_check_type(insts_target, list) ret = [eval(_ff) for d in insts] ret = Helper.join_list(ret) zlog(f"Join-list by {jcode}: from {len(insts)} to {len(ret)}") return ret
def _fb_batch(self, insts): num_splits = self.rconf.split_batch loss_factor = 1. / num_splits splitted_insts = Helper.split_list(insts, num_splits) with self.train_recorder.go(): for one_insts in splitted_insts: res = self._run_fb(one_insts, loss_factor) self.train_recorder.record(res) self._tp.iidx += self.batch_size_f(insts)
def set_children_info(self, oracle_strategy, label_ranking_dict: Dict = None, free_dist_alpha: float = 0.): heads = self.heads.vals the_len = len(heads) # self.children_set = [set() for _ in range(the_len)] self.children_list = [[] for _ in range(the_len)] tmp_descendant_list = [None for _ in range(the_len)] # exclude root for m, h in enumerate(heads[1:], 1): # self.children_set[h].add(m) self.children_list[h].append(m) # l2r order # re-arrange list order (left -> right) if oracle_strategy == "i2o": for h in range(the_len): self.children_list[h].sort(key=lambda x: -x if x < h else x) elif oracle_strategy == "label": # todo(warn): only use first level! level0_labels = [z.split(":")[0] for z in self.labels.vals] for h in range(the_len): self.children_list[h].sort( key=lambda x: label_ranking_dict[level0_labels[x]]) elif oracle_strategy == "n2f": self.shuffle_children_n2f() elif oracle_strategy == "free": self.free_dist_alpha = free_dist_alpha self.shuffle_children_free() else: assert oracle_strategy == "l2r" pass # todo(+N): does the order of descendant list matter? # todo(+N): depth-first or breadth-first? (currently select the latter) # recursively get descendant list: do this # ===== def _recursive_add(cur_n): cur_children = self.children_list[cur_n] # List[int] for i in cur_children: _recursive_add(i) new_dlist = [cur_children] cur_layer = 0 while True: another_layer = Helper.join_list( tmp_descendant_list[i][cur_layer] if cur_layer < len(tmp_descendant_list[i]) else [] for i in cur_children) if len(another_layer) == 0: break new_dlist.append(another_layer) cur_layer += 1 tmp_descendant_list[cur_n] = new_dlist # ===== _recursive_add(0) self.descendant_list = [ Helper.join_list(tmp_descendant_list[i]) for i in range(the_len) ]
def yield_data(self, files): # if not isinstance(files, (list, tuple)): files = [files] # cur_num = 0 for f in files: cur_num += 1 zlog("-----\nDataReader: [#%d] Start reading file %s." % (cur_num, f)) with zopen(f) as fd: for z in self._yield_tokens(fd): yield z if cur_num % self.report_freq == 0: zlog("** DataReader: [#%d] Summary till now:" % cur_num) Helper.printd(self.stats) zlog("=====\nDataReader: End reading ALL (#%d) ==> Summary ALL:" % cur_num) Helper.printd(self.stats)
def main(): s0 = IterStreamer(range(200)) s1 = InstCacher(range(200), shuffle=True) s2 = InstCacher( MultiCatStreamer( [IterStreamer(range(100, 200)), IterStreamer(range(100))])) s3 = BatchArranger(InstCacher(IterStreamer(range(200))), 8, 10, None, lambda x: x == 48, None, lambda x: (x - 24)**2, True) # nums = set(list(s0)) for R in range(10): assert nums == set(list(s1)) assert nums == set(list(s2)) zz = list(s3) assert nums == set(Helper.join_list(zz) + [48])
def _recursive_add(cur_n): cur_children = self.children_list[cur_n] # List[int] for i in cur_children: _recursive_add(i) new_dlist = [cur_children] cur_layer = 0 while True: another_layer = Helper.join_list( tmp_descendant_list[i][cur_layer] if cur_layer < len(tmp_descendant_list[i]) else [] for i in cur_children) if len(another_layer) == 0: break new_dlist.append(another_layer) cur_layer += 1 tmp_descendant_list[cur_n] = new_dlist
def finish(self, word_filter=(lambda ww, rank, val: True), sort_by_count=True, target_range=DEFAULT_TARGET_RANGE): v = self.v if sort_by_count: v.v, v.final_vals = VocabBuilder.ranking_vals( self.counts_, v.pre_list, v.post_list, self.default_val_, True, word_filter=word_filter) else: tmp_counts_ = OrderedDict([(k, self.counts_[k]) for k in self.keys_]) v.v, v.final_vals = VocabBuilder.ranking_vals( tmp_counts_, v.pre_list, v.post_list, self.default_val_, False, word_filter=word_filter) v.final_words = Helper.reverse_idx(v.v) printing("Build Vocab %s ok, from %d to %d, as %s." % (v.name, len(self.counts_), len(v), str(v))) # VocabBuilder._build_check(v) VocabBuilder._build_target_range(v, target_range[0], target_range[1]) VocabBuilder._build_prop(v) return v
def ranking_vals(word_vals, pre_list, post_list, default_val, sort_vals, word_filter=(lambda ww, rank, val: True)): if sort_vals: valid_word_list = Helper.rank_key(word_vals) else: valid_word_list = word_vals.keys() # truncated_vals = [default_val] * len(pre_list) v = dict(zip(pre_list, range(len(pre_list)))) for ii, ww in enumerate(valid_word_list): rank, val = ii+1, word_vals[ww] if word_filter(ww, rank, val): v[ww] = len(v) truncated_vals.append(val) for one in post_list: v[one] = len(v) truncated_vals.append(default_val) return v, truncated_vals
def inference_on_batch(self, insts: List[ParseInstance], **kwargs): # iconf = self.conf.iconf with BK.no_grad_env(): self.refresh_batch(False) # pruning and scores from g1 valid_mask, go1_pack = self._get_g1_pack( insts, self.lambda_g1_arc_testing, self.lambda_g1_lab_testing) # encode input_repr, enc_repr, jpos_pack, mask_arr = self.bter.run( insts, False) mask_expr = BK.input_real(mask_arr) # decode final_valid_expr = self._make_final_valid(valid_mask, mask_expr) ret_heads, ret_labels, _, _ = self.dl.decode( insts, enc_repr, final_valid_expr, go1_pack, False, 0.) # collect the results together all_heads = Helper.join_list(ret_heads) if ret_labels is None: # todo(note): simply get labels from the go1-label classifier; must provide g1parser if go1_pack is None: _, go1_pack = self._get_g1_pack(insts, 1., 1.) _, go1_label_max_idxes = go1_pack[1].max( -1) # [bs, slen, slen] pred_heads_arr, _ = self.predict_padder.pad( all_heads) # [bs, slen] pred_heads_expr = BK.input_idx(pred_heads_arr) pred_labels_expr = BK.gather_one_lastdim( go1_label_max_idxes, pred_heads_expr).squeeze(-1) all_labels = BK.get_value(pred_labels_expr) # [bs, slen] else: all_labels = np.concatenate(ret_labels, 0) # ===== assign, todo(warn): here, the labels are directly original idx, no need to change for one_idx, one_inst in enumerate(insts): cur_length = len(one_inst) + 1 one_inst.pred_heads.set_vals( all_heads[one_idx] [:cur_length]) # directly int-val for heads one_inst.pred_labels.build_vals( all_labels[one_idx][:cur_length], self.label_vocab) # one_inst.pred_par_scores.set_vals(all_scores[one_idx][:cur_length]) # ===== # put jpos result (possibly) self.jpos_decode(insts, jpos_pack) # ----- info = {"sent": len(insts), "tok": sum(map(len, insts))} return info
def eval(self, quiet=True, breakdown=False): all_pre_u, all_pre_l, label_pres = self._calc_result( self.preds, self.golds) all_rec_u, all_rec_l, label_recs = self._calc_result( self.golds, self.preds) all_f_u = F1Result(all_pre_u, all_rec_u) all_f_l = F1Result(all_pre_l, all_rec_l) label_fs = { k: F1Result(label_pres[k], label_recs[k]) for k in self.labels } if breakdown else {} if not quiet: zlog( f"Overall f1 score for {self.name}: unlabeled {all_f_u}; labeled {all_f_l}" ) zlog("Breakdowns: \n" + Helper.printd_str(label_fs)) return all_f_u, all_f_l, label_fs
def _combine_recursive_keys(values, bidxes, keys): if isinstance(keys, dict): ret = {} for k in keys: ret[k] = SliceManager._combine_recursive_keys( [z[k] for z in values], bidxes, keys[k]) elif isinstance(keys, set): ret = {} for k in keys: ret[k] = SliceManager._combine_recursive( [z[k] for z in values], bidxes) else: # direct through if keys is None: keys = [] elif not isinstance(keys, Iterable): keys = [keys] next_values = [Helper.apply_keys(z, keys) for z in values] ret = SliceManager._combine_recursive(next_values, bidxes) return ret
def _arrange_idxes(slices): values, bidxes = [], [] # tmp tmp_bidx_bases = [ 0, ] tmp_id2idx = {} for s in slices: one_ew, one_sidx = s.ew, s.slice_idx ew_id = one_ew.id if ew_id not in tmp_id2idx: tmp_id2idx[ew_id] = len(values) values.append(one_ew.val) tmp_bidx_bases.append(one_ew.bsize + tmp_bidx_bases[-1]) # idx_in_vals = tmp_id2idx[ew_id] bidxes.append(tmp_bidx_bases[idx_in_vals] + one_sidx) # check for perfect match if Helper.check_is_range(bidxes, tmp_bidx_bases[-1]): bidxes = None return values, bidxes
def arange_cache(self, bidxes): new_bsize = len(bidxes) # if the idxes are already fine, then no need to select if not Helper.check_is_range(bidxes, self.cur_bsize): # mask is on CPU to make assigning easier bidxes_ct = BK.input_idx(bidxes, BK.CPU_DEVICE) self.scoring_fixed_mask_ct = self.scoring_fixed_mask_ct.index_select( 0, bidxes_ct) self.scoring_mask_ct = self.scoring_mask_ct.index_select( 0, bidxes_ct) self.oracle_mask_ct = self.oracle_mask_ct.index_select( 0, bidxes_ct) # other things are all on target-device (possibly GPU) bidxes_device = BK.to_device(bidxes_ct) self.enc_repr = self.enc_repr.index_select(0, bidxes_device) self.scoring_cache.arange_cache(bidxes_device) # oracles self.oracle_mask_t = self.oracle_mask_t.index_select( 0, bidxes_device) self.oracle_label_t = self.oracle_label_t.index_select( 0, bidxes_device) # update bsize self.update_bsize(new_bsize)
def main(args): conf, model, vpack, test_iter = prepare_test(args, AnalyzeConf) # make sure the model is order 1 graph model, otherwise cannot run through assert isinstance(model, G1Parser) and isinstance(conf.pconf, G1ParserConf) # ===== # helpers all_stater = StatRecorder(False) def _stat(k, v): all_stater.record_kv(k, v) # check agreement def _agree2(a, b, name): agreement = (np.asarray(a) == np.asarray(b)) num_agree = int(agreement.sum()) _stat(name, num_agree) # do not care about efficiency here! step2_pack = [] for cur_insts in test_iter: # score and prune valid_mask, arc_score, label_score, mask_expr, marginals = model.prune_on_batch( cur_insts, conf.zprune) # greedy on raw scores greedy_label_scores, greedy_label_mat_idxes = label_score.max( -1) # [*, m, h] greedy_all_scores, greedy_arc_idxes = (arc_score + greedy_label_scores).max( -1) # [*, m] greedy_label_idxes = greedy_label_mat_idxes.gather( -1, greedy_arc_idxes.unsqueeze(-1)).squeeze(-1) # [*, m] # greedy on marginals (arc only) greedy_marg_arc_scores, greedy_marg_arc_idxes = marginals.max( -1) # [*, m] entropy_marg = -(marginals * (marginals + 1e-10 * (marginals == 0.).float()).log()).sum(-1) # [*, m] # decode model.inference_on_batch(cur_insts) # ===== z = ZObject() keys = list(locals().keys()) for k in keys: v = locals()[k] try: setattr(z, k, v.cpu().detach().numpy()) except: pass # ===== for idx in range(len(cur_insts)): one_inst: ParseInstance = cur_insts[idx] one_len = len(one_inst) + 1 # [1, len) _stat("all_edges", one_len - 1) arc_gold = one_inst.heads.vals[1:] arc_mst = one_inst.pred_heads.vals[1:] arc_gma = z.greedy_marg_arc_idxes[idx][1:one_len] # step 1: decoding agreement, how many edges agree: gold, mst-decode, greedy-marginal arcs = {"gold": arc_gold, "mst": arc_mst, "gma": arc_gma} cmp_keys = sorted(arcs.keys()) for i in range(len(cmp_keys)): for j in range(i + 1, len(cmp_keys)): n1, n2 = cmp_keys[i], cmp_keys[j] _agree2(arcs[n1], arcs[n2], f"{n1}_{n2}") # step 2: confidence arc_agree = (np.asarray(arc_gold) == np.asarray(arc_mst)) arc_marginals_mst = z.marginals[idx][range(1, one_len), arc_mst] arc_marginals_gold = z.marginals[idx][range(1, one_len), arc_gold] arc_entropy = z.entropy_marg[idx][1:one_len] for tidx in range(one_len - 1): step2_pack.append([ int(arc_agree[tidx]), min(1., float(arc_marginals_mst[tidx])), min(1., float(arc_marginals_gold[tidx])), float(arc_entropy[tidx]) ]) # step 2: bucket by marginals if True: NUM_BUCKET = 10 df = pd.DataFrame(step2_pack, columns=['agree', 'm_mst', 'm_gold', 'entropy']) z = df.sort_values(by='m_mst', ascending=False) z.to_csv('res.csv') for cur_b in range(NUM_BUCKET): interval = 1. / NUM_BUCKET r0, r1 = cur_b * interval, (cur_b + 1) * interval cur_v = df[(df.m_mst >= r0) & ((df.m_mst < r1))] zlog(f"#===== [{r0}, {r1}): {cur_v.shape}\n" + str(cur_v.describe())) # ===== d = all_stater.summary(get_v=False, get_str=True) Helper.printd(d, "\n\n")
def pred_events(self): return Helper.join_list(x.pred_events for x in self.sents)
def pred_entity_fillers(self): return Helper.join_list(x.pred_entity_fillers for x in self.sents)
def subword_is_start(self): return Helper.join_list(self.cur_is_starts)
def subword_typeids(self): if self.cur_typeids is None: return None else: return Helper.join_list(self.cur_typeids)
def subword_ids(self): return Helper.join_list(self.cur_ids)
def _run_end(self): x = self.test_recorder.summary() res = self.res_manager.end() x.update(res) Helper.printd(x, sep=" ") return x
def get_split_params(self): params0 = Helper.join_list(z.get_parameters() for z in [self.arc_m, self.arc_h, self.lab_m, self.lab_h]) params1 = Helper.join_list(z.get_parameters() for z in [self.arc_scorer, self.lab_scorer]) return params0, params1
def main(args): conf = PsConf() conf.update_from_args(args) # read the data path_train, path_dev, path_test = [ get_data(z) for z in [conf.train, conf.dev, conf.test] ] pretrain_file = get_data(conf.pretrain_file) train_insts = list(get_data_reader(path_train, "conllu", "", False, "")) dev_insts = list(get_data_reader(path_dev, "conllu", "", False, "")) test_insts = list(get_data_reader(path_test, "conllu", "", False, "")) use_pos = conf.use_pos num_pieces = conf.pieces max_epoch = conf.max_epoch reg_scores_lambda = conf.reg_scores_lambda cur_run = conf.cur_run zlog( f"Read from train/dev/test: {len(train_insts)}/{len(dev_insts)}/{len(test_insts)}, split train into {num_pieces}" ) # others RGPU = os.getenv("RGPU", "") # first train on all: 1. get dict (only build once), 2: score dev/test with Timer("train", "Train-ALL"): cur_conf, cur_model = "_conf.all", "_model.all" cur_load_model = cur_model + ".best" cur_base_opt = get_base_opt(cur_conf, cur_model, use_pos, True, max_epoch, reg_scores_lambda, cur_run) system(get_train_cmd(RGPU, cur_base_opt, path_train, path_dev, path_test, pretrain_file), pp=True) system(get_score_cmd(RGPU, cur_conf, cur_load_model, path_dev, "dev.scores.pkl"), pp=True) system(get_score_cmd(RGPU, cur_conf, cur_load_model, path_test, "test.scores.pkl"), pp=True) # then training on the pieces (leaving one out) # first split into pieces Random.shuffle(train_insts) piece_length = math.ceil(len(train_insts) / num_pieces) train_pieces = [] cur_idx = 0 while cur_idx < len(train_insts): next_idx = min(len(train_insts), cur_idx + piece_length) train_pieces.append(train_insts[cur_idx:next_idx]) cur_idx = next_idx zlog(f"Split training into {num_pieces}: {[len(x) for x in train_pieces]}") assert len(train_pieces) == num_pieces # next train each of the pieces for piece_id in range(num_pieces): with Timer("train", f"Train-{piece_id}"): # get current training pieces cur_training_insts = Helper.join_list( [train_pieces[x] for x in range(num_pieces) if x != piece_id]) cur_testing_insts = train_pieces[piece_id] # write files cur_path_train, cur_path_test = f"tmp.train.{piece_id}.conllu", f"tmp.test.{piece_id}.conllu" write_insts(cur_path_train, cur_training_insts) write_insts(cur_path_test, cur_testing_insts) cur_conf, cur_model = f"_conf.{piece_id}", f"_model.{piece_id}" cur_load_model = cur_model + ".best" # no build dict, reuse previous cur_base_opt = get_base_opt(cur_conf, cur_model, use_pos, False, max_epoch, reg_scores_lambda, cur_run) system(get_train_cmd(RGPU, cur_base_opt, cur_path_train, path_dev, cur_path_test, pretrain_file), pp=True) system(get_score_cmd(RGPU, cur_conf, cur_load_model, cur_path_test, f"tmp.test.{piece_id}.scores.pkl"), pp=True) # finally put them in order all_results = [] for piece_id in range(num_pieces): all_results.extend(read_results(f"tmp.test.{piece_id}.scores.pkl")) # reorder to the original order orig_indexes = [z.inst_idx for z in train_insts] orig_results = [None] * len(orig_indexes) for new_idx, orig_idx in enumerate(orig_indexes): assert orig_results[orig_idx] is None orig_results[orig_idx] = all_results[new_idx] # saving write_results("train.scores.pkl", orig_results) zlog("The end.")