def validate(self): conf = self.conf # report & reset training stat if self.tp.uidx > 0: train_result = self._run_train_report( ) # first report training stat self.train_recorder.reset() # reset training stat else: # for validate_first train_result = None # dev ss, cur_cidx = self.current_name(), self.tp.cidx zlog("", func="plain") # empty line with Timer(info=f"Valid {ss}", print_date=True), self.model.ema_wrap_dev(): # no validation if specified if (self.tp.eidx < conf.valid_start_eidx) or ( self.tp.uidx < conf.valid_start_uidx): zlog("No validation since not the time yet!\n", func="plain") return # validate if len(self.dev_runners ) == 0: # simply use train if there are no dev zlog( "Use training results for dev since there are no dev set provided!", func="warn") dev_result = train_result else: dev_result = self._run_validate(self.dev_runners) # record cur_no_bad = (self.tp.eidx < conf.bad_start_eidx) or ( self.tp.uidx < conf.bad_start_uidx) cur_record_best = (self.tp.cidx >= conf.record_best_cidx) if_overall_best, if_best, if_anneal = self.tp.update_checkpoint( train_result, dev_result, cur_no_bad, cur_record_best, conf.anneal_patience) # save curr & best self.save(conf.model_prefix + conf.model_suffix_curr) if if_overall_best: zlog("Curr is overall best " + str(self.tp.info_overall_best()), func="result") else: zlog("Curr not overall best, the overall best is " + str(self.tp.info_overall_best()), func="result") if if_best: self.save(conf.model_prefix + conf.model_suffix_best) zlog("Curr is best: " + str(self.tp.info_best()), func="result") else: zlog("Curr not best, the best is " + str(self.tp.info_best()), func="result") if cur_cidx >= conf.save_start_cidx and cur_cidx % conf.save_cfreq == 0: self.save(conf.model_prefix + ss) # speical save if if_anneal and conf.anneal_restore: zlog("Restore from previous best model!!", func="plain") self.load(conf.model_prefix + conf.model_suffix_best, False) zlog("", func="plain") # empty line
def run(self): rec = self.test_recorder with Timer(info="Run-test", print_date=True): for insts in self.test_stream: # results are stored in insts themselves with rec.go(): res = self._run_batch(insts) rec.record(res) res = self._run_end() return res
def do_dev(self): conf = self.conf # report & reset training stat if self.tp.uidx > 0: train_result = self.run_train_report( ) # first report training stat self.train_recorder.reset() # reset training stat else: # for validate_first train_result = ResultRecord.get_nil() # dev ss, cur_cidx = self.current_name(), self.tp.cidx zlog("", func="plain") # empty line with Timer(info=f"Valid {ss}", print_date=True): # no validation if specified if self.tp.uidx < conf.valid_start_uidx: zlog("No validation since not the time yet!\n", func="plain") return # validate if len(self.d_center.get_datasets( wset="dev")) == 0: # simply use train if there are no dev zlog( "Use training results for dev since there are no dev set provided!", func="warn") dev_result = train_result else: dev_result = self.do_test("dev") # record cur_record_best = (self.tp.cidx >= conf.record_best_start_cidx) if_overall_best, if_best, if_anneal = self.tp.update_checkpoint( train_result, dev_result, record_best=cur_record_best) # save curr & best self.save(conf.model_save_prefix + conf.model_save_suffix_curr) if if_overall_best: zlog("Curr is overall best " + str(self.tp.info_overall_best()), func="result") else: zlog("Curr not overall best, the overall best is " + str(self.tp.info_overall_best()), func="result") if if_best: self.save(conf.model_save_prefix + conf.model_save_suffix_best) zlog("Curr is best: " + str(self.tp.info_best()), func="result") else: zlog("Curr not best, the best is " + str(self.tp.info_best()), func="result") if cur_cidx >= conf.save_special_start_cidx and cur_cidx % conf.save_special_cfreq == 0: self.save(conf.model_save_prefix + ss) # speical save # -- zlog("", func="plain") # empty line
def main(args): conf, model, vpack, test_iter = prepare_test(args) dconf = conf.dconf # -- if conf.tconf.test_do_oracle_batching: # note: special mode!! zlog("First decode to get oracle!") all_insts = [] # simply decode and get them grouped_insts = defaultdict(list) for insts in test_iter: model.predict_on_batch(insts) all_insts.extend(insts) for inst in insts: for frame in inst.events: _key = frame.info["exit_lidx"] grouped_insts[_key].append(frame) group_info = { k: len(grouped_insts[k]) for k in sorted(grouped_insts.keys()) } zlog(f"group: {group_info}") # then feed them within groups rr = ZmtlTestingRunner(model, None, conf, dconf.output, dconf.test, do_score=dconf.test_do_score) rec = rr.test_recorder with Timer(info="Run-test", print_date=True): tconf = conf.tconf tconf.test_count_mode = "tok" # note: here we already get the frames!! for frames in grouped_insts.values(): stream, _ = batch_stream(IterStreamer(frames), tconf, False) for binsts in stream: with rec.go(): res0 = rr._run_batch(binsts) rec.record(res0) rr.all_insts = all_insts # replace by sents!! res = rr._run_end() else: # go rr = ZmtlTestingRunner(model, test_iter, conf, dconf.output, dconf.test, do_score=dconf.test_do_score) res = rr.run() zlog(f"zzzfinal: {res}") zlog("The end of testing.")
def do_test(self, wset="test"): model, t_center, d_center = self.model, self.t_center, self.d_center conf = self.conf # -- to_test_datasets = d_center.get_datasets(wset=wset) t_center.prepare_datasets(to_test_datasets) # re-prepare!! aggr = ResultAggregator() for one_ii, one_dataset in enumerate(to_test_datasets): with Timer( info= f"Test({one_ii+1}/{len(to_test_datasets)}): {one_dataset}", print_date=True): one_res = self.run_test_dataset(one_dataset) aggr.add(one_dataset.name, one_res, one_dataset.conf.group_eval_weight) ret = aggr.get_res() return ret
# -- def main(args): conf: OverallConf = init_everything(OverallConf(), args) dconf, tconf = conf.dconf, conf.tconf # data from .train import prepare_train_data train_streamers, dev_streamers, test_streamer, _ = prepare_train_data( dconf) extra_streamers = dev_streamers if test_streamer is None else dev_streamers + [ test_streamer ] # vocab vpack = ZmtlVocabPackage.build_from_stream( dconf, MultiCatStreamer(train_streamers), MultiCatStreamer(extra_streamers)) vpack.save(dconf.dict_dir) zlog("The end of Building.") if __name__ == '__main__': import sys with Timer(info=f"Building", print_date=True) as et: main(sys.argv[1:]) # example: for building vocab and filtering embeds """ # filter for pb PYTHONPATH=../src/ python3 -m msp2.tasks.zmtl.main.build train:../pb/conll05/train.conll.ud.json dev:../pb/conll05/dev.conll.ud.json,../pb/conll05/test.wsj.conll.ud.json,../pb/conll05/test.brown.conll.ud.json dict_dir:./ pretrain_hits_outf:hits_conll05.vec pretrain_wv_file:wiki-news-300d-1M-subword.vec |& tee _log.voc_conll05 """
ms_budgets = [ScheduledValue(f"ms_budget{i}", c) for i,c in enumerate(dconf.get_ms_train_budgets())] joined_train_streamer = MultiJoinStreamer(prepared_train_streamers, dconf.ms_stop_idx, ratios=ms_budgets) else: ms_budgets = [] joined_train_streamer = prepared_train_streamers[0] train_iter, train_batch_f = batch_stream(joined_train_streamer, tconf, True) dev_iters = [batch_stream(index_stream( z, vpack, tconf.dev_use_cache, 0, test_inst_preparer), tconf, False)[0] for z in dev_streamers] # training runner tr = ZmtlTrainingRunner.create(model, train_iter, train_batch_f, conf, dev_iters, [dconf.output+f".dev{i}" for i in range(len(dev_golds))], dev_golds) for mv in ms_budgets: # add them for scheduling! tr.add_scheduled_value(mv) # load? if tconf.load_model: # tr.load(dconf.model_load_name, tconf.load_process, load_strict=dconf.model_load_strict) tr.load(dconf.model_load_name, tconf.load_process) # go tr.run() zlog("The end of Training.") if __name__ == '__main__': import sys with Timer(info=f"Training", print_date=True) as et: main(sys.argv[1:]) # -- """ CUDA_VISIBLE_DEVICES= PYTHONPATH=../src:../../src/:../../../src python3 -m msp2.tasks.zmtl.main.train _conf """
new_vocab = aug_words_and_embs(_embedder, vpack.get_voc("word"), extra_vocab, extra_embedding, aug_scale=dconf.pretrain_scale) vpack.put_voc("word", new_vocab) # ===== # No Cache!! test_inst_preparer = model.get_inst_preper(False) test_iter, _ = batch_stream( index_stream(test_streamer, vpack, False, False, test_inst_preparer), tconf, False) return conf, model, vpack, test_iter # ----- def main(args): conf, model, vpack, test_iter = prepare_test(args) dconf = conf.dconf # go rr = ZsfpTestingRunner(model, test_iter, conf, dconf.output, dconf.test) res = rr.run() zlog(f"zzzfinal: {res}") zlog("The end of testing.") if __name__ == '__main__': import sys with Timer(info=f"Testing", print_date=True) as et: main(sys.argv[1:])
1000) # 1000 sents once time test_dataset = ZDataset(d_center.conf.testM, 'testM', 'decode', _no_load=True) # use testM for other options! for lines in yield_lines(sys.stdin, BATCH_LINE): insts = [Sent.create(one.split()) for one in lines] # note: simply split as sentence!! test_dataset.set_insts(insts) # directly set it! cc["sent"] += len(insts) if cc["sent"] % 50000 == 0: zlog(f"Decode for {cc}") # -- t_center.prepare_datasets([test_dataset]) # re-prepare!! for ibatch in test_dataset.yield_batches(loop=False): one_res = model.predict_on_batch(ibatch) # -- for inst in insts: sys.stdout.write( json.dumps(inst.to_json(), ensure_ascii=False) + "\n") # ===== zlog(f"The end of Decoding: {cc}") # -- # MDIR=?? # PYTHONPATH=../src/ CUDA_VISIBLE_DEVICES=0 python3 -m msp2.tasks.zmtl2.main.decode ${MDIR}/_conf model_load_name:${MDIR}/zmodel.best.m vocab_load_dir:${MDIR}/ log_stderr:1 testM.group_tasks:?? if __name__ == '__main__': with Timer(info=f"Decoding", print_date=True) as et: main(sys.argv[1:])
# conf conf: ZOverallConf = init_everything(ZOverallConf(), args) # task t_center = TaskCenter(conf.tconf) enc = t_center.tasks['enc'] # data d_center = DataCenter(conf.dconf) for dataset in d_center.get_datasets(): enc.prepare_dataset(dataset) vv = SimpleVocab.build_by_static([]) vv2 = SimpleVocab.build_by_static([]) for item in dataset.items: vv.feed_one(item._batch_len) vv2.feed_one(sum(len(z) for z in item.sents) + 1) vv.build_sort(lambda w, i, c: w) vv2.build_sort(lambda w, i, c: w) zlog( f"#== For {dataset} (subword):\n{vv.get_info_table().to_string()}") zlog(f"#== For {dataset} (word):\n{vv2.get_info_table().to_string()}") # -- zlog("The end of Building.") if __name__ == '__main__': import sys with Timer(info=f"CheckLength", print_date=True) as et: main(sys.argv[1:]) # -- # python3 -m msp2.tasks.zmtl2.main.check_length train0.input_dir:ud train0.input_format:conllu train0.group_files:_ud14/en2 train0.approx_prev_next:1 train0.left_extend_nsent:1 train0.right_extend_nsent:1