def execute(self, env_prefix: str): conf = self.conf # -- dir_name = self.dir_name system(f"mkdir -p {dir_name}") # train system( f"cd {dir_name}; {env_prefix} PYTHONPATH=../src:../../src/:../../../src python3 -m msp2.tasks.{conf.task_name}.main.train {conf.conf_infile} {self.arg_str} conf_output:{conf.conf_outfile} >_log_train 2>&1;" ) rr = self._read_results(dir_name + "/_log_train") # test # system(f"cd {dir_name}; {env_prefix} python3 PYTHONPATH=../../src/ python3 -m msp2.tasks.zsfp.main.test {conf.conf_outfile} device:0 {self.arg_str} >_log_test 2>&1") return rr
def get_eval_script(): this_dir = os.path.dirname(os.path.abspath(__file__)) srl_eval_script = os.path.join(this_dir, "srl-eval.pl") if not os.path.exists(srl_eval_script): srl_eval_gz_file = os.path.join(this_dir, "srl-eval.pl.gz") if os.path.isfile(srl_eval_gz_file): # decompress system(f"gzip -c -d {srl_eval_gz_file} >{srl_eval_script}", ass=True) else: raise RuntimeError("Cannot find srl_eval!!") # -- return srl_eval_script
def run(self): conf = self.conf # -- train_base_opt = self.get_train_base_opt() test_base_opt = self.get_test_base_opt() _L_PRE = conf.log_prefix DEBUG_OPTION = "-m pdb" if conf.debug else "" PRINT_OPTION = "log_stderr:0" if conf.quite else "" DEVICE_OPTION = "nn.device:0" if self._get_gpus_from_env_prefix( conf._env_prefix) else "" # -- # special mode for convenience if conf.do_test2: # test without logging? TEST2_CMD = f"cd {conf.run_dir}; {conf._env_prefix} PYTHONPATH={conf.src_dir}:$PYTHONPATH python3 {DEBUG_OPTION} -m {conf._module}.test {conf.conf_output} {test_base_opt} log_file: log_stderr:1 {DEVICE_OPTION} {conf.test_extras} {conf._test_extras}" system(TEST2_CMD, pp=(not conf.quite)) else: # train? TRAIN_CMD = f"cd {conf.run_dir}; {conf._env_prefix} PYTHONPATH={conf.src_dir}:$PYTHONPATH python3 {conf.train_pyopts} {DEBUG_OPTION} -m {conf._module}.train {conf.conf_input} {train_base_opt} log_file:{_L_PRE}_train {PRINT_OPTION} {DEVICE_OPTION} conf_output:{conf.conf_output} {conf.train_extras} {conf._train_extras}" if conf.do_train: system(TRAIN_CMD, pp=(not conf.quite)) # test? TEST_CMD = f"cd {conf.run_dir}; {conf._env_prefix} PYTHONPATH={conf.src_dir}:$PYTHONPATH python3 {DEBUG_OPTION} -m {conf._module}.test {conf.conf_output} {test_base_opt} log_file:{_L_PRE}_test {PRINT_OPTION} {DEVICE_OPTION} {conf.test_extras} {conf._test_extras}" if conf.do_test: system(TEST_CMD, pp=(not conf.quite)) # test-all? if conf.do_test_all: for extras in self.get_all_dt_opts(): # note: mainly input/output/log _TMP_CMD = TEST_CMD + f" {extras}" system(_TMP_CMD, pp=True)
def get_result(self): output = system( f"cat {self.conf.run_dir}/_log_train | grep zzzzzfinal", popen=True) result_res, result_dict = re.search("\"Result\(([0-9.]+)\): (.*)\"", output).groups() result_res, result_dict = eval(result_res), eval(result_dict) return MyResult(result_res, result_dict)
def annotate(self, insts: List): conf: AnnotatorP2DConf = self.conf # -- # get all sentences and run in batch all_sents = list(yield_sents(insts)) tmp_input = os.path.join(conf.p2d_tmp_dir, "_input.penn") with zopen(tmp_input, 'w') as fd: for sent in all_sents: fd.write(sent2tree(sent) + "\n") # run tmp_output = os.path.join(conf.p2d_tmp_dir, "_output.conllu") log_cmd = f'2>{conf.p2d_log}' if conf.p2d_log else '' system(f"{self.cmd} -treeFile {tmp_input} >{tmp_output} {log_cmd}") # read output and add back conll_reader_conf = ReaderGetterConf() conll_reader_conf.input_conf.use_multiline = True conll_reader_conf.input_conf.mtl_ignore_f = "'ignore_#'" conll_reader_conf.input_format = "conllu" conll_reader_conf.input_path = tmp_output conll_reader = get_reader(conll_reader_conf) new_sents = list(conll_reader) # -- assert len(all_sents) == len(new_sents) for s0, s1 in zip(all_sents, new_sents): assert len(s0) == len(s1) mismatched_tokens = [ (v1, v2) for v1, v2 in zip(s0.seq_word.vals, s1.seq_word.vals) if v1 != v2 ] if len(mismatched_tokens) > 0: zwarn( f"Mismatch token NUM={len(mismatched_tokens)}: {mismatched_tokens}" ) if conf.p2d_change_words: s0.build_words(s1.seq_word.vals) # use the other one!! # breakpoint() # note: build again! s0.build_dep_tree(s1.tree_dep.seq_head.vals, [ self.p2d_udp_converter(z) for z in s1.tree_dep.seq_label.vals ]) if conf.p2d_use_xpos: trg_pos_list = s1.info.get["xpos"] else: trg_pos_list = s1.seq_upos.vals s0.build_uposes([self.p2d_upos_converter(z) for z in trg_pos_list])
def eval(self, gold_insts: List, pred_insts: List): # -- conf: PbEvalConf = self.conf tmp_gold, tmp_pred = conf.tmp_file_prefix + ".gold.props", conf.tmp_file_prefix + ".pred.props" with zopen(tmp_gold, 'w') as fd: fd.write(insts2props(gold_insts)) with zopen(tmp_pred, 'w') as fd: fd.write(insts2props(pred_insts)) # -- precision_output = system( f"{self.cmd} {tmp_pred} {tmp_gold} 2>/dev/null", ass=True, popen=True) recall_output = system(f"{self.cmd} {tmp_gold} {tmp_pred} 2>/dev/null", ass=True, popen=True) # parse output res = PbEvalResult(precision_output, recall_output) return res
def run(aa): fin, args = aa # -- fout = eval(args.output_pattern)(fin) CMD = args.cmd CMD = CMD.replace("[IN]", fin) CMD = CMD.replace("[OUT]", fout) with _global_lock: # get resource cur_idx = list(global_resources.keys())[0] del global_resources[cur_idx] try: CMD = CMD.replace("[IDX]", str(cur_idx)) system(CMD, pp=True) except: import traceback traceback.print_exc() raise RuntimeError() finally: # put resource back with _global_lock: global_resources[cur_idx] = 1
def prepare_from_treebanks(wset: str, section: str): system(f"mkdir -p {CONLL05}/{wset}/words") system(f"mkdir -p {CONLL05}/{wset}/synt") # -- if "brown" in wset: assert section is None trees = [f"{PTB3}/PARSED/MRG/BROWN/CK/CK0{z}.MRG" for z in "123"] _tmp_in_cmd = " | grep -v '^\\*x\\*' " sec_infix = "" else: trees = sorted(zglob(f"{PTB3}/PARSED/MRG/WSJ/{section}/*.MRG")) _tmp_in_cmd = "" sec_infix = f".{section}" if "test" not in wset else "" # -- # remove files if existing! system(f"mkdir -p {CONLL05}/{wset}/docids/") f_word, f_docid, f_synt = f"{CONLL05}/{wset}/words/{wset}{sec_infix}.words.gz", \ f"{CONLL05}/{wset}/docids/{wset}{sec_infix}.docids.gz", \ f"{CONLL05}/{wset}/synt/{wset}{sec_infix}.synt.gz" for f in [f_word, f_docid, f_synt]: if os.path.exists(f): os.remove(f) # add all for one_tree in trees: one_doc_id = one_tree.split("/")[-1].split(".")[0] my_system( f"cat {one_tree} {_tmp_in_cmd} | {GO_PERL} {SRLCONLL}/bin/wsj-removetraces.pl | {GO_PERL} {SRLCONLL}/bin/wsj-to-se.pl -w 1 | awk '{{print $1}}' | gzip >>{f_word}" ) my_system( f"cat {one_tree} {_tmp_in_cmd} | {GO_PERL} {SRLCONLL}/bin/wsj-removetraces.pl | {GO_PERL} {SRLCONLL}/bin/wsj-to-se.pl -w 1 | awk '{{print $1}}' | sed 's/^.\\+$/{one_doc_id}/' | gzip >>{f_docid}" ) my_system( f"cat {one_tree} {_tmp_in_cmd} | {GO_PERL} {SRLCONLL}/bin/wsj-removetraces.pl | {GO_PERL} {SRLCONLL}/bin/wsj-to-se.pl -w 0 -p 1 | gzip >>{f_synt}" )
def annotate(self, insts: List[DataInstance]): conf: AnnotatorSemaforConf = self.conf # -- # get all sentences and run in batch all_sents = list(yield_sents(insts)) # run all in batch # step 1: prepare input tmp_input = os.path.join(f"{conf.semafor_tmp_dir}", "_input.txt") tmp_input = os.path.abspath(tmp_input) # require absolute path tmp_input = self.delete_and_get_file(tmp_input) with zopen(tmp_input, 'w') as fd: for sent in all_sents: # write one line per sent fd.write(" ".join(sent.seq_word.vals) + "\n") # step 2: run semafor tmp_output = os.path.join(f"{conf.semafor_tmp_dir}", "_output.json") tmp_output = os.path.abspath(tmp_output) # require absolute path tmp_output = self.delete_and_get_file( tmp_output, delete=(not conf.semafor_use_cached)) if not conf.semafor_use_cached: # otherwise simply skip running _semafor_log = conf.semafor_log if conf.semafor_log else "/dev/null" # append to log! system( f"bash {self.semafor_sh} {tmp_input} {tmp_output} {conf.semafor_num_threads} >>{_semafor_log} 2>&1", ass=True) # step 3: read output and put them in sents semafor_results = default_json_serializer.load_list(tmp_output) assert len(semafor_results) == len( all_sents), "Error: predict inst number mismatch!" for one_res, one_sent in zip(semafor_results, all_sents): one_semafor_sent: Sent = SemaforHelper.semafor2sent(one_res) one_idx_map = SemaforHelper.find_sent_map(one_semafor_sent, one_sent) # put them back one_sent.clear_events() one_sent.clear_entity_fillers() # add them all for evt in one_semafor_sent.events: evt_widx, evt_wlen = evt.mention.widx, evt.mention.wlen mapped_posi = SemaforHelper.map_span(evt_widx, evt_wlen, one_idx_map) if mapped_posi is None: zwarn( f"Failed mapping evt of {evt}: {evt.mention} to {one_sent.seq_word}" ) continue evt2 = one_sent.make_event(mapped_posi[0], mapped_posi[1], type=evt.type) for alink in evt.args: ef = alink.arg ef_widx, ef_wlen = ef.mention.widx, ef.mention.wlen mapped_posi = SemaforHelper.map_span( ef_widx, ef_wlen, one_idx_map) if mapped_posi is None: zwarn( f"Failed mapping arg of {alink}: {ef.mention} to {one_sent.seq_word}" ) continue ef2 = one_sent.make_entity_filler( mapped_posi[0], mapped_posi[1]) # make new ef for each arg evt2.add_arg(ef2, role=alink.role) # -- self.count += 1
def _read_results(self, file): output = system(f"cat {file} | grep zzzzzfinal", popen=True) result_res, result_dict = re.search("Result\(([0-9.]+)\): (\{.*\})", output).groups() result_res, result_dict = eval(result_res), eval(result_dict) return MyResult(result_res, result_dict)
def main(*args): conf: RunConf = init_everything(RunConf(), args, add_utils=False, add_nn=False) # ===== # get paths RUN_DIR = conf.run_dir if RUN_DIR: mkdir_p(RUN_DIR, raise_error=True) os.chdir(RUN_DIR) # change to it!! SRC_DIR = zglob1(conf.src_dir, check_prefix="..", check_iter=10) VOC_DIR = zglob1(conf.voc_dir, check_prefix="..", check_iter=10) DATA_DIR = zglob1(conf.dataset.data_dir, check_prefix="..", check_iter=10) zlog( f"RUN with RUN={RUN_DIR}, SRC={SRC_DIR}, VOC={VOC_DIR}, DATA={DATA_DIR}" ) # ===== # modes dataset_choice = conf.dataset._choice is_pb, is_fn = [dataset_choice.startswith(z) for z in ["pb", "fn"]] assert is_pb or is_fn # ===== # options # -- # base ones base_opt = "conf_output:_conf" # eval if is_pb: base_opt += f" eval_conf:pb" elif is_fn: base_opt += f" dict_frame_file:{DATA_DIR}/{conf.dataset.frame_file}" base_opt += f" eval_conf:fn eval_conf.frame_file:{DATA_DIR}/{conf.dataset.frame_file}" # eval # -- # ===== # modeling if conf.use_word_input: base_opt += " ec_word.dim:300 ec_word.drop_rate:0.2 ec_word.init_from_pretrain:1 ec_word.rare_unk_thr:2" # word # base_opt += " ec_posi.dim:512" # posi? # base_opt += " ec_char.dim:50 ec_char.init_scale:5." # char? if conf.use_bert_input: base_opt += " ec_bert.dim:768 bert_model:bert-base-cased bert_output_layers:7,8,9" # bert features? base_opt += " eproj_dim:512" # -- if conf.use_rel_posi: base_opt += " enc_conf.enc_att.n_layers:2 enc_conf.enc_att.use_posi:0 enc_conf.clip_dist:16" # enc1 else: base_opt += " enc_conf.enc_att.n_layers:2 enc_conf.enc_att.use_posi:1 enc_conf.clip_dist:0" # enc1 # base_opt += " enc_conf.enc_tatt.n_layers:2 enc_conf.enc_tatt.use_posi:1" # enc1 # base_opt += " enc_conf.enc_rnn.n_layers:1 enc_conf.enc_hidden:1024" # enc1 # -- # frame base_opt += " loss_evt:0.5 pred_evt:1" # with evts base_opt += " evt_conf.cand_label_smoothing:0.05 evt_conf.label_smoothing:0.1" # label smooth base_opt += " evt_conf.lookup_conf.use_emb:0" # no adding frame embeddings? base_opt += " evt_conf.span_conf.sconf.hid_nlayer:1" # pred scorer? if conf.assume_frame: # no need for the evt module!! base_opt += " loss_evt:0 pred_evt:0 eval_conf.weight_frame:0." elif conf.assume_trg: # no need for cand, but still need to identify frame types base_opt += " evt_conf.loss_cand:0. evt_conf.loss_use_posi:1 evt_conf.pred_use_posi:1" # use-posi for evt base_opt += " evt_conf.pred_addition_non_score:-100000." # NEGINF-non if is_fn: # further use cons for fn base_opt += f" evt_cons_lex_file:{VOC_DIR}/cons_lex_{dataset_choice}.json evt_conf.pred_use_cons:1 evt_conf.pred_use_lu:1 evt_conf.loss_use_cons:0 evt_conf.loss_use_lu:0" # cons & use-lu for evt else: # evt_conf -> direct base_opt += " evt_conf.loss_cand:1.0 evt_conf.loss_lab:1.0" # loss_cand base_opt += " evt_conf.span_train_sample_rate:1.0 evt_conf.span_topk_rate:1.0 evt_conf.span_train_sample:1" # some rates # -- if is_pb: # lab is aux for pb base_opt += " evt_conf.loss_lab:0.5 evt_conf.pred_score_prune:0. evt_conf.pred_addition_non_score:-100000." elif is_fn: # lab is essential for fn base_opt += " loss_evt:1 evt_conf.loss_cand:0.5 evt_conf.span_train_sample_rate:0.33 evt_conf.span_topk_rate:0.4 evt_conf.span_train_sample:1" # -- if conf.no_frame_label: base_opt += " evt_conf.loss_lab:0. evt_conf.pred_score_prune:0. evt_conf.pred_addition_non_score:-100000." # -- # arg base_opt += " arg_use_finput:0" base_opt += f" fenc_conf.enc_att.n_layers:8 fenc_conf.clip_dist:{16 if conf.use_rel_posi else 0}" # fenc # base_opt += " fenc_conf.enc_tatt.n_layers:6" # fenc # base_opt += " fenc_conf.enc_rnn.n_layers:3 fenc_conf.enc_hidden:1024" # enc1 base_opt += " loss_arg:1. pred_arg:1" # with args base_opt += " arg_conf.label_smoothing:0.1" # label smooth if conf.arg_mode in ["span", "head"]: # arg_conf -> direct base_opt += " arg_conf.loss_cand:0.5" # loss_cand # base_opt+=" arg_conf.span_train_sample_rate:0.33 arg_conf.span_topk_rate:0.4" # some rates base_opt += " arg_conf.span_topk_rate:1. arg_conf.span_topk_count:10 arg_conf.span_train_sample:0" # some rates base_opt += " arg_conf.loss_weight_non:1." # less penalizing this? base_opt += " arg_conf.pp_check_more:1" # check non-overlapping if conf.arg_mode == "span": base_opt += " arg_conf.max_width:30 arg_conf.softhead_topk:5 arg_conf.pred_non_overlapping:1" # span elif conf.arg_mode == "head": base_opt += " arg_conf.core_span_mode:shead arg_conf.max_width:1" # head # extender base_opt += " arg_conf.loss_ext:0.5 arg_conf.pred_ext:1 arg_conf.ext_use_finput:0" base_opt += f" arg_conf.ext_conf.eenc_conf.enc_att.n_layers:1 arg_conf.ext_conf.eenc_conf.enc_att.aconf.clip_dist:{16 if conf.use_rel_posi else 0}" else: raise NotImplementedError() elif conf.arg_mode == "soft": base_opt += " arg_conf:soft" base_opt += " arg_conf.loss_ext:0.5 arg_conf.pred_ext:1 arg_conf.ext_use_finput:0" base_opt += f" arg_conf.ext_conf.eenc_conf.enc_att.n_layers:1 arg_conf.ext_conf.eenc_conf.enc_att.aconf.clip_dist:{16 if conf.use_rel_posi else 0}" base_opt += " arg_conf.pp_check_more:1" elif conf.arg_mode in ["anchor", "anchor2"]: base_opt += " arg_conf:anchor" if conf.arg_mode == "anchor2": # yet another head mode! base_opt += " arg_conf.core_span_mode:shead" base_opt += " arg_conf.loss_ext:0.5 arg_conf.pred_ext:1 arg_conf.ext_use_finput:0" base_opt += f" arg_conf.ext_conf.eenc_conf.enc_att.n_layers:1 arg_conf.ext_conf.eenc_conf.enc_att.aconf.clip_dist:{16 if conf.use_rel_posi else 0}" base_opt += " arg_conf.pp_check_more:1" elif conf.arg_mode in ["seq", "seq0"]: # arg_conf -> seq base_opt += " arg_conf:seq arg_conf.seq_scheme:BIO" # use seq mode! base_opt += " arg_conf.loss_weight_non:1." # less penalizing this? # -- if conf.arg_mode == "seq": base_opt += " arg_conf.beam_k:150 arg_conf.use_bigram:0 arg_conf.pred_use_seq_cons:1" # viterbi with constraints if conf.arg_seq_mod == "crf": # crf-mode base_opt += " arg_conf.loss_mode:crf arg_conf.use_bigram:1 arg_conf.local_normalize:0" elif conf.arg_mode == "seq0": # greedy mode: no crf and no viterbi base_opt += " arg_conf.pred_use_seq_cons:0 arg_conf.loss_mode:mle arg_conf.use_bigram:0 arg_conf.local_normalize:1" else: raise NotImplementedError() else: raise NotImplementedError() # -- # ===== # training base_opt += " ema_decay:0. ema_update_step:1" # ema if 1: UPE = 1000 # 1000 update as one epoch base_opt += " lrate.val:0.0002 anneal_times:10 anneal_patience:10 lrate.m:0.75" base_opt += f" valid_ufreq:{UPE} valid_epoch:0 max_uidx:{UPE*150} lrate_warmup_uidx:{8*UPE} lrate_decrease_alpha:0." if conf.use_rel_posi: base_opt += " train_count_mode:ftok train_batch_size:4096 accu_batch:1" # actually bs=bs*accu base_opt += " test_count_mode:ftok test_batch_size:2048" else: base_opt += " train_count_mode:ftok train_batch_size:4096 accu_batch:1" # actually bs=bs*accu base_opt += " test_count_mode:ftok test_batch_size:2048" base_opt += " df_hdrop:0.2" # general dropout else: # possibly for rnn base_opt += " lrate.val:0.002 anneal_times:10 anneal_patience:10" base_opt += " train_count_mode:frame max_eidx:100 train_batch_size:32" base_opt += " df_hdrop:0.33" # general dropout if is_pb: base_opt += " train_skip_noevt_rate:0.0" elif is_fn: base_opt += " train_skip_noevt_rate:1.0" # skip sents where no targets! # data base_opt += " " + conf.dataset.get_data_str(DATA_DIR, conf.do_ms_train) base_opt += f" pretrain_wv_file:{VOC_DIR}/hits_{dataset_choice}.vec pretrain_scale:10." # filtered pretrain file # nn base_opt += f" nn.device:0 nn.random_seed:9347{conf.cur_run} nn.random_cuda_seed:9349{conf.cur_run}" # ===== # note: base_opt is only for training!! _L_PRE = conf.log_prefix DEBUG_OPTION = "-m pdb" if conf.debug else "" TRAIN_CMD = f"CUDA_VISIBLE_DEVICES={conf.rgpu} PYTHONPATH={SRC_DIR}:$PYTHONPATH python3 {DEBUG_OPTION} -m msp2.tasks.zsfp.main.train {base_opt} log_file:{_L_PRE}_train {conf.train_extras}" # -- TEST_CMD = f"CUDA_VISIBLE_DEVICES={conf.rgpu} PYTHONPATH={SRC_DIR}:$PYTHONPATH python3 {DEBUG_OPTION} -m msp2.tasks.zsfp.main.test {conf.conf_output} log_file:{_L_PRE}_test {conf.test_extras}" # -- if conf.do_train: system(TRAIN_CMD, pp=True) # -- if conf.do_test: system(TEST_CMD, pp=True) # -- if conf.do_test_all: for tfile in conf.dataset.all_dt_files: _TMP_CMD = f"CUDA_VISIBLE_DEVICES={conf.rgpu} PYTHONPATH={SRC_DIR}:$PYTHONPATH python3 {DEBUG_OPTION} -m msp2.tasks.zsfp.main.test {conf.conf_output} test:{DATA_DIR}/{tfile} output:{conf.out_prefix}.{tfile} log_file:{_L_PRE}.{tfile} test_extra_pretrain_wv_files:{VOC_DIR}/hits_{dataset_choice}.vec {conf.test_extras}" system(_TMP_CMD, pp=True)
# # prepare data for conll05 # reference: https://github.com/strubell/preprocess-conll05 import sys import os from typing import List from msp2.utils import system, zlog, zglob # -- my_system = lambda *args, **kwargs: system(*args, **kwargs, pp=True) # -- # first collect the data """ # ----- # step0: data & paths wget https://www.cs.upc.edu/~srlconll/conll05st-release.tar.gz tar -xzvf conll05st-release.tar.gz wget https://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz tar -xzvf conll05st-tests.tar.gz wget https://www.cs.upc.edu/~srlconll/srlconll-1.1.tgz tar -xzvf srlconll-1.1.tgz # -- SRLCONLL="`pwd`/srlconll-1.1" CONLL05="`pwd`/conll05st-release" PTB3="`pwd`/TREEBANK_3" export PERL5LIB=$SRLCONLL/lib:$PERL5LIB """