Example #1
0
 def __init__(self,
              path: str,
              dir_file_f: Callable = (lambda x: x.id),
              dir_file_suffix=".json",
              end="\n"):
     super().__init__()
     self.path = path
     self.dir_file_f = dir_file_f
     self.dir_file_suffix = dir_file_suffix
     self.end = end
     mkdir_p(path, True)  # mkdir
Example #2
0
 def execute(self, env_prefix: str):
     conf = self.conf
     # first change path if needed!
     RUN_DIR = conf.run_dir
     if RUN_DIR:
         mkdir_p(RUN_DIR, raise_error=True)
         # os.chdir(RUN_DIR)  # change to it!!
     else:
         conf.run_dir = "."
     # directly change conf
     conf.src_dir = zglob1(conf.src_dir, check_prefix="..", check_iter=10)
     conf.src_dir = os.path.abspath(conf.src_dir)  # get absolute path!!
     conf._env_prefix = env_prefix
     # --
     self.run()
     res = self.get_result()
     return res
Example #3
0
 def save(self, prefix="./"):
     fname = prefix + "zsfp.voc.pkl"
     mkdir_p(os.path.dirname(fname))  # try to make dir if not there!
     default_pickle_serializer.to_file([self.vocabs, self.embeds], fname)
Example #4
0
File: go.py Project: zzsfornlp/zmsp
def main(*args):
    conf: RunConf = init_everything(RunConf(),
                                    args,
                                    add_utils=False,
                                    add_nn=False)
    # =====
    # get paths
    RUN_DIR = conf.run_dir
    if RUN_DIR:
        mkdir_p(RUN_DIR, raise_error=True)
        os.chdir(RUN_DIR)  # change to it!!
    SRC_DIR = zglob1(conf.src_dir, check_prefix="..", check_iter=10)
    VOC_DIR = zglob1(conf.voc_dir, check_prefix="..", check_iter=10)
    DATA_DIR = zglob1(conf.dataset.data_dir, check_prefix="..", check_iter=10)
    zlog(
        f"RUN with RUN={RUN_DIR}, SRC={SRC_DIR}, VOC={VOC_DIR}, DATA={DATA_DIR}"
    )
    # =====
    # modes
    dataset_choice = conf.dataset._choice
    is_pb, is_fn = [dataset_choice.startswith(z) for z in ["pb", "fn"]]
    assert is_pb or is_fn
    # =====
    # options
    # --
    # base ones
    base_opt = "conf_output:_conf"
    # eval
    if is_pb:
        base_opt += f" eval_conf:pb"
    elif is_fn:
        base_opt += f" dict_frame_file:{DATA_DIR}/{conf.dataset.frame_file}"
        base_opt += f" eval_conf:fn eval_conf.frame_file:{DATA_DIR}/{conf.dataset.frame_file}"  # eval
    # --
    # =====
    # modeling
    if conf.use_word_input:
        base_opt += " ec_word.dim:300 ec_word.drop_rate:0.2 ec_word.init_from_pretrain:1 ec_word.rare_unk_thr:2"  # word
    # base_opt += " ec_posi.dim:512"  # posi?
    # base_opt += " ec_char.dim:50 ec_char.init_scale:5."  # char?
    if conf.use_bert_input:
        base_opt += " ec_bert.dim:768 bert_model:bert-base-cased bert_output_layers:7,8,9"  # bert features?
    base_opt += " eproj_dim:512"  # --
    if conf.use_rel_posi:
        base_opt += " enc_conf.enc_att.n_layers:2 enc_conf.enc_att.use_posi:0 enc_conf.clip_dist:16"  # enc1
    else:
        base_opt += " enc_conf.enc_att.n_layers:2 enc_conf.enc_att.use_posi:1 enc_conf.clip_dist:0"  # enc1
    # base_opt += " enc_conf.enc_tatt.n_layers:2 enc_conf.enc_tatt.use_posi:1"  # enc1
    # base_opt += " enc_conf.enc_rnn.n_layers:1 enc_conf.enc_hidden:1024"  # enc1
    # --
    # frame
    base_opt += " loss_evt:0.5 pred_evt:1"  # with evts
    base_opt += " evt_conf.cand_label_smoothing:0.05 evt_conf.label_smoothing:0.1"  # label smooth
    base_opt += " evt_conf.lookup_conf.use_emb:0"  # no adding frame embeddings?
    base_opt += " evt_conf.span_conf.sconf.hid_nlayer:1"  # pred scorer?
    if conf.assume_frame:  # no need for the evt module!!
        base_opt += " loss_evt:0 pred_evt:0 eval_conf.weight_frame:0."
    elif conf.assume_trg:  # no need for cand, but still need to identify frame types
        base_opt += " evt_conf.loss_cand:0. evt_conf.loss_use_posi:1 evt_conf.pred_use_posi:1"  # use-posi for evt
        base_opt += " evt_conf.pred_addition_non_score:-100000."  # NEGINF-non
        if is_fn:  # further use cons for fn
            base_opt += f" evt_cons_lex_file:{VOC_DIR}/cons_lex_{dataset_choice}.json evt_conf.pred_use_cons:1 evt_conf.pred_use_lu:1 evt_conf.loss_use_cons:0 evt_conf.loss_use_lu:0"  # cons & use-lu for evt
    else:
        # evt_conf -> direct
        base_opt += " evt_conf.loss_cand:1.0 evt_conf.loss_lab:1.0"  # loss_cand
        base_opt += " evt_conf.span_train_sample_rate:1.0 evt_conf.span_topk_rate:1.0 evt_conf.span_train_sample:1"  # some rates
        # --
        if is_pb:  # lab is aux for pb
            base_opt += " evt_conf.loss_lab:0.5 evt_conf.pred_score_prune:0. evt_conf.pred_addition_non_score:-100000."
        elif is_fn:  # lab is essential for fn
            base_opt += " loss_evt:1 evt_conf.loss_cand:0.5 evt_conf.span_train_sample_rate:0.33 evt_conf.span_topk_rate:0.4 evt_conf.span_train_sample:1"
        # --
        if conf.no_frame_label:
            base_opt += " evt_conf.loss_lab:0. evt_conf.pred_score_prune:0. evt_conf.pred_addition_non_score:-100000."
    # --
    # arg
    base_opt += " arg_use_finput:0"
    base_opt += f" fenc_conf.enc_att.n_layers:8 fenc_conf.clip_dist:{16 if conf.use_rel_posi else 0}"  # fenc
    # base_opt += " fenc_conf.enc_tatt.n_layers:6"  # fenc
    # base_opt += " fenc_conf.enc_rnn.n_layers:3 fenc_conf.enc_hidden:1024"  # enc1
    base_opt += " loss_arg:1. pred_arg:1"  # with args
    base_opt += " arg_conf.label_smoothing:0.1"  # label smooth
    if conf.arg_mode in ["span", "head"]:
        # arg_conf -> direct
        base_opt += " arg_conf.loss_cand:0.5"  # loss_cand
        # base_opt+=" arg_conf.span_train_sample_rate:0.33 arg_conf.span_topk_rate:0.4"  # some rates
        base_opt += " arg_conf.span_topk_rate:1. arg_conf.span_topk_count:10 arg_conf.span_train_sample:0"  # some rates
        base_opt += " arg_conf.loss_weight_non:1."  # less penalizing this?
        base_opt += " arg_conf.pp_check_more:1"  # check non-overlapping
        if conf.arg_mode == "span":
            base_opt += " arg_conf.max_width:30 arg_conf.softhead_topk:5 arg_conf.pred_non_overlapping:1"  # span
        elif conf.arg_mode == "head":
            base_opt += " arg_conf.core_span_mode:shead arg_conf.max_width:1"  # head
            # extender
            base_opt += " arg_conf.loss_ext:0.5 arg_conf.pred_ext:1 arg_conf.ext_use_finput:0"
            base_opt += f" arg_conf.ext_conf.eenc_conf.enc_att.n_layers:1 arg_conf.ext_conf.eenc_conf.enc_att.aconf.clip_dist:{16 if conf.use_rel_posi else 0}"
        else:
            raise NotImplementedError()
    elif conf.arg_mode == "soft":
        base_opt += " arg_conf:soft"
        base_opt += " arg_conf.loss_ext:0.5 arg_conf.pred_ext:1 arg_conf.ext_use_finput:0"
        base_opt += f" arg_conf.ext_conf.eenc_conf.enc_att.n_layers:1 arg_conf.ext_conf.eenc_conf.enc_att.aconf.clip_dist:{16 if conf.use_rel_posi else 0}"
        base_opt += " arg_conf.pp_check_more:1"
    elif conf.arg_mode in ["anchor", "anchor2"]:
        base_opt += " arg_conf:anchor"
        if conf.arg_mode == "anchor2":  # yet another head mode!
            base_opt += " arg_conf.core_span_mode:shead"
        base_opt += " arg_conf.loss_ext:0.5 arg_conf.pred_ext:1 arg_conf.ext_use_finput:0"
        base_opt += f" arg_conf.ext_conf.eenc_conf.enc_att.n_layers:1 arg_conf.ext_conf.eenc_conf.enc_att.aconf.clip_dist:{16 if conf.use_rel_posi else 0}"
        base_opt += " arg_conf.pp_check_more:1"
    elif conf.arg_mode in ["seq", "seq0"]:
        # arg_conf -> seq
        base_opt += " arg_conf:seq arg_conf.seq_scheme:BIO"  # use seq mode!
        base_opt += " arg_conf.loss_weight_non:1."  # less penalizing this?
        # --
        if conf.arg_mode == "seq":
            base_opt += " arg_conf.beam_k:150 arg_conf.use_bigram:0 arg_conf.pred_use_seq_cons:1"  # viterbi with constraints
            if conf.arg_seq_mod == "crf":  # crf-mode
                base_opt += " arg_conf.loss_mode:crf arg_conf.use_bigram:1 arg_conf.local_normalize:0"
        elif conf.arg_mode == "seq0":  # greedy mode: no crf and no viterbi
            base_opt += " arg_conf.pred_use_seq_cons:0 arg_conf.loss_mode:mle arg_conf.use_bigram:0 arg_conf.local_normalize:1"
        else:
            raise NotImplementedError()
    else:
        raise NotImplementedError()
    # --
    # =====
    # training
    base_opt += " ema_decay:0. ema_update_step:1"  # ema
    if 1:
        UPE = 1000  # 1000 update as one epoch
        base_opt += " lrate.val:0.0002 anneal_times:10 anneal_patience:10 lrate.m:0.75"
        base_opt += f" valid_ufreq:{UPE} valid_epoch:0 max_uidx:{UPE*150} lrate_warmup_uidx:{8*UPE} lrate_decrease_alpha:0."
        if conf.use_rel_posi:
            base_opt += " train_count_mode:ftok train_batch_size:4096 accu_batch:1"  # actually bs=bs*accu
            base_opt += " test_count_mode:ftok test_batch_size:2048"
        else:
            base_opt += " train_count_mode:ftok train_batch_size:4096 accu_batch:1"  # actually bs=bs*accu
            base_opt += " test_count_mode:ftok test_batch_size:2048"
        base_opt += " df_hdrop:0.2"  # general dropout
    else:  # possibly for rnn
        base_opt += " lrate.val:0.002 anneal_times:10 anneal_patience:10"
        base_opt += " train_count_mode:frame max_eidx:100 train_batch_size:32"
        base_opt += " df_hdrop:0.33"  # general dropout
    if is_pb:
        base_opt += " train_skip_noevt_rate:0.0"
    elif is_fn:
        base_opt += " train_skip_noevt_rate:1.0"  # skip sents where no targets!
    # data
    base_opt += " " + conf.dataset.get_data_str(DATA_DIR, conf.do_ms_train)
    base_opt += f" pretrain_wv_file:{VOC_DIR}/hits_{dataset_choice}.vec pretrain_scale:10."  # filtered pretrain file
    # nn
    base_opt += f" nn.device:0 nn.random_seed:9347{conf.cur_run} nn.random_cuda_seed:9349{conf.cur_run}"
    # =====
    # note: base_opt is only for training!!
    _L_PRE = conf.log_prefix
    DEBUG_OPTION = "-m pdb" if conf.debug else ""
    TRAIN_CMD = f"CUDA_VISIBLE_DEVICES={conf.rgpu} PYTHONPATH={SRC_DIR}:$PYTHONPATH python3 {DEBUG_OPTION} -m msp2.tasks.zsfp.main.train {base_opt} log_file:{_L_PRE}_train {conf.train_extras}"
    # --
    TEST_CMD = f"CUDA_VISIBLE_DEVICES={conf.rgpu} PYTHONPATH={SRC_DIR}:$PYTHONPATH python3 {DEBUG_OPTION} -m msp2.tasks.zsfp.main.test {conf.conf_output} log_file:{_L_PRE}_test {conf.test_extras}"
    # --
    if conf.do_train:
        system(TRAIN_CMD, pp=True)
    # --
    if conf.do_test:
        system(TEST_CMD, pp=True)
    # --
    if conf.do_test_all:
        for tfile in conf.dataset.all_dt_files:
            _TMP_CMD = f"CUDA_VISIBLE_DEVICES={conf.rgpu} PYTHONPATH={SRC_DIR}:$PYTHONPATH python3 {DEBUG_OPTION} -m msp2.tasks.zsfp.main.test {conf.conf_output} test:{DATA_DIR}/{tfile} output:{conf.out_prefix}.{tfile} log_file:{_L_PRE}.{tfile} test_extra_pretrain_wv_files:{VOC_DIR}/hits_{dataset_choice}.vec {conf.test_extras}"
            system(_TMP_CMD, pp=True)