コード例 #1
0
 def _run_end(self):
     x = self.test_recorder.summary()
     res = self.res_manager.end()
     x.update(res)
     MltDevResult.calc_acc(x)
     Helper.printd(x, sep=" || ")
     return x
コード例 #2
0
 def _run_train_report(self):
     x = self.train_recorder.summary()
     y = GLOBAL_RECORDER.summary()
     if len(y) > 0:
         x.update(y)
     MltDevResult.calc_acc(x)
     Helper.printd(x, " || ")
     return RecordResult(x, score=x.get("res", 0.))
コード例 #3
0
ファイル: run.py プロジェクト: ValentinaPy/zmsp
 def _run_train_report(self):
     x = self.train_recorder.summary()
     y = GLOBAL_RECORDER.summary()
     # todo(warn): get loss/tok
     # todo(note): too complex to div here, only accumulating the sums.
     # x["loss_tok"] = x.get("loss_sum", 0.)/x["tok"]
     if len(y) > 0:
         x.update(y)
     Helper.printd(x, " || ")
     return RecordResult(x)
コード例 #4
0
ファイル: run.py プロジェクト: ValentinaPy/zmsp
 def _run_train_report(self):
     x = self.train_recorder.summary()
     y = GLOBAL_RECORDER.summary()
     # todo(warn): get loss/tok
     x["loss_tok"] = x.get("loss_sum", 0.) / x["tok"]
     if len(y) > 0:
         x.update(y)
     # zlog(x, "report")
     Helper.printd(x, " || ")
     return RecordResult(x)
コード例 #5
0
ファイル: testA.py プロジェクト: ValentinaPy/zmsp
def main(args):
    conf: DecodeAConf = init_everything(args, DecodeAConf)
    dconf, mconf = conf.dconf, conf.mconf
    iconf = mconf.iconf
    # vocab
    vpack = IEVocabPackage.build_by_reading(conf)
    # prepare data
    test_streamer = get_data_reader(dconf.test,
                                    dconf.input_format,
                                    dconf.use_label0,
                                    dconf.noef_link0,
                                    dconf.aux_repr_test,
                                    max_evt_layers=dconf.max_evt_layers)
    # model
    model = build_model(conf.model_type, conf, vpack)
    model.load(dconf.model_load_name)
    # use bert?
    if dconf.use_bert:
        bmodel = get_berter(dconf.bconf)
        test_streamer = BerterDataAuger(test_streamer, bmodel, "aux_repr")
    # finally prepare iter (No Cache!!, actually no batch_stream)
    test_inst_preparer = model.get_inst_preper(False)
    test_iter = index_stream(test_streamer, vpack, False, False,
                             test_inst_preparer)
    # =====
    # run
    decoder = ArgAugDecoder(conf.aconf, model)
    all_docs = []
    stat_recorder = StatRecorder(False)
    with Timer(tag="Decode", info="Decoding", print_date=True):
        with zopen(dconf.output_file, 'w') as fd:
            data_writer = get_data_writer(fd, dconf.output_format)
            for one_doc in test_iter:
                info = decoder.decode(one_doc)
                stat_recorder.record(info)
                if conf.verbose:
                    zlog(f"Decode one doc, id={one_doc.doc_id} info={info}")
                # release resources
                for one_sent in one_doc.sents:
                    one_sent.extra_features[
                        "aux_repr"] = None  # todo(note): special name!
                # write output
                data_writer.write([one_doc])
                #
                all_docs.append(one_doc)
    if conf.verbose:
        zlog(f"Finish decoding, overall: {stat_recorder.summary()}")
    # eval?
    if conf.do_eval:
        evaler = MyIEEvaler(MyIEEvalerConf())
        result = evaler.eval(all_docs, all_docs)
        Helper.printd(result)
    zlog("The end.")
コード例 #6
0
def main():
    utils.init("zlog", 1234)
    z = StatRecorder(True)
    times = Random.randint(100)
    for _ in range(times):
        with z.go():
            z.record_kv("distr_n", Random.randint(10))
    Helper.printd(z.summary(), "\n")
    #
    cc = Conf0()
    cc.update_from_args(["a:10", "y:www", "z.x:1"])
    pass
コード例 #7
0
ファイル: model.py プロジェクト: ValentinaPy/zmsp
 def __init__(self, conf: MyIEModelConf, vpack: VocabPackage):
     self.conf = conf
     self.vpack = vpack
     tconf = conf.tconf
     # ===== Vocab =====
     # ===== Model =====
     self.pc = BK.ParamCollection(True)
     # bottom-part: input + encoder
     self.bter: MyIEBT = self.build_encoder()
     self.lexi_output_dim = self.bter.emb_output_dim
     self.enc_ef_output_dim, self.enc_evt_output_dim = self.bter.get_output_dims()[0]
     self.enc_lrf_sv = ScheduledValue("enc_lrf", tconf.enc_lrf)
     self.pc.optimizer_set(tconf.enc_optim.optim, self.enc_lrf_sv, tconf.enc_optim,
                           params=self.bter.get_parameters(), check_repeat=True, check_full=True)
     # upper-parts: the decoders
     self.decoders: List = self.build_decoders()
     self.dec_lrf_sv = ScheduledValue("dec_lrf", tconf.dec_lrf)
     self.pc.optimizer_set(tconf.dec_optim.optim, self.dec_lrf_sv, tconf.dec_optim,
                           params=Helper.join_list(z.get_parameters() for z in self.decoders),
                           check_repeat=True, check_full=True)
     # ===== For training =====
     # schedule values
     self.margin = ScheduledValue("margin", tconf.margin)
     self._scheduled_values = [self.margin, self.enc_lrf_sv, self.dec_lrf_sv]
     # for refreshing dropouts
     self.previous_refresh_training = True
     # =====
     # others
     self.train_constrain_evt_types = {"": None, "kbp17": KBP17_TYPES}[conf.tconf.constrain_evt_types]
     self.test_constrain_evt_types = {"": None, "kbp17": KBP17_TYPES}[conf.iconf.constrain_evt_types]
コード例 #8
0
ファイル: vocab.py プロジェクト: zzsfornlp/zmsp
 def filter_vals(word_vals, word_filter=(lambda ww, rank, val: True)):
     ranked_list = Helper.rank_key(word_vals)
     truncated_vals = {}
     for ii, ww in enumerate(ranked_list):
         rank, val = ii+1, word_vals[ww]
         if word_filter(ww, rank, val):
             truncated_vals[ww] = val
     return truncated_vals
コード例 #9
0
ファイル: analyzer.py プロジェクト: zzsfornlp/zmsp
 def do_join(self, insts_target: str, jcode: str) -> List:
     vs = self.vars
     _ff = compile(jcode, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     ret = [eval(_ff) for d in insts]
     ret = Helper.join_list(ret)
     zlog(f"Join-list by {jcode}: from {len(insts)} to {len(ret)}")
     return ret
コード例 #10
0
ファイル: process_train.py プロジェクト: zzsfornlp/zmsp
 def _fb_batch(self, insts):
     num_splits = self.rconf.split_batch
     loss_factor = 1. / num_splits
     splitted_insts = Helper.split_list(insts, num_splits)
     with self.train_recorder.go():
         for one_insts in splitted_insts:
             res = self._run_fb(one_insts, loss_factor)
             self.train_recorder.record(res)
     self._tp.iidx += self.batch_size_f(insts)
コード例 #11
0
ファイル: data.py プロジェクト: ValentinaPy/zmsp
    def set_children_info(self,
                          oracle_strategy,
                          label_ranking_dict: Dict = None,
                          free_dist_alpha: float = 0.):
        heads = self.heads.vals
        the_len = len(heads)
        # self.children_set = [set() for _ in range(the_len)]
        self.children_list = [[] for _ in range(the_len)]
        tmp_descendant_list = [None for _ in range(the_len)]
        # exclude root
        for m, h in enumerate(heads[1:], 1):
            # self.children_set[h].add(m)
            self.children_list[h].append(m)  # l2r order
        # re-arrange list order (left -> right)
        if oracle_strategy == "i2o":
            for h in range(the_len):
                self.children_list[h].sort(key=lambda x: -x if x < h else x)
        elif oracle_strategy == "label":
            # todo(warn): only use first level!
            level0_labels = [z.split(":")[0] for z in self.labels.vals]
            for h in range(the_len):
                self.children_list[h].sort(
                    key=lambda x: label_ranking_dict[level0_labels[x]])
        elif oracle_strategy == "n2f":
            self.shuffle_children_n2f()
        elif oracle_strategy == "free":
            self.free_dist_alpha = free_dist_alpha
            self.shuffle_children_free()
        else:
            assert oracle_strategy == "l2r"
            pass
        # todo(+N): does the order of descendant list matter?
        # todo(+N): depth-first or breadth-first? (currently select the latter)
        # recursively get descendant list: do this
        # =====
        def _recursive_add(cur_n):
            cur_children = self.children_list[cur_n]  # List[int]
            for i in cur_children:
                _recursive_add(i)
            new_dlist = [cur_children]
            cur_layer = 0
            while True:
                another_layer = Helper.join_list(
                    tmp_descendant_list[i][cur_layer]
                    if cur_layer < len(tmp_descendant_list[i]) else []
                    for i in cur_children)
                if len(another_layer) == 0:
                    break
                new_dlist.append(another_layer)
                cur_layer += 1
            tmp_descendant_list[cur_n] = new_dlist

        # =====
        _recursive_add(0)
        self.descendant_list = [
            Helper.join_list(tmp_descendant_list[i]) for i in range(the_len)
        ]
コード例 #12
0
ファイル: scorpus.py プロジェクト: ValentinaPy/zmsp
 def yield_data(self, files):
     #
     if not isinstance(files, (list, tuple)):
         files = [files]
     #
     cur_num = 0
     for f in files:
         cur_num += 1
         zlog("-----\nDataReader: [#%d] Start reading file %s." %
              (cur_num, f))
         with zopen(f) as fd:
             for z in self._yield_tokens(fd):
                 yield z
         if cur_num % self.report_freq == 0:
             zlog("** DataReader: [#%d] Summary till now:" % cur_num)
             Helper.printd(self.stats)
     zlog("=====\nDataReader: End reading ALL (#%d) ==> Summary ALL:" %
          cur_num)
     Helper.printd(self.stats)
コード例 #13
0
ファイル: test_streamer.py プロジェクト: ValentinaPy/zmsp
def main():
    s0 = IterStreamer(range(200))
    s1 = InstCacher(range(200), shuffle=True)
    s2 = InstCacher(
        MultiCatStreamer(
            [IterStreamer(range(100, 200)),
             IterStreamer(range(100))]))
    s3 = BatchArranger(InstCacher(IterStreamer(range(200))), 8, 10, None,
                       lambda x: x == 48, None, lambda x: (x - 24)**2, True)
    #
    nums = set(list(s0))
    for R in range(10):
        assert nums == set(list(s1))
        assert nums == set(list(s2))
        zz = list(s3)
        assert nums == set(Helper.join_list(zz) + [48])
コード例 #14
0
ファイル: data.py プロジェクト: ValentinaPy/zmsp
 def _recursive_add(cur_n):
     cur_children = self.children_list[cur_n]  # List[int]
     for i in cur_children:
         _recursive_add(i)
     new_dlist = [cur_children]
     cur_layer = 0
     while True:
         another_layer = Helper.join_list(
             tmp_descendant_list[i][cur_layer]
             if cur_layer < len(tmp_descendant_list[i]) else []
             for i in cur_children)
         if len(another_layer) == 0:
             break
         new_dlist.append(another_layer)
         cur_layer += 1
     tmp_descendant_list[cur_n] = new_dlist
コード例 #15
0
ファイル: vocab.py プロジェクト: zzsfornlp/zmsp
 def finish(self, word_filter=(lambda ww, rank, val: True), sort_by_count=True, target_range=DEFAULT_TARGET_RANGE):
     v = self.v
     if sort_by_count:
         v.v, v.final_vals = VocabBuilder.ranking_vals(
             self.counts_, v.pre_list, v.post_list, self.default_val_, True, word_filter=word_filter)
     else:
         tmp_counts_ = OrderedDict([(k, self.counts_[k]) for k in self.keys_])
         v.v, v.final_vals = VocabBuilder.ranking_vals(
             tmp_counts_, v.pre_list, v.post_list, self.default_val_, False, word_filter=word_filter)
     v.final_words = Helper.reverse_idx(v.v)
     printing("Build Vocab %s ok, from %d to %d, as %s." % (v.name, len(self.counts_), len(v), str(v)))
     #
     VocabBuilder._build_check(v)
     VocabBuilder._build_target_range(v, target_range[0], target_range[1])
     VocabBuilder._build_prop(v)
     return v
コード例 #16
0
ファイル: vocab.py プロジェクト: zzsfornlp/zmsp
 def ranking_vals(word_vals, pre_list, post_list, default_val, sort_vals, word_filter=(lambda ww, rank, val: True)):
     if sort_vals:
         valid_word_list = Helper.rank_key(word_vals)
     else:
         valid_word_list = word_vals.keys()
     #
     truncated_vals = [default_val] * len(pre_list)
     v = dict(zip(pre_list, range(len(pre_list))))
     for ii, ww in enumerate(valid_word_list):
         rank, val = ii+1, word_vals[ww]
         if word_filter(ww, rank, val):
             v[ww] = len(v)
             truncated_vals.append(val)
     for one in post_list:
         v[one] = len(v)
         truncated_vals.append(default_val)
     return v, truncated_vals
コード例 #17
0
ファイル: g2p.py プロジェクト: ValentinaPy/zmsp
 def inference_on_batch(self, insts: List[ParseInstance], **kwargs):
     # iconf = self.conf.iconf
     with BK.no_grad_env():
         self.refresh_batch(False)
         # pruning and scores from g1
         valid_mask, go1_pack = self._get_g1_pack(
             insts, self.lambda_g1_arc_testing, self.lambda_g1_lab_testing)
         # encode
         input_repr, enc_repr, jpos_pack, mask_arr = self.bter.run(
             insts, False)
         mask_expr = BK.input_real(mask_arr)
         # decode
         final_valid_expr = self._make_final_valid(valid_mask, mask_expr)
         ret_heads, ret_labels, _, _ = self.dl.decode(
             insts, enc_repr, final_valid_expr, go1_pack, False, 0.)
         # collect the results together
         all_heads = Helper.join_list(ret_heads)
         if ret_labels is None:
             # todo(note): simply get labels from the go1-label classifier; must provide g1parser
             if go1_pack is None:
                 _, go1_pack = self._get_g1_pack(insts, 1., 1.)
             _, go1_label_max_idxes = go1_pack[1].max(
                 -1)  # [bs, slen, slen]
             pred_heads_arr, _ = self.predict_padder.pad(
                 all_heads)  # [bs, slen]
             pred_heads_expr = BK.input_idx(pred_heads_arr)
             pred_labels_expr = BK.gather_one_lastdim(
                 go1_label_max_idxes, pred_heads_expr).squeeze(-1)
             all_labels = BK.get_value(pred_labels_expr)  # [bs, slen]
         else:
             all_labels = np.concatenate(ret_labels, 0)
         # ===== assign, todo(warn): here, the labels are directly original idx, no need to change
         for one_idx, one_inst in enumerate(insts):
             cur_length = len(one_inst) + 1
             one_inst.pred_heads.set_vals(
                 all_heads[one_idx]
                 [:cur_length])  # directly int-val for heads
             one_inst.pred_labels.build_vals(
                 all_labels[one_idx][:cur_length], self.label_vocab)
             # one_inst.pred_par_scores.set_vals(all_scores[one_idx][:cur_length])
         # =====
         # put jpos result (possibly)
         self.jpos_decode(insts, jpos_pack)
         # -----
         info = {"sent": len(insts), "tok": sum(map(len, insts))}
         return info
コード例 #18
0
ファイル: evaler.py プロジェクト: zzsfornlp/zmsp
 def eval(self, quiet=True, breakdown=False):
     all_pre_u, all_pre_l, label_pres = self._calc_result(
         self.preds, self.golds)
     all_rec_u, all_rec_l, label_recs = self._calc_result(
         self.golds, self.preds)
     all_f_u = F1Result(all_pre_u, all_rec_u)
     all_f_l = F1Result(all_pre_l, all_rec_l)
     label_fs = {
         k: F1Result(label_pres[k], label_recs[k])
         for k in self.labels
     } if breakdown else {}
     if not quiet:
         zlog(
             f"Overall f1 score for {self.name}: unlabeled {all_f_u}; labeled {all_f_l}"
         )
         zlog("Breakdowns: \n" + Helper.printd_str(label_fs))
     return all_f_u, all_f_l, label_fs
コード例 #19
0
ファイル: expr.py プロジェクト: zzsfornlp/zmsp
 def _combine_recursive_keys(values, bidxes, keys):
     if isinstance(keys, dict):
         ret = {}
         for k in keys:
             ret[k] = SliceManager._combine_recursive_keys(
                 [z[k] for z in values], bidxes, keys[k])
     elif isinstance(keys, set):
         ret = {}
         for k in keys:
             ret[k] = SliceManager._combine_recursive(
                 [z[k] for z in values], bidxes)
     else:
         # direct through
         if keys is None:
             keys = []
         elif not isinstance(keys, Iterable):
             keys = [keys]
         next_values = [Helper.apply_keys(z, keys) for z in values]
         ret = SliceManager._combine_recursive(next_values, bidxes)
     return ret
コード例 #20
0
ファイル: expr.py プロジェクト: zzsfornlp/zmsp
 def _arrange_idxes(slices):
     values, bidxes = [], []
     # tmp
     tmp_bidx_bases = [
         0,
     ]
     tmp_id2idx = {}
     for s in slices:
         one_ew, one_sidx = s.ew, s.slice_idx
         ew_id = one_ew.id
         if ew_id not in tmp_id2idx:
             tmp_id2idx[ew_id] = len(values)
             values.append(one_ew.val)
             tmp_bidx_bases.append(one_ew.bsize + tmp_bidx_bases[-1])
         #
         idx_in_vals = tmp_id2idx[ew_id]
         bidxes.append(tmp_bidx_bases[idx_in_vals] + one_sidx)
     # check for perfect match
     if Helper.check_is_range(bidxes, tmp_bidx_bases[-1]):
         bidxes = None
     return values, bidxes
コード例 #21
0
ファイル: base_search.py プロジェクト: ValentinaPy/zmsp
 def arange_cache(self, bidxes):
     new_bsize = len(bidxes)
     # if the idxes are already fine, then no need to select
     if not Helper.check_is_range(bidxes, self.cur_bsize):
         # mask is on CPU to make assigning easier
         bidxes_ct = BK.input_idx(bidxes, BK.CPU_DEVICE)
         self.scoring_fixed_mask_ct = self.scoring_fixed_mask_ct.index_select(
             0, bidxes_ct)
         self.scoring_mask_ct = self.scoring_mask_ct.index_select(
             0, bidxes_ct)
         self.oracle_mask_ct = self.oracle_mask_ct.index_select(
             0, bidxes_ct)
         # other things are all on target-device (possibly GPU)
         bidxes_device = BK.to_device(bidxes_ct)
         self.enc_repr = self.enc_repr.index_select(0, bidxes_device)
         self.scoring_cache.arange_cache(bidxes_device)
         # oracles
         self.oracle_mask_t = self.oracle_mask_t.index_select(
             0, bidxes_device)
         self.oracle_label_t = self.oracle_label_t.index_select(
             0, bidxes_device)
         # update bsize
         self.update_bsize(new_bsize)
コード例 #22
0
def main(args):
    conf, model, vpack, test_iter = prepare_test(args, AnalyzeConf)
    # make sure the model is order 1 graph model, otherwise cannot run through
    assert isinstance(model, G1Parser) and isinstance(conf.pconf, G1ParserConf)
    # =====
    # helpers
    all_stater = StatRecorder(False)

    def _stat(k, v):
        all_stater.record_kv(k, v)

    # check agreement
    def _agree2(a, b, name):
        agreement = (np.asarray(a) == np.asarray(b))
        num_agree = int(agreement.sum())
        _stat(name, num_agree)

    # do not care about efficiency here!
    step2_pack = []
    for cur_insts in test_iter:
        # score and prune
        valid_mask, arc_score, label_score, mask_expr, marginals = model.prune_on_batch(
            cur_insts, conf.zprune)
        # greedy on raw scores
        greedy_label_scores, greedy_label_mat_idxes = label_score.max(
            -1)  # [*, m, h]
        greedy_all_scores, greedy_arc_idxes = (arc_score +
                                               greedy_label_scores).max(
                                                   -1)  # [*, m]
        greedy_label_idxes = greedy_label_mat_idxes.gather(
            -1, greedy_arc_idxes.unsqueeze(-1)).squeeze(-1)  # [*, m]
        # greedy on marginals (arc only)
        greedy_marg_arc_scores, greedy_marg_arc_idxes = marginals.max(
            -1)  # [*, m]
        entropy_marg = -(marginals *
                         (marginals + 1e-10 *
                          (marginals == 0.).float()).log()).sum(-1)  # [*, m]
        # decode
        model.inference_on_batch(cur_insts)
        # =====
        z = ZObject()
        keys = list(locals().keys())
        for k in keys:
            v = locals()[k]
            try:
                setattr(z, k, v.cpu().detach().numpy())
            except:
                pass
        # =====
        for idx in range(len(cur_insts)):
            one_inst: ParseInstance = cur_insts[idx]
            one_len = len(one_inst) + 1  # [1, len)
            _stat("all_edges", one_len - 1)
            arc_gold = one_inst.heads.vals[1:]
            arc_mst = one_inst.pred_heads.vals[1:]
            arc_gma = z.greedy_marg_arc_idxes[idx][1:one_len]
            # step 1: decoding agreement, how many edges agree: gold, mst-decode, greedy-marginal
            arcs = {"gold": arc_gold, "mst": arc_mst, "gma": arc_gma}
            cmp_keys = sorted(arcs.keys())
            for i in range(len(cmp_keys)):
                for j in range(i + 1, len(cmp_keys)):
                    n1, n2 = cmp_keys[i], cmp_keys[j]
                    _agree2(arcs[n1], arcs[n2], f"{n1}_{n2}")
            # step 2: confidence
            arc_agree = (np.asarray(arc_gold) == np.asarray(arc_mst))
            arc_marginals_mst = z.marginals[idx][range(1, one_len), arc_mst]
            arc_marginals_gold = z.marginals[idx][range(1, one_len), arc_gold]
            arc_entropy = z.entropy_marg[idx][1:one_len]
            for tidx in range(one_len - 1):
                step2_pack.append([
                    int(arc_agree[tidx]),
                    min(1., float(arc_marginals_mst[tidx])),
                    min(1., float(arc_marginals_gold[tidx])),
                    float(arc_entropy[tidx])
                ])
    # step 2: bucket by marginals
    if True:
        NUM_BUCKET = 10
        df = pd.DataFrame(step2_pack,
                          columns=['agree', 'm_mst', 'm_gold', 'entropy'])
        z = df.sort_values(by='m_mst', ascending=False)
        z.to_csv('res.csv')
        for cur_b in range(NUM_BUCKET):
            interval = 1. / NUM_BUCKET
            r0, r1 = cur_b * interval, (cur_b + 1) * interval
            cur_v = df[(df.m_mst >= r0) & ((df.m_mst < r1))]
            zlog(f"#===== [{r0}, {r1}): {cur_v.shape}\n" +
                 str(cur_v.describe()))
    # =====
    d = all_stater.summary(get_v=False, get_str=True)
    Helper.printd(d, "\n\n")
コード例 #23
0
ファイル: data.py プロジェクト: ValentinaPy/zmsp
 def pred_events(self):
     return Helper.join_list(x.pred_events for x in self.sents)
コード例 #24
0
ファイル: data.py プロジェクト: ValentinaPy/zmsp
 def pred_entity_fillers(self):
     return Helper.join_list(x.pred_entity_fillers for x in self.sents)
コード例 #25
0
 def subword_is_start(self):
     return Helper.join_list(self.cur_is_starts)
コード例 #26
0
 def subword_typeids(self):
     if self.cur_typeids is None:
         return None
     else:
         return Helper.join_list(self.cur_typeids)
コード例 #27
0
 def subword_ids(self):
     return Helper.join_list(self.cur_ids)
コード例 #28
0
ファイル: run.py プロジェクト: ValentinaPy/zmsp
 def _run_end(self):
     x = self.test_recorder.summary()
     res = self.res_manager.end()
     x.update(res)
     Helper.printd(x, sep=" ")
     return x
コード例 #29
0
 def get_split_params(self):
     params0 = Helper.join_list(z.get_parameters() for z in [self.arc_m, self.arc_h, self.lab_m, self.lab_h])
     params1 = Helper.join_list(z.get_parameters() for z in [self.arc_scorer, self.lab_scorer])
     return params0, params1
コード例 #30
0
ファイル: score_go.py プロジェクト: ValentinaPy/zmsp
def main(args):
    conf = PsConf()
    conf.update_from_args(args)
    # read the data
    path_train, path_dev, path_test = [
        get_data(z) for z in [conf.train, conf.dev, conf.test]
    ]
    pretrain_file = get_data(conf.pretrain_file)
    train_insts = list(get_data_reader(path_train, "conllu", "", False, ""))
    dev_insts = list(get_data_reader(path_dev, "conllu", "", False, ""))
    test_insts = list(get_data_reader(path_test, "conllu", "", False, ""))
    use_pos = conf.use_pos
    num_pieces = conf.pieces
    max_epoch = conf.max_epoch
    reg_scores_lambda = conf.reg_scores_lambda
    cur_run = conf.cur_run
    zlog(
        f"Read from train/dev/test: {len(train_insts)}/{len(dev_insts)}/{len(test_insts)}, split train into {num_pieces}"
    )
    # others
    RGPU = os.getenv("RGPU", "")
    # first train on all: 1. get dict (only build once), 2: score dev/test
    with Timer("train", "Train-ALL"):
        cur_conf, cur_model = "_conf.all", "_model.all"
        cur_load_model = cur_model + ".best"
        cur_base_opt = get_base_opt(cur_conf, cur_model, use_pos, True,
                                    max_epoch, reg_scores_lambda, cur_run)
        system(get_train_cmd(RGPU, cur_base_opt, path_train, path_dev,
                             path_test, pretrain_file),
               pp=True)
        system(get_score_cmd(RGPU, cur_conf, cur_load_model, path_dev,
                             "dev.scores.pkl"),
               pp=True)
        system(get_score_cmd(RGPU, cur_conf, cur_load_model, path_test,
                             "test.scores.pkl"),
               pp=True)
    # then training on the pieces (leaving one out)
    # first split into pieces
    Random.shuffle(train_insts)
    piece_length = math.ceil(len(train_insts) / num_pieces)
    train_pieces = []
    cur_idx = 0
    while cur_idx < len(train_insts):
        next_idx = min(len(train_insts), cur_idx + piece_length)
        train_pieces.append(train_insts[cur_idx:next_idx])
        cur_idx = next_idx
    zlog(f"Split training into {num_pieces}: {[len(x) for x in train_pieces]}")
    assert len(train_pieces) == num_pieces
    # next train each of the pieces
    for piece_id in range(num_pieces):
        with Timer("train", f"Train-{piece_id}"):
            # get current training pieces
            cur_training_insts = Helper.join_list(
                [train_pieces[x] for x in range(num_pieces) if x != piece_id])
            cur_testing_insts = train_pieces[piece_id]
            # write files
            cur_path_train, cur_path_test = f"tmp.train.{piece_id}.conllu", f"tmp.test.{piece_id}.conllu"
            write_insts(cur_path_train, cur_training_insts)
            write_insts(cur_path_test, cur_testing_insts)
            cur_conf, cur_model = f"_conf.{piece_id}", f"_model.{piece_id}"
            cur_load_model = cur_model + ".best"
            # no build dict, reuse previous
            cur_base_opt = get_base_opt(cur_conf, cur_model, use_pos, False,
                                        max_epoch, reg_scores_lambda, cur_run)
            system(get_train_cmd(RGPU, cur_base_opt, cur_path_train, path_dev,
                                 cur_path_test, pretrain_file),
                   pp=True)
            system(get_score_cmd(RGPU, cur_conf, cur_load_model, cur_path_test,
                                 f"tmp.test.{piece_id}.scores.pkl"),
                   pp=True)
    # finally put them in order
    all_results = []
    for piece_id in range(num_pieces):
        all_results.extend(read_results(f"tmp.test.{piece_id}.scores.pkl"))
    # reorder to the original order
    orig_indexes = [z.inst_idx for z in train_insts]
    orig_results = [None] * len(orig_indexes)
    for new_idx, orig_idx in enumerate(orig_indexes):
        assert orig_results[orig_idx] is None
        orig_results[orig_idx] = all_results[new_idx]
    # saving
    write_results("train.scores.pkl", orig_results)
    zlog("The end.")