Beispiel #1
0
 def _next(self):
     # buffered read
     if len(self.buckets_) == 0:
         # read into buffer
         while self.buffered_bsize_ < self.k:
             one = self.base_streamer_.next()
             if self.base_streamer_.is_eos(one):
                 # todo(+N): this actually does not ensure the end if base_streamer can re-produce things
                 break  # should have check active, currently skip this, assuming base_streamer's resposibility
             # dump instances (like short or long instances)
             dump_instance = any(f_(one) for f_ in self.dump_detectors)
             if dump_instance:
                 continue
             # single instances
             single_instance = any(f_(one) for f_ in self.single_detectors)
             if single_instance:
                 # immediate arrange this special one
                 return [one]
             # add this instance to buffer
             self.buffer_.append(one)
             self.buffered_bsize_ += self.batch_size_f(one)
         # prepare buffering
         if len(self.buffer_) > 0:
             # sorting
             sorted_buffer = self.buffer_
             if self.sorting_keyer is not None:
                 sorted_buffer.sort(key=self.sorting_keyer)  # small first
             # prepare buckets
             buckets = []
             tmp_bsize = 0
             tmp_bucket = []
             for one in sorted_buffer:
                 tmp_bsize += self.batch_size_f(one)
                 tmp_bucket.append(one)
                 if tmp_bsize >= self.batch_size:
                     buckets.append(tmp_bucket)
                     tmp_bsize = 0
                     tmp_bucket = []
             if len(tmp_bucket) > 0:
                 buckets.append(tmp_bucket)
             # another shuffle?
             if self.shuffling:
                 Random.shuffle(buckets, "data")
             else:
                 # todo(warn): to keep sorting-order if sorting else original-order BECAUSE-OF later POP
                 buckets.reverse()
             # clear here
             self.buckets_ = buckets
             self.buffer_ = []
             self.buffered_bsize_ = 0
     # return buckets
     if len(self.buckets_) > 0:
         ret = self.buckets_.pop()
         return ret
     else:
         return None
Beispiel #2
0
def main():
    utils.init("zlog", 1234)
    z = StatRecorder(True)
    times = Random.randint(100)
    for _ in range(times):
        with z.go():
            z.record_kv("distr_n", Random.randint(10))
    Helper.printd(z.summary(), "\n")
    #
    cc = Conf0()
    cc.update_from_args(["a:10", "y:www", "z.x:1"])
    pass
Beispiel #3
0
 def __init__(self, ignore_chs_label_mask, fdrop_chs: float,
              fdrop_par: float):
     self.ignore_chs_label_mask = ignore_chs_label_mask
     if fdrop_chs > 0.:
         self.chs_getter_f = self.get_fchs_dropped
         self.chs_rand_gen = Random.stream_bool(fdrop_chs)
     else:
         self.chs_getter_f = self.get_fchs_base
     if fdrop_par > 0.:
         self.par_getter_f = self.get_fpar_dropped
         self.par_rand_gen = Random.stream_bool(fdrop_par)
     else:
         self.par_getter_f = self.get_fpar_base
Beispiel #4
0
def _my_get_params_init(shape, init, lookup):
    # shape is a tuple of dims
    assert init in [
        "default", "random", "glorot", "ortho", "gaussian", "zeros"
    ], "Unknown init method %s" % init
    poss_scale = COMMON_CONFIG.init_scale_l if lookup else COMMON_CONFIG.init_scale_nl
    if len(shape) == 1:  # set bias to 0
        return np.zeros((shape[0], ))
    else:
        # get defaults
        if init == "default":
            init = COMMON_CONFIG.init_def_l if lookup else COMMON_CONFIG.init_def_nl
        # specifics
        if init == "glorot":
            if lookup:  # special for lookups
                shape_g = (shape[-1], )  # fan-out for lookup
            else:
                shape_g = shape
            w0 = Random.random_sample(shape, "winit")  # [0,1)
            w0 = (w0 - 0.5) * (2 * (np.sqrt(3.0 * len(shape_g) /
                                            (sum(shape_g)))))
            return w0 * poss_scale
        elif init == "random":
            w0 = Random.random_sample(shape, "winit")  # [0,1)
            w0 = (w0 - 0.5) * 2
            return w0 * poss_scale
        elif init == "gaussian":
            w0 = Random.randn_clip(shape, "winit")
            return w0 * poss_scale
        elif init == "ortho":
            # todo(note): always assume init square matrices
            assert len(shape) == 2 and (shape[0] % shape[1] == 0
                                        or shape[1] % shape[0] == 0
                                        ), f"Bad shape {shape} for ortho_init!"
            orig_num = shape[0] // shape[1]
            if orig_num == 0:
                num = shape[1] // shape[0]
            else:
                num = orig_num
            if num == 1:
                w0 = Random.ortho_weight(shape[1], "winit")
            else:
                w0 = np.concatenate([
                    Random.ortho_weight(shape[1], "winit") for _ in range(num)
                ])
            if orig_num == 0:  # reverse it!
                w0 = np.transpose(w0)
            return w0 * poss_scale
        elif init == "zeros":
            return np.zeros(shape)
Beispiel #5
0
 def shuffle_children_free(self):
     alpha = self.free_dist_alpha
     if alpha <= 0.:
         for one_list in self.children_list:
             if len(one_list) > 1:
                 Random.shuffle(one_list)
     else:
         for i, one_list in enumerate(self.children_list):
             if len(one_list) > 1:
                 values = [abs(i - z) * alpha for z in one_list]
                 # TODO(+N): is it correct to use Gumble for ranking
                 logprobs = np.log(MathHelper.softmax(values))
                 G = np.random.random_sample(len(logprobs))
                 ranking_values = np.log(-np.log(G)) - logprobs
                 self.children_list[i] = [
                     one_list[z] for z in np.argsort(ranking_values)
                 ]
Beispiel #6
0
 def _input_f_obtain(self, edrop):
     if edrop <= 0.:
         return lambda x: x
     else:
         # todo(warn): replaced sample, maybe an efficient but approx. impl & only works for 2d
         edrop_rands = Random.random_sample(
             (int(self.dropout_wordceil * edrop), ))  # [0,1)
         edrop_idxes = [int(self.dropout_wordceil * z) for z in edrop_rands]
         edrop_set = set(edrop_idxes)
         return lambda x: [0 if one in edrop_set else one
                           for one in x]  # drop to 0 if fall in the set
Beispiel #7
0
 def prepare(self, insts: List[ParseInstance], training):
     conf = self.conf
     word_idxes = [z.words.idxes for z in insts]
     word_arr, input_mask = self.padder.pad(word_idxes)  # [bsize, slen]
     # prepare for the masks
     input_word_mask = (Random.random_sample(word_arr.shape) <
                        conf.mask_rate) & (input_mask > 0.)
     input_word_mask &= (word_arr >= conf.min_mask_rank)
     input_word_mask[:, 0] = False  # no masking for special ROOT
     output_pred_mask = (input_word_mask & (word_arr <= conf.max_pred_rank))
     return input_word_mask.astype(np.float32), output_pred_mask.astype(
         np.float32), word_arr
Beispiel #8
0
 def __init__(self, base_streams: List[Streamer], budgets: List,
              stop_sidx: int):
     super().__init__()
     # -----
     assert len(base_streams) == len(budgets)
     assert len(base_streams) > 0
     self.base_streams = base_streams
     self.budgets = budgets
     self.stop_sidx = stop_sidx
     self.random_sampler = Random.stream(Random.random_sample)
     # status
     self.current_ptr = len(base_streams) - 1
     self.current_budget = 0.
     self.stats = [0 for _ in self.base_streams]
Beispiel #9
0
 def __init__(self, pc: BK.ParamCollection, bconf: BTConf,
              vpack: VocabPackage):
     super().__init__(pc, None, None)
     self.bconf = bconf
     # ===== Vocab =====
     self.word_vocab = vpack.get_voc("word")
     self.char_vocab = vpack.get_voc("char")
     self.pos_vocab = vpack.get_voc("pos")
     # ===== Model =====
     # embedding
     self.emb = self.add_sub_node(
         "emb", MyEmbedder(self.pc, bconf.emb_conf, vpack))
     emb_output_dim = self.emb.get_output_dims()[0]
     # encoder0 for jpos
     # todo(note): will do nothing if not use_jpos
     bconf.jpos_conf._input_dim = emb_output_dim
     self.jpos_enc = self.add_sub_node(
         "enc0", JPosModule(self.pc, bconf.jpos_conf, self.pos_vocab))
     enc0_output_dim = self.jpos_enc.get_output_dims()[0]
     # encoder
     # todo(0): feed compute-on-the-fly hp
     bconf.enc_conf._input_dim = enc0_output_dim
     self.enc = self.add_sub_node("enc", MyEncoder(self.pc, bconf.enc_conf))
     self.enc_output_dim = self.enc.get_output_dims()[0]
     # ===== Input Specification =====
     # inputs (word, char, pos) and vocabulary
     self.need_word = self.emb.has_word
     self.need_char = self.emb.has_char
     # todo(warn): currently only allow extra fields for POS
     self.need_pos = False
     if len(self.emb.extra_names) > 0:
         assert len(
             self.emb.extra_names) == 1 and self.emb.extra_names[0] == "pos"
         self.need_pos = True
     # todo(warn): currently only allow one aux field
     self.need_aux = False
     if len(self.emb.dim_auxes) > 0:
         assert len(self.emb.dim_auxes) == 1
         self.need_aux = True
     #
     self.word_padder = DataPadder(2,
                                   pad_vals=self.word_vocab.pad,
                                   mask_range=2)
     self.char_padder = DataPadder(3,
                                   pad_lens=(0, 0, bconf.char_max_length),
                                   pad_vals=self.char_vocab.pad)
     self.pos_padder = DataPadder(2, pad_vals=self.pos_vocab.pad)
     #
     self.random_sample_stream = Random.stream(Random.random_sample)
Beispiel #10
0
 def __init__(self, conf: M3AIEModelConf, vpack: IEVocabPackage):
     super().__init__(conf, vpack)
     # components
     self.cand_extractor: CandidateExtractor = self.decoders[0]
     self.arg_linker: ArgLinker = self.decoders[1]
     self.span_expander: ArgSpanExpander = self.decoders[2]
     # vocab
     self.hl_arg: HLabelVocab = self.vpack.get_voc("hl_arg")
     # lambdas for training
     self.lambda_cand = ScheduledValue("lambda_cand",
                                       conf.tconf.lambda_cand)
     self.lambda_arg = ScheduledValue("lambda_arg", conf.tconf.lambda_arg)
     self.lambda_span = ScheduledValue("lambda_span",
                                       conf.tconf.lambda_span)
     self.add_scheduled_values(self.lambda_cand)
     self.add_scheduled_values(self.lambda_arg)
     self.add_scheduled_values(self.lambda_span)
     # others
     self.random_sample_stream = Random.stream(Random.random_sample)
Beispiel #11
0
 def __init__(self, conf: M3IEModelConf, vpack: IEVocabPackage):
     super().__init__(conf, vpack)
     # components
     self.ef_extractor: MentionExtractor = self.decoders[0]
     self.evt_extractor: MentionExtractor = self.decoders[1]
     self.arg_linker: ArgLinker = self.decoders[2]
     self.span_expander: ArgSpanExpander = self.decoders[3]
     # vocab
     self.hl_ef: HLabelVocab = self.vpack.get_voc("hl_ef")
     self.hl_evt: HLabelVocab = self.vpack.get_voc("hl_evt")
     self.hl_arg: HLabelVocab = self.vpack.get_voc("hl_arg")
     # lambdas for training
     self.lambda_mention_ef = ScheduledValue("lambda_mention_ef",
                                             conf.tconf.lambda_mention_ef)
     self.lambda_mention_evt = ScheduledValue("lambda_mention_evt",
                                              conf.tconf.lambda_mention_evt)
     self.lambda_affi_ef = ScheduledValue("lambda_affi_ef",
                                          conf.tconf.lambda_affi_ef)
     self.lambda_affi_evt = ScheduledValue("lambda_affi_evt",
                                           conf.tconf.lambda_affi_evt)
     self.lambda_arg = ScheduledValue("lambda_arg", conf.tconf.lambda_arg)
     self.lambda_span = ScheduledValue("lambda_span",
                                       conf.tconf.lambda_span)
     self.add_scheduled_values(self.lambda_mention_ef)
     self.add_scheduled_values(self.lambda_mention_evt)
     self.add_scheduled_values(self.lambda_affi_ef)
     self.add_scheduled_values(self.lambda_affi_evt)
     self.add_scheduled_values(self.lambda_arg)
     self.add_scheduled_values(self.lambda_span)
     #
     self.random_sample_stream = Random.stream(Random.random_sample)
     self.pos_ef_getter_set = set(conf.pos_ef_getter_list)
     #
     if conf.iconf.expand_span_method == "dep":
         self.static_span_expander = SpanExpanderDep()
     elif conf.iconf.expand_span_method == "ext":
         self.static_span_expander = SpanExpanderExternal(
             conf.iconf.expand_span_ext_file)
     else:
         zlog("No static span expander!")
         self.static_span_expander = None
Beispiel #12
0
 def filter_pembed(self,
                   wv: WordVectors,
                   init_nohit=0.,
                   scale=1.0,
                   assert_all_hit=True,
                   set_init=True):
     if init_nohit <= 0.:
         get_nohit = lambda s: np.zeros((s, ), dtype=np.float32)
     else:
         get_nohit = lambda s: (Random.random_sample(
             (s, )).astype(np.float32) - 0.5) * (2 * init_nohit)
     #
     ret = [np.zeros((wv.embed_size, ),
                     dtype=np.float32)]  # init NIL is zero
     record = defaultdict(int)
     for ws in self.pools_hint_lexicon[1:]:
         res = np.zeros((wv.embed_size, ), dtype=np.float32)
         for w in ws:
             hit, norm_name, norm_w = wv.norm_until_hit(w)
             if hit:
                 value = np.asarray(wv.get_vec(norm_w, norm=False),
                                    dtype=np.float32)
                 record[norm_name] += 1
             else:
                 value = get_nohit(wv.embed_size)
                 record["no-hit"] += 1
             res += value
         ret.append(res)
     #
     assert not assert_all_hit or record[
         "no-hit"] == 0, f"Filter-embed error: assert all-hit but get no-hit of {record['no-hit']}"
     zlog(
         f"Filter pre-trained Pembed: {record}, no-hit is inited with {init_nohit}."
     )
     ret = np.asarray(ret, dtype=np.float32) * scale
     if set_init:
         self.set_pool_init(ret)
     return ret
Beispiel #13
0
 def filter_embed(self, wv: 'WordVectors', init_nohit=0., scale=1.0, assert_all_hit=False):
     if init_nohit <= 0.:
         get_nohit = lambda s: np.zeros((s,), dtype=np.float32)
     else:
         get_nohit = lambda s: (Random.random_sample((s,)).astype(np.float32)-0.5) * (2*init_nohit)
     #
     ret = []
     res = defaultdict(int)
     for w in self.final_words:
         hit, norm_name, norm_w = wv.norm_until_hit(w)
         if hit:
             value = np.asarray(wv.get_vec(norm_w, norm=False), dtype=np.float32)
             res[norm_name] += 1
         else:
             value = get_nohit(wv.embed_size)
             # value = np.zeros((wv.embed_size,), dtype=np.float32)
             res["no-hit"] += 1
         ret.append(value)
     #
     if assert_all_hit:
         zcheck(res["no-hit"]==0, f"Filter-embed error: assert all-hit but get no-hit of {res['no-hit']}")
     printing("Filter pre-trained embed: %s, no-hit is inited with %s." % (res, init_nohit))
     return np.asarray(ret, dtype=np.float32) * scale
Beispiel #14
0
 def __init__(self, pc: BK.ParamCollection, bconf: BTConf, tconf: 'BaseTrainingConf', vpack: VocabPackage):
     super().__init__(pc, None, None)
     self.bconf = bconf
     # ===== Vocab =====
     self.word_vocab = vpack.get_voc("word")
     self.char_vocab = vpack.get_voc("char")
     self.lemma_vocab = vpack.get_voc("lemma")
     self.upos_vocab = vpack.get_voc("upos")
     self.ulabel_vocab = vpack.get_voc("ulabel")
     # ===== Model =====
     # embedding
     self.emb = self.add_sub_node("emb", MyEmbedder(self.pc, bconf.emb_conf, vpack))
     emb_output_dim = self.emb.get_output_dims()[0]
     self.emb_output_dim = emb_output_dim
     # doc hint
     self.use_doc_hint = bconf.use_doc_hint
     self.dh_combine_method = bconf.dh_combine_method
     if self.use_doc_hint:
         assert len(bconf.emb_conf.dim_auxes)>0
         # todo(note): currently use the concat of them if input multiple layers
         bconf.dh_conf._input_dim = bconf.emb_conf.dim_auxes[0]  # same as input bert dim
         bconf.dh_conf._output_dim = emb_output_dim  # same as emb_output_dim
         self.dh_node = self.add_sub_node("dh", DocHintModule(pc, bconf.dh_conf))
     else:
         self.dh_node = None
     # encoders
     # shared
     # todo(note): feed compute-on-the-fly hp
     bconf.enc_conf._input_dim = emb_output_dim
     self.enc = self.add_sub_node("enc", MyEncoder(self.pc, bconf.enc_conf))
     tmp_enc_output_dim = self.enc.get_output_dims()[0]
     # privates
     bconf.enc_ef_conf._input_dim = tmp_enc_output_dim
     self.enc_ef = self.add_sub_node("enc_ef", MyEncoder(self.pc, bconf.enc_ef_conf))
     self.enc_ef_output_dim = self.enc_ef.get_output_dims()[0]
     bconf.enc_evt_conf._input_dim = tmp_enc_output_dim
     self.enc_evt = self.add_sub_node("enc_evt", MyEncoder(self.pc, bconf.enc_evt_conf))
     self.enc_evt_output_dim = self.enc_evt.get_output_dims()[0]
     # ===== Input Specification =====
     # inputs (word, lemma, char, upos, ulabel) and vocabulary
     self.need_word = self.emb.has_word
     self.need_char = self.emb.has_char
     # extra fields
     # todo(warn): need to
     self.need_lemma = False
     self.need_upos = False
     self.need_ulabel = False
     for one_extra_name in self.emb.extra_names:
         if one_extra_name == "lemma":
             self.need_lemma = True
         elif one_extra_name == "upos":
             self.need_upos = True
         elif one_extra_name == "ulabel":
             self.need_ulabel = True
         else:
             raise NotImplementedError("UNK extra input name: " + one_extra_name)
     # todo(warn): currently only allow one aux field
     self.need_aux = False
     if len(self.emb.dim_auxes) > 0:
         assert len(self.emb.dim_auxes) == 1
         self.need_aux = True
     # padders
     self.word_padder = DataPadder(2, pad_vals=self.word_vocab.pad, mask_range=2)
     self.char_padder = DataPadder(3, pad_lens=(0, 0, bconf.char_max_length), pad_vals=self.char_vocab.pad)
     self.lemma_padder = DataPadder(2, pad_vals=self.lemma_vocab.pad)
     self.upos_padder = DataPadder(2, pad_vals=self.upos_vocab.pad)
     self.ulabel_padder = DataPadder(2, pad_vals=self.ulabel_vocab.pad)
     #
     self.random_sample_stream = Random.stream(Random.random_sample)
     self.train_skip_noevt_rate = tconf.train_skip_noevt_rate
     self.train_skip_length = tconf.train_skip_length
     self.train_min_length = tconf.train_min_length
     self.test_min_length = tconf.test_min_length
     self.test_skip_noevt_rate = tconf.test_skip_noevt_rate
     self.train_sent_based = tconf.train_sent_based
     #
     assert not self.train_sent_based, "The basic model should not use this sent-level mode!"
Beispiel #15
0
 def __init__(self, conf: MtlMlmModelConf, vpack: VocabPackage):
     super().__init__(conf)
     # for easier checking
     self.word_vocab = vpack.get_voc("word")
     # components
     self.embedder = self.add_node("emb", EmbedderNode(self.pc, conf.emb_conf, vpack))
     self.inputter = Inputter(self.embedder, vpack)  # not a node
     self.emb_out_dim = self.embedder.get_output_dims()[0]
     self.enc_attn_count = conf.default_attn_count
     if conf.enc_choice == "vrec":
         self.encoder = self.add_component("enc", VRecEncoder(self.pc, self.emb_out_dim, conf.venc_conf))
         self.enc_attn_count = self.encoder.attn_count
     elif conf.enc_choice == "original":
         conf.oenc_conf._input_dim = self.emb_out_dim
         self.encoder = self.add_node("enc", MyEncoder(self.pc, conf.oenc_conf))
     else:
         raise NotImplementedError()
     zlog(f"Finished building model's encoder {self.encoder}, all size is {self.encoder.count_allsize_parameters()}")
     self.enc_out_dim = self.encoder.get_output_dims()[0]
     # --
     conf.rprep_conf._rprep_vr_conf.matt_conf.head_count = self.enc_attn_count  # make head-count agree
     self.rpreper = self.add_node("rprep", RPrepNode(self.pc, self.enc_out_dim, conf.rprep_conf))
     # --
     self.lambda_agree = self.add_scheduled_value(ScheduledValue(f"agr:lambda", conf.lambda_agree))
     self.agree_loss_f = EntropyHelper.get_method(conf.agree_loss_f)
     # --
     self.masklm = self.add_component("mlm", MaskLMNode(self.pc, self.enc_out_dim, conf.mlm_conf, self.inputter))
     self.plainlm = self.add_component("plm", PlainLMNode(self.pc, self.enc_out_dim, conf.plm_conf, self.inputter))
     # todo(note): here we use attn as dim_pair, do not use pair if not using vrec!!
     self.orderpr = self.add_component("orp", OrderPredNode(
         self.pc, self.enc_out_dim, self.enc_attn_count, conf.orp_conf, self.inputter))
     # =====
     # pre-training pre-load point!!
     if conf.load_pretrain_model_name:
         zlog(f"At preload_pretrain point: Loading from {conf.load_pretrain_model_name}")
         self.pc.load(conf.load_pretrain_model_name, strict=False)
     # =====
     self.dpar = self.add_component("dpar", DparG1Decoder(
         self.pc, self.enc_out_dim, self.enc_attn_count, conf.dpar_conf, self.inputter))
     self.upos = self.add_component("upos", SeqLabNode(
         self.pc, "pos", self.enc_out_dim, self.conf.upos_conf, self.inputter))
     if conf.do_ner:
         if conf.ner_use_crf:
             self.ner = self.add_component("ner", SeqCrfNode(
                 self.pc, "ner", self.enc_out_dim, self.conf.ner_conf, self.inputter))
         else:
             self.ner = self.add_component("ner", SeqLabNode(
                 self.pc, "ner", self.enc_out_dim, self.conf.ner_conf, self.inputter))
     else:
         self.ner = None
     # for pairwise reprs (no trainable params here!)
     self.rel_dist_embed = self.add_node("oremb", PosiEmbedding2(self.pc, n_dim=self.enc_attn_count, max_val=100))
     self._prepr_f_attn_sum = lambda cache, rdist: BK.stack(cache.list_attn, 0).sum(0) if (len(cache.list_attn))>0 else None
     self._prepr_f_attn_avg = lambda cache, rdist: BK.stack(cache.list_attn, 0).mean(0) if (len(cache.list_attn))>0 else None
     self._prepr_f_attn_max = lambda cache, rdist: BK.stack(cache.list_attn, 0).max(0)[0] if (len(cache.list_attn))>0 else None
     self._prepr_f_attn_last = lambda cache, rdist: cache.list_attn[-1] if (len(cache.list_attn))>0 else None
     self._prepr_f_rdist = lambda cache, rdist: self._get_rel_dist_embed(rdist, False)
     self._prepr_f_rdist_abs = lambda cache, rdist: self._get_rel_dist_embed(rdist, True)
     self.prepr_f = getattr(self, "_prepr_f_"+conf.prepr_choice)  # shortcut
     # --
     self.testing_rand_gen = Random.create_sep_generator(conf.testing_rand_gen_seed)  # especial gen for testing
     # =====
     if conf.orp_loss_special:
         self.orderpr.add_node_special(self.masklm)
     # =====
     # extra one!!
     self.aug_word2 = self.aug_encoder = self.aug_mixturer = None
     if conf.aug_word2:
         self.aug_word2 = self.add_node("aug2", AugWord2Node(self.pc, conf.emb_conf, vpack,
                                                             "word2", conf.aug_word2_dim, self.emb_out_dim))
         if conf.aug_word2_aug_encoder:
             assert conf.enc_choice == "vrec"
             self.aug_detach_drop = self.add_node("dd", Dropout(self.pc, (self.enc_out_dim,), fix_rate=conf.aug_detach_dropout))
             self.aug_encoder = self.add_component("Aenc", VRecEncoder(self.pc, self.emb_out_dim, conf.venc_conf))
             self.aug_mixturer = self.add_node("Amix", BertFeaturesWeightLayer(self.pc, conf.aug_detach_numlayer))
Beispiel #16
0
class ParseInstance(Instance):
    #
    ROOT_SYMBOL = VocabHelper.convert_special_pattern("r")

    def __init__(self, words, poses=None, heads=None, labels=None, code=""):
        super().__init__()
        # todo(0): always include special ROOT symbol
        _tmp_root_list = [ParseInstance.ROOT_SYMBOL]
        self.code = code
        if code:
            aug_words = get_aug_words(words, code)
        else:
            aug_words = words
        self.words = SeqFactor(_tmp_root_list + aug_words)
        self.chars = InputCharFactor([""] + words)  # empty pad chars
        #
        if poses is not None:
            poses = _tmp_root_list + poses
        if heads is not None:
            heads = [0] + heads
        if labels is not None:
            labels = _tmp_root_list + labels
        self.poses = SeqFactor(poses)
        self.heads = SeqFactor(heads)
        self.labels = SeqFactor(
            labels
        )  # todo(warn): for td, no processing, directly use 0 as padding!!
        # =====
        # for top-down parsing, but deprecated now
        self.children_mask_arr: np.ndarray = None  # [N, N]
        # self.children_se: List[Set] = None
        self.children_list: List[List] = None  # list of children
        self.descendant_list: List[List] = None  # list of all descendants
        #
        self.free_dist_alpha: float = None
        # =====
        # other helpful info (calculate in need)
        self._unprojs = None
        self._sibs = None
        self._gps = None
        # all children should be aranged l2r
        self._children_left = None
        self._children_right = None
        self._children_all = None
        # predictions (optional prediction probs)
        self.pred_poses = SeqFactor(None)
        self.pred_pos_scores = SeqFactor(None)
        self.pred_heads = SeqFactor(None)
        self.pred_labels = SeqFactor(None)
        self.pred_par_scores = SeqFactor(None)
        self.pred_miscs = SeqFactor(None)
        # for real length
        self.length = InstanceHelper.check_equal_length(
            [self.words, self.chars, self.poses, self.heads, self.labels]) - 1
        # extra inputs, for example, those from mbert
        self.extra_features = {"aux_repr": None}
        # extra preds info
        self.extra_pred_misc = {}

    def __len__(self):
        return self.length

    # =====
    # helpful functions
    @staticmethod
    def get_children(heads):
        cur_len = len(heads)
        children_left, children_right = [[] for _ in range(cur_len)
                                         ], [[] for _ in range(cur_len)]
        for i in range(1, cur_len):
            h = heads[i]
            if i < h:
                children_left[h].append(i)
            else:
                children_right[h].append(i)
        return children_left, children_right

    @staticmethod
    def get_sibs(children_left, children_right):
        # get sibling list: sided nearest sibling (self if single)
        sibs = [-1] * len(children_left)
        for vs in children_left:
            if len(vs) > 0:
                prev_s = vs[-1]
                for cur_m in reversed(vs):
                    sibs[cur_m] = prev_s
                    prev_s = cur_m
        for vs in children_right:
            if len(vs) > 0:
                prev_s = vs[0]
                for cur_m in vs:
                    sibs[cur_m] = prev_s
                    prev_s = cur_m
        # todo(+2): how about 0? currently set to 0
        sibs[0] = 0
        return sibs

    @staticmethod
    def get_gps(heads):
        # get grandparent list
        # todo(+2): how about 0? currently will be 0
        return [heads[x] for x in heads]

    # =====
    # other properties
    @property
    def unprojs(self):
        # todo(warn): calculate the unproj situations: 0 proj edge, 1 unproj edge
        if self._unprojs is None:
            self._unprojs = [0] + ConlluParse.calc_crossed(self.heads.vals[1:])
        return self._unprojs

    @property
    def sibs(self):
        if self._sibs is None:
            self._sibs = ParseInstance.get_sibs(self.children_left,
                                                self.children_right)
        return self._sibs

    @property
    def gps(self):
        if self._gps is None:
            heads = self.heads.vals
            self._gps = ParseInstance.get_gps(heads)
        return self._gps

    @property
    def children_left(self):
        if self._children_left is None:
            heads = self.heads.vals
            self._children_left, self._children_right = ParseInstance.get_children(
                heads)
        return self._children_left

    @property
    def children_right(self):
        if self._children_right is None:
            heads = self.heads.vals
            self._children_left, self._children_right = ParseInstance.get_children(
                heads)
        return self._children_right

    @property
    def children_all(self):
        if self._children_all is None:
            self._children_all = [
                a + b for a, b in zip(self.children_left, self.children_right)
            ]
        return self._children_all

    # =====
    # special processing for training
    # for h-local-loss
    def get_children_mask_arr(self, add_self_if_leaf=True):
        if self.children_mask_arr is None:
            # on need
            heads = self.heads.vals
            the_len = len(heads)
            masks = np.zeros([the_len, the_len], dtype=np.float32)
            # exclude root
            for m, h in enumerate(heads[1:], 1):
                masks[h, m] = 1.  # this one is [h,m]
            if add_self_if_leaf:
                for i in range(the_len):
                    if sum(masks[i]) == 0.:
                        masks[i, i] = 1.
            self.children_mask_arr = masks
        return self.children_mask_arr

    # set once when reading!
    def set_children_info(self,
                          oracle_strategy,
                          label_ranking_dict: Dict = None,
                          free_dist_alpha: float = 0.):
        heads = self.heads.vals
        the_len = len(heads)
        # self.children_set = [set() for _ in range(the_len)]
        self.children_list = [[] for _ in range(the_len)]
        tmp_descendant_list = [None for _ in range(the_len)]
        # exclude root
        for m, h in enumerate(heads[1:], 1):
            # self.children_set[h].add(m)
            self.children_list[h].append(m)  # l2r order
        # re-arrange list order (left -> right)
        if oracle_strategy == "i2o":
            for h in range(the_len):
                self.children_list[h].sort(key=lambda x: -x if x < h else x)
        elif oracle_strategy == "label":
            # todo(warn): only use first level!
            level0_labels = [z.split(":")[0] for z in self.labels.vals]
            for h in range(the_len):
                self.children_list[h].sort(
                    key=lambda x: label_ranking_dict[level0_labels[x]])
        elif oracle_strategy == "n2f":
            self.shuffle_children_n2f()
        elif oracle_strategy == "free":
            self.free_dist_alpha = free_dist_alpha
            self.shuffle_children_free()
        else:
            assert oracle_strategy == "l2r"
            pass
        # todo(+N): does the order of descendant list matter?
        # todo(+N): depth-first or breadth-first? (currently select the latter)
        # recursively get descendant list: do this
        # =====
        def _recursive_add(cur_n):
            cur_children = self.children_list[cur_n]  # List[int]
            for i in cur_children:
                _recursive_add(i)
            new_dlist = [cur_children]
            cur_layer = 0
            while True:
                another_layer = Helper.join_list(
                    tmp_descendant_list[i][cur_layer]
                    if cur_layer < len(tmp_descendant_list[i]) else []
                    for i in cur_children)
                if len(another_layer) == 0:
                    break
                new_dlist.append(another_layer)
                cur_layer += 1
            tmp_descendant_list[cur_n] = new_dlist

        # =====
        _recursive_add(0)
        self.descendant_list = [
            Helper.join_list(tmp_descendant_list[i]) for i in range(the_len)
        ]

    # =====
    # todo(warn): does not shuffle descendant list since this can disturb the depth-order

    # shuffle once before each running for free mode
    def shuffle_children_free(self):
        alpha = self.free_dist_alpha
        if alpha <= 0.:
            for one_list in self.children_list:
                if len(one_list) > 1:
                    Random.shuffle(one_list)
        else:
            for i, one_list in enumerate(self.children_list):
                if len(one_list) > 1:
                    values = [abs(i - z) * alpha for z in one_list]
                    # TODO(+N): is it correct to use Gumble for ranking
                    logprobs = np.log(MathHelper.softmax(values))
                    G = np.random.random_sample(len(logprobs))
                    ranking_values = np.log(-np.log(G)) - logprobs
                    self.children_list[i] = [
                        one_list[z] for z in np.argsort(ranking_values)
                    ]

    INST_RAND = Random.stream(Random.random_sample)

    # shuffle for n2f mode
    def shuffle_children_n2f(self):
        rr = ParseInstance.INST_RAND
        for i, one_list in enumerate(self.children_list):
            if len(one_list) > 1:
                # todo(warn): use small random to break tie
                values = [abs(i - z) + next(rr) for z in one_list]
                self.children_list[i] = [
                    one_list[z] for z in np.argsort(values)
                ]

    # =====
    # todo(warn): exclude artificial root node

    def get_real_values_select(self, selections):
        ret = []
        for name in selections:
            zv = getattr(self, name)
            if zv.has_vals():
                ret.append(zv.vals[1:])
            else:
                ret.append(None)
        return ret

    def get_real_values_all(self):
        ret = {}
        for zn, zv in vars(self).items():
            if isinstance(zv, SeqFactor):
                if zv.has_vals():
                    ret[zn] = zv.vals[1:]
                else:
                    ret[zn] = None
        return ret
Beispiel #17
0
 def run(self, train_stream, dev_streams):
     rconf = self.rconf
     last_report_uidx, last_dev_uidx = 0, 0
     if rconf.validate_first:
         self._validate(dev_streams)
     # =====
     # for lrate warmup and annealing
     if rconf.lrate_warmup < 0:
         # calculate epochs
         steps_per_epoch = 0
         for _ in train_stream:
             steps_per_epoch += 1
         n_epoch = -rconf.lrate_warmup
         n_steps = n_epoch * steps_per_epoch
         utils.zlog(f"Calculating warmup steps for {n_epoch} epochs: {steps_per_epoch} steps per epoch.")
     elif rconf.lrate_warmup > 0:
         n_steps = rconf.lrate_warmup
     else:
         n_steps = 0
     max_lrate = self.lrate.value
     # final_lrate = lrate * anneal_factor * (step)^lrate_anneal_alpha
     # final_lrate(n_steps) = max_lrate
     lrate_anneal_alpha = rconf.lrate_anneal_alpha
     if n_steps > 0:
         anneal_factor = 1. / (n_steps**lrate_anneal_alpha)
     else:
         anneal_factor = 1.
     self.lrate_warmup_steps = n_steps
     utils.zlog(f"For lrate-warmup, will go with the first {n_steps} steps up to {max_lrate}, "
                f"then anneal with lrate*{anneal_factor}*step^{lrate_anneal_alpha}")
     # =====
     while not self._finished():
         # todo(note): epoch start from 1!!
         self._tp.eidx += 1
         with Timer(tag="Train-Iter", info="Iter %s" % self._tp.eidx, print_date=True) as et:
             self._adjust_scheduled_values()  # adjust at the start of each epoch
             act_lrate = 0.
             # for batches
             for insts in train_stream:
                 # skip this batch
                 if Random.random_bool(rconf.skip_batch):
                     continue
                 # train on batch, return a dictionary
                 # possibly split batch to save memory
                 self._fb_batch(insts)
                 self._tp.uidx += 1
                 # get the effective lrate
                 act_lrate = self.lrate.value
                 if self._tp.uidx < self.lrate_warmup_steps:
                     act_lrate *= (self._tp.uidx / self.lrate_warmup_steps)
                 else:
                     act_lrate *= anneal_factor * (self._tp.uidx**lrate_anneal_alpha)
                 #
                 self._run_update(act_lrate, 1.)
                 # report on training process
                 if rconf.flag_verbose and (self._tp.uidx-last_report_uidx)>=rconf.report_freq:
                     utils.zlog(f"Current act_lrate is {act_lrate}.")
                     self._run_train_report()
                     last_report_uidx = self._tp.uidx
                 # time for validating
                 if (self._tp.uidx-last_dev_uidx)>=rconf.valid_freq:
                     self._validate(dev_streams)
                     last_dev_uidx = self._tp.uidx
                     last_report_uidx = self._tp.uidx
                     # todo(+N): do we need to adjust sv at a finer grained?
                     self._adjust_scheduled_values()  # adjust after uidx validation
                     if self._finished():
                         break
             # validate at the end of epoch?
             utils.zlog(f"End of epoch: Current act_lrate is {act_lrate}.")
             if rconf.validate_epoch:
                 self._validate(dev_streams)
                 last_dev_uidx = self._tp.uidx
                 last_report_uidx = self._tp.uidx
         utils.zlog("")
     utils.zlog("zzzzzfinal: After training, the best point is: %s." % (str(self._tp.info_save_best())))
Beispiel #18
0
 def reset(self):
     self.ptr = 0
     if self.shuffle:
         Random.shuffle(self.c, "data")
Beispiel #19
0
 def get_random_bool_streamer(true_rate, batch_size=1024):
     return RandomStreamer(
         lambda size: Random.random_bool(true_rate, size, "data"),
         batch_size)
Beispiel #20
0
 def __init__(self, pc: BK.ParamCollection, bconf: Berter2Conf):
     super().__init__(pc, None, None)
     self.bconf = bconf
     self.model_name = bconf.bert2_model
     zlog(
         f"Loading pre-trained bert model for Berter2 of {self.model_name}")
     # Load pretrained model/tokenizer
     self.tokenizer = BertTokenizer.from_pretrained(
         self.model_name,
         do_lower_case=bconf.bert2_lower_case,
         cache_dir=None if
         (not bconf.bert2_cache_dir) else bconf.bert2_cache_dir)
     self.model = BertModel.from_pretrained(
         self.model_name,
         output_hidden_states=True,
         cache_dir=None if
         (not bconf.bert2_cache_dir) else bconf.bert2_cache_dir)
     zlog(f"Load done, move to default device {BK.DEFAULT_DEVICE}")
     BK.to_device(self.model)
     # =====
     # zero padding embeddings?
     if bconf.bert2_zero_pademb:
         with BK.no_grad_env():
             # todo(warn): specific!!
             zlog(
                 f"Unusual operation: make bert's padding embedding (idx0) zero!!"
             )
             self.model.embeddings.word_embeddings.weight[0].fill_(0.)
     # =====
     # check trainable ones and add parameters
     # todo(+N): this part is specific and looking into the lib, can break in further versions!!
     # the idx of layer is [1(embed)] + [N(enc)], that is, layer0 is the output of embeddings
     self.hidden_size = self.model.config.hidden_size
     self.num_bert_layers = len(
         self.model.encoder.layer) + 1  # +1 for embeddings
     self.output_layers = [
         i if i >= 0 else (self.num_bert_layers + i)
         for i in bconf.bert2_output_layers
     ]
     self.layer_is_output = [False] * self.num_bert_layers
     for i in self.output_layers:
         self.layer_is_output[i] = True
     # the highest used layer
     self.output_max_layer = max(
         self.output_layers) if len(self.output_layers) > 0 else -1
     # from max-layer down
     self.trainable_layers = list(range(self.output_max_layer, -1,
                                        -1))[:bconf.bert2_trainable_layers]
     # the lowest trainable layer
     self.trainable_min_layer = min(self.trainable_layers) if len(
         self.trainable_layers) > 0 else (self.output_max_layer + 1)
     zlog(f"Build Berter2: {self}")
     # add parameters
     prefix_name = self.pc.nnc_name(self.name, True) + "/"
     for layer_idx in self.trainable_layers:
         if layer_idx == 0:  # add the embedding layer
             infix_name = "embed"
             named_params = self.pc.param_add_external(
                 prefix_name + infix_name, self.model.embeddings)
         else:
             # here we should use the original (-1) index
             infix_name = "enc" + str(layer_idx)
             named_params = self.pc.param_add_external(
                 prefix_name + infix_name,
                 self.model.encoder.layer[layer_idx - 1])
         # add to self.params
         for one_name, one_param in named_params:
             assert f"{infix_name}_{one_name}" not in self.params
             self.params[f"{infix_name}_{one_name}"] = one_param
     # for dropout/mask input
     self.random_sample_stream = Random.stream(Random.random_sample)
     # =====
     # for other inputs; todo(note): still, 0 means all-zero embedding
     self.other_embeds = [
         self.add_sub_node(
             "OE", Embedding(self.pc,
                             vsize,
                             self.hidden_size,
                             fix_row0=True))
         for vsize in bconf.bert2_other_input_vsizes
     ]
     # =====
     # for output
     if bconf.bert2_output_mode == "layered":
         self.output_f = lambda x: x
         self.output_dims = (
             self.hidden_size,
             len(self.output_layers),
         )
     elif bconf.bert2_output_mode == "concat":
         self.output_f = lambda x: x.view(BK.get_shape(x)[:-2] + [-1]
                                          )  # combine the last two dims
         self.output_dims = (self.hidden_size * len(self.output_layers), )
     elif bconf.bert2_output_mode == "weighted":
         self.output_f = self.add_sub_node(
             "wb", BertFeaturesWeightLayer(pc, len(self.output_layers)))
         self.output_dims = (self.hidden_size, )
     else:
         raise NotImplementedError(
             f"UNK mode for bert2 output: {bconf.bert2_output_mode}")
Beispiel #21
0
        # in fact, inplaced if not wrapping model specific preparer
        return inst


#
def index_stream(in_stream, vpack, cached, cache_shuffle, inst_preparer):
    i_stream = IndexerStreamer(in_stream, vpack, inst_preparer)
    if cached:
        return InstCacher(i_stream, shuffle=cache_shuffle)
    else:
        return i_stream


# for arrange batches:
# todo(warn): batch_size means number of sents, no sorting by #sent
_BS_sample_stream = Random.stream(Random.random_sample)


#
def batch_stream(in_stream, ticonf, training):
    # =====
    def _count_train_sents(d):
        # todo(note): here, we include only "hit" sents for training, but maybe the alternative can be also ok, especially for doc-hint?
        valid_sents = [
            x for x in d.sents if x.length < ticonf.train_skip_length
            and x.length >= ticonf.train_min_length
        ]
        return len(valid_sents) - len([
            x for x in valid_sents if len(x.events) == 0
        ]) * ticonf.train_skip_noevt_rate
Beispiel #22
0
def main(args):
    conf = PsConf()
    conf.update_from_args(args)
    # read the data
    path_train, path_dev, path_test = [
        get_data(z) for z in [conf.train, conf.dev, conf.test]
    ]
    pretrain_file = get_data(conf.pretrain_file)
    train_insts = list(get_data_reader(path_train, "conllu", "", False, ""))
    dev_insts = list(get_data_reader(path_dev, "conllu", "", False, ""))
    test_insts = list(get_data_reader(path_test, "conllu", "", False, ""))
    use_pos = conf.use_pos
    num_pieces = conf.pieces
    max_epoch = conf.max_epoch
    reg_scores_lambda = conf.reg_scores_lambda
    cur_run = conf.cur_run
    zlog(
        f"Read from train/dev/test: {len(train_insts)}/{len(dev_insts)}/{len(test_insts)}, split train into {num_pieces}"
    )
    # others
    RGPU = os.getenv("RGPU", "")
    # first train on all: 1. get dict (only build once), 2: score dev/test
    with Timer("train", "Train-ALL"):
        cur_conf, cur_model = "_conf.all", "_model.all"
        cur_load_model = cur_model + ".best"
        cur_base_opt = get_base_opt(cur_conf, cur_model, use_pos, True,
                                    max_epoch, reg_scores_lambda, cur_run)
        system(get_train_cmd(RGPU, cur_base_opt, path_train, path_dev,
                             path_test, pretrain_file),
               pp=True)
        system(get_score_cmd(RGPU, cur_conf, cur_load_model, path_dev,
                             "dev.scores.pkl"),
               pp=True)
        system(get_score_cmd(RGPU, cur_conf, cur_load_model, path_test,
                             "test.scores.pkl"),
               pp=True)
    # then training on the pieces (leaving one out)
    # first split into pieces
    Random.shuffle(train_insts)
    piece_length = math.ceil(len(train_insts) / num_pieces)
    train_pieces = []
    cur_idx = 0
    while cur_idx < len(train_insts):
        next_idx = min(len(train_insts), cur_idx + piece_length)
        train_pieces.append(train_insts[cur_idx:next_idx])
        cur_idx = next_idx
    zlog(f"Split training into {num_pieces}: {[len(x) for x in train_pieces]}")
    assert len(train_pieces) == num_pieces
    # next train each of the pieces
    for piece_id in range(num_pieces):
        with Timer("train", f"Train-{piece_id}"):
            # get current training pieces
            cur_training_insts = Helper.join_list(
                [train_pieces[x] for x in range(num_pieces) if x != piece_id])
            cur_testing_insts = train_pieces[piece_id]
            # write files
            cur_path_train, cur_path_test = f"tmp.train.{piece_id}.conllu", f"tmp.test.{piece_id}.conllu"
            write_insts(cur_path_train, cur_training_insts)
            write_insts(cur_path_test, cur_testing_insts)
            cur_conf, cur_model = f"_conf.{piece_id}", f"_model.{piece_id}"
            cur_load_model = cur_model + ".best"
            # no build dict, reuse previous
            cur_base_opt = get_base_opt(cur_conf, cur_model, use_pos, False,
                                        max_epoch, reg_scores_lambda, cur_run)
            system(get_train_cmd(RGPU, cur_base_opt, cur_path_train, path_dev,
                                 cur_path_test, pretrain_file),
                   pp=True)
            system(get_score_cmd(RGPU, cur_conf, cur_load_model, cur_path_test,
                                 f"tmp.test.{piece_id}.scores.pkl"),
                   pp=True)
    # finally put them in order
    all_results = []
    for piece_id in range(num_pieces):
        all_results.extend(read_results(f"tmp.test.{piece_id}.scores.pkl"))
    # reorder to the original order
    orig_indexes = [z.inst_idx for z in train_insts]
    orig_results = [None] * len(orig_indexes)
    for new_idx, orig_idx in enumerate(orig_indexes):
        assert orig_results[orig_idx] is None
        orig_results[orig_idx] = all_results[new_idx]
    # saving
    write_results("train.scores.pkl", orig_results)
    zlog("The end.")