def _next(self): # buffered read if len(self.buckets_) == 0: # read into buffer while self.buffered_bsize_ < self.k: one = self.base_streamer_.next() if self.base_streamer_.is_eos(one): # todo(+N): this actually does not ensure the end if base_streamer can re-produce things break # should have check active, currently skip this, assuming base_streamer's resposibility # dump instances (like short or long instances) dump_instance = any(f_(one) for f_ in self.dump_detectors) if dump_instance: continue # single instances single_instance = any(f_(one) for f_ in self.single_detectors) if single_instance: # immediate arrange this special one return [one] # add this instance to buffer self.buffer_.append(one) self.buffered_bsize_ += self.batch_size_f(one) # prepare buffering if len(self.buffer_) > 0: # sorting sorted_buffer = self.buffer_ if self.sorting_keyer is not None: sorted_buffer.sort(key=self.sorting_keyer) # small first # prepare buckets buckets = [] tmp_bsize = 0 tmp_bucket = [] for one in sorted_buffer: tmp_bsize += self.batch_size_f(one) tmp_bucket.append(one) if tmp_bsize >= self.batch_size: buckets.append(tmp_bucket) tmp_bsize = 0 tmp_bucket = [] if len(tmp_bucket) > 0: buckets.append(tmp_bucket) # another shuffle? if self.shuffling: Random.shuffle(buckets, "data") else: # todo(warn): to keep sorting-order if sorting else original-order BECAUSE-OF later POP buckets.reverse() # clear here self.buckets_ = buckets self.buffer_ = [] self.buffered_bsize_ = 0 # return buckets if len(self.buckets_) > 0: ret = self.buckets_.pop() return ret else: return None
def main(): utils.init("zlog", 1234) z = StatRecorder(True) times = Random.randint(100) for _ in range(times): with z.go(): z.record_kv("distr_n", Random.randint(10)) Helper.printd(z.summary(), "\n") # cc = Conf0() cc.update_from_args(["a:10", "y:www", "z.x:1"]) pass
def __init__(self, ignore_chs_label_mask, fdrop_chs: float, fdrop_par: float): self.ignore_chs_label_mask = ignore_chs_label_mask if fdrop_chs > 0.: self.chs_getter_f = self.get_fchs_dropped self.chs_rand_gen = Random.stream_bool(fdrop_chs) else: self.chs_getter_f = self.get_fchs_base if fdrop_par > 0.: self.par_getter_f = self.get_fpar_dropped self.par_rand_gen = Random.stream_bool(fdrop_par) else: self.par_getter_f = self.get_fpar_base
def _my_get_params_init(shape, init, lookup): # shape is a tuple of dims assert init in [ "default", "random", "glorot", "ortho", "gaussian", "zeros" ], "Unknown init method %s" % init poss_scale = COMMON_CONFIG.init_scale_l if lookup else COMMON_CONFIG.init_scale_nl if len(shape) == 1: # set bias to 0 return np.zeros((shape[0], )) else: # get defaults if init == "default": init = COMMON_CONFIG.init_def_l if lookup else COMMON_CONFIG.init_def_nl # specifics if init == "glorot": if lookup: # special for lookups shape_g = (shape[-1], ) # fan-out for lookup else: shape_g = shape w0 = Random.random_sample(shape, "winit") # [0,1) w0 = (w0 - 0.5) * (2 * (np.sqrt(3.0 * len(shape_g) / (sum(shape_g))))) return w0 * poss_scale elif init == "random": w0 = Random.random_sample(shape, "winit") # [0,1) w0 = (w0 - 0.5) * 2 return w0 * poss_scale elif init == "gaussian": w0 = Random.randn_clip(shape, "winit") return w0 * poss_scale elif init == "ortho": # todo(note): always assume init square matrices assert len(shape) == 2 and (shape[0] % shape[1] == 0 or shape[1] % shape[0] == 0 ), f"Bad shape {shape} for ortho_init!" orig_num = shape[0] // shape[1] if orig_num == 0: num = shape[1] // shape[0] else: num = orig_num if num == 1: w0 = Random.ortho_weight(shape[1], "winit") else: w0 = np.concatenate([ Random.ortho_weight(shape[1], "winit") for _ in range(num) ]) if orig_num == 0: # reverse it! w0 = np.transpose(w0) return w0 * poss_scale elif init == "zeros": return np.zeros(shape)
def shuffle_children_free(self): alpha = self.free_dist_alpha if alpha <= 0.: for one_list in self.children_list: if len(one_list) > 1: Random.shuffle(one_list) else: for i, one_list in enumerate(self.children_list): if len(one_list) > 1: values = [abs(i - z) * alpha for z in one_list] # TODO(+N): is it correct to use Gumble for ranking logprobs = np.log(MathHelper.softmax(values)) G = np.random.random_sample(len(logprobs)) ranking_values = np.log(-np.log(G)) - logprobs self.children_list[i] = [ one_list[z] for z in np.argsort(ranking_values) ]
def _input_f_obtain(self, edrop): if edrop <= 0.: return lambda x: x else: # todo(warn): replaced sample, maybe an efficient but approx. impl & only works for 2d edrop_rands = Random.random_sample( (int(self.dropout_wordceil * edrop), )) # [0,1) edrop_idxes = [int(self.dropout_wordceil * z) for z in edrop_rands] edrop_set = set(edrop_idxes) return lambda x: [0 if one in edrop_set else one for one in x] # drop to 0 if fall in the set
def prepare(self, insts: List[ParseInstance], training): conf = self.conf word_idxes = [z.words.idxes for z in insts] word_arr, input_mask = self.padder.pad(word_idxes) # [bsize, slen] # prepare for the masks input_word_mask = (Random.random_sample(word_arr.shape) < conf.mask_rate) & (input_mask > 0.) input_word_mask &= (word_arr >= conf.min_mask_rank) input_word_mask[:, 0] = False # no masking for special ROOT output_pred_mask = (input_word_mask & (word_arr <= conf.max_pred_rank)) return input_word_mask.astype(np.float32), output_pred_mask.astype( np.float32), word_arr
def __init__(self, base_streams: List[Streamer], budgets: List, stop_sidx: int): super().__init__() # ----- assert len(base_streams) == len(budgets) assert len(base_streams) > 0 self.base_streams = base_streams self.budgets = budgets self.stop_sidx = stop_sidx self.random_sampler = Random.stream(Random.random_sample) # status self.current_ptr = len(base_streams) - 1 self.current_budget = 0. self.stats = [0 for _ in self.base_streams]
def __init__(self, pc: BK.ParamCollection, bconf: BTConf, vpack: VocabPackage): super().__init__(pc, None, None) self.bconf = bconf # ===== Vocab ===== self.word_vocab = vpack.get_voc("word") self.char_vocab = vpack.get_voc("char") self.pos_vocab = vpack.get_voc("pos") # ===== Model ===== # embedding self.emb = self.add_sub_node( "emb", MyEmbedder(self.pc, bconf.emb_conf, vpack)) emb_output_dim = self.emb.get_output_dims()[0] # encoder0 for jpos # todo(note): will do nothing if not use_jpos bconf.jpos_conf._input_dim = emb_output_dim self.jpos_enc = self.add_sub_node( "enc0", JPosModule(self.pc, bconf.jpos_conf, self.pos_vocab)) enc0_output_dim = self.jpos_enc.get_output_dims()[0] # encoder # todo(0): feed compute-on-the-fly hp bconf.enc_conf._input_dim = enc0_output_dim self.enc = self.add_sub_node("enc", MyEncoder(self.pc, bconf.enc_conf)) self.enc_output_dim = self.enc.get_output_dims()[0] # ===== Input Specification ===== # inputs (word, char, pos) and vocabulary self.need_word = self.emb.has_word self.need_char = self.emb.has_char # todo(warn): currently only allow extra fields for POS self.need_pos = False if len(self.emb.extra_names) > 0: assert len( self.emb.extra_names) == 1 and self.emb.extra_names[0] == "pos" self.need_pos = True # todo(warn): currently only allow one aux field self.need_aux = False if len(self.emb.dim_auxes) > 0: assert len(self.emb.dim_auxes) == 1 self.need_aux = True # self.word_padder = DataPadder(2, pad_vals=self.word_vocab.pad, mask_range=2) self.char_padder = DataPadder(3, pad_lens=(0, 0, bconf.char_max_length), pad_vals=self.char_vocab.pad) self.pos_padder = DataPadder(2, pad_vals=self.pos_vocab.pad) # self.random_sample_stream = Random.stream(Random.random_sample)
def __init__(self, conf: M3AIEModelConf, vpack: IEVocabPackage): super().__init__(conf, vpack) # components self.cand_extractor: CandidateExtractor = self.decoders[0] self.arg_linker: ArgLinker = self.decoders[1] self.span_expander: ArgSpanExpander = self.decoders[2] # vocab self.hl_arg: HLabelVocab = self.vpack.get_voc("hl_arg") # lambdas for training self.lambda_cand = ScheduledValue("lambda_cand", conf.tconf.lambda_cand) self.lambda_arg = ScheduledValue("lambda_arg", conf.tconf.lambda_arg) self.lambda_span = ScheduledValue("lambda_span", conf.tconf.lambda_span) self.add_scheduled_values(self.lambda_cand) self.add_scheduled_values(self.lambda_arg) self.add_scheduled_values(self.lambda_span) # others self.random_sample_stream = Random.stream(Random.random_sample)
def __init__(self, conf: M3IEModelConf, vpack: IEVocabPackage): super().__init__(conf, vpack) # components self.ef_extractor: MentionExtractor = self.decoders[0] self.evt_extractor: MentionExtractor = self.decoders[1] self.arg_linker: ArgLinker = self.decoders[2] self.span_expander: ArgSpanExpander = self.decoders[3] # vocab self.hl_ef: HLabelVocab = self.vpack.get_voc("hl_ef") self.hl_evt: HLabelVocab = self.vpack.get_voc("hl_evt") self.hl_arg: HLabelVocab = self.vpack.get_voc("hl_arg") # lambdas for training self.lambda_mention_ef = ScheduledValue("lambda_mention_ef", conf.tconf.lambda_mention_ef) self.lambda_mention_evt = ScheduledValue("lambda_mention_evt", conf.tconf.lambda_mention_evt) self.lambda_affi_ef = ScheduledValue("lambda_affi_ef", conf.tconf.lambda_affi_ef) self.lambda_affi_evt = ScheduledValue("lambda_affi_evt", conf.tconf.lambda_affi_evt) self.lambda_arg = ScheduledValue("lambda_arg", conf.tconf.lambda_arg) self.lambda_span = ScheduledValue("lambda_span", conf.tconf.lambda_span) self.add_scheduled_values(self.lambda_mention_ef) self.add_scheduled_values(self.lambda_mention_evt) self.add_scheduled_values(self.lambda_affi_ef) self.add_scheduled_values(self.lambda_affi_evt) self.add_scheduled_values(self.lambda_arg) self.add_scheduled_values(self.lambda_span) # self.random_sample_stream = Random.stream(Random.random_sample) self.pos_ef_getter_set = set(conf.pos_ef_getter_list) # if conf.iconf.expand_span_method == "dep": self.static_span_expander = SpanExpanderDep() elif conf.iconf.expand_span_method == "ext": self.static_span_expander = SpanExpanderExternal( conf.iconf.expand_span_ext_file) else: zlog("No static span expander!") self.static_span_expander = None
def filter_pembed(self, wv: WordVectors, init_nohit=0., scale=1.0, assert_all_hit=True, set_init=True): if init_nohit <= 0.: get_nohit = lambda s: np.zeros((s, ), dtype=np.float32) else: get_nohit = lambda s: (Random.random_sample( (s, )).astype(np.float32) - 0.5) * (2 * init_nohit) # ret = [np.zeros((wv.embed_size, ), dtype=np.float32)] # init NIL is zero record = defaultdict(int) for ws in self.pools_hint_lexicon[1:]: res = np.zeros((wv.embed_size, ), dtype=np.float32) for w in ws: hit, norm_name, norm_w = wv.norm_until_hit(w) if hit: value = np.asarray(wv.get_vec(norm_w, norm=False), dtype=np.float32) record[norm_name] += 1 else: value = get_nohit(wv.embed_size) record["no-hit"] += 1 res += value ret.append(res) # assert not assert_all_hit or record[ "no-hit"] == 0, f"Filter-embed error: assert all-hit but get no-hit of {record['no-hit']}" zlog( f"Filter pre-trained Pembed: {record}, no-hit is inited with {init_nohit}." ) ret = np.asarray(ret, dtype=np.float32) * scale if set_init: self.set_pool_init(ret) return ret
def filter_embed(self, wv: 'WordVectors', init_nohit=0., scale=1.0, assert_all_hit=False): if init_nohit <= 0.: get_nohit = lambda s: np.zeros((s,), dtype=np.float32) else: get_nohit = lambda s: (Random.random_sample((s,)).astype(np.float32)-0.5) * (2*init_nohit) # ret = [] res = defaultdict(int) for w in self.final_words: hit, norm_name, norm_w = wv.norm_until_hit(w) if hit: value = np.asarray(wv.get_vec(norm_w, norm=False), dtype=np.float32) res[norm_name] += 1 else: value = get_nohit(wv.embed_size) # value = np.zeros((wv.embed_size,), dtype=np.float32) res["no-hit"] += 1 ret.append(value) # if assert_all_hit: zcheck(res["no-hit"]==0, f"Filter-embed error: assert all-hit but get no-hit of {res['no-hit']}") printing("Filter pre-trained embed: %s, no-hit is inited with %s." % (res, init_nohit)) return np.asarray(ret, dtype=np.float32) * scale
def __init__(self, pc: BK.ParamCollection, bconf: BTConf, tconf: 'BaseTrainingConf', vpack: VocabPackage): super().__init__(pc, None, None) self.bconf = bconf # ===== Vocab ===== self.word_vocab = vpack.get_voc("word") self.char_vocab = vpack.get_voc("char") self.lemma_vocab = vpack.get_voc("lemma") self.upos_vocab = vpack.get_voc("upos") self.ulabel_vocab = vpack.get_voc("ulabel") # ===== Model ===== # embedding self.emb = self.add_sub_node("emb", MyEmbedder(self.pc, bconf.emb_conf, vpack)) emb_output_dim = self.emb.get_output_dims()[0] self.emb_output_dim = emb_output_dim # doc hint self.use_doc_hint = bconf.use_doc_hint self.dh_combine_method = bconf.dh_combine_method if self.use_doc_hint: assert len(bconf.emb_conf.dim_auxes)>0 # todo(note): currently use the concat of them if input multiple layers bconf.dh_conf._input_dim = bconf.emb_conf.dim_auxes[0] # same as input bert dim bconf.dh_conf._output_dim = emb_output_dim # same as emb_output_dim self.dh_node = self.add_sub_node("dh", DocHintModule(pc, bconf.dh_conf)) else: self.dh_node = None # encoders # shared # todo(note): feed compute-on-the-fly hp bconf.enc_conf._input_dim = emb_output_dim self.enc = self.add_sub_node("enc", MyEncoder(self.pc, bconf.enc_conf)) tmp_enc_output_dim = self.enc.get_output_dims()[0] # privates bconf.enc_ef_conf._input_dim = tmp_enc_output_dim self.enc_ef = self.add_sub_node("enc_ef", MyEncoder(self.pc, bconf.enc_ef_conf)) self.enc_ef_output_dim = self.enc_ef.get_output_dims()[0] bconf.enc_evt_conf._input_dim = tmp_enc_output_dim self.enc_evt = self.add_sub_node("enc_evt", MyEncoder(self.pc, bconf.enc_evt_conf)) self.enc_evt_output_dim = self.enc_evt.get_output_dims()[0] # ===== Input Specification ===== # inputs (word, lemma, char, upos, ulabel) and vocabulary self.need_word = self.emb.has_word self.need_char = self.emb.has_char # extra fields # todo(warn): need to self.need_lemma = False self.need_upos = False self.need_ulabel = False for one_extra_name in self.emb.extra_names: if one_extra_name == "lemma": self.need_lemma = True elif one_extra_name == "upos": self.need_upos = True elif one_extra_name == "ulabel": self.need_ulabel = True else: raise NotImplementedError("UNK extra input name: " + one_extra_name) # todo(warn): currently only allow one aux field self.need_aux = False if len(self.emb.dim_auxes) > 0: assert len(self.emb.dim_auxes) == 1 self.need_aux = True # padders self.word_padder = DataPadder(2, pad_vals=self.word_vocab.pad, mask_range=2) self.char_padder = DataPadder(3, pad_lens=(0, 0, bconf.char_max_length), pad_vals=self.char_vocab.pad) self.lemma_padder = DataPadder(2, pad_vals=self.lemma_vocab.pad) self.upos_padder = DataPadder(2, pad_vals=self.upos_vocab.pad) self.ulabel_padder = DataPadder(2, pad_vals=self.ulabel_vocab.pad) # self.random_sample_stream = Random.stream(Random.random_sample) self.train_skip_noevt_rate = tconf.train_skip_noevt_rate self.train_skip_length = tconf.train_skip_length self.train_min_length = tconf.train_min_length self.test_min_length = tconf.test_min_length self.test_skip_noevt_rate = tconf.test_skip_noevt_rate self.train_sent_based = tconf.train_sent_based # assert not self.train_sent_based, "The basic model should not use this sent-level mode!"
def __init__(self, conf: MtlMlmModelConf, vpack: VocabPackage): super().__init__(conf) # for easier checking self.word_vocab = vpack.get_voc("word") # components self.embedder = self.add_node("emb", EmbedderNode(self.pc, conf.emb_conf, vpack)) self.inputter = Inputter(self.embedder, vpack) # not a node self.emb_out_dim = self.embedder.get_output_dims()[0] self.enc_attn_count = conf.default_attn_count if conf.enc_choice == "vrec": self.encoder = self.add_component("enc", VRecEncoder(self.pc, self.emb_out_dim, conf.venc_conf)) self.enc_attn_count = self.encoder.attn_count elif conf.enc_choice == "original": conf.oenc_conf._input_dim = self.emb_out_dim self.encoder = self.add_node("enc", MyEncoder(self.pc, conf.oenc_conf)) else: raise NotImplementedError() zlog(f"Finished building model's encoder {self.encoder}, all size is {self.encoder.count_allsize_parameters()}") self.enc_out_dim = self.encoder.get_output_dims()[0] # -- conf.rprep_conf._rprep_vr_conf.matt_conf.head_count = self.enc_attn_count # make head-count agree self.rpreper = self.add_node("rprep", RPrepNode(self.pc, self.enc_out_dim, conf.rprep_conf)) # -- self.lambda_agree = self.add_scheduled_value(ScheduledValue(f"agr:lambda", conf.lambda_agree)) self.agree_loss_f = EntropyHelper.get_method(conf.agree_loss_f) # -- self.masklm = self.add_component("mlm", MaskLMNode(self.pc, self.enc_out_dim, conf.mlm_conf, self.inputter)) self.plainlm = self.add_component("plm", PlainLMNode(self.pc, self.enc_out_dim, conf.plm_conf, self.inputter)) # todo(note): here we use attn as dim_pair, do not use pair if not using vrec!! self.orderpr = self.add_component("orp", OrderPredNode( self.pc, self.enc_out_dim, self.enc_attn_count, conf.orp_conf, self.inputter)) # ===== # pre-training pre-load point!! if conf.load_pretrain_model_name: zlog(f"At preload_pretrain point: Loading from {conf.load_pretrain_model_name}") self.pc.load(conf.load_pretrain_model_name, strict=False) # ===== self.dpar = self.add_component("dpar", DparG1Decoder( self.pc, self.enc_out_dim, self.enc_attn_count, conf.dpar_conf, self.inputter)) self.upos = self.add_component("upos", SeqLabNode( self.pc, "pos", self.enc_out_dim, self.conf.upos_conf, self.inputter)) if conf.do_ner: if conf.ner_use_crf: self.ner = self.add_component("ner", SeqCrfNode( self.pc, "ner", self.enc_out_dim, self.conf.ner_conf, self.inputter)) else: self.ner = self.add_component("ner", SeqLabNode( self.pc, "ner", self.enc_out_dim, self.conf.ner_conf, self.inputter)) else: self.ner = None # for pairwise reprs (no trainable params here!) self.rel_dist_embed = self.add_node("oremb", PosiEmbedding2(self.pc, n_dim=self.enc_attn_count, max_val=100)) self._prepr_f_attn_sum = lambda cache, rdist: BK.stack(cache.list_attn, 0).sum(0) if (len(cache.list_attn))>0 else None self._prepr_f_attn_avg = lambda cache, rdist: BK.stack(cache.list_attn, 0).mean(0) if (len(cache.list_attn))>0 else None self._prepr_f_attn_max = lambda cache, rdist: BK.stack(cache.list_attn, 0).max(0)[0] if (len(cache.list_attn))>0 else None self._prepr_f_attn_last = lambda cache, rdist: cache.list_attn[-1] if (len(cache.list_attn))>0 else None self._prepr_f_rdist = lambda cache, rdist: self._get_rel_dist_embed(rdist, False) self._prepr_f_rdist_abs = lambda cache, rdist: self._get_rel_dist_embed(rdist, True) self.prepr_f = getattr(self, "_prepr_f_"+conf.prepr_choice) # shortcut # -- self.testing_rand_gen = Random.create_sep_generator(conf.testing_rand_gen_seed) # especial gen for testing # ===== if conf.orp_loss_special: self.orderpr.add_node_special(self.masklm) # ===== # extra one!! self.aug_word2 = self.aug_encoder = self.aug_mixturer = None if conf.aug_word2: self.aug_word2 = self.add_node("aug2", AugWord2Node(self.pc, conf.emb_conf, vpack, "word2", conf.aug_word2_dim, self.emb_out_dim)) if conf.aug_word2_aug_encoder: assert conf.enc_choice == "vrec" self.aug_detach_drop = self.add_node("dd", Dropout(self.pc, (self.enc_out_dim,), fix_rate=conf.aug_detach_dropout)) self.aug_encoder = self.add_component("Aenc", VRecEncoder(self.pc, self.emb_out_dim, conf.venc_conf)) self.aug_mixturer = self.add_node("Amix", BertFeaturesWeightLayer(self.pc, conf.aug_detach_numlayer))
class ParseInstance(Instance): # ROOT_SYMBOL = VocabHelper.convert_special_pattern("r") def __init__(self, words, poses=None, heads=None, labels=None, code=""): super().__init__() # todo(0): always include special ROOT symbol _tmp_root_list = [ParseInstance.ROOT_SYMBOL] self.code = code if code: aug_words = get_aug_words(words, code) else: aug_words = words self.words = SeqFactor(_tmp_root_list + aug_words) self.chars = InputCharFactor([""] + words) # empty pad chars # if poses is not None: poses = _tmp_root_list + poses if heads is not None: heads = [0] + heads if labels is not None: labels = _tmp_root_list + labels self.poses = SeqFactor(poses) self.heads = SeqFactor(heads) self.labels = SeqFactor( labels ) # todo(warn): for td, no processing, directly use 0 as padding!! # ===== # for top-down parsing, but deprecated now self.children_mask_arr: np.ndarray = None # [N, N] # self.children_se: List[Set] = None self.children_list: List[List] = None # list of children self.descendant_list: List[List] = None # list of all descendants # self.free_dist_alpha: float = None # ===== # other helpful info (calculate in need) self._unprojs = None self._sibs = None self._gps = None # all children should be aranged l2r self._children_left = None self._children_right = None self._children_all = None # predictions (optional prediction probs) self.pred_poses = SeqFactor(None) self.pred_pos_scores = SeqFactor(None) self.pred_heads = SeqFactor(None) self.pred_labels = SeqFactor(None) self.pred_par_scores = SeqFactor(None) self.pred_miscs = SeqFactor(None) # for real length self.length = InstanceHelper.check_equal_length( [self.words, self.chars, self.poses, self.heads, self.labels]) - 1 # extra inputs, for example, those from mbert self.extra_features = {"aux_repr": None} # extra preds info self.extra_pred_misc = {} def __len__(self): return self.length # ===== # helpful functions @staticmethod def get_children(heads): cur_len = len(heads) children_left, children_right = [[] for _ in range(cur_len) ], [[] for _ in range(cur_len)] for i in range(1, cur_len): h = heads[i] if i < h: children_left[h].append(i) else: children_right[h].append(i) return children_left, children_right @staticmethod def get_sibs(children_left, children_right): # get sibling list: sided nearest sibling (self if single) sibs = [-1] * len(children_left) for vs in children_left: if len(vs) > 0: prev_s = vs[-1] for cur_m in reversed(vs): sibs[cur_m] = prev_s prev_s = cur_m for vs in children_right: if len(vs) > 0: prev_s = vs[0] for cur_m in vs: sibs[cur_m] = prev_s prev_s = cur_m # todo(+2): how about 0? currently set to 0 sibs[0] = 0 return sibs @staticmethod def get_gps(heads): # get grandparent list # todo(+2): how about 0? currently will be 0 return [heads[x] for x in heads] # ===== # other properties @property def unprojs(self): # todo(warn): calculate the unproj situations: 0 proj edge, 1 unproj edge if self._unprojs is None: self._unprojs = [0] + ConlluParse.calc_crossed(self.heads.vals[1:]) return self._unprojs @property def sibs(self): if self._sibs is None: self._sibs = ParseInstance.get_sibs(self.children_left, self.children_right) return self._sibs @property def gps(self): if self._gps is None: heads = self.heads.vals self._gps = ParseInstance.get_gps(heads) return self._gps @property def children_left(self): if self._children_left is None: heads = self.heads.vals self._children_left, self._children_right = ParseInstance.get_children( heads) return self._children_left @property def children_right(self): if self._children_right is None: heads = self.heads.vals self._children_left, self._children_right = ParseInstance.get_children( heads) return self._children_right @property def children_all(self): if self._children_all is None: self._children_all = [ a + b for a, b in zip(self.children_left, self.children_right) ] return self._children_all # ===== # special processing for training # for h-local-loss def get_children_mask_arr(self, add_self_if_leaf=True): if self.children_mask_arr is None: # on need heads = self.heads.vals the_len = len(heads) masks = np.zeros([the_len, the_len], dtype=np.float32) # exclude root for m, h in enumerate(heads[1:], 1): masks[h, m] = 1. # this one is [h,m] if add_self_if_leaf: for i in range(the_len): if sum(masks[i]) == 0.: masks[i, i] = 1. self.children_mask_arr = masks return self.children_mask_arr # set once when reading! def set_children_info(self, oracle_strategy, label_ranking_dict: Dict = None, free_dist_alpha: float = 0.): heads = self.heads.vals the_len = len(heads) # self.children_set = [set() for _ in range(the_len)] self.children_list = [[] for _ in range(the_len)] tmp_descendant_list = [None for _ in range(the_len)] # exclude root for m, h in enumerate(heads[1:], 1): # self.children_set[h].add(m) self.children_list[h].append(m) # l2r order # re-arrange list order (left -> right) if oracle_strategy == "i2o": for h in range(the_len): self.children_list[h].sort(key=lambda x: -x if x < h else x) elif oracle_strategy == "label": # todo(warn): only use first level! level0_labels = [z.split(":")[0] for z in self.labels.vals] for h in range(the_len): self.children_list[h].sort( key=lambda x: label_ranking_dict[level0_labels[x]]) elif oracle_strategy == "n2f": self.shuffle_children_n2f() elif oracle_strategy == "free": self.free_dist_alpha = free_dist_alpha self.shuffle_children_free() else: assert oracle_strategy == "l2r" pass # todo(+N): does the order of descendant list matter? # todo(+N): depth-first or breadth-first? (currently select the latter) # recursively get descendant list: do this # ===== def _recursive_add(cur_n): cur_children = self.children_list[cur_n] # List[int] for i in cur_children: _recursive_add(i) new_dlist = [cur_children] cur_layer = 0 while True: another_layer = Helper.join_list( tmp_descendant_list[i][cur_layer] if cur_layer < len(tmp_descendant_list[i]) else [] for i in cur_children) if len(another_layer) == 0: break new_dlist.append(another_layer) cur_layer += 1 tmp_descendant_list[cur_n] = new_dlist # ===== _recursive_add(0) self.descendant_list = [ Helper.join_list(tmp_descendant_list[i]) for i in range(the_len) ] # ===== # todo(warn): does not shuffle descendant list since this can disturb the depth-order # shuffle once before each running for free mode def shuffle_children_free(self): alpha = self.free_dist_alpha if alpha <= 0.: for one_list in self.children_list: if len(one_list) > 1: Random.shuffle(one_list) else: for i, one_list in enumerate(self.children_list): if len(one_list) > 1: values = [abs(i - z) * alpha for z in one_list] # TODO(+N): is it correct to use Gumble for ranking logprobs = np.log(MathHelper.softmax(values)) G = np.random.random_sample(len(logprobs)) ranking_values = np.log(-np.log(G)) - logprobs self.children_list[i] = [ one_list[z] for z in np.argsort(ranking_values) ] INST_RAND = Random.stream(Random.random_sample) # shuffle for n2f mode def shuffle_children_n2f(self): rr = ParseInstance.INST_RAND for i, one_list in enumerate(self.children_list): if len(one_list) > 1: # todo(warn): use small random to break tie values = [abs(i - z) + next(rr) for z in one_list] self.children_list[i] = [ one_list[z] for z in np.argsort(values) ] # ===== # todo(warn): exclude artificial root node def get_real_values_select(self, selections): ret = [] for name in selections: zv = getattr(self, name) if zv.has_vals(): ret.append(zv.vals[1:]) else: ret.append(None) return ret def get_real_values_all(self): ret = {} for zn, zv in vars(self).items(): if isinstance(zv, SeqFactor): if zv.has_vals(): ret[zn] = zv.vals[1:] else: ret[zn] = None return ret
def run(self, train_stream, dev_streams): rconf = self.rconf last_report_uidx, last_dev_uidx = 0, 0 if rconf.validate_first: self._validate(dev_streams) # ===== # for lrate warmup and annealing if rconf.lrate_warmup < 0: # calculate epochs steps_per_epoch = 0 for _ in train_stream: steps_per_epoch += 1 n_epoch = -rconf.lrate_warmup n_steps = n_epoch * steps_per_epoch utils.zlog(f"Calculating warmup steps for {n_epoch} epochs: {steps_per_epoch} steps per epoch.") elif rconf.lrate_warmup > 0: n_steps = rconf.lrate_warmup else: n_steps = 0 max_lrate = self.lrate.value # final_lrate = lrate * anneal_factor * (step)^lrate_anneal_alpha # final_lrate(n_steps) = max_lrate lrate_anneal_alpha = rconf.lrate_anneal_alpha if n_steps > 0: anneal_factor = 1. / (n_steps**lrate_anneal_alpha) else: anneal_factor = 1. self.lrate_warmup_steps = n_steps utils.zlog(f"For lrate-warmup, will go with the first {n_steps} steps up to {max_lrate}, " f"then anneal with lrate*{anneal_factor}*step^{lrate_anneal_alpha}") # ===== while not self._finished(): # todo(note): epoch start from 1!! self._tp.eidx += 1 with Timer(tag="Train-Iter", info="Iter %s" % self._tp.eidx, print_date=True) as et: self._adjust_scheduled_values() # adjust at the start of each epoch act_lrate = 0. # for batches for insts in train_stream: # skip this batch if Random.random_bool(rconf.skip_batch): continue # train on batch, return a dictionary # possibly split batch to save memory self._fb_batch(insts) self._tp.uidx += 1 # get the effective lrate act_lrate = self.lrate.value if self._tp.uidx < self.lrate_warmup_steps: act_lrate *= (self._tp.uidx / self.lrate_warmup_steps) else: act_lrate *= anneal_factor * (self._tp.uidx**lrate_anneal_alpha) # self._run_update(act_lrate, 1.) # report on training process if rconf.flag_verbose and (self._tp.uidx-last_report_uidx)>=rconf.report_freq: utils.zlog(f"Current act_lrate is {act_lrate}.") self._run_train_report() last_report_uidx = self._tp.uidx # time for validating if (self._tp.uidx-last_dev_uidx)>=rconf.valid_freq: self._validate(dev_streams) last_dev_uidx = self._tp.uidx last_report_uidx = self._tp.uidx # todo(+N): do we need to adjust sv at a finer grained? self._adjust_scheduled_values() # adjust after uidx validation if self._finished(): break # validate at the end of epoch? utils.zlog(f"End of epoch: Current act_lrate is {act_lrate}.") if rconf.validate_epoch: self._validate(dev_streams) last_dev_uidx = self._tp.uidx last_report_uidx = self._tp.uidx utils.zlog("") utils.zlog("zzzzzfinal: After training, the best point is: %s." % (str(self._tp.info_save_best())))
def reset(self): self.ptr = 0 if self.shuffle: Random.shuffle(self.c, "data")
def get_random_bool_streamer(true_rate, batch_size=1024): return RandomStreamer( lambda size: Random.random_bool(true_rate, size, "data"), batch_size)
def __init__(self, pc: BK.ParamCollection, bconf: Berter2Conf): super().__init__(pc, None, None) self.bconf = bconf self.model_name = bconf.bert2_model zlog( f"Loading pre-trained bert model for Berter2 of {self.model_name}") # Load pretrained model/tokenizer self.tokenizer = BertTokenizer.from_pretrained( self.model_name, do_lower_case=bconf.bert2_lower_case, cache_dir=None if (not bconf.bert2_cache_dir) else bconf.bert2_cache_dir) self.model = BertModel.from_pretrained( self.model_name, output_hidden_states=True, cache_dir=None if (not bconf.bert2_cache_dir) else bconf.bert2_cache_dir) zlog(f"Load done, move to default device {BK.DEFAULT_DEVICE}") BK.to_device(self.model) # ===== # zero padding embeddings? if bconf.bert2_zero_pademb: with BK.no_grad_env(): # todo(warn): specific!! zlog( f"Unusual operation: make bert's padding embedding (idx0) zero!!" ) self.model.embeddings.word_embeddings.weight[0].fill_(0.) # ===== # check trainable ones and add parameters # todo(+N): this part is specific and looking into the lib, can break in further versions!! # the idx of layer is [1(embed)] + [N(enc)], that is, layer0 is the output of embeddings self.hidden_size = self.model.config.hidden_size self.num_bert_layers = len( self.model.encoder.layer) + 1 # +1 for embeddings self.output_layers = [ i if i >= 0 else (self.num_bert_layers + i) for i in bconf.bert2_output_layers ] self.layer_is_output = [False] * self.num_bert_layers for i in self.output_layers: self.layer_is_output[i] = True # the highest used layer self.output_max_layer = max( self.output_layers) if len(self.output_layers) > 0 else -1 # from max-layer down self.trainable_layers = list(range(self.output_max_layer, -1, -1))[:bconf.bert2_trainable_layers] # the lowest trainable layer self.trainable_min_layer = min(self.trainable_layers) if len( self.trainable_layers) > 0 else (self.output_max_layer + 1) zlog(f"Build Berter2: {self}") # add parameters prefix_name = self.pc.nnc_name(self.name, True) + "/" for layer_idx in self.trainable_layers: if layer_idx == 0: # add the embedding layer infix_name = "embed" named_params = self.pc.param_add_external( prefix_name + infix_name, self.model.embeddings) else: # here we should use the original (-1) index infix_name = "enc" + str(layer_idx) named_params = self.pc.param_add_external( prefix_name + infix_name, self.model.encoder.layer[layer_idx - 1]) # add to self.params for one_name, one_param in named_params: assert f"{infix_name}_{one_name}" not in self.params self.params[f"{infix_name}_{one_name}"] = one_param # for dropout/mask input self.random_sample_stream = Random.stream(Random.random_sample) # ===== # for other inputs; todo(note): still, 0 means all-zero embedding self.other_embeds = [ self.add_sub_node( "OE", Embedding(self.pc, vsize, self.hidden_size, fix_row0=True)) for vsize in bconf.bert2_other_input_vsizes ] # ===== # for output if bconf.bert2_output_mode == "layered": self.output_f = lambda x: x self.output_dims = ( self.hidden_size, len(self.output_layers), ) elif bconf.bert2_output_mode == "concat": self.output_f = lambda x: x.view(BK.get_shape(x)[:-2] + [-1] ) # combine the last two dims self.output_dims = (self.hidden_size * len(self.output_layers), ) elif bconf.bert2_output_mode == "weighted": self.output_f = self.add_sub_node( "wb", BertFeaturesWeightLayer(pc, len(self.output_layers))) self.output_dims = (self.hidden_size, ) else: raise NotImplementedError( f"UNK mode for bert2 output: {bconf.bert2_output_mode}")
# in fact, inplaced if not wrapping model specific preparer return inst # def index_stream(in_stream, vpack, cached, cache_shuffle, inst_preparer): i_stream = IndexerStreamer(in_stream, vpack, inst_preparer) if cached: return InstCacher(i_stream, shuffle=cache_shuffle) else: return i_stream # for arrange batches: # todo(warn): batch_size means number of sents, no sorting by #sent _BS_sample_stream = Random.stream(Random.random_sample) # def batch_stream(in_stream, ticonf, training): # ===== def _count_train_sents(d): # todo(note): here, we include only "hit" sents for training, but maybe the alternative can be also ok, especially for doc-hint? valid_sents = [ x for x in d.sents if x.length < ticonf.train_skip_length and x.length >= ticonf.train_min_length ] return len(valid_sents) - len([ x for x in valid_sents if len(x.events) == 0 ]) * ticonf.train_skip_noevt_rate
def main(args): conf = PsConf() conf.update_from_args(args) # read the data path_train, path_dev, path_test = [ get_data(z) for z in [conf.train, conf.dev, conf.test] ] pretrain_file = get_data(conf.pretrain_file) train_insts = list(get_data_reader(path_train, "conllu", "", False, "")) dev_insts = list(get_data_reader(path_dev, "conllu", "", False, "")) test_insts = list(get_data_reader(path_test, "conllu", "", False, "")) use_pos = conf.use_pos num_pieces = conf.pieces max_epoch = conf.max_epoch reg_scores_lambda = conf.reg_scores_lambda cur_run = conf.cur_run zlog( f"Read from train/dev/test: {len(train_insts)}/{len(dev_insts)}/{len(test_insts)}, split train into {num_pieces}" ) # others RGPU = os.getenv("RGPU", "") # first train on all: 1. get dict (only build once), 2: score dev/test with Timer("train", "Train-ALL"): cur_conf, cur_model = "_conf.all", "_model.all" cur_load_model = cur_model + ".best" cur_base_opt = get_base_opt(cur_conf, cur_model, use_pos, True, max_epoch, reg_scores_lambda, cur_run) system(get_train_cmd(RGPU, cur_base_opt, path_train, path_dev, path_test, pretrain_file), pp=True) system(get_score_cmd(RGPU, cur_conf, cur_load_model, path_dev, "dev.scores.pkl"), pp=True) system(get_score_cmd(RGPU, cur_conf, cur_load_model, path_test, "test.scores.pkl"), pp=True) # then training on the pieces (leaving one out) # first split into pieces Random.shuffle(train_insts) piece_length = math.ceil(len(train_insts) / num_pieces) train_pieces = [] cur_idx = 0 while cur_idx < len(train_insts): next_idx = min(len(train_insts), cur_idx + piece_length) train_pieces.append(train_insts[cur_idx:next_idx]) cur_idx = next_idx zlog(f"Split training into {num_pieces}: {[len(x) for x in train_pieces]}") assert len(train_pieces) == num_pieces # next train each of the pieces for piece_id in range(num_pieces): with Timer("train", f"Train-{piece_id}"): # get current training pieces cur_training_insts = Helper.join_list( [train_pieces[x] for x in range(num_pieces) if x != piece_id]) cur_testing_insts = train_pieces[piece_id] # write files cur_path_train, cur_path_test = f"tmp.train.{piece_id}.conllu", f"tmp.test.{piece_id}.conllu" write_insts(cur_path_train, cur_training_insts) write_insts(cur_path_test, cur_testing_insts) cur_conf, cur_model = f"_conf.{piece_id}", f"_model.{piece_id}" cur_load_model = cur_model + ".best" # no build dict, reuse previous cur_base_opt = get_base_opt(cur_conf, cur_model, use_pos, False, max_epoch, reg_scores_lambda, cur_run) system(get_train_cmd(RGPU, cur_base_opt, cur_path_train, path_dev, cur_path_test, pretrain_file), pp=True) system(get_score_cmd(RGPU, cur_conf, cur_load_model, cur_path_test, f"tmp.test.{piece_id}.scores.pkl"), pp=True) # finally put them in order all_results = [] for piece_id in range(num_pieces): all_results.extend(read_results(f"tmp.test.{piece_id}.scores.pkl")) # reorder to the original order orig_indexes = [z.inst_idx for z in train_insts] orig_results = [None] * len(orig_indexes) for new_idx, orig_idx in enumerate(orig_indexes): assert orig_results[orig_idx] is None orig_results[orig_idx] = all_results[new_idx] # saving write_results("train.scores.pkl", orig_results) zlog("The end.")