def __init__(self): # embedding and encoding layer # self.emb_conf = EmbedConf().init_from_kwargs(dim_word=300, dim_char=30, dim_extras='[300,50]', # extra_names='["lemma","upos"]', emb_proj_dim=512) # first only use word and char self.emb_conf = EmbedConf().init_from_kwargs(dim_word=300, dim_char=30) # doc hint? self.use_doc_hint = False self.dh_conf = DocHintConf() self.dh_combine_method = "both" # how to combine features from dh: add=adding, both=appending both ends, cls=replace-i0 # shared encoder and private encoders self.enc_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_hidden=512, enc_rnn_layer=1) self.enc_ef_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_hidden=512, enc_rnn_layer=0) self.enc_evt_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_hidden=512, enc_rnn_layer=0) # stop certain gradients, detach input for the specific encoders? self.enc_ef_input_detach = False self.enc_evt_input_detach = False # ===== # other options # inputs self.char_max_length = 45 # dropouts self.drop_embed = 0.33 self.dropmd_embed = 0. self.drop_hidden = 0.33 self.gdrop_rnn = 0.33 # gdrop (always fixed for recurrent connections) self.idrop_rnn = 0.33 # idrop for rnn self.fix_drop = True # fix drop for one run for each dropout self.singleton_unk = 0.5 # replace singleton words with UNK when training self.singleton_thr = 2 # only replace singleton if freq(val) <= this (also decay with 1/freq) # ===== # how to batch and deal with long sentences in doc # todo(note): decide to encode long sentences by themselves rather than split and merge self.enc_bucket_range = 10 # bucket the sentences (but need to do multiple calculation); also not the best choice since not smart at detecting splitting/bucketing points
def __init__(self): # embedding self.emb_conf = EmbedConf().init_from_kwargs( dim_word=0) # by default no embedding inputs # bert self.bert_conf = Berter2Conf().init_from_kwargs( bert2_retinc_cls=True, bert2_training_mask_rate=0., bert2_output_mode="concat") # middle layer to reduce dim self.middle_dim = 0 # 0 means no middle one # encoder self.enc_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_hidden=1024, enc_rnn_layer=0) # ===== # other options # inputs self.char_max_length = 45 # dropouts self.drop_embed = 0.33 self.dropmd_embed = 0. self.drop_hidden = 0.33 self.gdrop_rnn = 0.33 # gdrop (always fixed for recurrent connections) self.idrop_rnn = 0.33 # idrop for rnn self.fix_drop = True # fix drop for one run for each dropout
def __init__(self): # dimension specifications self._input_dim = 0 # dim of sent-embed CLS self._output_dim = 0 # dim of sent-encoder's input # 1. doc encoding self.enc_doc_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_hidden=512, enc_rnn_layer=0) # 2. doc hints # keyword model self.kconf = KeyWordConf() self.katt_conf = AttConf().init_from_kwargs(d_kqv=128, head_count=2, out_act="elu") # keyword part self.use_keyword = True self.keyword_min_count = 2 # indoc-freq should >=this self.keyword_min_rank = 100 # global-rank should >=this self.num_keyword_general = 3 self.num_keyword_noun = 3 self.num_keyword_verb = 3 # keysent part self.use_keysent = True self.num_keysent_first = 3 self.num_keysent_top = 3 self.keysent_min_len = 1 # only include sent >= this len self.keysent_topktok_score = 10 # how many topk-tokens to include when calculating top scored sent
def __init__(self): super().__init__() # by default no encoders self.enc_conf.enc_rnn_layer = 0 # ===== self.m2e_use_basic = False # use basic encoder (at least embeddings) self.m2e_use_basic_dep = False # use the special dep features as a replacement of basic_plus self.m2e_use_basic_plus = True # use basic encoder with the new mode self.bert_conf = Berter2Conf() self.ms_extend_step = 0 # multi-sent extending for each side (win=2*mse+1) for encoding self.ms_extend_budget = 510 # overall subword budget, do not make it too large self.benc_bucket_msize = Constants.INT_PRAC_MAX # max bsize for one bucket self.benc_bucket_range = 10 # similar to enc_bucket_range, but for bert enc # extra encoder over bert? self.m3_enc_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_hidden=512, enc_rnn_layer=0) # simple dep based basic part self.dep_output_dim = 512 self.dep_label_dim = 50 # ===== self.bert_use_center_typeids = True # current(center) sent 1, others 0 self.bert_use_special_typeids = False # pred as 0 # other inputs for bert self.bert_other_inputs = [ ] # names of factors in Sentence (direct field name like 'uposes', 'entity_labels', ...)
def __init__(self): self._input_dim = -1 # to be filled self.jpos_multitask = False # the overall switch for multitask self.jpos_lambda = 0. # lambda(switch) for training: loss_parsing + lambda*loss_pos self.jpos_decode = True # switch for decoding self.jpos_enc = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_hidden=1024, enc_rnn_layer=0) self.jpos_stacking = True # stacking as inputs (using adding here)
def __init__(self): super().__init__() self._input_dim = 0 self._lexi_dim = 0 # the first pass selector self.use_selector = False self.sel_conf = NodeSelectorConf() # the specific encoder # basic modeling mode 1) especially model for each cand position (DMXNN); 2) one general encoder self.dmxnn = False # specific to dmxnn mode self.posi_dim = 5 # dimension for PF self.posi_cut = 20 # [-cut, cut] # encoder self.e_enc = EncConf().init_from_kwargs(enc_hidden=300, enc_cnn_layer=0, enc_cnn_windows='[3]', enc_rnn_layer=0, no_final_dropout=True) # before labeler self.use_lab_f = True self.lab_f_use_lexi = False # also include lexi repr for lab_f self.lab_f_act = "elu" # whether exclude nil self.exclude_nil = True # selection for training (only if use_selector=False) self.train_ng_ratio = 1. # neg sample's ratio to gold's count self.train_min_num = 1. # min number per sentence self.train_min_rate = 0.5 # a min selecting rate for neg sentences self.train_min_rate_s2 = 0.5 # sampling for the secondary type, within gold_mask1 # correct labels for training return self.train_gold_corr = True # loss weights for NodeSelector and NodeExtractor self.lambda_ns = 0.5 # if use_selector=True self.lambda_ne = 1. self.lambda_ne2 = 1. # secondary type # secondary type self.use_secondary_type = False # the second type for each trigger self.sectype_reuse_hl = False # use the same predictor? self.sectype_t2ift1 = True # only output t2 if there are t1 self.sectype_noback_enc = False # stop grad to enc
def __init__(self): # embedding and encoding layer self.emb_conf = EmbedConf().init_from_kwargs(dim_word=300, dim_char=30, dim_extras='[50]', extra_names='["pos"]') self.enc_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_hidden=1024, enc_rnn_layer=3) # joint pos encoder (layer0) self.jpos_conf = JPosConf() # ===== # other options # inputs self.char_max_length = 45 # dropouts self.drop_embed = 0.33 self.dropmd_embed = 0. self.drop_hidden = 0.33 self.gdrop_rnn = 0.33 # gdrop (always fixed for recurrent connections) self.idrop_rnn = 0.33 # idrop for rnn self.fix_drop = True # fix drop for one run for each dropout self.singleton_unk = 0.5 # replace singleton words with UNK when training self.singleton_thr = 2 # only replace singleton if freq(val) <= this (also decay with 1/freq)
def __init__(self): super().__init__() # components self.emb_conf = EmbedderNodeConf() self.mlm_conf = MaskLMNodeConf() self.plm_conf = PlainLMNodeConf() self.orp_conf = OrderPredNodeConf() self.orp_loss_special = False # special loss for orp # non-pretraining parts self.dpar_conf = DparG1DecoderConf() self.upos_conf = SeqLabNodeConf() self.ner_conf = SeqCrfNodeConf() self.do_ner = False # an extra flag self.ner_use_crf = True # whether use crf for ner? # where pairwise repr comes from self.default_attn_count = 128 # by default, setting this to what? self.prepr_choice = "attn_max" # rdist, rdist_abs, attn_sum, attn_avg, attn_max, attn_last # which encoder to use self.enc_choice = "vrec" # by default, use the new one self.venc_conf = VRecEncoderConf() self.oenc_conf = EncConf().init_from_kwargs(enc_hidden=512, enc_rnn_layer=0) # another small encoder self.rprep_conf = RPrepConf() # agreement module self.lambda_agree = SVConf().init_from_kwargs(val=0., which_idx="eidx", mode="none") self.agree_loss_f = "js" # separate generator seed especially for testing mask self.testing_rand_gen_seed = 0 self.testing_get_attns = False # todo(+N): should make this into another conf # for training and testing self.no_build_dict = False # -- length thresh self.train_max_length = 80 self.train_min_length = 5 self.test_single_length = 80 # -- batch size self.train_shuffle = True self.train_batch_size = 80 self.train_inst_copy = 1 # how many times to copy the training insts (for different masks) self.test_batch_size = 32 # -- maxibatch for BatchArranger self.train_maxibatch_size = 20 # cache for this number of batches in BatchArranger self.test_maxibatch_size = -1 # -1 means cache all # -- batch_size_f self.train_batch_on_len = False # whether use sent length as budgets rather then sent count self.test_batch_on_len = False # ----- self.train_preload_model = False self.train_preload_process = False # interactive testing self.test_interactive = False # special mode # load pre-training model? self.load_pretrain_model_name = "" # ----- # special mode with extra word2 # todo(+N): ugly patch! self.aug_word2 = False self.aug_word2_dim = 300 self.aug_word2_pretrain = "" self.aug_word2_save_dir = "" self.aug_word2_aug_encoder = True # special model's special mode, feature-based original encoder and stack another one!! self.aug_detach_ratio = 0.5 self.aug_detach_dropout = 0.33 self.aug_detach_numlayer = 6 # use mixture of how many last layers?
def main(): np.random.seed(1234) NUM_POS = 10 # build vocabs reader = TextReader("./test_utils.py") vb_word = VocabBuilder("w") vb_char = VocabBuilder("c") for one in reader: vb_word.feed_stream(one.tokens) vb_char.feed_stream((c for w in one.tokens for c in w)) voc_word = vb_word.finish() voc_char = vb_char.finish() voc_pos = VocabBuilder.build_from_stream(range(NUM_POS), name="pos") vpack = VocabPackage({ "word": voc_word, "char": voc_char, "pos": voc_pos }, {"word": None}) # build model pc = BK.ParamCollection() conf_emb = EmbedConf().init_from_kwargs(init_words_from_pretrain=False, dim_char=10, dim_posi=10, emb_proj_dim=400, dim_extras="50", extra_names="pos") conf_emb.do_validate() mod_emb = MyEmbedder(pc, conf_emb, vpack) conf_enc = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_cnn_layer=1, enc_att_layer=1) conf_enc._input_dim = mod_emb.get_output_dims()[0] mod_enc = MyEncoder(pc, conf_enc) enc_output_dim = mod_enc.get_output_dims()[0] mod_scorer = BiAffineScorer(pc, enc_output_dim, enc_output_dim, 10) # build data word_padder = DataPadder(2, pad_lens=(0, 50), mask_range=2) char_padder = DataPadder(3, pad_lens=(0, 50, 20)) word_idxes = [] char_idxes = [] pos_idxes = [] for toks in reader: one_words = [] one_chars = [] for w in toks.tokens: one_words.append(voc_word.get_else_unk(w)) one_chars.append([voc_char.get_else_unk(c) for c in w]) word_idxes.append(one_words) char_idxes.append(one_chars) pos_idxes.append( np.random.randint(voc_pos.trg_len(), size=len(one_words)) + 1) # pred->trg word_arr, word_mask_arr = word_padder.pad(word_idxes) pos_arr, _ = word_padder.pad(pos_idxes) char_arr, _ = char_padder.pad(char_idxes) # # run rop = layers.RefreshOptions(hdrop=0.2, gdrop=0.2, fix_drop=True) for _ in range(5): mod_emb.refresh(rop) mod_enc.refresh(rop) mod_scorer.refresh(rop) # expr_emb = mod_emb(word_arr, char_arr, [pos_arr]) zlog(BK.get_shape(expr_emb)) expr_enc = mod_enc(expr_emb, word_mask_arr) zlog(BK.get_shape(expr_enc)) # mask_expr = BK.input_real(word_mask_arr) score0 = mod_scorer.paired_score(expr_enc, expr_enc, mask_expr, mask_expr) score1 = mod_scorer.plain_score(expr_enc.unsqueeze(-2), expr_enc.unsqueeze(-3), mask_expr.unsqueeze(-1), mask_expr.unsqueeze(-2)) # zmiss = float(BK.avg(score0 - score1)) assert zmiss < 0.0001 zlog("OK") pass