Esempio n. 1
0
 def __init__(self):
     # embedding and encoding layer
     # self.emb_conf = EmbedConf().init_from_kwargs(dim_word=300, dim_char=30, dim_extras='[300,50]',
     #                                              extra_names='["lemma","upos"]', emb_proj_dim=512)
     # first only use word and char
     self.emb_conf = EmbedConf().init_from_kwargs(dim_word=300, dim_char=30)
     # doc hint?
     self.use_doc_hint = False
     self.dh_conf = DocHintConf()
     self.dh_combine_method = "both"  # how to combine features from dh: add=adding, both=appending both ends, cls=replace-i0
     # shared encoder and private encoders
     self.enc_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_hidden=512, enc_rnn_layer=1)
     self.enc_ef_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_hidden=512, enc_rnn_layer=0)
     self.enc_evt_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2", enc_hidden=512, enc_rnn_layer=0)
     # stop certain gradients, detach input for the specific encoders?
     self.enc_ef_input_detach = False
     self.enc_evt_input_detach = False
     # =====
     # other options
     # inputs
     self.char_max_length = 45
     # dropouts
     self.drop_embed = 0.33
     self.dropmd_embed = 0.
     self.drop_hidden = 0.33
     self.gdrop_rnn = 0.33            # gdrop (always fixed for recurrent connections)
     self.idrop_rnn = 0.33            # idrop for rnn
     self.fix_drop = True            # fix drop for one run for each dropout
     self.singleton_unk = 0.5        # replace singleton words with UNK when training
     self.singleton_thr = 2          # only replace singleton if freq(val) <= this (also decay with 1/freq)
     # =====
     # how to batch and deal with long sentences in doc
     # todo(note): decide to encode long sentences by themselves rather than split and merge
     self.enc_bucket_range = 10  # bucket the sentences (but need to do multiple calculation); also not the best choice since not smart at detecting splitting/bucketing points
Esempio n. 2
0
 def __init__(self):
     # embedding
     self.emb_conf = EmbedConf().init_from_kwargs(
         dim_word=0)  # by default no embedding inputs
     # bert
     self.bert_conf = Berter2Conf().init_from_kwargs(
         bert2_retinc_cls=True,
         bert2_training_mask_rate=0.,
         bert2_output_mode="concat")
     # middle layer to reduce dim
     self.middle_dim = 0  # 0 means no middle one
     # encoder
     self.enc_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2",
                                                enc_hidden=1024,
                                                enc_rnn_layer=0)
     # =====
     # other options
     # inputs
     self.char_max_length = 45
     # dropouts
     self.drop_embed = 0.33
     self.dropmd_embed = 0.
     self.drop_hidden = 0.33
     self.gdrop_rnn = 0.33  # gdrop (always fixed for recurrent connections)
     self.idrop_rnn = 0.33  # idrop for rnn
     self.fix_drop = True  # fix drop for one run for each dropout
Esempio n. 3
0
 def __init__(self):
     # dimension specifications
     self._input_dim = 0  # dim of sent-embed CLS
     self._output_dim = 0  # dim of sent-encoder's input
     # 1. doc encoding
     self.enc_doc_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2",
                                                    enc_hidden=512,
                                                    enc_rnn_layer=0)
     # 2. doc hints
     # keyword model
     self.kconf = KeyWordConf()
     self.katt_conf = AttConf().init_from_kwargs(d_kqv=128,
                                                 head_count=2,
                                                 out_act="elu")
     # keyword part
     self.use_keyword = True
     self.keyword_min_count = 2  # indoc-freq should >=this
     self.keyword_min_rank = 100  # global-rank should >=this
     self.num_keyword_general = 3
     self.num_keyword_noun = 3
     self.num_keyword_verb = 3
     # keysent part
     self.use_keysent = True
     self.num_keysent_first = 3
     self.num_keysent_top = 3
     self.keysent_min_len = 1  # only include sent >= this len
     self.keysent_topktok_score = 10  # how many topk-tokens to include when calculating top scored sent
Esempio n. 4
0
 def __init__(self):
     super().__init__()
     # by default no encoders
     self.enc_conf.enc_rnn_layer = 0
     # =====
     self.m2e_use_basic = False  # use basic encoder (at least embeddings)
     self.m2e_use_basic_dep = False  # use the special dep features as a replacement of basic_plus
     self.m2e_use_basic_plus = True  # use basic encoder with the new mode
     self.bert_conf = Berter2Conf()
     self.ms_extend_step = 0  # multi-sent extending for each side (win=2*mse+1) for encoding
     self.ms_extend_budget = 510  # overall subword budget, do not make it too large
     self.benc_bucket_msize = Constants.INT_PRAC_MAX  # max bsize for one bucket
     self.benc_bucket_range = 10  # similar to enc_bucket_range, but for bert enc
     # extra encoder over bert?
     self.m3_enc_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2",
                                                   enc_hidden=512,
                                                   enc_rnn_layer=0)
     # simple dep based basic part
     self.dep_output_dim = 512
     self.dep_label_dim = 50
     # =====
     self.bert_use_center_typeids = True  # current(center) sent 1, others 0
     self.bert_use_special_typeids = False  # pred as 0
     # other inputs for bert
     self.bert_other_inputs = [
     ]  # names of factors in Sentence (direct field name like 'uposes', 'entity_labels', ...)
Esempio n. 5
0
 def __init__(self):
     self._input_dim = -1  # to be filled
     self.jpos_multitask = False  # the overall switch for multitask
     self.jpos_lambda = 0.  # lambda(switch) for training: loss_parsing + lambda*loss_pos
     self.jpos_decode = True  # switch for decoding
     self.jpos_enc = EncConf().init_from_kwargs(enc_rnn_type="lstm2",
                                                enc_hidden=1024,
                                                enc_rnn_layer=0)
     self.jpos_stacking = True  # stacking as inputs (using adding here)
Esempio n. 6
0
 def __init__(self):
     super().__init__()
     self._input_dim = 0
     self._lexi_dim = 0
     # the first pass selector
     self.use_selector = False
     self.sel_conf = NodeSelectorConf()
     # the specific encoder
     # basic modeling mode 1) especially model for each cand position (DMXNN); 2) one general encoder
     self.dmxnn = False
     # specific to dmxnn mode
     self.posi_dim = 5  # dimension for PF
     self.posi_cut = 20  # [-cut, cut]
     # encoder
     self.e_enc = EncConf().init_from_kwargs(enc_hidden=300,
                                             enc_cnn_layer=0,
                                             enc_cnn_windows='[3]',
                                             enc_rnn_layer=0,
                                             no_final_dropout=True)
     # before labeler
     self.use_lab_f = True
     self.lab_f_use_lexi = False  # also include lexi repr for lab_f
     self.lab_f_act = "elu"
     # whether exclude nil
     self.exclude_nil = True
     # selection for training (only if use_selector=False)
     self.train_ng_ratio = 1.  # neg sample's ratio to gold's count
     self.train_min_num = 1.  # min number per sentence
     self.train_min_rate = 0.5  # a min selecting rate for neg sentences
     self.train_min_rate_s2 = 0.5  # sampling for the secondary type, within gold_mask1
     # correct labels for training return
     self.train_gold_corr = True
     # loss weights for NodeSelector and NodeExtractor
     self.lambda_ns = 0.5  # if use_selector=True
     self.lambda_ne = 1.
     self.lambda_ne2 = 1.  # secondary type
     # secondary type
     self.use_secondary_type = False  # the second type for each trigger
     self.sectype_reuse_hl = False  # use the same predictor?
     self.sectype_t2ift1 = True  # only output t2 if there are t1
     self.sectype_noback_enc = False  # stop grad to enc
Esempio n. 7
0
 def __init__(self):
     # embedding and encoding layer
     self.emb_conf = EmbedConf().init_from_kwargs(dim_word=300,
                                                  dim_char=30,
                                                  dim_extras='[50]',
                                                  extra_names='["pos"]')
     self.enc_conf = EncConf().init_from_kwargs(enc_rnn_type="lstm2",
                                                enc_hidden=1024,
                                                enc_rnn_layer=3)
     # joint pos encoder (layer0)
     self.jpos_conf = JPosConf()
     # =====
     # other options
     # inputs
     self.char_max_length = 45
     # dropouts
     self.drop_embed = 0.33
     self.dropmd_embed = 0.
     self.drop_hidden = 0.33
     self.gdrop_rnn = 0.33  # gdrop (always fixed for recurrent connections)
     self.idrop_rnn = 0.33  # idrop for rnn
     self.fix_drop = True  # fix drop for one run for each dropout
     self.singleton_unk = 0.5  # replace singleton words with UNK when training
     self.singleton_thr = 2  # only replace singleton if freq(val) <= this (also decay with 1/freq)
Esempio n. 8
0
 def __init__(self):
     super().__init__()
     # components
     self.emb_conf = EmbedderNodeConf()
     self.mlm_conf = MaskLMNodeConf()
     self.plm_conf = PlainLMNodeConf()
     self.orp_conf = OrderPredNodeConf()
     self.orp_loss_special = False  # special loss for orp
     # non-pretraining parts
     self.dpar_conf = DparG1DecoderConf()
     self.upos_conf = SeqLabNodeConf()
     self.ner_conf = SeqCrfNodeConf()
     self.do_ner = False  # an extra flag
     self.ner_use_crf = True  # whether use crf for ner?
     # where pairwise repr comes from
     self.default_attn_count = 128  # by default, setting this to what?
     self.prepr_choice = "attn_max"  # rdist, rdist_abs, attn_sum, attn_avg, attn_max, attn_last
     # which encoder to use
     self.enc_choice = "vrec"  # by default, use the new one
     self.venc_conf = VRecEncoderConf()
     self.oenc_conf = EncConf().init_from_kwargs(enc_hidden=512, enc_rnn_layer=0)
     # another small encoder
     self.rprep_conf = RPrepConf()
     # agreement module
     self.lambda_agree = SVConf().init_from_kwargs(val=0., which_idx="eidx", mode="none")
     self.agree_loss_f = "js"
     # separate generator seed especially for testing mask
     self.testing_rand_gen_seed = 0
     self.testing_get_attns = False
     # todo(+N): should make this into another conf
     # for training and testing
     self.no_build_dict = False
     # -- length thresh
     self.train_max_length = 80
     self.train_min_length = 5
     self.test_single_length = 80
     # -- batch size
     self.train_shuffle = True
     self.train_batch_size = 80
     self.train_inst_copy = 1  # how many times to copy the training insts (for different masks)
     self.test_batch_size = 32
     # -- maxibatch for BatchArranger
     self.train_maxibatch_size = 20  # cache for this number of batches in BatchArranger
     self.test_maxibatch_size = -1  # -1 means cache all
     # -- batch_size_f
     self.train_batch_on_len = False  # whether use sent length as budgets rather then sent count
     self.test_batch_on_len = False
     # -----
     self.train_preload_model = False
     self.train_preload_process = False
     # interactive testing
     self.test_interactive = False  # special mode
     # load pre-training model?
     self.load_pretrain_model_name = ""
     # -----
     # special mode with extra word2
     # todo(+N): ugly patch!
     self.aug_word2 = False
     self.aug_word2_dim = 300
     self.aug_word2_pretrain = ""
     self.aug_word2_save_dir = ""
     self.aug_word2_aug_encoder = True  # special model's special mode, feature-based original encoder and stack another one!!
     self.aug_detach_ratio = 0.5
     self.aug_detach_dropout = 0.33
     self.aug_detach_numlayer = 6  # use mixture of how many last layers?
Esempio n. 9
0
def main():
    np.random.seed(1234)
    NUM_POS = 10
    # build vocabs
    reader = TextReader("./test_utils.py")
    vb_word = VocabBuilder("w")
    vb_char = VocabBuilder("c")
    for one in reader:
        vb_word.feed_stream(one.tokens)
        vb_char.feed_stream((c for w in one.tokens for c in w))
    voc_word = vb_word.finish()
    voc_char = vb_char.finish()
    voc_pos = VocabBuilder.build_from_stream(range(NUM_POS), name="pos")
    vpack = VocabPackage({
        "word": voc_word,
        "char": voc_char,
        "pos": voc_pos
    }, {"word": None})
    # build model
    pc = BK.ParamCollection()
    conf_emb = EmbedConf().init_from_kwargs(init_words_from_pretrain=False,
                                            dim_char=10,
                                            dim_posi=10,
                                            emb_proj_dim=400,
                                            dim_extras="50",
                                            extra_names="pos")
    conf_emb.do_validate()
    mod_emb = MyEmbedder(pc, conf_emb, vpack)
    conf_enc = EncConf().init_from_kwargs(enc_rnn_type="lstm2",
                                          enc_cnn_layer=1,
                                          enc_att_layer=1)
    conf_enc._input_dim = mod_emb.get_output_dims()[0]
    mod_enc = MyEncoder(pc, conf_enc)
    enc_output_dim = mod_enc.get_output_dims()[0]
    mod_scorer = BiAffineScorer(pc, enc_output_dim, enc_output_dim, 10)
    # build data
    word_padder = DataPadder(2, pad_lens=(0, 50), mask_range=2)
    char_padder = DataPadder(3, pad_lens=(0, 50, 20))
    word_idxes = []
    char_idxes = []
    pos_idxes = []
    for toks in reader:
        one_words = []
        one_chars = []
        for w in toks.tokens:
            one_words.append(voc_word.get_else_unk(w))
            one_chars.append([voc_char.get_else_unk(c) for c in w])
        word_idxes.append(one_words)
        char_idxes.append(one_chars)
        pos_idxes.append(
            np.random.randint(voc_pos.trg_len(), size=len(one_words)) +
            1)  # pred->trg
    word_arr, word_mask_arr = word_padder.pad(word_idxes)
    pos_arr, _ = word_padder.pad(pos_idxes)
    char_arr, _ = char_padder.pad(char_idxes)
    #
    # run
    rop = layers.RefreshOptions(hdrop=0.2, gdrop=0.2, fix_drop=True)
    for _ in range(5):
        mod_emb.refresh(rop)
        mod_enc.refresh(rop)
        mod_scorer.refresh(rop)
        #
        expr_emb = mod_emb(word_arr, char_arr, [pos_arr])
        zlog(BK.get_shape(expr_emb))
        expr_enc = mod_enc(expr_emb, word_mask_arr)
        zlog(BK.get_shape(expr_enc))
        #
        mask_expr = BK.input_real(word_mask_arr)
        score0 = mod_scorer.paired_score(expr_enc, expr_enc, mask_expr,
                                         mask_expr)
        score1 = mod_scorer.plain_score(expr_enc.unsqueeze(-2),
                                        expr_enc.unsqueeze(-3),
                                        mask_expr.unsqueeze(-1),
                                        mask_expr.unsqueeze(-2))
        #
        zmiss = float(BK.avg(score0 - score1))
        assert zmiss < 0.0001
    zlog("OK")
    pass