def __init__(self, pc, input_dims: List[int], use_affs: List[bool], out_dim: int, out_act='linear', out_drop=0., param_init_scale=1.): super().__init__(pc, None, None) # ----- self.input_dims = input_dims self.use_affs = use_affs self.out_dim = out_dim # ===== assert len(input_dims) == len(use_affs) self.aff_nodes = [] for d, use in zip(input_dims, use_affs): if use: one_aff = self.add_sub_node( "aff", Affine(pc, d, out_dim, init_scale=param_init_scale, init_rop=NoDropRop())) else: assert d == out_dim, f"Dimension mismatch for skipping affine: {d} vs {out_dim}!" one_aff = None self.aff_nodes.append(one_aff) self.out_act_f = ActivationHelper.get_act(out_act) self.out_drop = self.add_sub_node( "drop", Dropout(pc, (out_dim, ), fix_rate=out_drop))
def create_dropout_node(self): if self.comp_dropout > 0.: self.dropout = self.add_sub_node( f"D{self.comp_name}", Dropout(self.pc, (self.output_dim,), fix_rate=self.comp_dropout)) else: self.dropout = lambda x: x return self.dropout
def __init__(self, pc, conf: NodeExtractorConfHead, vocab: HLabelVocab, extract_type: str): super().__init__(pc, conf, vocab, extract_type) # node selector conf.sel_conf._input_dim = conf._input_dim # make dims fit self.sel: NodeSelector = self.add_sub_node( "sel", NodeSelector(pc, conf.sel_conf)) # encoding self.dmxnn = conf.dmxnn self.posi_embed = self.add_sub_node( "pe", RelPosiEmbedding(pc, conf.posi_dim, max=conf.posi_cut)) if self.dmxnn: conf.e_enc._input_dim = conf._input_dim + conf.posi_dim else: conf.e_enc._input_dim = conf._input_dim self.e_encoder = self.add_sub_node("ee", MyEncoder(pc, conf.e_enc)) e_enc_dim = self.e_encoder.get_output_dims()[0] # decoding # todo(note): dropout after pooling; todo(+N): cannot go to previous layers if there are no encoders self.special_drop = self.add_sub_node("sd", Dropout(pc, (e_enc_dim, ))) self.use_lab_f = conf.use_lab_f self.lab_f_use_lexi = conf.lab_f_use_lexi if self.use_lab_f: lab_f_input_dims = [e_enc_dim] * 3 if self.dmxnn else [e_enc_dim] if self.lab_f_use_lexi: lab_f_input_dims.append(conf._lexi_dim) self.lab_f = self.add_sub_node( "lab", Affine(pc, lab_f_input_dims, conf.lab_conf.n_dim, act=conf.lab_f_act)) else: self.lab_f = lambda x: x[0] # only use the first one # secondary type self.use_secondary_type = conf.use_secondary_type if self.use_secondary_type: # todo(note): re-use vocab; or totally reuse the predictor? if conf.sectype_reuse_hl: self.hl2: HLabelNode = self.hl else: new_lab_conf = deepcopy(conf.lab_conf) new_lab_conf.zero_nil = False # todo(note): not zero_nil here! self.hl2: HLabelNode = self.add_sub_node( "hl", HLabelNode(pc, new_lab_conf, vocab)) # enc+t1 -> t2 self.t1tot2 = self.add_sub_node( "1to2", Embedding(pc, self.hl_output_size, conf.lab_conf.n_dim)) else: self.hl2 = None self.t1tot2 = None
def __init__(self, pc, input_dim, hid_dim: int, hid_act='linear', hid_drop=0., hid_piece4init=1, out_dim=0, out_fbias=0., out_fact="linear", out_piece4init=1, init_scale=1.): super().__init__(pc, None, None) # ----- # hidden layer self.hid_aff = self.add_sub_node( "hidden", Affine(pc, input_dim, hid_dim, n_piece4init=hid_piece4init, init_scale=init_scale, init_rop=NoDropRop())) self.hid_act_f = ActivationHelper.get_act(hid_act) self.hid_drop = self.add_sub_node( "drop", Dropout(pc, (hid_dim, ), fix_rate=hid_drop)) # ----- # output layer (optional) self.final_output_dim = hid_dim # todo(+N): how about split hidden layers for each specific output self.out_fbias = out_fbias # fixed extra bias self.out_act_f = ActivationHelper.get_act(out_fact) # no output dropouts if out_dim > 0: assert hid_act != "linear", "Please use non-linear activation for mlp!" assert hid_piece4init == 1, "Strange hid_piece4init for hidden layer with out_dim>0" self.final_aff = self.add_sub_node( "final", Affine(pc, hid_dim, out_dim, n_piece4init=out_piece4init, init_scale=init_scale, init_rop=NoDropRop())) self.final_output_dim = out_dim else: self.final_aff = None
def __init__(self, pc: BK.ParamCollection, head_count, conf: MAttNormerConf): super().__init__(pc, None, None) self.conf = conf self.head_count = head_count # ----- self._norm_f = getattr(self, "_norm_" + conf.norm_mode) # shortcut self._norm_dims = { 'flatten': [-1], 'head': [-1], 'cand': [-2], 'head_cand': [-1, -2], 'binary': [-1] }[conf.norm_mode] self.norm_prune = conf.norm_prune # cnode: special attention self.cnode = self.add_sub_node("cn", ConcreteNode(pc, conf.cconf)) # attention dropout: # no-fix attention dropout, not elegant here rr = NoFixRop() self.adrop = self.add_sub_node( "adrop", Dropout(pc, (), init_rop=rr, fix_rate=conf.attn_dropout))
def __init__(self, conf: MtlMlmModelConf, vpack: VocabPackage): super().__init__(conf) # for easier checking self.word_vocab = vpack.get_voc("word") # components self.embedder = self.add_node("emb", EmbedderNode(self.pc, conf.emb_conf, vpack)) self.inputter = Inputter(self.embedder, vpack) # not a node self.emb_out_dim = self.embedder.get_output_dims()[0] self.enc_attn_count = conf.default_attn_count if conf.enc_choice == "vrec": self.encoder = self.add_component("enc", VRecEncoder(self.pc, self.emb_out_dim, conf.venc_conf)) self.enc_attn_count = self.encoder.attn_count elif conf.enc_choice == "original": conf.oenc_conf._input_dim = self.emb_out_dim self.encoder = self.add_node("enc", MyEncoder(self.pc, conf.oenc_conf)) else: raise NotImplementedError() zlog(f"Finished building model's encoder {self.encoder}, all size is {self.encoder.count_allsize_parameters()}") self.enc_out_dim = self.encoder.get_output_dims()[0] # -- conf.rprep_conf._rprep_vr_conf.matt_conf.head_count = self.enc_attn_count # make head-count agree self.rpreper = self.add_node("rprep", RPrepNode(self.pc, self.enc_out_dim, conf.rprep_conf)) # -- self.lambda_agree = self.add_scheduled_value(ScheduledValue(f"agr:lambda", conf.lambda_agree)) self.agree_loss_f = EntropyHelper.get_method(conf.agree_loss_f) # -- self.masklm = self.add_component("mlm", MaskLMNode(self.pc, self.enc_out_dim, conf.mlm_conf, self.inputter)) self.plainlm = self.add_component("plm", PlainLMNode(self.pc, self.enc_out_dim, conf.plm_conf, self.inputter)) # todo(note): here we use attn as dim_pair, do not use pair if not using vrec!! self.orderpr = self.add_component("orp", OrderPredNode( self.pc, self.enc_out_dim, self.enc_attn_count, conf.orp_conf, self.inputter)) # ===== # pre-training pre-load point!! if conf.load_pretrain_model_name: zlog(f"At preload_pretrain point: Loading from {conf.load_pretrain_model_name}") self.pc.load(conf.load_pretrain_model_name, strict=False) # ===== self.dpar = self.add_component("dpar", DparG1Decoder( self.pc, self.enc_out_dim, self.enc_attn_count, conf.dpar_conf, self.inputter)) self.upos = self.add_component("upos", SeqLabNode( self.pc, "pos", self.enc_out_dim, self.conf.upos_conf, self.inputter)) if conf.do_ner: if conf.ner_use_crf: self.ner = self.add_component("ner", SeqCrfNode( self.pc, "ner", self.enc_out_dim, self.conf.ner_conf, self.inputter)) else: self.ner = self.add_component("ner", SeqLabNode( self.pc, "ner", self.enc_out_dim, self.conf.ner_conf, self.inputter)) else: self.ner = None # for pairwise reprs (no trainable params here!) self.rel_dist_embed = self.add_node("oremb", PosiEmbedding2(self.pc, n_dim=self.enc_attn_count, max_val=100)) self._prepr_f_attn_sum = lambda cache, rdist: BK.stack(cache.list_attn, 0).sum(0) if (len(cache.list_attn))>0 else None self._prepr_f_attn_avg = lambda cache, rdist: BK.stack(cache.list_attn, 0).mean(0) if (len(cache.list_attn))>0 else None self._prepr_f_attn_max = lambda cache, rdist: BK.stack(cache.list_attn, 0).max(0)[0] if (len(cache.list_attn))>0 else None self._prepr_f_attn_last = lambda cache, rdist: cache.list_attn[-1] if (len(cache.list_attn))>0 else None self._prepr_f_rdist = lambda cache, rdist: self._get_rel_dist_embed(rdist, False) self._prepr_f_rdist_abs = lambda cache, rdist: self._get_rel_dist_embed(rdist, True) self.prepr_f = getattr(self, "_prepr_f_"+conf.prepr_choice) # shortcut # -- self.testing_rand_gen = Random.create_sep_generator(conf.testing_rand_gen_seed) # especial gen for testing # ===== if conf.orp_loss_special: self.orderpr.add_node_special(self.masklm) # ===== # extra one!! self.aug_word2 = self.aug_encoder = self.aug_mixturer = None if conf.aug_word2: self.aug_word2 = self.add_node("aug2", AugWord2Node(self.pc, conf.emb_conf, vpack, "word2", conf.aug_word2_dim, self.emb_out_dim)) if conf.aug_word2_aug_encoder: assert conf.enc_choice == "vrec" self.aug_detach_drop = self.add_node("dd", Dropout(self.pc, (self.enc_out_dim,), fix_rate=conf.aug_detach_dropout)) self.aug_encoder = self.add_component("Aenc", VRecEncoder(self.pc, self.emb_out_dim, conf.venc_conf)) self.aug_mixturer = self.add_node("Amix", BertFeaturesWeightLayer(self.pc, conf.aug_detach_numlayer))
def __init__(self, pc: BK.ParamCollection, econf: EmbedConf, vpack: VocabPackage): super().__init__(pc, None, None) self.conf = econf # repr_sizes = [] # word self.has_word = (econf.dim_word > 0) if self.has_word: npvec = vpack.get_emb( "word") if econf.init_words_from_pretrain else None self.word_embed = self.add_sub_node( "ew", Embedding(self.pc, len(vpack.get_voc("word")), econf.dim_word, npvec=npvec, name="word", freeze=econf.word_freeze)) repr_sizes.append(econf.dim_word) # char self.has_char = (econf.dim_char > 0) if self.has_char: # todo(warn): cnns will also use emb's drop? self.char_embed = self.add_sub_node( "ec", Embedding(self.pc, len(vpack.get_voc("char")), econf.dim_char, name="char")) per_cnn_size = econf.char_cnn_hidden // len(econf.char_cnn_windows) self.char_cnns = [ self.add_sub_node( "cnnc", CnnLayer(self.pc, econf.dim_char, per_cnn_size, z, pooling="max", act="tanh")) for z in econf.char_cnn_windows ] repr_sizes.append(econf.char_cnn_hidden) # posi: absolute positional embeddings self.has_posi = (econf.dim_posi > 0) if self.has_posi: self.posi_embed = self.add_sub_node( "ep", PosiEmbedding(self.pc, econf.dim_posi, econf.posi_clip, econf.posi_fix_sincos, econf.posi_freeze)) repr_sizes.append(econf.dim_posi) # extras: like POS, ... self.dim_extras = econf.dim_extras self.extra_names = econf.extra_names zcheck( len(self.dim_extras) == len(self.extra_names), "Unmatched dims and names!") self.extra_embeds = [] for one_extra_dim, one_name in zip(self.dim_extras, self.extra_names): self.extra_embeds.append( self.add_sub_node( "ext", Embedding(self.pc, len(vpack.get_voc(one_name)), one_extra_dim, npvec=vpack.get_emb(one_name, None), name="extra:" + one_name))) repr_sizes.append(one_extra_dim) # auxes self.dim_auxes = econf.dim_auxes self.fold_auxes = econf.fold_auxes self.aux_overall_gammas = [] self.aux_fold_lambdas = [] for one_aux_dim, one_aux_fold in zip(self.dim_auxes, self.fold_auxes): repr_sizes.append(one_aux_dim) # aux gamma and fold trainable lambdas self.aux_overall_gammas.append(self.add_param("AG", (), 1.)) # scalar self.aux_fold_lambdas.append( self.add_param( "AL", (), [1. / one_aux_fold for _ in range(one_aux_fold)])) # [#fold] # ===== # another projection layer? & set final dim if len(repr_sizes) <= 0: zwarn("No inputs??") # zcheck(len(repr_sizes)>0, "No inputs?") self.repr_sizes = repr_sizes self.has_proj = (econf.emb_proj_dim > 0) if self.has_proj: proj_layer = Affine(self.pc, sum(repr_sizes), econf.emb_proj_dim) if econf.emb_proj_norm: norm_layer = LayerNorm(self.pc, econf.emb_proj_dim) self.final_layer = self.add_sub_node( "fl", Sequential(self.pc, [proj_layer, norm_layer])) else: self.final_layer = self.add_sub_node("fl", proj_layer) self.output_dim = econf.emb_proj_dim else: self.final_layer = None self.output_dim = sum(repr_sizes) # ===== # special MdDropout: dropout the entire last dim (for word, char, extras, but not posi) self.dropmd_word = self.add_sub_node("md", DropoutLastN(pc, lastn=1)) self.dropmd_char = self.add_sub_node("md", DropoutLastN(pc, lastn=1)) self.dropmd_extras = [ self.add_sub_node("md", DropoutLastN(pc, lastn=1)) for _ in self.extra_names ] # dropouts for aux self.drop_auxes = [ self.add_sub_node("aux", Dropout(pc, (one_aux_dim, ))) for one_aux_dim in self.dim_auxes ]
def __init__(self, pc, conf: HLabelNodeConf, hl_vocab: HLabelVocab, eff_max_layer=None): super().__init__(pc, None, None) self.conf = conf self.hl_vocab = hl_vocab assert self.hl_vocab.nil_as_zero # for each layer, the idx=0 is the full-NIL # basic pool embeddings npvec = hl_vocab.pool_init_vec if not conf.pool_init_hint: npvec = None else: assert npvec is not None, "pool-init not provided by the Vocab!" n_dim, n_pool = conf.n_dim, len(hl_vocab.pools_k) self.pool_pred = self.add_sub_node( "pp", Embedding( pc, n_pool, n_dim, fix_row0=conf.zero_nil, npvec=npvec, init_rop=(NoDropRop() if conf.nodrop_pred_embeds else None))) if conf.tie_embeds: self.pool_lookup = self.pool_pred else: self.pool_lookup = self.add_sub_node( "pl", Embedding(pc, n_pool, n_dim, fix_row0=conf.zero_nil, npvec=npvec, init_rop=(NoDropRop() if conf.nodrop_lookup_embeds else None))) # layered labels embeddings (to be refreshed) self.max_layer = hl_vocab.max_layer self.layered_embeds_pred = [None] * self.max_layer self.layered_embeds_lookup = [None] * self.max_layer self.layered_prei = [ None ] * self.max_layer # previous layer i, for score combining self.layered_isnil = [None] * self.max_layer # whether is nil(None) self.zero_nil = conf.zero_nil # lookup summer assert conf.strategy_predict == "sum" self.lookup_is_sum, self.lookup_is_ff = [ conf.strategy_lookup == z for z in ["sum", "ff"] ] if self.lookup_is_ff: self.lookup_summer = self.add_sub_node( "summer", Affine(pc, [n_dim] * self.max_layer, n_dim, act="tanh")) elif self.lookup_is_sum: self.sum_dropout = self.add_sub_node("sdrop", Dropout(pc, (n_dim, ))) self.lookup_summer = lambda embeds: self.sum_dropout( BK.stack(embeds, 0).sum(0)) else: raise NotImplementedError( f"UNK strategy_lookup: {conf.strategy_lookup}") # bias for prediction self.prediction_sizes = [ len(hl_vocab.layered_pool_links_padded[i]) for i in range(self.max_layer) ] if conf.bias_predict: self.biases_pred = [ self.add_param(name="B", shape=(x, )) for x in self.prediction_sizes ] else: self.biases_pred = [None] * self.max_layer # ===== # training self.is_hinge_loss, self.is_prob_loss = [ conf.loss_function == z for z in ["hinge", "prob"] ] self.loss_lambdas = conf.loss_lambdas + [1.] * ( self.max_layer - len(conf.loss_lambdas)) # loss scale self.margin_lambdas = conf.margin_lambdas + [0.] * ( self.max_layer - len(conf.margin_lambdas)) # margin scale self.lookup_soft_alphas = conf.lookup_soft_alphas + [1.] * ( self.max_layer - len(conf.lookup_soft_alphas)) self.loss_fullnil_weight = conf.loss_fullnil_weight # ====== # set current effective max_layer self.eff_max_layer = self.max_layer if eff_max_layer is not None: self.set_eff_max_layer(eff_max_layer)
def __init__(self, pc, dim: int, conf: VRecConf): super().__init__(pc, None, None) self.conf = conf self.dim = dim # ===== # Feat if conf.feat_mod == "matt": self.feat_node = self.add_sub_node( "feat", MAttNode(pc, dim, dim, dim, conf.matt_conf)) self.attn_count = conf.matt_conf.head_count elif conf.feat_mod == "fcomb": self.feat_node = self.add_sub_node( "feat", FCombNode(pc, dim, dim, dim, conf.fc_conf)) self.attn_count = conf.fc_conf.fc_count else: raise NotImplementedError() feat_out_dim = self.feat_node.get_output_dims()[0] # ===== # Combiner if conf.comb_mode == "affine": self.comb_aff = self.add_sub_node( "aff", AffineCombiner(pc, [dim, feat_out_dim], [conf.comb_affine_q, conf.comb_affine_v], dim, out_act=conf.comb_affine_act, out_drop=conf.comb_affine_drop)) self.comb_f = lambda q, v, c: (self.comb_aff([q, v]), None) elif conf.comb_mode == "lstm": self.comb_lstm = self.add_sub_node( "lstm", LstmNode2(pc, feat_out_dim, dim)) self.comb_f = self._call_lstm else: raise NotImplementedError() # ===== # ff if conf.ff_dim > 0: self.has_ff = True self.linear1 = self.add_sub_node( "l1", Affine(pc, dim, conf.ff_dim, act=conf.ff_act, init_rop=NoDropRop())) self.dropout1 = self.add_sub_node( "d1", Dropout(pc, (conf.ff_dim, ), fix_rate=conf.ff_drop)) self.linear2 = self.add_sub_node( "l2", Affine(pc, conf.ff_dim, dim, act="linear", init_rop=NoDropRop())) self.dropout2 = self.add_sub_node( "d2", Dropout(pc, (dim, ), fix_rate=conf.ff_drop)) else: self.has_ff = False # layer norms if conf.use_pre_norm: self.att_pre_norm = self.add_sub_node("aln1", LayerNorm(pc, dim)) self.ff_pre_norm = self.add_sub_node("fln1", LayerNorm(pc, dim)) else: self.att_pre_norm = self.ff_pre_norm = None if conf.use_post_norm: self.att_post_norm = self.add_sub_node("aln2", LayerNorm(pc, dim)) self.ff_post_norm = self.add_sub_node("fln2", LayerNorm(pc, dim)) else: self.att_post_norm = self.ff_post_norm = None