def get_pretraining_model(model_name, ctx_l, max_seq_length=128, hidden_dropout_prob=0.1, attention_dropout_prob=0.1, generator_units_scale=None, generator_layers_scale=None): """ A Electra Pretrain Model is built with a generator and a discriminator, in which the generator has the same embedding as the discriminator but different backbone. """ cfg, tokenizer, _, _ = get_pretrained_electra(model_name, load_backbone=False) cfg = ElectraModel.get_cfg().clone_merge(cfg) cfg.defrost() cfg.MODEL.hidden_dropout_prob = hidden_dropout_prob cfg.MODEL.attention_dropout_prob = attention_dropout_prob cfg.MODEL.max_length = max_seq_length # Keep the original generator size if not designated if generator_layers_scale: cfg.MODEL.generator_layers_scale = generator_layers_scale if generator_units_scale: cfg.MODEL.generator_units_scale = generator_units_scale cfg.freeze() model = ElectraForPretrain(cfg, uniform_generator=False, tied_generator=False, tied_embeddings=True, disallow_correct=False, weight_initializer=TruncNorm(stdev=0.02)) model.initialize(ctx=ctx_l) model.hybridize() return cfg, tokenizer, model
def get_test_cfg(): cfg = ElectraModel.get_cfg() cfg.defrost() cfg.MODEL.vocab_size = 100 cfg.MODEL.units = 12 * 8 cfg.MODEL.hidden_size = 128 cfg.MODEL.num_heads = 2 cfg.MODEL.num_layers = 2 cfg.freeze() return cfg