def add_args(parser): NATransformerModel.add_args(parser) parser.add_argument( "--load-weight-level", default='all', choices=['all', 'encoder_decoder', 'encoder'], help= "which components needs to load weights from checkpoint. all: load all. encoder_decoder: load encoder and decoder only. encoder: load encoder only." ) parser.add_argument("--latent-dim", type=int, help="dimension for latent vector.") parser.add_argument("--posterior-layers", type=int, help="num layers for posterior transformer.") parser.add_argument( "--kl-div-loss-factor", type=float, help= "weights on the kl divergence term in ELBO (or initial budget). ignored if using control-VAE" ) parser.add_argument( "--control-vae", action="store_true", help= "use the PI algorithm introduced in ControlVAE to calculate the weight on KL-divergence on latent." ) parser.add_argument( '--control-vae-args', type=str, metavar='JSON', help= """args for ControlVAE, a valid setup is: '{"v_kl": 3.0, "Kp": 0.01, "Ki": 0.0001, "beta_min": 0.0, "beta_max": 1.0 }' """ )
def add_args(parser): NATransformerModel.add_args(parser) parser.add_argument( "--load-encoder-only", action= "store_true", #type=bool, nargs='?', const=True, default=False, help="whether only load encoder states from checkpoint.")
def add_args(parser): NATransformerModel.add_args(parser) parser.add_argument("--decoder-positional-attention", action="store_true", help="add postional attention when decoding") parser.add_argument("--decoder-positional-attention-head-num", type=int, help="num of heads of positional attention in decoder layers") parser.add_argument("--load-encoder-only", action="store_true", #type=bool, nargs='?', const=True, default=False, help="whether only load encoder states from checkpoint.")
def add_args(parser): NATransformerModel.add_args(parser) parser.add_argument("--train-step", type=int, help="number of refinement iterations during training") parser.add_argument("--dae-ratio", type=float, help="the probability of switching to the denoising auto-encoder loss") parser.add_argument("--stochastic-approx", action="store_true", help="sampling from the decoder as the inputs for next iteration")
def add_args(parser): NATransformerModel.add_args(parser) parser.add_argument("--crf-lowrank-approx", type=int, help="the dimension of low-rank approximation of transition") parser.add_argument("--crf-beam-approx", type=int, help="the beam size for apporixmating the normalizing factor") parser.add_argument("--word-ins-loss-factor", type=float, help="weights on NAT loss used to co-training with CRF loss.")
def add_args(parser): NATransformerModel.add_args(parser) """Add task-specific arguments to the parser.""" parser.add_argument( "--vgg-config", type=str, help= """config in json format e.g. '[{"in_channels":64, "subsample": 2}, {"in_channels":64, "subsample": 2}]'. If a dict is empty, default values are used.""", )
def add_args(parser): """Add model-specific arguments to the parser.""" NATransformerModel.add_args(parser) parser.add_argument('--share-encoder-embeddings', action='store_true', help='share encoder embeddings across languages') parser.add_argument('--share-decoder-embeddings', action='store_true', help='share decoder embeddings across languages') parser.add_argument('--share-encoders', action='store_true', help='share encoders across languages') parser.add_argument('--student-arch', default="nonautoregressive_transformer", help='determine the type of student network to mutual learn from.') parser.add_argument('--teacher-arch', default="transformer", help='determine the type of teacher network to mutual learn from.') parser.add_argument('--load-to-teacher', action='store_true', help='load checkpoint to teacher network.') parser.add_argument('--freeze-teacher', action='store_true', help='whether to freeze teacher.') parser.add_argument("--student-kd-factor", default=.5, type=float, help="weights on the knowledge distillation loss for training student" ) parser.add_argument("--teacher-kd-factor", default=.5, type=float, help="weights on the knowledge distillation loss for training teacher" ) parser.add_argument("--control-kd-factor", action="store_true", help="use the PI algorithm introduced in ControlVAE to calculate the weight on KL-divergence on latent.") parser.add_argument('--control-kd-args', type=str, metavar='JSON', help="""args for ControlVAE, a valid setup is: '{"v_kl": 3.0, "Kp": 0.01, "Ki": 0.0001, "beta_min": 0.0, "beta_max": 1.0 }' """) # inference flags parser.add_argument('--reduce-to-student', action='store_true', help='when inference, only load student network.') parser.add_argument('--reduce-to-teacher', action='store_true', help='when inference, only load teacher network.')
def build_model(cls, args, task, src_dict, tgt_dict, encoder_embed_tokens, decoder_embed_tokens): posterior_args = copy.deepcopy(args) posterior_args.encoder_layers = args.posterior_layers posterior_args.decoder_layers = args.posterior_layers # posterior_args.share_decoder_input_output_embed = False # posterior_args.share_all_embeddings = False posterior_args.src_embedding_copy = False # TODO: swap encoder & decoder values # assumed same args for now. transfo = cls( args=posterior_args, # in posterior, encoder eats target encoder=NATransformerModel.build_encoder(posterior_args, tgt_dict, decoder_embed_tokens), # in posterior, decoder eats source decoder=NATransformerModel.build_decoder(posterior_args, src_dict, encoder_embed_tokens)) transfo.predict_head = LatentPredictor( hidden_dim=args.decoder_embed_dim, latent_dim=args.latent_dim) return transfo
def add_args(parser): NATransformerModel.add_args(parser) parser.add_argument( "--load-weight-level", default='all', choices=['all', 'encoder_decoder', 'encoder'], help= "which components needs to load weights from checkpoint. all: load all. encoder_decoder: load encoder and decoder only. encoder: load encoder only." ) parser.add_argument("--latent-dim", type=int, help="dimension for latent vector.") parser.add_argument("--posterior-layers", type=int, help="num layers for posterior transformer.") parser.add_argument( "--kl-div-loss-factor", type=float, help= "weights on the kl divergence term in ELBO (or initial budget). ignored if using control-VAE" ) parser.add_argument( "--posterior-attention-heads", type=int, help="weights on the latent embedding attention loss.") parser.add_argument( "--sg-latent-prediction", action="store_true", help= "stop the gradients back-propagated from the latent embedding aligner predictor" ) parser.add_argument( "--latent-use-embed", action="store_true", help= "Use encoder embeddings instead of encoder out as input to LEA module." )
def add_args(parser): NATransformerModel.add_args(parser)