Ejemplo n.º 1
0
 def add_args(parser):
     NATransformerModel.add_args(parser)
     parser.add_argument(
         "--load-weight-level",
         default='all',
         choices=['all', 'encoder_decoder', 'encoder'],
         help=
         "which components needs to load weights from checkpoint. all: load all. encoder_decoder: load encoder and decoder only. encoder: load encoder only."
     )
     parser.add_argument("--latent-dim",
                         type=int,
                         help="dimension for latent vector.")
     parser.add_argument("--posterior-layers",
                         type=int,
                         help="num layers for posterior transformer.")
     parser.add_argument(
         "--kl-div-loss-factor",
         type=float,
         help=
         "weights on the kl divergence term in ELBO (or initial budget). ignored if using control-VAE"
     )
     parser.add_argument(
         "--control-vae",
         action="store_true",
         help=
         "use the PI algorithm introduced in ControlVAE to calculate the weight on KL-divergence on latent."
     )
     parser.add_argument(
         '--control-vae-args',
         type=str,
         metavar='JSON',
         help=
         """args for ControlVAE, a valid setup is: '{"v_kl": 3.0, "Kp": 0.01, "Ki": 0.0001, "beta_min": 0.0, "beta_max": 1.0 }' """
     )
Ejemplo n.º 2
0
 def add_args(parser):
     NATransformerModel.add_args(parser)
     parser.add_argument(
         "--load-encoder-only",
         action=
         "store_true",  #type=bool, nargs='?', const=True, default=False,
         help="whether only load encoder states from checkpoint.")
Ejemplo n.º 3
0
 def add_args(parser):
     NATransformerModel.add_args(parser)
     parser.add_argument("--decoder-positional-attention", action="store_true",
                         help="add postional attention when decoding")
     parser.add_argument("--decoder-positional-attention-head-num", type=int, 
         help="num of heads of positional attention in decoder layers")
     parser.add_argument("--load-encoder-only", action="store_true", #type=bool, nargs='?', const=True, default=False,
     help="whether only load encoder states from checkpoint.")
 def add_args(parser):
     NATransformerModel.add_args(parser)
     parser.add_argument("--train-step", type=int,
                         help="number of refinement iterations during training")
     parser.add_argument("--dae-ratio", type=float,
                         help="the probability of switching to the denoising auto-encoder loss")
     parser.add_argument("--stochastic-approx", action="store_true",
                         help="sampling from the decoder as the inputs for next iteration")
Ejemplo n.º 5
0
 def add_args(parser):
     NATransformerModel.add_args(parser)
     parser.add_argument("--crf-lowrank-approx", type=int,
                         help="the dimension of low-rank approximation of transition")
     parser.add_argument("--crf-beam-approx", type=int,
                         help="the beam size for apporixmating the normalizing factor")
     parser.add_argument("--word-ins-loss-factor", type=float,
                         help="weights on NAT loss used to co-training with CRF loss.")
Ejemplo n.º 6
0
    def add_args(parser):
        NATransformerModel.add_args(parser)
        """Add task-specific arguments to the parser."""

        parser.add_argument(
            "--vgg-config",
            type=str,
            help=
            """config in json format e.g. '[{"in_channels":64, "subsample": 2}, {"in_channels":64, "subsample": 2}]'.
             If a dict is empty, default values are used.""",
        )
Ejemplo n.º 7
0
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        NATransformerModel.add_args(parser)
        parser.add_argument('--share-encoder-embeddings', action='store_true',
                            help='share encoder embeddings across languages')
        parser.add_argument('--share-decoder-embeddings', action='store_true',
                            help='share decoder embeddings across languages')
        parser.add_argument('--share-encoders', action='store_true',
                            help='share encoders across languages')
        parser.add_argument('--student-arch', default="nonautoregressive_transformer",
                            help='determine the type of student network to mutual learn from.')        
        parser.add_argument('--teacher-arch', default="transformer",
                            help='determine the type of teacher network to mutual learn from.')

        parser.add_argument('--load-to-teacher', action='store_true',
                            help='load checkpoint to teacher network.')
        parser.add_argument('--freeze-teacher', action='store_true',
                help='whether to freeze teacher.')

        parser.add_argument("--student-kd-factor",
                            default=.5,
                            type=float,
                            help="weights on the knowledge distillation loss for training student"
                            )
        parser.add_argument("--teacher-kd-factor",
                            default=.5,
                            type=float,
                            help="weights on the knowledge distillation loss for training teacher"
                            )
        parser.add_argument("--control-kd-factor", action="store_true",
                            help="use the PI algorithm introduced in ControlVAE to calculate the weight on KL-divergence on latent.")
        parser.add_argument('--control-kd-args', type=str, metavar='JSON',
                            help="""args for ControlVAE, a valid setup is: '{"v_kl": 3.0, "Kp": 0.01, "Ki": 0.0001, "beta_min": 0.0, "beta_max": 1.0 }' """)


        # inference flags
        parser.add_argument('--reduce-to-student', action='store_true',
                            help='when inference, only load student network.')
        parser.add_argument('--reduce-to-teacher', action='store_true',
                            help='when inference, only load teacher network.')
Ejemplo n.º 8
0
    def build_model(cls, args, task, src_dict, tgt_dict, encoder_embed_tokens,
                    decoder_embed_tokens):
        posterior_args = copy.deepcopy(args)
        posterior_args.encoder_layers = args.posterior_layers
        posterior_args.decoder_layers = args.posterior_layers
        # posterior_args.share_decoder_input_output_embed = False
        # posterior_args.share_all_embeddings = False
        posterior_args.src_embedding_copy = False

        # TODO: swap encoder & decoder values
        # assumed same args for now.

        transfo = cls(
            args=posterior_args,
            # in posterior, encoder eats target
            encoder=NATransformerModel.build_encoder(posterior_args, tgt_dict,
                                                     decoder_embed_tokens),
            # in posterior, decoder eats source
            decoder=NATransformerModel.build_decoder(posterior_args, src_dict,
                                                     encoder_embed_tokens))
        transfo.predict_head = LatentPredictor(
            hidden_dim=args.decoder_embed_dim, latent_dim=args.latent_dim)
        return transfo
Ejemplo n.º 9
0
 def add_args(parser):
     NATransformerModel.add_args(parser)
     parser.add_argument(
         "--load-weight-level",
         default='all',
         choices=['all', 'encoder_decoder', 'encoder'],
         help=
         "which components needs to load weights from checkpoint. all: load all. encoder_decoder: load encoder and decoder only. encoder: load encoder only."
     )
     parser.add_argument("--latent-dim",
                         type=int,
                         help="dimension for latent vector.")
     parser.add_argument("--posterior-layers",
                         type=int,
                         help="num layers for posterior transformer.")
     parser.add_argument(
         "--kl-div-loss-factor",
         type=float,
         help=
         "weights on the kl divergence term in ELBO (or initial budget). ignored if using control-VAE"
     )
     parser.add_argument(
         "--posterior-attention-heads",
         type=int,
         help="weights on the latent embedding attention loss.")
     parser.add_argument(
         "--sg-latent-prediction",
         action="store_true",
         help=
         "stop the gradients back-propagated from the latent embedding aligner predictor"
     )
     parser.add_argument(
         "--latent-use-embed",
         action="store_true",
         help=
         "Use encoder embeddings instead of encoder out as input to LEA module."
     )
Ejemplo n.º 10
0
 def add_args(parser):
     NATransformerModel.add_args(parser)