Ejemplo n.º 1
0
    def build_model(cls, args, task):
        """Build a new model instance."""
        # make sure all arguments are present in older models
        base_lm_architecture(args)

        if getattr(args, "max_target_positions", None) is not None:
            max_target_positions = args.max_target_positions
        else:
            max_target_positions = getattr(args, "tokens_per_sample",
                                           DEFAULT_MAX_TARGET_POSITIONS)

        def load_pretrained_embedding_from_file(embed_path, dictionary,
                                                embed_dim):
            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
            embed_dict = utils.parse_embedding(embed_path)
            utils.print_embed_overlap(embed_dict, dictionary)
            return utils.load_embedding(embed_dict, dictionary, embed_tokens)

        if args.is_wordlm and hasattr(task, "word_dictionary"):
            dictionary = task.word_dictionary
        elif isinstance(task, SpeechRecognitionEspressoTask):
            dictionary = task.target_dictionary
        else:
            dictionary = task.source_dictionary

        # separate decoder input embeddings
        pretrained_decoder_embed = None
        if args.decoder_embed_path:
            pretrained_decoder_embed = load_pretrained_embedding_from_file(
                args.decoder_embed_path, dictionary, args.decoder_embed_dim)
        # one last double check of parameter combinations
        if args.share_embed and (args.decoder_embed_dim !=
                                 args.decoder_out_embed_dim):
            raise ValueError(
                "--share-embed requires "
                "--decoder-embed-dim to match --decoder-out-embed-dim")

        if args.decoder_freeze_embed:
            pretrained_decoder_embed.weight.requires_grad = False

        decoder = SpeechLSTMDecoder(
            dictionary=dictionary,
            embed_dim=args.decoder_embed_dim,
            hidden_size=args.decoder_hidden_size,
            out_embed_dim=args.decoder_out_embed_dim,
            num_layers=args.decoder_layers,
            dropout_in=args.decoder_dropout_in,
            dropout_out=args.decoder_dropout_out,
            attn_type=None,
            encoder_output_units=0,
            pretrained_embed=pretrained_decoder_embed,
            share_input_output_embed=args.share_embed,
            adaptive_softmax_cutoff=(utils.eval_str_list(
                args.adaptive_softmax_cutoff, type=int) if args.criterion
                                     == "adaptive_loss" else None),
            max_target_positions=max_target_positions,
        )
        return cls(decoder, args)
Ejemplo n.º 2
0
 def build_decoder(cls, cfg, tgt_dict, embed_tokens):
     return SpeechLSTMDecoder(
         tgt_dict,
         embed_dim=cfg.decoder.embed_dim,
         hidden_size=cfg.decoder.hidden_size,
         out_embed_dim=cfg.decoder.hidden_size,
         num_layers=cfg.decoder.layers,
         dropout_in=cfg.decoder.dropout_in,
         dropout_out=cfg.decoder.dropout_out,
         residual=cfg.decoder.residual,
         pretrained_embed=embed_tokens,
         share_input_output_embed=True,  # disallow fc_out in decoder
         max_target_positions=DEFAULT_MAX_TARGET_POSITIONS,
     )