def get_decoder(lang, shared_decoder_embed_tokens=None): """ Fetch decoder for the input `lang`, which denotes the target language of the model """ if lang not in lang_decoders: tgt_dict = task.dicts[lang] if shared_decoder_embed_tokens is None: decoder_embed_tokens = common_layers.Embedding( num_embeddings=len(tgt_dict), embedding_dim=args.decoder_embed_dim, padding_idx=tgt_dict.pad(), freeze_embed=args.decoder_freeze_embed, ) utils.load_embedding( embedding=decoder_embed_tokens, dictionary=tgt_dict, pretrained_embed=args.decoder_pretrained_embed, ) else: decoder_embed_tokens = shared_decoder_embed_tokens lang_decoders[lang] = cls.single_model_cls.build_decoder( args, task.dicts[lang], tgt_dict, embed_tokens=decoder_embed_tokens) return lang_decoders[lang]
def get_encoder(lang): lang = strip_suffix(lang) if lang not in lang_encoders: src_dict = task.dicts[lang] encoder_embed_tokens = common_layers.Embedding( num_embeddings=len(src_dict), embedding_dim=args.encoder_embed_dim, padding_idx=src_dict.pad(), freeze_embed=args.encoder_freeze_embed, normalize_embed=getattr(args, "encoder_normalize_embed", False), ) utils.load_embedding( embedding=encoder_embed_tokens, dictionary=src_dict, pretrained_embed=args.encoder_pretrained_embed, ) lang_encoders[lang] = cls.single_model_cls.build_encoder( args, src_dict, embed_tokens=encoder_embed_tokens) return lang_encoders[lang]
def get_decoder(lang_pair, shared_decoder_embed_tokens=None): if args.share_decoders: args.remove_vr_if_same_lang_at_enc_and_dec = False """ Fetch decoder for the input `lang_pair`, which denotes the target language of the model """ source_lang, target_lang = (strip_suffix(lang) for lang in lang_pair.split("-")) if target_lang not in lang_decoders: # hack to prevent VR for denoising autoencoder. We remove vocab # reduction params if we have lang-lang_any_suffix args_maybe_modified = copy.deepcopy(args) if (source_lang == target_lang and not args.remove_vr_if_same_lang_at_enc_and_dec): args_maybe_modified.vocab_reduction_params = None tgt_dict = task.dicts[target_lang] if shared_decoder_embed_tokens is None: decoder_embed_tokens = common_layers.Embedding( num_embeddings=len(tgt_dict), embedding_dim=args.decoder_embed_dim, padding_idx=tgt_dict.pad(), freeze_embed=args.decoder_freeze_embed, ) utils.load_embedding( embedding=decoder_embed_tokens, dictionary=tgt_dict, pretrained_embed=args.decoder_pretrained_embed, ) else: decoder_embed_tokens = shared_decoder_embed_tokens lang_decoders[ target_lang] = cls.single_model_cls.build_decoder( args_maybe_modified, task.dicts[source_lang], tgt_dict, embed_tokens=decoder_embed_tokens, ) return lang_decoders[target_lang]
def __init__( self, src_dict, dst_dict, vocab_reduction_params=None, encoder_hidden_dim=512, embed_dim=512, freeze_embed=False, hidden_dim=512, out_embed_dim=512, cell_type="lstm", num_layers=1, dropout_in=0.1, dropout_out=0.1, attention_type="dot", residual_level=None, averaging_encoder=False, project_output=True, tie_embeddings=False, pretrained_embed=None, projection_pretrained_embed=None, ): super().__init__( src_dict, dst_dict, vocab_reduction_params, out_embed_dim, project_output=project_output, pretrained_embed=projection_pretrained_embed, ) encoder_hidden_dim = max(1, encoder_hidden_dim) self.encoder_hidden_dim = encoder_hidden_dim self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.out_embed_dim = out_embed_dim self.dropout_in = dropout_in self.dropout_out = dropout_out self.attention_type = attention_type self.residual_level = residual_level self.tie_embeddings = tie_embeddings num_embeddings = len(dst_dict) padding_idx = dst_dict.pad() self.embed_tokens = Embedding( num_embeddings=num_embeddings, embedding_dim=embed_dim, padding_idx=padding_idx, freeze_embed=freeze_embed, ) if self.tie_embeddings: assert self.embed_dim == self.out_embed_dim, ( "Input embeddings and output projections must have the same " "dimension for the weights to be tied") self.embed_tokens.weight = self.output_projection_w else: pytorch_translate_utils.load_embedding( embedding=self.embed_tokens, dictionary=dst_dict, pretrained_embed=pretrained_embed, ) self.hidden_dim = hidden_dim self.averaging_encoder = averaging_encoder if cell_type == "lstm": cell_class = rnn_cell.LSTMCell elif cell_type == "milstm": cell_class = rnn_cell.MILSTMCell elif cell_type == "layer_norm_lstm": cell_class = rnn_cell.LayerNormLSTMCell if hidden_dim != encoder_hidden_dim: hidden_init_fc_list = [] cell_init_fc_list = [] for _ in range(num_layers): hidden_init_fc_list.append( Linear(encoder_hidden_dim, hidden_dim)) cell_init_fc_list.append(Linear(encoder_hidden_dim, hidden_dim)) self.hidden_init_fc_list = nn.ModuleList(hidden_init_fc_list) self.cell_init_fc_list = nn.ModuleList(cell_init_fc_list) self.attention = attention.build_attention( attention_type=attention_type, decoder_hidden_state_dim=hidden_dim, context_dim=encoder_hidden_dim, ) if self.attention.context_dim: self.initial_attn_context = nn.Parameter( torch.Tensor(self.attention.context_dim).zero_()) self.combined_output_and_context_dim = self.attention.context_dim + hidden_dim layers = [] for layer in range(num_layers): if layer == 0: cell_input_dim = embed_dim + self.attention.context_dim else: cell_input_dim = hidden_dim layers.append( cell_class(input_dim=cell_input_dim, hidden_dim=hidden_dim)) self.layers = nn.ModuleList(layers) if self.combined_output_and_context_dim != out_embed_dim: self.additional_fc = Linear(self.combined_output_and_context_dim, out_embed_dim)
def __init__( self, dictionary, word_dropout_params=None, embed_dim=512, freeze_embed=False, hidden_dim=512, num_layers=1, cell_type="lstm", dropout_in=0.1, dropout_out=0.1, residual_level=None, bidirectional=False, pretrained_embed=None, padding_value=0, left_pad=True, ): super().__init__(dictionary) self.dictionary = dictionary self.dropout_in = dropout_in self.dropout_out = dropout_out self.residual_level = residual_level self.hidden_dim = hidden_dim self.output_units = hidden_dim # fairseq LSTM compatibility self.bidirectional = bidirectional num_embeddings = len(dictionary) self.padding_idx = dictionary.pad() self.padding_value = padding_value self.left_pad = left_pad self.embed_tokens = Embedding( num_embeddings=num_embeddings, embedding_dim=embed_dim, padding_idx=self.padding_idx, freeze_embed=freeze_embed, ) pytorch_translate_utils.load_embedding( embedding=self.embed_tokens, dictionary=dictionary, pretrained_embed=pretrained_embed, ) self.word_dim = embed_dim self.cell_type = cell_type self.layers = nn.ModuleList([]) for layer in range(num_layers): self.layers.append( RNNLayer( self.word_dim if layer == 0 else hidden_dim, hidden_dim, self.cell_type, True if bidirectional and layer == 0 else False, )) self.num_layers = len(self.layers) self.word_dropout_module = None if (word_dropout_params and word_dropout_params["word_dropout_freq_threshold"] is not None and word_dropout_params["word_dropout_freq_threshold"] > 0): self.word_dropout_module = word_dropout.WordDropout( dictionary, word_dropout_params)
def __init__( self, dictionary, embed_dim=512, freeze_embed=False, cell_type="lstm", hidden_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, residual_level=None, bidirectional=False, pretrained_embed=None, word_dropout_params=None, padding_value=0, left_pad=True, ): assert cell_type == "lstm", 'sequence-lstm requires cell_type="lstm"' super().__init__(dictionary) self.dictionary = dictionary self.dropout_in = dropout_in self.dropout_out = dropout_out self.residual_level = residual_level self.hidden_dim = hidden_dim self.bidirectional = bidirectional num_embeddings = len(dictionary) self.padding_idx = dictionary.pad() self.padding_value = padding_value self.left_pad = left_pad self.embed_tokens = Embedding( num_embeddings=num_embeddings, embedding_dim=embed_dim, padding_idx=self.padding_idx, freeze_embed=freeze_embed, ) pytorch_translate_utils.load_embedding( embedding=self.embed_tokens, dictionary=dictionary, pretrained_embed=pretrained_embed, ) self.word_dim = embed_dim self.layers = nn.ModuleList([]) for layer in range(num_layers): is_layer_bidirectional = self.bidirectional and layer == 0 self.layers.append( LSTMSequenceEncoder.LSTM( self.word_dim if layer == 0 else hidden_dim, hidden_dim // 2 if is_layer_bidirectional else hidden_dim, num_layers=1, dropout=self.dropout_out, bidirectional=is_layer_bidirectional, )) self.num_layers = len(self.layers) self.word_dropout_module = None if (word_dropout_params and word_dropout_params["word_dropout_freq_threshold"] is not None and word_dropout_params["word_dropout_freq_threshold"] > 0): self.word_dropout_module = word_dropout.WordDropout( dictionary, word_dropout_params) # Variable tracker self.tracker = VariableTracker() # Initialize adversarial mode self.set_gradient_tracking_mode(False)
def __init__( self, src_dict, dst_dict, vocab_reduction_params=None, n=4, encoder_hidden_dim=512, embed_dim=512, freeze_embed=False, hidden_dim=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention_type="dot", residual_level=None, activation_fn=nn.ReLU, project_output=True, pretrained_embed=None, projection_pretrained_embed=None, ): super().__init__( src_dict, dst_dict, vocab_reduction_params, out_embed_dim, project_output=project_output, pretrained_embed=projection_pretrained_embed, ) self.history_len = n - 1 self.encoder_hidden_dim = encoder_hidden_dim self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.out_embed_dim = out_embed_dim self.dropout_in = dropout_in self.dropout_out = dropout_out self.attention_type = attention_type self.residual_level = residual_level self.dst_dict = dst_dict self.activation_fn = activation_fn num_embeddings = len(dst_dict) padding_idx = dst_dict.pad() self.embed_tokens = Embedding( num_embeddings=num_embeddings, embedding_dim=embed_dim, padding_idx=padding_idx, freeze_embed=freeze_embed, ) pytorch_translate_utils.load_embedding( embedding=self.embed_tokens, dictionary=dst_dict, pretrained_embed=pretrained_embed, ) self.history_conv = nn.Sequential( torch.nn.Conv1d(embed_dim, hidden_dim, self.history_len), activation_fn()) self.hidden_dim = hidden_dim self.layers = nn.ModuleList([ NonlinearLayer(hidden_dim, hidden_dim, activation_fn=activation_fn) for _ in range(num_layers) ]) self.attention = attention.build_attention( attention_type=attention_type, decoder_hidden_state_dim=hidden_dim, context_dim=encoder_hidden_dim, force_projection=True, ) self.combined_output_and_context_dim = self.attention.context_dim + hidden_dim if self.combined_output_and_context_dim != out_embed_dim: self.additional_fc = Linear(self.combined_output_and_context_dim, out_embed_dim)
def build_model(cls, args, task): """Build a new model instance.""" src_dict, dst_dict = task.source_dictionary, task.target_dictionary base_architecture(args) assert args.sequence_lstm, "CharRNNModel only supports sequence_lstm" assert args.cell_type == "lstm", "CharRNNModel only supports cell_type lstm" assert hasattr(args, "char_source_dict_size"), ( "args.char_source_dict_size required. " "should be set by load_binarized_dataset()" ) if hasattr(args, "char_cnn_params"): args.embed_bytes = getattr(args, "embed_bytes", False) # If we embed bytes then the number of indices is fixed and does not # depend on the dictionary if args.embed_bytes: num_chars = vocab_constants.NUM_BYTE_INDICES + TAGS.__len__() + 1 else: num_chars = args.char_source_dict_size # In case use_pretrained_weights is true, verify the model params # are correctly set if args.embed_bytes and getattr(args, "use_pretrained_weights", False): verify_pretrain_params(args) encoder = CharCNNEncoder( src_dict, num_chars=num_chars, unk_only_char_encoding=args.unk_only_char_encoding, embed_dim=args.char_embed_dim, token_embed_dim=args.encoder_embed_dim, freeze_embed=args.encoder_freeze_embed, normalize_embed=args.encoder_normalize_embed, char_cnn_params=args.char_cnn_params, char_cnn_nonlinear_fn=args.char_cnn_nonlinear_fn, char_cnn_num_highway_layers=args.char_cnn_num_highway_layers, char_cnn_output_dim=getattr(args, "char_cnn_output_dim", -1), num_layers=args.encoder_layers, hidden_dim=args.encoder_hidden_dim, dropout_in=args.encoder_dropout_in, dropout_out=args.encoder_dropout_out, residual_level=args.residual_level, bidirectional=bool(args.encoder_bidirectional), use_pretrained_weights=getattr(args, "use_pretrained_weights", False), finetune_pretrained_weights=getattr( args, "finetune_pretrained_weights", False ), weights_file=getattr(args, "pretrained_weights_file", ""), ) else: assert ( args.unk_only_char_encoding is False ), "unk_only_char_encoding should be False when using CharRNNEncoder" encoder = CharRNNEncoder( src_dict, num_chars=args.char_source_dict_size, char_embed_dim=args.char_embed_dim, token_embed_dim=args.encoder_embed_dim, normalize_embed=args.encoder_normalize_embed, char_rnn_units=args.char_rnn_units, char_rnn_layers=args.char_rnn_layers, num_layers=args.encoder_layers, hidden_dim=args.encoder_hidden_dim, dropout_in=args.encoder_dropout_in, dropout_out=args.encoder_dropout_out, residual_level=args.residual_level, bidirectional=bool(args.encoder_bidirectional), ) decoder_embed_tokens = Embedding( num_embeddings=len(dst_dict), embedding_dim=args.decoder_embed_dim, padding_idx=dst_dict.pad(), freeze_embed=args.decoder_freeze_embed, ) utils.load_embedding( embedding=decoder_embed_tokens, dictionary=dst_dict, pretrained_embed=args.decoder_pretrained_embed, ) decoder = rnn.RNNDecoder( src_dict=src_dict, dst_dict=dst_dict, embed_tokens=decoder_embed_tokens, vocab_reduction_params=args.vocab_reduction_params, encoder_hidden_dim=args.encoder_hidden_dim, embed_dim=args.decoder_embed_dim, out_embed_dim=args.decoder_out_embed_dim, cell_type=args.cell_type, num_layers=args.decoder_layers, hidden_dim=args.decoder_hidden_dim, attention_type=args.attention_type, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, residual_level=args.residual_level, averaging_encoder=args.averaging_encoder, ) return cls(task, encoder, decoder)