def __init__(self, model_dim, num_heads, dropout=0.1, batch_first=False, masked_layers=False): super().__init__() self.num_heads = num_heads self.model_dim = model_dim self.batch_first = batch_first self.masked_layers = masked_layers assert model_dim % num_heads == 0 self.head_dim = model_dim // num_heads self.query_projection = MaskedFunction( XavierLinear(model_dim, model_dim, bias=False)) self.key_projection = MaskedFunction( XavierLinear(model_dim, model_dim, bias=False)) self.value_projection = MaskedFunction( XavierLinear(model_dim, model_dim, bias=False)) self.out_projection = MaskedFunction( XavierLinear(model_dim, model_dim, bias=False)) self.attn_dropout = nn.Dropout(dropout)
def __init__(self, h, d_model, attn_p=0.1, static=True, share=3): super(MultiHeadAttention, self).__init__() self.h = h self.d = d_model self.share = share assert d_model % h == 0 self.d_head = d_model // h self.fc_query = MaskedFunction( XavierLinear(d_model, h * self.d_head, bias=False)) self.fc_key = MaskedFunction( XavierLinear(d_model, h * self.d_head, bias=False)) self.fc_value = MaskedFunction( XavierLinear(d_model, h * self.d_head, bias=False)) self.fc_concat = MaskedFunction( XavierLinear(h * self.d_head, d_model, bias=False)) self.sm = nn.Softmax(dim=-1) if static: self.attn_dropout = StaticDropout(attn_p) else: self.attn_dropout = nn.Dropout(attn_p)
def __init__(self, model_dim, dropout=0.1, feed_forward=None, batch_first=False, masked_layers=False): super().__init__() self.batch_first = batch_first self.model_dim = model_dim self.masked_layers = masked_layers self.dropout = nn.Dropout(dropout) self.feed_forward = feed_forward self.input_proj = XavierLinear(model_dim * 2, model_dim, bias=False) self.forget_proj = XavierLinear(model_dim * 2, model_dim, bias=False)
def _build_model(self, model_args): logger.info('Building {} model'.format(model_args.model)) model = build_model(model_args.model, model_args) embedding_size = model_args.word_vec_size or getattr(model_args, 'model_size', None) if embedding_size is None: raise ValueError('Could not infer embedding size') if model_args.copy_decoder and not model_args.join_vocab: raise NotImplementedError('In order to use the copy decoder, the source and target language must ' 'use the same vocabulary') if model_args.join_vocab and model_args.pre_word_vecs_dec: raise ValueError('Cannot join vocabularies when loading pre-trained target embeddings') dummy_input = torch.zeros(1, 1, embedding_size) dummy_output, _ = model(dummy_input, dummy_input) output_size = dummy_output.size(-1) src_embedding = self._get_embedding(model_args, self.src_dict, embedding_size, getattr(self.args, 'pre_word_vecs_enc', None)) if model_args.join_vocab: tgt_embedding = src_embedding else: tgt_embedding = self._get_embedding(model_args, self.tgt_dict, embedding_size, getattr(self.args, 'pre_word_vecs_dev', None)) tgt_linear = XavierLinear(output_size, len(self.tgt_dict)) if model_args.tie_weights: tgt_linear.weight = tgt_embedding.weight encoder = NMTEncoder(model.encoder, src_embedding, model_args.word_dropout) if model_args.copy_decoder: masked_layers = getattr(model_args, 'masked_layers', False) attention_dropout = getattr(model_args, 'attn_dropout', 0.0) decoder = NMTDecoder(model.decoder, tgt_embedding, model_args.word_dropout, tgt_linear, copy_decoder=True, batch_first=model_args.batch_first, extra_attention=model_args.extra_attention, masked_layers=masked_layers, attention_dropout=attention_dropout) else: decoder = NMTDecoder(model.decoder, tgt_embedding, model_args.word_dropout, tgt_linear) if model_args.freeze_model: logger.info('Freezing model parameters') for param in itertools.chain(encoder.parameters(), decoder.decoder.parameters(), tgt_embedding.parameters(), tgt_linear.parameters()): param.requires_grad_(False) self.model = EncoderDecoderModel(encoder, decoder) self.model.batch_first = model_args.batch_first
def __init__(self, model_dim, hidden_dim, dropout, weight_norm=False): super().__init__() self.model_dim = model_dim self.hidden_dim = hidden_dim self.layer_1 = XavierLinear(model_dim, hidden_dim, weight_norm=weight_norm) self.layer_2 = XavierLinear(hidden_dim, model_dim, weight_norm=weight_norm) self.dropout = nn.Dropout(dropout)
def __init__(self, encoder_dim, hidden_dim, output_dim, dropout=0.1, num_layers=3): super().__init__() layers = [ XavierLinear(encoder_dim, hidden_dim), nn.LeakyReLU(0.2), nn.Dropout(dropout) ] for i in range(num_layers - 1): layers.extend([ XavierLinear(hidden_dim, hidden_dim), nn.LeakyReLU(0.2), nn.Dropout(dropout) ]) layers.append(XavierLinear(hidden_dim, output_dim)) self.layers = nn.Sequential(*layers)
def _build_model(self, model_args): logger.info('Building translation model') super()._build_model(model_args) translation_model = self.model logger.info('Building denoising model') denoising_model = build_model(model_args.model, model_args) del denoising_model.encoder embedding = translation_model.encoder.embedded_dropout.embedding linear = XavierLinear(translation_model.decoder.linear.weight.size(1), len(self.src_dict)) if model_args.tie_dual_weights: linear.weight = embedding.weight if model_args.copy_decoder: masked_layers = getattr(model_args, 'masked_layers', False) attention_dropout = getattr(model_args, 'attn_dropout', 0.0) decoder = NMTDecoder(denoising_model.decoder, embedding, model_args.word_dropout, linear, copy_decoder=True, batch_first=model_args.batch_first, extra_attention=model_args.extra_attention, masked_layers=masked_layers, attention_dropout=attention_dropout) else: decoder = NMTDecoder(denoising_model.decoder, embedding, model_args.word_dropout, linear) denoising_model = EncoderDecoderModel(translation_model.encoder, decoder) compound_model = self.DialectTranslationModel(translation_model, denoising_model) compound_model.batch_first = translation_model.batch_first self.model = compound_model
def __init__(self, decoder, embedding, dropout, linear, *, copy_decoder=False, batch_first=False, extra_attention=False, masked_layers=False, attention_dropout=0.1, language_embedding=None): super().__init__() self.decoder = decoder self.embedded_dropout = EmbeddingDropout(embedding, dropout) self.linear = linear self.copy_decoder = copy_decoder self.batch_first = batch_first self.extra_attention = extra_attention if self.copy_decoder: model_dim = linear.weight.size(1) self.gate_layer = XavierLinear(model_dim, 1) if extra_attention: self.attention = MultiHeadAttention(model_dim, 1, attention_dropout, batch_first, masked_layers) self._register_load_state_dict_pre_hook( self._load_nmt_model_compatibility) if language_embedding is not None: self.language_embedding = language_embedding model_dim = self.embedded_dropout.embedding.weight.size(1) emb_dim = language_embedding.weight.size(1) self.merge_layer = XavierLinear(model_dim + emb_dim, model_dim) else: self.language_embedding = None
def _build_model(self, model_args): logger.info('Building {} model'.format(model_args.model)) num_models = 1 if model_args.output_select == 'separate_decoders': num_models = max(num_models, len(self.target_languages)) if model_args.separate_encoders: num_models = max(num_models, len(self.source_languages)) models = [ build_model(model_args.model, model_args) for _ in range(num_models) ] model = models[0] embedding_size = model_args.word_vec_size or getattr( model_args, 'model_size', None) if embedding_size is None: raise ValueError('Could not infer embedding size') if model_args.copy_decoder and not model_args.join_src_tgt_vocab: raise NotImplementedError( 'In order to use the copy decoder, the source and target language must ' 'use the same vocabulary') dummy_input = torch.zeros(1, 1, embedding_size) dummy_output, _ = model(dummy_input, dummy_input) output_size = dummy_output.size(-1) # Pre-trained embedding argument comes from self.args, not model_args, so we don't load them again # when loading the model form a checkpoint if hasattr(self.args, 'pre_word_vecs'): if len(self.args.pre_word_vecs ) == 1 and '=' not in self.args.pre_word_vecs[0]: preload = {'': self.args.pre_word_vecs[0]} else: preload = { key: filename for key, filename in map(lambda x: x.split('='), self.args.pre_word_vecs) } else: preload = {} dict_keys = self._get_dict_keys() inv_dictionaries = { dictionary: key for key, dictionary in self.dictionaries.items() } embeddings = {} for dictionary, key in inv_dictionaries.items(): embedding = self._get_embedding(model_args, dictionary, embedding_size, path=preload.get( dict_keys[key][:-5], None)) embeddings[dictionary] = embedding if model_args.separate_encoders: encoders = { lang: NMTEncoder(m.encoder, embeddings[self.dictionaries[lang + '.src']], model_args.word_dropout) for lang, m in zip(self.source_languages, models) } logger.debug('Number of encoders {}'.format(len(encoders))) else: source_dicts = set(self.dictionaries[lang + '.src'] for lang in self.source_languages) encoders = { dictionary: NMTEncoder(m.encoder, embeddings[dictionary], model_args.word_dropout) for dictionary, m in zip(source_dicts, models) } logger.debug('Number of encoders {}'.format(len(encoders))) encoders = { lang: encoders[self.dictionaries[lang + '.src']] for lang in self.source_languages } if model_args.output_select == 'decoder_every_step': language_embedding = nn.Embedding(len(self.target_languages), embedding_size) else: language_embedding = None def make_decoder(decoder, embedding, linear): if model_args.copy_decoder: masked_layers = getattr(model_args, 'masked_layers', False) attention_dropout = getattr(model_args, 'attn_dropout', 0.0) return NMTDecoder(decoder, embedding, model_args.word_dropout, linear, copy_decoder=True, batch_first=model_args.batch_first, extra_attention=model_args.extra_attention, masked_layers=masked_layers, attention_dropout=attention_dropout, language_embedding=language_embedding) else: return NMTDecoder(decoder, embedding, model_args.word_dropout, linear, language_embedding=language_embedding) target_dicts = set(self.dictionaries[lang + '.tgt'] for lang in self.target_languages) linears = { dictionary: XavierLinear(output_size, len(dictionary)) for dictionary in target_dicts } if model_args.tie_weights: for dictionary in target_dicts: linears[dictionary].weight = embeddings[dictionary].weight if model_args.output_select == 'separate_decoders': decoders = { lang: make_decoder(m.decoder, embeddings[self.dictionaries[lang + '.tgt']], linears[self.dictionaries[lang + '.tgt']]) for lang, m in zip(self.target_languages, models) } logger.debug('Number of decoders {}'.format(len(decoders))) else: decoders = { dictionary: make_decoder(model.decoder, embeddings[dictionary], linears[dictionary]) for dictionary in target_dicts } logger.debug('Number of decoders {}'.format(len(decoders))) decoders = { lang: decoders[self.dictionaries[lang + '.tgt']] for lang in self.target_languages } # Share extra decoder parameters if model_args.copy_decoder: first_decoder = decoders[self.target_languages[0]] for decoder in decoders.values(): if model_args.extra_attention: decoder.attention = first_decoder.attention if model_args.join_lang_vocab == ['all']: for decoder in decoders.values(): decoder.merge_layer = first_decoder.merge_layer elif len(model_args.join_lang_vocab) > 0: first_decoder = decoders[model_args.join_lang_vocab[0]] for other_lang in model_args.join_lang_vocab[1:]: decoders[ other_lang].merge_layer = first_decoder.merge_layer self.model = self.MultilingualNMTModel(encoders, decoders) self.model.batch_first = model_args.batch_first self.model.output_select = model_args.output_select
class NMTDecoder(IncrementalDecoder): """Wraps a Decoder and adds embedding and projection""" def __init__(self, decoder, embedding, dropout, linear, *, copy_decoder=False, batch_first=False, extra_attention=False, masked_layers=False, attention_dropout=0.1, language_embedding=None): super().__init__() self.decoder = decoder self.embedded_dropout = EmbeddingDropout(embedding, dropout) self.linear = linear self.copy_decoder = copy_decoder self.batch_first = batch_first self.extra_attention = extra_attention if self.copy_decoder: model_dim = linear.weight.size(1) self.gate_layer = XavierLinear(model_dim, 1) if extra_attention: self.attention = MultiHeadAttention(model_dim, 1, attention_dropout, batch_first, masked_layers) self._register_load_state_dict_pre_hook( self._load_nmt_model_compatibility) if language_embedding is not None: self.language_embedding = language_embedding model_dim = self.embedded_dropout.embedding.weight.size(1) emb_dim = language_embedding.weight.size(1) self.merge_layer = XavierLinear(model_dim + emb_dim, model_dim) else: self.language_embedding = None def forward(self, decoder_inputs, encoder_outputs, decoder_mask=None, encoder_mask=None): if self.language_embedding is not None: indices, language_id = decoder_inputs emb = torch.cat((self.embedded_dropout(indices), self.language_embedding(language_id)), dim=-1) emb = self.merge_layer(emb) else: emb = self.embedded_dropout(decoder_inputs) out, attention_weights = self.decoder(emb, encoder_outputs, decoder_mask, encoder_mask) if self.copy_decoder: if self.extra_attention: source_attention_bias = self.get_encoder_attention_bias( encoder_outputs, self.batch_first, encoder_mask) _, attention_weights = self.attention(out, encoder_outputs, encoder_outputs, source_attention_bias, decoder_mask, encoder_mask) gates = torch.sigmoid(self.gate_layer(out)).squeeze(-1) if self.training and decoder_mask is not None: # Optimize the projection by calculating only those position where # the input was not padding nonpad_indices = torch.nonzero(decoder_mask.view(-1)).squeeze(1) out = out.view(-1, out.size(-1)) out = out.index_select(0, nonpad_indices) # For multihead attention, the batch size dimension will be bigger. That means the results # of this operation are garbage if attention_weights is not None: attention_weights = attention_weights.view( -1, attention_weights.size(-1)) attention_weights = attention_weights.index_select( 0, nonpad_indices) if self.copy_decoder: gates = gates.masked_select(decoder_mask) if self.copy_decoder: attention_weights = {'attn': attention_weights, 'gates': gates} return self.linear(out), attention_weights def _step(self, decoder_inputs, encoder_outputs, incremental_state, decoder_mask=None, encoder_mask=None): emb = self.embedded_dropout(decoder_inputs) out, attention_weights = self.decoder.step(emb, encoder_outputs, incremental_state, decoder_mask, encoder_mask) if self.copy_decoder: if self.extra_attention: source_attention_bias = self.get_encoder_attention_bias( encoder_outputs, self.batch_first, encoder_mask) _, attention_weights = self.attention(out, encoder_outputs, encoder_outputs, source_attention_bias, decoder_mask, encoder_mask) gates = torch.sigmoid(self.gate_layer(out)).squeeze(-1) attention_weights = {'attn': attention_weights, 'gates': gates} return self.linear(out), attention_weights def get_normalized_probs(self, decoder_outputs, attention_weights, encoder_inputs=None, encoder_mask=None, decoder_mask=None, log_probs=False): decoder_probs = self.decoder.get_normalized_probs( decoder_outputs, attention_weights, encoder_inputs, encoder_mask, decoder_mask, log_probs) if not self.copy_decoder: return decoder_probs attention_weights, gates = attention_weights[ 'attn'], attention_weights['gates'] gates = gates.unsqueeze(-1) optimized = decoder_outputs.dim() == 2 if not self.batch_first: encoder_inputs = encoder_inputs.transpose(0, 1).unsqueeze( 0) # (1, batch, src_len) if optimized: # (batch, tgt_len, src_len) | (tgt_len, batch, src_len) new_size = list(decoder_mask.size()) + [encoder_inputs.size(-1)] nonpad_indices = torch.nonzero(decoder_mask.view(-1)).squeeze(1) encoder_inputs = encoder_inputs.expand(new_size).contiguous() \ .view(-1, encoder_inputs.size(-1)) \ .index_select(0, nonpad_indices) # encoder_inputs is now (decoder_outputs.size(0), src_len) else: encoder_inputs = encoder_inputs.expand_as(attention_weights) assert encoder_inputs.size() == attention_weights.size() encoder_probs = decoder_probs.new_full(decoder_probs.size(), 1e-20) encoder_probs.scatter_add_(1 if optimized else 2, encoder_inputs, attention_weights) if log_probs: encoder_probs.log_() encoder_probs.add_(torch.log(gates)) decoder_probs.add_(torch.log(1 - gates)) # Very important to have it this way around, otherwise we will add -inf + inf = NaN res = decoder_probs + torch.log1p( torch.exp(encoder_probs - decoder_probs)) return res else: return gates * encoder_probs + (1 - gates) * decoder_probs def reorder_incremental_state(self, incremental_state, new_order): self.decoder.reorder_incremental_state(incremental_state, new_order) if self.extra_attention: self.attention.reorder_incremental_state(incremental_state, new_order) def _load_nmt_model_compatibility(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): if prefix + 'gate_layer.weight' in state_dict: return logger.info('Augmenting NMTModel with a copy decoder') items = self.gate_layer.state_dict(prefix=prefix + 'gate_layer.').items() if self.extra_attention: items = itertools.chain( items, self.attention.state_dict(prefix=prefix + 'attention.').items()) for key, value in items: assert key not in state_dict state_dict[key] = value