Example #1
0
    def __init__(self,
                 model_dim,
                 num_heads,
                 dropout=0.1,
                 batch_first=False,
                 masked_layers=False):
        super().__init__()
        self.num_heads = num_heads
        self.model_dim = model_dim
        self.batch_first = batch_first
        self.masked_layers = masked_layers

        assert model_dim % num_heads == 0

        self.head_dim = model_dim // num_heads

        self.query_projection = MaskedFunction(
            XavierLinear(model_dim, model_dim, bias=False))
        self.key_projection = MaskedFunction(
            XavierLinear(model_dim, model_dim, bias=False))
        self.value_projection = MaskedFunction(
            XavierLinear(model_dim, model_dim, bias=False))

        self.out_projection = MaskedFunction(
            XavierLinear(model_dim, model_dim, bias=False))

        self.attn_dropout = nn.Dropout(dropout)
Example #2
0
    def __init__(self, h, d_model, attn_p=0.1, static=True, share=3):
        super(MultiHeadAttention, self).__init__()
        self.h = h
        self.d = d_model
        self.share = share

        assert d_model % h == 0

        self.d_head = d_model // h
        self.fc_query = MaskedFunction(
            XavierLinear(d_model, h * self.d_head, bias=False))
        self.fc_key = MaskedFunction(
            XavierLinear(d_model, h * self.d_head, bias=False))
        self.fc_value = MaskedFunction(
            XavierLinear(d_model, h * self.d_head, bias=False))

        self.fc_concat = MaskedFunction(
            XavierLinear(h * self.d_head, d_model, bias=False))

        self.sm = nn.Softmax(dim=-1)

        if static:
            self.attn_dropout = StaticDropout(attn_p)
        else:
            self.attn_dropout = nn.Dropout(attn_p)
Example #3
0
 def __init__(self, model_dim, dropout=0.1, feed_forward=None, batch_first=False, masked_layers=False):
     super().__init__()
     self.batch_first = batch_first
     self.model_dim = model_dim
     self.masked_layers = masked_layers
     self.dropout = nn.Dropout(dropout)
     self.feed_forward = feed_forward
     self.input_proj = XavierLinear(model_dim * 2, model_dim, bias=False)
     self.forget_proj = XavierLinear(model_dim * 2, model_dim, bias=False)
Example #4
0
    def _build_model(self, model_args):
        logger.info('Building {} model'.format(model_args.model))
        model = build_model(model_args.model, model_args)

        embedding_size = model_args.word_vec_size or getattr(model_args, 'model_size', None)
        if embedding_size is None:
            raise ValueError('Could not infer embedding size')

        if model_args.copy_decoder and not model_args.join_vocab:
            raise NotImplementedError('In order to use the copy decoder, the source and target language must '
                                      'use the same vocabulary')

        if model_args.join_vocab and model_args.pre_word_vecs_dec:
            raise ValueError('Cannot join vocabularies when loading pre-trained target embeddings')

        dummy_input = torch.zeros(1, 1, embedding_size)
        dummy_output, _ = model(dummy_input, dummy_input)
        output_size = dummy_output.size(-1)

        src_embedding = self._get_embedding(model_args, self.src_dict, embedding_size,
                                            getattr(self.args, 'pre_word_vecs_enc', None))

        if model_args.join_vocab:
            tgt_embedding = src_embedding
        else:
            tgt_embedding = self._get_embedding(model_args, self.tgt_dict, embedding_size,
                                                getattr(self.args, 'pre_word_vecs_dev', None))

        tgt_linear = XavierLinear(output_size, len(self.tgt_dict))

        if model_args.tie_weights:
            tgt_linear.weight = tgt_embedding.weight

        encoder = NMTEncoder(model.encoder, src_embedding, model_args.word_dropout)

        if model_args.copy_decoder:
            masked_layers = getattr(model_args, 'masked_layers', False)
            attention_dropout = getattr(model_args, 'attn_dropout', 0.0)
            decoder = NMTDecoder(model.decoder, tgt_embedding, model_args.word_dropout, tgt_linear,
                                 copy_decoder=True,
                                 batch_first=model_args.batch_first,
                                 extra_attention=model_args.extra_attention,
                                 masked_layers=masked_layers,
                                 attention_dropout=attention_dropout)
        else:
            decoder = NMTDecoder(model.decoder, tgt_embedding, model_args.word_dropout, tgt_linear)

        if model_args.freeze_model:
            logger.info('Freezing model parameters')
            for param in itertools.chain(encoder.parameters(), decoder.decoder.parameters(),
                                         tgt_embedding.parameters(),
                                         tgt_linear.parameters()):
                param.requires_grad_(False)

        self.model = EncoderDecoderModel(encoder, decoder)
        self.model.batch_first = model_args.batch_first
Example #5
0
    def __init__(self, model_dim, hidden_dim, dropout, weight_norm=False):
        super().__init__()
        self.model_dim = model_dim
        self.hidden_dim = hidden_dim
        self.layer_1 = XavierLinear(model_dim,
                                    hidden_dim,
                                    weight_norm=weight_norm)
        self.layer_2 = XavierLinear(hidden_dim,
                                    model_dim,
                                    weight_norm=weight_norm)

        self.dropout = nn.Dropout(dropout)
Example #6
0
 def __init__(self,
              encoder_dim,
              hidden_dim,
              output_dim,
              dropout=0.1,
              num_layers=3):
     super().__init__()
     layers = [
         XavierLinear(encoder_dim, hidden_dim),
         nn.LeakyReLU(0.2),
         nn.Dropout(dropout)
     ]
     for i in range(num_layers - 1):
         layers.extend([
             XavierLinear(hidden_dim, hidden_dim),
             nn.LeakyReLU(0.2),
             nn.Dropout(dropout)
         ])
     layers.append(XavierLinear(hidden_dim, output_dim))
     self.layers = nn.Sequential(*layers)
Example #7
0
    def _build_model(self, model_args):
        logger.info('Building translation model')
        super()._build_model(model_args)
        translation_model = self.model

        logger.info('Building denoising model')
        denoising_model = build_model(model_args.model, model_args)
        del denoising_model.encoder

        embedding = translation_model.encoder.embedded_dropout.embedding
        linear = XavierLinear(translation_model.decoder.linear.weight.size(1),
                              len(self.src_dict))

        if model_args.tie_dual_weights:
            linear.weight = embedding.weight

        if model_args.copy_decoder:
            masked_layers = getattr(model_args, 'masked_layers', False)
            attention_dropout = getattr(model_args, 'attn_dropout', 0.0)
            decoder = NMTDecoder(denoising_model.decoder,
                                 embedding,
                                 model_args.word_dropout,
                                 linear,
                                 copy_decoder=True,
                                 batch_first=model_args.batch_first,
                                 extra_attention=model_args.extra_attention,
                                 masked_layers=masked_layers,
                                 attention_dropout=attention_dropout)
        else:
            decoder = NMTDecoder(denoising_model.decoder, embedding,
                                 model_args.word_dropout, linear)

        denoising_model = EncoderDecoderModel(translation_model.encoder,
                                              decoder)
        compound_model = self.DialectTranslationModel(translation_model,
                                                      denoising_model)
        compound_model.batch_first = translation_model.batch_first
        self.model = compound_model
Example #8
0
    def __init__(self,
                 decoder,
                 embedding,
                 dropout,
                 linear,
                 *,
                 copy_decoder=False,
                 batch_first=False,
                 extra_attention=False,
                 masked_layers=False,
                 attention_dropout=0.1,
                 language_embedding=None):
        super().__init__()
        self.decoder = decoder
        self.embedded_dropout = EmbeddingDropout(embedding, dropout)
        self.linear = linear
        self.copy_decoder = copy_decoder
        self.batch_first = batch_first
        self.extra_attention = extra_attention

        if self.copy_decoder:
            model_dim = linear.weight.size(1)
            self.gate_layer = XavierLinear(model_dim, 1)
            if extra_attention:
                self.attention = MultiHeadAttention(model_dim, 1,
                                                    attention_dropout,
                                                    batch_first, masked_layers)

            self._register_load_state_dict_pre_hook(
                self._load_nmt_model_compatibility)

        if language_embedding is not None:
            self.language_embedding = language_embedding
            model_dim = self.embedded_dropout.embedding.weight.size(1)
            emb_dim = language_embedding.weight.size(1)
            self.merge_layer = XavierLinear(model_dim + emb_dim, model_dim)
        else:
            self.language_embedding = None
    def _build_model(self, model_args):
        logger.info('Building {} model'.format(model_args.model))

        num_models = 1
        if model_args.output_select == 'separate_decoders':
            num_models = max(num_models, len(self.target_languages))
        if model_args.separate_encoders:
            num_models = max(num_models, len(self.source_languages))
        models = [
            build_model(model_args.model, model_args)
            for _ in range(num_models)
        ]
        model = models[0]

        embedding_size = model_args.word_vec_size or getattr(
            model_args, 'model_size', None)
        if embedding_size is None:
            raise ValueError('Could not infer embedding size')

        if model_args.copy_decoder and not model_args.join_src_tgt_vocab:
            raise NotImplementedError(
                'In order to use the copy decoder, the source and target language must '
                'use the same vocabulary')

        dummy_input = torch.zeros(1, 1, embedding_size)
        dummy_output, _ = model(dummy_input, dummy_input)
        output_size = dummy_output.size(-1)

        # Pre-trained embedding argument comes from self.args, not model_args, so we don't load them again
        # when loading the model form a checkpoint
        if hasattr(self.args, 'pre_word_vecs'):
            if len(self.args.pre_word_vecs
                   ) == 1 and '=' not in self.args.pre_word_vecs[0]:
                preload = {'': self.args.pre_word_vecs[0]}
            else:
                preload = {
                    key: filename
                    for key, filename in map(lambda x: x.split('='),
                                             self.args.pre_word_vecs)
                }
        else:
            preload = {}

        dict_keys = self._get_dict_keys()
        inv_dictionaries = {
            dictionary: key
            for key, dictionary in self.dictionaries.items()
        }

        embeddings = {}
        for dictionary, key in inv_dictionaries.items():
            embedding = self._get_embedding(model_args,
                                            dictionary,
                                            embedding_size,
                                            path=preload.get(
                                                dict_keys[key][:-5], None))
            embeddings[dictionary] = embedding

        if model_args.separate_encoders:
            encoders = {
                lang: NMTEncoder(m.encoder,
                                 embeddings[self.dictionaries[lang + '.src']],
                                 model_args.word_dropout)
                for lang, m in zip(self.source_languages, models)
            }
            logger.debug('Number of encoders {}'.format(len(encoders)))
        else:
            source_dicts = set(self.dictionaries[lang + '.src']
                               for lang in self.source_languages)
            encoders = {
                dictionary: NMTEncoder(m.encoder, embeddings[dictionary],
                                       model_args.word_dropout)
                for dictionary, m in zip(source_dicts, models)
            }
            logger.debug('Number of encoders {}'.format(len(encoders)))
            encoders = {
                lang: encoders[self.dictionaries[lang + '.src']]
                for lang in self.source_languages
            }

        if model_args.output_select == 'decoder_every_step':
            language_embedding = nn.Embedding(len(self.target_languages),
                                              embedding_size)
        else:
            language_embedding = None

        def make_decoder(decoder, embedding, linear):
            if model_args.copy_decoder:
                masked_layers = getattr(model_args, 'masked_layers', False)
                attention_dropout = getattr(model_args, 'attn_dropout', 0.0)
                return NMTDecoder(decoder,
                                  embedding,
                                  model_args.word_dropout,
                                  linear,
                                  copy_decoder=True,
                                  batch_first=model_args.batch_first,
                                  extra_attention=model_args.extra_attention,
                                  masked_layers=masked_layers,
                                  attention_dropout=attention_dropout,
                                  language_embedding=language_embedding)
            else:
                return NMTDecoder(decoder,
                                  embedding,
                                  model_args.word_dropout,
                                  linear,
                                  language_embedding=language_embedding)

        target_dicts = set(self.dictionaries[lang + '.tgt']
                           for lang in self.target_languages)
        linears = {
            dictionary: XavierLinear(output_size, len(dictionary))
            for dictionary in target_dicts
        }

        if model_args.tie_weights:
            for dictionary in target_dicts:
                linears[dictionary].weight = embeddings[dictionary].weight

        if model_args.output_select == 'separate_decoders':
            decoders = {
                lang:
                make_decoder(m.decoder,
                             embeddings[self.dictionaries[lang + '.tgt']],
                             linears[self.dictionaries[lang + '.tgt']])
                for lang, m in zip(self.target_languages, models)
            }
            logger.debug('Number of decoders {}'.format(len(decoders)))
        else:
            decoders = {
                dictionary: make_decoder(model.decoder, embeddings[dictionary],
                                         linears[dictionary])
                for dictionary in target_dicts
            }
            logger.debug('Number of decoders {}'.format(len(decoders)))
            decoders = {
                lang: decoders[self.dictionaries[lang + '.tgt']]
                for lang in self.target_languages
            }

        # Share extra decoder parameters
        if model_args.copy_decoder:
            first_decoder = decoders[self.target_languages[0]]
            for decoder in decoders.values():
                if model_args.extra_attention:
                    decoder.attention = first_decoder.attention

            if model_args.join_lang_vocab == ['all']:
                for decoder in decoders.values():
                    decoder.merge_layer = first_decoder.merge_layer
            elif len(model_args.join_lang_vocab) > 0:
                first_decoder = decoders[model_args.join_lang_vocab[0]]
                for other_lang in model_args.join_lang_vocab[1:]:
                    decoders[
                        other_lang].merge_layer = first_decoder.merge_layer

        self.model = self.MultilingualNMTModel(encoders, decoders)
        self.model.batch_first = model_args.batch_first
        self.model.output_select = model_args.output_select
Example #10
0
class NMTDecoder(IncrementalDecoder):
    """Wraps a Decoder and adds embedding and projection"""
    def __init__(self,
                 decoder,
                 embedding,
                 dropout,
                 linear,
                 *,
                 copy_decoder=False,
                 batch_first=False,
                 extra_attention=False,
                 masked_layers=False,
                 attention_dropout=0.1,
                 language_embedding=None):
        super().__init__()
        self.decoder = decoder
        self.embedded_dropout = EmbeddingDropout(embedding, dropout)
        self.linear = linear
        self.copy_decoder = copy_decoder
        self.batch_first = batch_first
        self.extra_attention = extra_attention

        if self.copy_decoder:
            model_dim = linear.weight.size(1)
            self.gate_layer = XavierLinear(model_dim, 1)
            if extra_attention:
                self.attention = MultiHeadAttention(model_dim, 1,
                                                    attention_dropout,
                                                    batch_first, masked_layers)

            self._register_load_state_dict_pre_hook(
                self._load_nmt_model_compatibility)

        if language_embedding is not None:
            self.language_embedding = language_embedding
            model_dim = self.embedded_dropout.embedding.weight.size(1)
            emb_dim = language_embedding.weight.size(1)
            self.merge_layer = XavierLinear(model_dim + emb_dim, model_dim)
        else:
            self.language_embedding = None

    def forward(self,
                decoder_inputs,
                encoder_outputs,
                decoder_mask=None,
                encoder_mask=None):
        if self.language_embedding is not None:
            indices, language_id = decoder_inputs

            emb = torch.cat((self.embedded_dropout(indices),
                             self.language_embedding(language_id)),
                            dim=-1)
            emb = self.merge_layer(emb)
        else:
            emb = self.embedded_dropout(decoder_inputs)

        out, attention_weights = self.decoder(emb, encoder_outputs,
                                              decoder_mask, encoder_mask)

        if self.copy_decoder:
            if self.extra_attention:
                source_attention_bias = self.get_encoder_attention_bias(
                    encoder_outputs, self.batch_first, encoder_mask)
                _, attention_weights = self.attention(out, encoder_outputs,
                                                      encoder_outputs,
                                                      source_attention_bias,
                                                      decoder_mask,
                                                      encoder_mask)

            gates = torch.sigmoid(self.gate_layer(out)).squeeze(-1)

        if self.training and decoder_mask is not None:
            # Optimize the projection by calculating only those position where
            # the input was not padding
            nonpad_indices = torch.nonzero(decoder_mask.view(-1)).squeeze(1)
            out = out.view(-1, out.size(-1))
            out = out.index_select(0, nonpad_indices)

            # For multihead attention, the batch size dimension will be bigger. That means the results
            # of this operation are garbage
            if attention_weights is not None:
                attention_weights = attention_weights.view(
                    -1, attention_weights.size(-1))
                attention_weights = attention_weights.index_select(
                    0, nonpad_indices)
            if self.copy_decoder:
                gates = gates.masked_select(decoder_mask)

        if self.copy_decoder:
            attention_weights = {'attn': attention_weights, 'gates': gates}

        return self.linear(out), attention_weights

    def _step(self,
              decoder_inputs,
              encoder_outputs,
              incremental_state,
              decoder_mask=None,
              encoder_mask=None):
        emb = self.embedded_dropout(decoder_inputs)
        out, attention_weights = self.decoder.step(emb, encoder_outputs,
                                                   incremental_state,
                                                   decoder_mask, encoder_mask)

        if self.copy_decoder:
            if self.extra_attention:
                source_attention_bias = self.get_encoder_attention_bias(
                    encoder_outputs, self.batch_first, encoder_mask)
                _, attention_weights = self.attention(out, encoder_outputs,
                                                      encoder_outputs,
                                                      source_attention_bias,
                                                      decoder_mask,
                                                      encoder_mask)

            gates = torch.sigmoid(self.gate_layer(out)).squeeze(-1)
            attention_weights = {'attn': attention_weights, 'gates': gates}

        return self.linear(out), attention_weights

    def get_normalized_probs(self,
                             decoder_outputs,
                             attention_weights,
                             encoder_inputs=None,
                             encoder_mask=None,
                             decoder_mask=None,
                             log_probs=False):
        decoder_probs = self.decoder.get_normalized_probs(
            decoder_outputs, attention_weights, encoder_inputs, encoder_mask,
            decoder_mask, log_probs)

        if not self.copy_decoder:
            return decoder_probs

        attention_weights, gates = attention_weights[
            'attn'], attention_weights['gates']
        gates = gates.unsqueeze(-1)

        optimized = decoder_outputs.dim() == 2
        if not self.batch_first:
            encoder_inputs = encoder_inputs.transpose(0, 1).unsqueeze(
                0)  # (1, batch, src_len)
        if optimized:
            # (batch, tgt_len, src_len) | (tgt_len, batch, src_len)
            new_size = list(decoder_mask.size()) + [encoder_inputs.size(-1)]
            nonpad_indices = torch.nonzero(decoder_mask.view(-1)).squeeze(1)
            encoder_inputs = encoder_inputs.expand(new_size).contiguous() \
                .view(-1, encoder_inputs.size(-1)) \
                .index_select(0, nonpad_indices)
            # encoder_inputs is now (decoder_outputs.size(0), src_len)
        else:
            encoder_inputs = encoder_inputs.expand_as(attention_weights)

        assert encoder_inputs.size() == attention_weights.size()

        encoder_probs = decoder_probs.new_full(decoder_probs.size(), 1e-20)
        encoder_probs.scatter_add_(1 if optimized else 2, encoder_inputs,
                                   attention_weights)

        if log_probs:
            encoder_probs.log_()
            encoder_probs.add_(torch.log(gates))
            decoder_probs.add_(torch.log(1 - gates))
            # Very important to have it this way around, otherwise we will add -inf + inf = NaN
            res = decoder_probs + torch.log1p(
                torch.exp(encoder_probs - decoder_probs))
            return res
        else:
            return gates * encoder_probs + (1 - gates) * decoder_probs

    def reorder_incremental_state(self, incremental_state, new_order):
        self.decoder.reorder_incremental_state(incremental_state, new_order)
        if self.extra_attention:
            self.attention.reorder_incremental_state(incremental_state,
                                                     new_order)

    def _load_nmt_model_compatibility(self, state_dict, prefix, local_metadata,
                                      strict, missing_keys, unexpected_keys,
                                      error_msgs):
        if prefix + 'gate_layer.weight' in state_dict:
            return

        logger.info('Augmenting NMTModel with a copy decoder')
        items = self.gate_layer.state_dict(prefix=prefix +
                                           'gate_layer.').items()
        if self.extra_attention:
            items = itertools.chain(
                items,
                self.attention.state_dict(prefix=prefix +
                                          'attention.').items())
        for key, value in items:
            assert key not in state_dict
            state_dict[key] = value