def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None):
    """Build a model from opts.

    Args:
        model_opt: the option loaded from checkpoint. It's important that
            the opts have been updated and validated. See
            :class:`onmt.utils.parse.ArgumentParser`.
        fields (dict[str, torchtext.data.Field]):
            `Field` objects for the model.
        gpu (bool): whether to use gpu.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.

    Returns:
        the NMTModel.
    """

    # for back compat when attention_dropout was not defined
    try:
        model_opt.attention_dropout
    except AttributeError:
        model_opt.attention_dropout = model_opt.dropout

    # Build embeddings.
    src_field = fields["src"]
    src_emb = build_embeddings(model_opt, src_field)
    tgt_field = fields["tgt"]
    tgt_emb = build_embeddings(model_opt, tgt_field)

    # Share the embedding matrix - preprocess with share_vocab required.
    if model_opt.share_embeddings:
        # src/tgt vocab should be the same if `-share_vocab` is specified.
        assert src_field.base_field.vocab == tgt_field.base_field.vocab, \
            "preprocess with -share_vocab if you use share_embeddings"

        tgt_emb.word_lut.weight = src_emb.word_lut.weight

    # Build encoder.
    encoder_x2y = build_encoder(model_opt, src_emb)
    encoder_y2x = build_encoder(model_opt, tgt_emb)

    # Build decoder.
    decoder_x2y = build_decoder(model_opt, tgt_emb)
    decoder_y2x = build_decoder(model_opt, src_emb)


    def share_attn_weight_and_bias(attn1, attn2, 
                                   share_relative_pos_embeddings=False):
        attn2.linear_keys = attn1.linear_keys
        attn2.linear_values = attn1.linear_values
        attn2.linear_query = attn1.linear_query
        attn2.final_linear = attn1.final_linear
        if share_relative_pos_embeddings:
            assert model_opt.max_relative_positions > 0
            attn2.relative_positions_embeddings = \
                attn1.relative_positions_embeddings

    # logger.info('share encoder')
    encoder_y2x = encoder_x2y
    # logger.info('share cross_attns btw fwd & bwd decoders')
    for dec1, dec2 in zip(decoder_x2y.transformer_layers, 
                            decoder_y2x.transformer_layers):
        share_attn_weight_and_bias(dec1.context_attn, dec2.context_attn)

    # logger.info('share self_attns btw fwd & bwd decoders')
    for dec1, dec2 in zip(decoder_x2y.transformer_layers, 
                            decoder_y2x.transformer_layers):
        share_attn_weight_and_bias(dec1.self_attn, dec2.self_attn,
                                    model_opt.share_relative_pos_embeddings)
    # logger.info('share feed_forwards btw fwd & bwd decoders')
    for dec1, dec2 in zip(decoder_x2y.transformer_layers, 
                            decoder_y2x.transformer_layers):
        dec2.feed_forward.w_1 = dec1.feed_forward.w_1
        dec2.feed_forward.w_2 = dec1.feed_forward.w_2

    # Build NMTModel(= encoder + decoder).
    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")
    model = onmt.models.NMTModel(encoder_x2y, encoder_y2x, 
                                 decoder_x2y, decoder_y2x)

    # Build prior modeling
    prior = None
    if model_opt.learned_prior:
        assert model_opt.num_experts > 1
        prior = onmt.models.Classifier(
            model_opt.enc_rnn_size, model_opt.num_experts, 
            dropout=(model_opt.dropout[0] if type(model_opt.dropout) is list
                     else model_opt.dropout))

    # Build Generator.
    if not model_opt.copy_attn:
        if model_opt.generator_function == "sparsemax":
            gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1)
        else:
            gen_func = nn.LogSoftmax(dim=-1)
        generator_x2y = nn.Sequential(
            nn.Linear(model_opt.dec_rnn_size,
                      len(fields["tgt"].base_field.vocab)),
            Cast(torch.float32),
            gen_func
        )
        generator_y2x = nn.Sequential(
            nn.Linear(model_opt.dec_rnn_size,
                      len(fields["src"].base_field.vocab)),
            Cast(torch.float32),
            gen_func
        )
        if model_opt.share_decoder_embeddings:
            generator_x2y[0].weight = decoder_x2y.embeddings.word_lut.weight
            generator_y2x[0].weight = decoder_y2x.embeddings.word_lut.weight
    else:
        tgt_base_field = fields["tgt"].base_field
        vocab_size = len(tgt_base_field.vocab)
        pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token]
        generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx)
        if model_opt.share_decoder_embeddings:
            generator_x2y.linear.weight = decoder_x2y.embeddings.word_lut.weight
            generator_y2x.linear.weight = decoder_y2x.embeddings.word_lut.weight

    # Load the model states from checkpoint or initialize them.
    if checkpoint is not None:
        # This preserves backward-compat for models using customed layernorm
        def fix_key(s):
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                       r'\1.layer_norm\2.bias', s)
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                       r'\1.layer_norm\2.weight', s)
            return s

        checkpoint['model'] = {fix_key(k): v
                               for k, v in checkpoint['model'].items()}
        # end of patch for backward compatibility

        model.load_state_dict(checkpoint['model'], strict=False)
        generator_x2y.load_state_dict(checkpoint['generator_x2y'], strict=False)
        generator_y2x.load_state_dict(checkpoint['generator_y2x'], strict=False)
        if model_opt.learned_prior:
            prior.load_state_dict(checkpoint['prior'], strict=False)
    else:
        if model_opt.param_init != 0.0:
            def init_param(target_model):
                for p in target_model.parameters():
                    p.data.uniform_(-model_opt.param_init, 
                                    model_opt.param_init)
            init_param(model)
            init_param(generator_x2y)
            init_param(generator_y2x)
            if model_opt.learned_prior:
                init_param(prior)
        if model_opt.param_init_glorot:
            def init_glorot(target_model):
                for p in target_model.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)
            init_glorot(model)
            init_glorot(generator_x2y)
            init_glorot(generator_y2x)
            if model_opt.learned_prior:
                init_glorot(prior)

    model.generator_x2y = generator_x2y
    model.generator_y2x = generator_y2x
    model.prior = prior
    model.to(device)
    if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam':
        model.half()
    return model
Beispiel #2
0
def build_base_model(model_opt, gpu, tokenizer, checkpoint=None, gpu_id=None):
    """Build a model from opts.

    Args:
        model_opt: the option loaded from checkpoint. It's important that
            the opts have been updated and validated. See
            :class:`onmt.utils.parse.ArgumentParser`.
        gpu (bool): whether to use gpu.
        tokenizer: tokenizer used to build embedding layer, if opt.share_tokenizer = true
                   tokenizer is a EasyTokenizer instance else is a dice contain {'src','tgt'}.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.

    Returns:
        the NMTModel.
    """

    # Build source embeddings.
    if opt.share_tokenizer:
        src_emb = build_embeddings(model_opt, tokenizer, src_field)
    else:
        src_emb = build_embeddings(model_opt, tokenizer['src'], src_field)
    # Build encoder.
    encoder = TransformerEncoder.from_opt(model_opt, src_emb)

    # Build target embeddings.
    if opt.share_tokenizer:
        tgt_emb = build_embeddings(model_opt, tokenizer, for_encoder=False)
    else:
        tgt_emb = build_embeddings(model_opt, tokenizer['tgt'], for_encoder=False)
    # Share the embedding matrix - preprocess with share_vocab required.
    if model_opt.share_embeddings:
        if not opt.share_tokenizer:
            # src/tgt vocab should be the same if `-share_vocab` is specified.
            assert src_field.base_field.vocab == tgt_field.base_field.vocab, \
                "preprocess with -share_vocab if you use share_embeddings"
        tgt_emb.word_lut.weight = src_emb.word_lut.weight
    # Build decoder.
    decoder = TransformerDecoder.from_opt(model_opt, src_emb)

    # Build TransformerModel(= encoder + decoder).
    model = TransformerModel(encoder, decoder)

    # Build Generator.
    # copy attention 是另一个论文提出的技术
    if not model_opt.copy_attn:
        if model_opt.generator_function == "sparsemax":
            gen_func = model.modules.sparse_activations.LogSparsemax(dim=-1)
        else:
            gen_func = nn.LogSoftmax(dim=-1)
        generator = nn.Sequential(
            nn.Linear(model_opt.dec_dim_size,
                      len(tokenizer.vocal) if opt.share_tokenizer else len(tokenizer['tgt'].vocab)),
            Cast(torch.float32),
            gen_func
        )
        if model_opt.share_decoder_embeddings:
            generator[0].weight = decoder.embeddings.word_lut.weight
    else:
        tgt_base_field = fields["tgt"].base_field
        vocab_size = len(tgt_base_field.vocab)
        pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token]
        generator = CopyGenerator(model_opt.dec_dim_size, vocab_size, pad_idx)

    # Load the model states from checkpoint or initialize them.
    if checkpoint is not None:
        # This preserves backward-compat for models using customed layernorm
        def fix_key(s):
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                       r'\1.layer_norm\2.bias', s)
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                       r'\1.layer_norm\2.weight', s)
            return s

        checkpoint['model'] = {fix_key(k): v
                               for k, v in checkpoint['model'].items()}
        # end of patch for backward compatibility

        model.load_state_dict(checkpoint['model'], strict=False)
        generator.load_state_dict(checkpoint['generator'], strict=False)
    else:
        # 判断如何初始化参数
        if model_opt.param_init != 0.0:
            for p in model.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
            for p in generator.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
        # 用xavier初始化
        if model_opt.param_init_glorot:
            for p in model.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
            for p in generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
        # 使用预训练词嵌入层参数
        if hasattr(model.encoder, 'embeddings'):
            model.encoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_enc)
        if hasattr(model.decoder, 'embeddings'):
            model.decoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_dec)
    # 生成部分
    model.generator = generator

    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")
    model.to(device)
    if model_opt.model_dtype == 'fp16':
        model.half()00000000000000

    return model
Beispiel #3
0
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None):
    """Build a model from opts.

    Args:
        model_opt: the option loaded from checkpoint. It's important that
            the opts have been updated and validated. See
            :class:`onmt.utils.parse.ArgumentParser`.
        fields (dict[str, torchtext.data.Field]):
            `Field` objects for the model.
        gpu (bool): whether to use gpu.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.

    Returns:
        the NMTModel.
    """

    # Build embeddings.
    if model_opt.model_type == "text":
        src_field = fields["src"]
        src_emb = build_embeddings(model_opt, src_field)
    else:
        src_emb = None

    # Build encoder.
    encoder = build_encoder(model_opt, src_emb)

    # Build decoder.
    tgt_field = fields["tgt"]
    tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False)

    # Share the embedding matrix - preprocess with share_vocab required.
    if model_opt.share_embeddings:
        # src/tgt vocab should be the same if `-share_vocab` is specified.
        assert src_field.base_field.vocab == tgt_field.base_field.vocab, \
            "preprocess with -share_vocab if you use share_embeddings"

        tgt_emb.word_lut.weight = src_emb.word_lut.weight

    if model_opt.share_position_embeddings:
        tgt_emb.make_embedding.pe.pe.weight = src_emb.make_embedding.pe.pe.weight

    decoder = build_decoder(model_opt, tgt_emb)

    # Build NMTModel(= encoder + decoder).
    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")

    # Build separate LM if doing simple fusion
    if model_opt.simple_fusion:
        layers = 12
        size = 768
        heads = 12

        lm_decoder_opt = copy.deepcopy(model_opt)
        lm_decoder_opt.dec_layers = layers
        lm_decoder_opt.use_GPT_version_ctxattn = False
        lm_decoder_opt.use_GPT_version_psa = False
        lm_decoder_opt.use_GPT_version_unconditional = True
        lm_decoder_opt.tgt_word_vec_size = size
        lm_decoder_opt.rnn_size = size
        lm_decoder_opt.dec_rnn_size = size
        lm_decoder_opt.transformer_ff = size * 4
        lm_decoder_opt.dec_heads = heads
        lm_decoder_opt.position_encoding_learned_dec = True
        lm_decoder_opt.share_decoder_embeddings = True
        lm_decoder_opt.dropout = 0

        lm_decoder_emb = build_embeddings(lm_decoder_opt,
                                          tgt_field,
                                          for_encoder=False)
        logger.info(lm_decoder_emb)

        lm_decoder = build_decoder(lm_decoder_opt, lm_decoder_emb)
        load_decoder = lm_decoder

        model = onmt.models.SimpleFusionModel(encoder, decoder, lm_decoder)

        generator = SimpleFusionGenerator(model_opt.dec_rnn_size,
                                          lm_decoder_opt.dec_rnn_size,
                                          len(fields["tgt"].base_field.vocab))
        generator.lm_linear.weight = lm_decoder.embeddings.word_lut.weight

        if model_opt.share_decoder_embeddings:
            generator.decoder_linear.weight = decoder.embeddings.word_lut.weight
        gen_linear = generator.lm_linear
    else:
        load_decoder = decoder
        if model_opt.unconditional:
            model = onmt.models.UncondModel(decoder)
        else:
            model = onmt.models.NMTModel(encoder, decoder)

        # Build Generator.
        if not model_opt.copy_attn:
            if model_opt.generator_function == "sparsemax":
                gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1)
            else:
                gen_func = nn.LogSoftmax(dim=-1)

            if model_opt.padded_vocab_fix_me_later:
                gen_func = nn.Sequential(PadGen(), gen_func)

            generator = nn.Sequential(
                nn.Linear(model_opt.dec_rnn_size,
                          len(fields["tgt"].base_field.vocab)),
                Cast(torch.float32), gen_func)
            if model_opt.share_decoder_embeddings:
                generator[0].weight = decoder.embeddings.word_lut.weight
            gen_linear = generator[0]
        else:
            tgt_base_field = fields["tgt"].base_field
            vocab_size = len(tgt_base_field.vocab)
            pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token]
            generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size,
                                      pad_idx)
            if model_opt.share_decoder_embeddings:
                generator.linear.weight = decoder.embeddings.word_lut.weight
            gen_linear = generator.linear

    if model_opt.encdec_share_params:
        for name, p in decoder.named_parameters():
            if 'ctx' in name or 'context' in name:
                continue
            pointer = encoder
            attrs = name.split('.')
            for attr_name in attrs[:-1]:
                pointer = getattr(pointer, attr_name)

            # pointer now has the encoder version of the parameter parent
            setattr(pointer, attrs[-1], p)

    # Load the model states from checkpoint or initialize them.
    if checkpoint is not None:
        # Normally, just load the model parameters from checkpoint
        if 'gpt2_params' not in checkpoint and 'enc_model' not in checkpoint:
            # This preserves backward-compat for models using customed layernorm
            def fix_key(s):
                s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                           r'\1.layer_norm\2.bias', s)
                s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                           r'\1.layer_norm\2.weight', s)
                return s

            checkpoint['model'] = {
                fix_key(k): v
                for k, v in checkpoint['model'].items()
            }
            # end of patch for backward compatibility

            # Initialize rest of parameters normally
            if hasattr(model_opt,
                       'load_uncond_from') and model_opt.load_uncond_from:
                for p in decoder.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)

                # Always initialize encoder parameters normally
                for p in encoder.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)

                if model_opt.ctx_weight_param:
                    for name, p in decoder.named_parameters():
                        if 'ctx_weight' in name:
                            p.data.zero_()
                        if 'ctx_bias' in name:
                            p.data.fill_(-10)

            model.load_state_dict(checkpoint['model'], strict=False)
            generator.load_state_dict(checkpoint['generator'], strict=False)
        else:
            # load the gpt parameters
            if 'gpt2_params' in checkpoint:
                init_something = model_opt.gpt2_init_embanddec or model_opt.simple_fusion or model_opt.gpt2_init_embandenc or model_opt.GPT_representation_mode != 'none'

                if init_something:
                    # Initialize all the weights first
                    if model_opt.gpt2_init_zero:
                        for p in decoder.parameters():
                            p.data.zero_()
                        if model_opt.simple_fusion:
                            generator.decoder_linear.weight.data.zero_()
                            generator.decoder_linear.bias.data.zero_()
                    else:
                        for p in decoder.parameters():
                            if p.dim() > 1:
                                xavier_uniform_(p)

                    # Always initialize encoder parameters normally
                    if encoder is not None:
                        for p in encoder.parameters():
                            if p.dim() > 1:
                                xavier_uniform_(p)
                    for p in generator.parameters():
                        if p.dim() > 1:
                            xavier_uniform_(p)
                    if model_opt.zero_bias_init:
                        gen_linear.bias.data.zero_()

                    if model_opt.ctx_weight_param:
                        for name, p in decoder.named_parameters():
                            if 'ctx_weight' in name:
                                p.data.zero_()
                            if 'ctx_bias' in name:
                                p.data.fill_(-10)
                        gen_linear.bias.data.zero_()

                load_models = []
                if model_opt.GPT_representation_mode != 'none':
                    load_embs = []
                    if model_opt.GPT_representation_loc in ['both', 'src']:
                        load_models.append(src_emb.gpt_model)
                        load_embs.append(src_emb)
                    if model_opt.GPT_representation_loc in ['both', 'tgt']:
                        load_models.append(tgt_emb.gpt_model)
                        load_embs.append(tgt_emb)

                else:
                    if model_opt.gpt2_init_embanddec or model_opt.simple_fusion:
                        load_models = [load_decoder]
                    elif model_opt.gpt2_init_embandenc:
                        load_models = [encoder]

                it_list = list(checkpoint['gpt2_params'])
                for lm_idx, load_model in enumerate(load_models):
                    #print(lm_idx, load_model)
                    for name, array in it_list:
                        name = name[12:]  # skip "transformer."
                        name = name.split('.')

                        assigned = False
                        if name[0] == 'wpe':
                            if model_opt.GPT_representation_mode != 'none':
                                pointer = load_embs[
                                    lm_idx].make_embedding.pe.pe.weight
                            else:
                                pointer = load_model.embeddings.make_embedding.pe.pe.weight

                        elif name[0] == 'wte':
                            if model_opt.GPT_representation_mode != 'none':
                                pointer = [
                                    load_embs[lm_idx].make_embedding.
                                    emb_luts[0].weight, gen_linear.weight
                                ]
                            else:
                                pointer = [
                                    load_model.embeddings.make_embedding.
                                    emb_luts[0].weight
                                ]
                                if not model_opt.nopretrain_decemb:
                                    pointer.append(gen_linear.weight)
                                if model_opt.simple_fusion and model_opt.sf_pretrain_dec_emb:
                                    pointer.append(
                                        decoder.embeddings.make_embedding.
                                        emb_luts[0].weight)

                        elif name[0] == 'ln_f':
                            if name[1] == 'weight':
                                pointer = load_model.layer_norm.weight
                            elif name[1] == 'bias':
                                pointer = load_model.layer_norm.bias
                            else:
                                raise ValueError(
                                    'I am missing something here!')

                        elif name[0] == 'h':
                            layer_num = name[1]
                            pointer = getattr(load_model.transformer_layers,
                                              layer_num)
                            if name[2] == 'attn':
                                assigned = True
                                pointer = pointer.self_attn
                                full_data = torch.from_numpy(array)
                                if name[3] == 'c_attn':
                                    end_size = full_data.shape[-1] // 3
                                    assert full_data.shape[-1] % 3 == 0
                                    if name[4] == 'bias':
                                        if init_something:
                                            pointer.linear_query.bias.data = full_data[:
                                                                                       end_size]
                                            pointer.linear_keys.bias.data = full_data[
                                                end_size:end_size * 2]
                                            pointer.linear_values.bias.data = full_data[
                                                end_size * 2:]
                                        if model_opt.gpt2_params_std > 0:
                                            pointer.linear_query.bias.orig = full_data[:end_size].clone(
                                            )
                                            pointer.linear_keys.bias.orig = full_data[
                                                end_size:end_size * 2].clone()
                                            pointer.linear_values.bias.orig = full_data[
                                                end_size * 2:].clone()
                                    elif name[4] == 'weight':
                                        if init_something:
                                            pointer.linear_query.weight.data = full_data[:, :end_size].t(
                                            ).contiguous()
                                            pointer.linear_keys.weight.data = full_data[:,
                                                                                        end_size:
                                                                                        end_size
                                                                                        *
                                                                                        2].t(
                                                                                        ).contiguous(
                                                                                        )
                                            pointer.linear_values.weight.data = full_data[:,
                                                                                          end_size
                                                                                          *
                                                                                          2:].t(
                                                                                          ).contiguous(
                                                                                          )
                                        if model_opt.gpt2_params_std > 0:
                                            pointer.linear_query.weight.orig = full_data[:, :end_size].t(
                                            ).contiguous().clone()
                                            pointer.linear_keys.weight.orig = full_data[:,
                                                                                        end_size:
                                                                                        end_size
                                                                                        *
                                                                                        2].t(
                                                                                        ).contiguous(
                                                                                        ).clone(
                                                                                        )
                                            pointer.linear_values.weight.orig = full_data[:, end_size * 2:].t(
                                            ).contiguous().clone()
                                    else:
                                        raise ValueError(
                                            'I am missing something here!')
                                elif name[3] == 'c_proj':
                                    if name[4] == 'bias':
                                        if init_something:
                                            pointer.final_linear.bias.data = full_data
                                        if model_opt.gpt2_params_std > 0:
                                            pointer.final_linear.bias.orig = full_data.clone(
                                            )
                                    elif name[4] == 'weight':
                                        if init_something:
                                            pointer.final_linear.weight.data = full_data.t(
                                            ).contiguous()
                                        if model_opt.gpt2_params_std > 0:
                                            pointer.final_linear.weight.orig = full_data.t(
                                            ).contiguous().clone()

                                    else:
                                        raise ValueError(
                                            'I am missing something here!')

                            elif name[2] == 'ln_1' or name[2] == 'ln_2':
                                num = name[2][3]
                                pointer = getattr(pointer, 'layer_norm_' + num)
                                if name[2] == 'bias':
                                    pointer = pointer.bias
                                elif name[2] == 'weight':
                                    pointer = pointer.weight
                                else:
                                    raise ValueError(
                                        'I am missing something here!')
                            elif name[2] == 'mlp':
                                pointer = pointer.feed_forward
                                pointer = getattr(pointer, name[2])
                                if name[3] == 'bias':
                                    pointer = pointer.bias
                                elif name[3] == 'weight':
                                    pointer = pointer.weight
                                else:
                                    raise ValueError(
                                        'I am missing something here!')
                            else:
                                raise ValueError(
                                    'I am missing something here!')
                        else:
                            raise ValueError('I am missing something here!')

                        if not assigned:
                            # if name[0] == 'wte':
                            #     print(array.shape)
                            #     continue
                            if name[-1] == 'weight':
                                array = array.T

                            if not isinstance(pointer, list):
                                pointer = [pointer]
                            for pointer_i in pointer:
                                target_size = int(math.ceil(
                                    array.shape[0] / 8)) * 8
                                padded_vocab = name[
                                    0] == 'wte' and pointer_i.shape[
                                        0] == target_size
                                padded_vocab = padded_vocab and pointer_i.shape[
                                    1:] == array.shape[1:]
                                try:
                                    assert pointer_i.shape == array.shape or padded_vocab
                                except AssertionError as e:

                                    e.args += (pointer_i.shape, array.shape)
                                    raise
                                if init_something:
                                    print(
                                        "Initialize PyTorch weight {}".format(
                                            name))
                                    if padded_vocab:
                                        pointer_i.data[:array.shape[
                                            0]] = torch.from_numpy(array)
                                    else:
                                        pointer_i.data = torch.from_numpy(
                                            array)
                                if model_opt.gpt2_params_std > 0:
                                    if padded_vocab:
                                        raise NotImplementedError
                                    else:
                                        pointer_i.orig = torch.from_numpy(
                                            array).clone()
                        # name = name[6:]  # skip "model/"
                        # name = name.split('/')

                        # assigned = False
                        # if name[0] == 'wpe':
                        #     if model_opt.GPT_representation_mode != 'none':
                        #         pointer = load_embs[lm_idx].make_embedding.pe.pe.weight
                        #     else:
                        #         pointer = load_model.embeddings.make_embedding.pe.pe.weight

                        # elif name[0] == 'wte':
                        #     if model_opt.GPT_representation_mode != 'none':
                        #         pointer = [load_embs[lm_idx].make_embedding.emb_luts[0].weight, gen_linear.weight]
                        #     else:
                        #         pointer = [load_model.embeddings.make_embedding.emb_luts[0].weight]
                        #         if not model_opt.nopretrain_decemb:
                        #             pointer.append(gen_linear.weight)
                        #         if model_opt.simple_fusion and model_opt.sf_pretrain_dec_emb:
                        #             pointer.append(decoder.embeddings.make_embedding.emb_luts[0].weight)

                        # elif name[0] == 'ln_f':
                        #     if name[1] == 'g':
                        #         pointer = load_model.layer_norm.weight
                        #     elif name[1] == 'b':
                        #         pointer = load_model.layer_norm.bias
                        #     else:
                        #         raise ValueError('I am missing something here!')

                        # elif name[0][0] == 'h':
                        #     layer_num = name[0][1:]
                        #     pointer = getattr(load_model.transformer_layers, layer_num)
                        #     if name[1] == 'attn':
                        #         assigned = True
                        #         pointer = pointer.self_attn
                        #         full_data = torch.from_numpy(array)
                        #         if name[2] == 'c_attn':
                        #             end_size = full_data.shape[-1]//3
                        #             assert full_data.shape[-1] % 3 == 0
                        #             if name[3] == 'b':
                        #                 if init_something:
                        #                     pointer.linear_query.bias.data = full_data[:end_size]
                        #                     pointer.linear_keys.bias.data = full_data[end_size:end_size*2]
                        #                     pointer.linear_values.bias.data = full_data[end_size*2:]
                        #                 if model_opt.gpt2_params_std > 0:
                        #                     pointer.linear_query.bias.orig = full_data[:end_size].clone()
                        #                     pointer.linear_keys.bias.orig = full_data[end_size:end_size*2].clone()
                        #                     pointer.linear_values.bias.orig = full_data[end_size*2:].clone()
                        #             elif name[3] == 'w':
                        #                 if init_something:
                        #                     pointer.linear_query.weight.data = full_data[:, :end_size].t().contiguous()
                        #                     pointer.linear_keys.weight.data = full_data[:, end_size:end_size*2].t().contiguous()
                        #                     pointer.linear_values.weight.data = full_data[:, end_size*2:].t().contiguous()
                        #                 if model_opt.gpt2_params_std > 0:
                        #                     pointer.linear_query.weight.orig = full_data[:, :end_size].t().contiguous().clone()
                        #                     pointer.linear_keys.weight.orig = full_data[:, end_size:end_size*2].t().contiguous().clone()
                        #                     pointer.linear_values.weight.orig = full_data[:, end_size*2:].t().contiguous().clone()
                        #             else:
                        #                 raise ValueError('I am missing something here!')
                        #         elif name[2] == 'c_proj':
                        #             if name[3] == 'b':
                        #                 if init_something:
                        #                     pointer.final_linear.bias.data = full_data
                        #                 if model_opt.gpt2_params_std > 0:
                        #                     pointer.final_linear.bias.orig = full_data.clone()
                        #             elif name[3] == 'w':
                        #                 if init_something:
                        #                     pointer.final_linear.weight.data = full_data.t().contiguous()
                        #                 if model_opt.gpt2_params_std > 0:
                        #                     pointer.final_linear.weight.orig = full_data.t().contiguous().clone()

                        #             else:
                        #                 raise ValueError('I am missing something here!')

                        #     elif name[1] == 'ln_1' or name[1] == 'ln_2':
                        #         num = name[1][3]
                        #         pointer = getattr(pointer, 'layer_norm_'+num)
                        #         if name[2] == 'b':
                        #             pointer = pointer.bias
                        #         elif name[2] == 'g':
                        #             pointer = pointer.weight
                        #         else:
                        #             raise ValueError('I am missing something here!')
                        #     elif name[1] == 'mlp':
                        #         pointer = pointer.feed_forward
                        #         pointer = getattr(pointer, name[2])
                        #         if name[3] == 'b':
                        #             pointer = pointer.bias
                        #         elif name[3] == 'w':
                        #             pointer = pointer.weight
                        #         else:
                        #             raise ValueError('I am missing something here!')
                        #     else:
                        #         raise ValueError('I am missing something here!')
                        # else:
                        #     raise ValueError('I am missing something here!')

                        # if not assigned:
                        #     if name[0] == 'wte':
                        #         print(array.shape)
                        #         continue
                        #     if name[-1] == 'w' or name[-1] == 'g':
                        #         array = array.T

                        #     if not isinstance(pointer, list):
                        #         pointer = [pointer]
                        #     for pointer_i in pointer:
                        #         target_size = int(math.ceil(array.shape[0]/8))*8
                        #         padded_vocab = name[0] == 'wte' and pointer_i.shape[0] == target_size
                        #         padded_vocab = padded_vocab and pointer_i.shape[1:] == array.shape[1:]
                        #         try:
                        #             assert pointer_i.shape == array.shape or padded_vocab
                        #         except AssertionError as e:

                        #             e.args += (pointer_i.shape, array.shape)
                        #             raise
                        #         if init_something:
                        #             print("Initialize PyTorch weight {}".format(name))
                        #             if padded_vocab:
                        #                 pointer_i.data[:array.shape[0]] = torch.from_numpy(array)
                        #             else:
                        #                 pointer_i.data = torch.from_numpy(array)
                        #         if model_opt.gpt2_params_std > 0:
                        #             if padded_vocab:
                        #                 raise NotImplementedError
                        #             else:
                        #                 pointer_i.orig = torch.from_numpy(array).clone()
            if 'enc_model' in checkpoint:
                load_dict = {
                    k[8:]: v
                    for k, v in checkpoint['enc_model'] if 'encoder' in k
                }
                encoder.load_state_dict(load_dict, strict=True)
    else:
        if model_opt.param_init != 0.0:
            for p in model.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
            for p in generator.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
        if model_opt.param_init_glorot:
            for p in model.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
            for p in generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)

        if not model_opt.unconditional and hasattr(model.encoder, 'embeddings') \
                and model.encoder.embeddings is not None:
            model.encoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_enc)
        if hasattr(model.decoder, 'embeddings'):
            model.decoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_dec)

    # remove requires_grad from params that are not trained:
    if model_opt.notrain_emb or model_opt.notrain_embanddec:
        if model_opt.position_encoding_learned_enc and model_opt.share_position_embeddings:
            model.encoder.embeddings.make_embedding.pe.pe.weight.requires_grad = False
        if model_opt.share_embeddings:
            model.encoder.embeddings.make_embedding.emb_luts[
                0].weight.requires_grad = False
        model.decoder.embeddings.make_embedding.pe.pe.weight.requires_grad = False
        model.decoder.embeddings.make_embedding.emb_luts[
            0].weight.requires_grad = False
        generator[0].weight.requires_grad = False

    if model_opt.notrain_genbias:
        generator[0].bias.requires_grad = False

    if model_opt.notrain_embanddec:
        for name, p in load_decoder.layer_norm.named_parameters():
            p.requires_grad = False
        for name, p in load_decoder.transformer_layers.named_parameters():
            if 'context' not in name and 'ctx' not in name:  # Takes care of normal and psa versions
                p.requires_grad = False

    if model_opt.onlytrainln:
        for name, p in model.decoder.named_parameters():
            if 'layer_norm' not in name:
                p.requires_grad = False
        for p in generator.parameters():
            p.requires_grad = False

    if model_opt.onlytrainoutp:
        if model_opt.share_decoder_embeddings:
            raise ValueError

        for p in model.decoder.parameters():
            p.requires_grad = False

    if model_opt.simple_fusion:
        for p in lm_decoder.parameters():
            p.requires_grad = False
        for p in generator.lm_linear.parameters():
            p.requires_grad = False

    model.generator = generator
    model.to(device)
    if model_opt.model_dtype == 'fp16':
        model.half()

    for p in model.parameters():
        if hasattr(p, 'orig'):
            p.orig = p.orig.to(device)
            if model_opt.model_dtype == 'fp16':
                p.orig = p.orig.half()

    return model
Beispiel #4
0
    def build_base_model(cls,
            src_types: List[str],
            model_opt,
            fields,
            gpu,
            checkpoint=None,
            gpu_id=None
    ):
        """Build a model from opts.

        Args:
            model_opt: the option loaded from checkpoint. It's important that
                the opts have been updated and validated. See
                :class:`onmt.utils.parse.ArgumentParser`.
            fields (dict[str, torchtext.data.Field]):
                `Field` objects for the model.
            gpu (bool): whether to use gpu.
            checkpoint: the model gnerated by train phase, or a resumed snapshot
                        model from a stopped training.
            gpu_id (int or NoneType): Which GPU to use.

        Returns:
            the NMTModel.
        """
        # for back compat when attention_dropout was not defined
        try:
            model_opt.attention_dropout
        except AttributeError:
            model_opt.attention_dropout = model_opt.dropout

        # Build embeddings.
        src_embs: Dict[str, Optional[nn.Module]] = dict()
        # PN: we always have text srcs for now
        for src_type in src_types:
            src_field = fields[f"src.{src_type}"]
            src_embs[src_type] = cls.build_embeddings(model_opt, src_field)
        # end for

        # Build encoders.
        encoders: List[EncoderBase] = list()
        for src_type in src_types:
            encoders.append(cls.build_encoder(model_opt, src_embs[src_type]))
        # end for

        # Build decoder.
        tgt_field = fields["tgt"]
        tgt_emb = cls.build_embeddings(model_opt, tgt_field, for_encoder=False)

        # No share embedding in this model
        assert not model_opt.share_embeddings, "share embeddings not supported"
        # # Share the embedding matrix - preprocess with share_vocab required.
        # if model_opt.share_embeddings:
        #     # src/tgt vocab should be the same if `-share_vocab` is specified.
        #     assert src_field.base_field.vocab == tgt_field.base_field.vocab, \
        #         "preprocess with -share_vocab if you use share_embeddings"
        #
        #     tgt_emb.word_lut.weight = src_emb.word_lut.weight

        decoder = cls.build_decoder(model_opt, tgt_emb)

        # Build MultiSourceNMTModel(= encoders + decoder).
        if gpu and gpu_id is not None:
            device = torch.device("cuda", gpu_id)
        elif gpu and not gpu_id:
            device = torch.device("cuda")
        elif not gpu:
            device = torch.device("cpu")
        # end if
        model = MultiSourceNMTModel(encoders, decoder)
        # Build Generator.
        if not model_opt.copy_attn:
            if model_opt.generator_function == "sparsemax":
                gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1)
            else:
                gen_func = nn.LogSoftmax(dim=-1)
            generator = nn.Sequential(
                nn.Linear(model_opt.dec_rnn_size,
                    len(fields["tgt"].base_field.vocab)),
                Cast(torch.float32),
                gen_func
            )
            if model_opt.share_decoder_embeddings:
                generator[0].weight = decoder.embeddings.word_lut.weight
        else:
            tgt_base_field = fields["tgt"].base_field
            vocab_size = len(tgt_base_field.vocab)
            pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token]
            generator = MultiSourceCopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx)

        # Load the model states from checkpoint or initialize them.
        if checkpoint is not None:
            # This preserves backward-compat for models using customed layernorm
            def fix_key(s):
                s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                    r'\1.layer_norm\2.bias', s)
                s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                    r'\1.layer_norm\2.weight', s)
                return s

            checkpoint['model'] = {fix_key(k): v
                for k, v in checkpoint['model'].items()}
            # end of patch for backward compatibility

            model.load_state_dict(checkpoint['model'], strict=False)
            generator.load_state_dict(checkpoint['generator'], strict=False)
        else:
            if model_opt.param_init != 0.0:
                for p in model.parameters():
                    p.data.uniform_(-model_opt.param_init, model_opt.param_init)
                for p in generator.parameters():
                    p.data.uniform_(-model_opt.param_init, model_opt.param_init)
            if model_opt.param_init_glorot:
                for p in model.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)
                for p in generator.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)

            for encoder in model.encoders:
                if hasattr(encoder, 'embeddings'):
                    encoder.embeddings.load_pretrained_vectors(
                        model_opt.pre_word_vecs_enc)
            if hasattr(model.decoder, 'embeddings'):
                model.decoder.embeddings.load_pretrained_vectors(
                    model_opt.pre_word_vecs_dec)

        model.generator = generator
        model.to(device)

        return model
Beispiel #5
0
def build_base_model(model_opt,
                     fields,
                     gpu,
                     args,
                     checkpoint=None,
                     gpu_id=None):
    """Build a model from opts.

    Args:
        model_opt: the option loaded from checkpoint. It's important that
            the opts have been updated and validated. See
            :class:`onmt.utils.parse.ArgumentParser`.
        fields (dict[str, torchtext.data.Field]):
            `Field` objects for the model.
        gpu (bool): whether to use gpu.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.

    Returns:
        the NMTModel.
    """

    # for back compat when attention_dropout was not defined
    try:
        model_opt.attention_dropout
    except AttributeError:
        model_opt.attention_dropout = model_opt.dropout

    # Build embeddings.
    if model_opt.model_type == "text" or model_opt.model_type == "vec":
        src_field = fields["src"]
        src_emb = build_embeddings(model_opt, src_field)
    else:
        src_emb = None

    # Build encoder.
    encoder = TransformerEncoder.from_opt(model_opt, src_emb)

    # Build decoder.
    tgt_field = fields["tgt"]
    tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False)

    # Share the embedding matrix - preprocess with share_vocab required.
    if model_opt.share_embeddings:
        # src/tgt vocab should be the same if `-share_vocab` is specified.
        assert src_field.base_field.vocab == tgt_field.base_field.vocab, \
            "preprocess with -share_vocab if you use share_embeddings"

        tgt_emb.word_lut.weight = src_emb.word_lut.weight

    decoder = TransformerDecoder.from_opt(model_opt, tgt_emb, args)

    # Build NMTModel(= encoder + decoder).
    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")
    model = onmt.models.NMTModel(encoder, decoder)

    # Build Generator.
    if not model_opt.copy_attn:
        if model_opt.generator_function == "sparsemax":
            gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1)
        else:
            gen_func = nn.LogSoftmax(dim=-1)
        generator = nn.Sequential(
            nn.Linear(model_opt.dec_rnn_size,
                      len(fields["tgt"].base_field.vocab)),
            Cast(torch.float32), gen_func)
        if model_opt.share_decoder_embeddings:
            generator[0].weight = decoder.embeddings.word_lut.weight
    else:
        tgt_base_field = fields["tgt"].base_field
        vocab_size = len(tgt_base_field.vocab)
        pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token]
        generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx)
        if model_opt.share_decoder_embeddings:
            generator.linear.weight = decoder.embeddings.word_lut.weight

    # Load the model states from checkpoint or initialize them.
    if checkpoint is not None:
        # This preserves backward-compat for models using customed layernorm
        def fix_key(s):
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                       r'\1.layer_norm\2.bias', s)
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                       r'\1.layer_norm\2.weight', s)
            return s

        checkpoint['model'] = {
            fix_key(k): v
            for k, v in checkpoint['model'].items()
        }
        # end of patch for backward compatibility

        model.load_state_dict(checkpoint['model'], strict=False)
        generator.load_state_dict(checkpoint['generator'], strict=False)

        if args.model_type == 'decoder_ext':
            w = []
            for i in range(model_opt.dec_layers):
                w.append([
                    decoder.transformer_layers[i].layer_norm_1.weight.data,
                    decoder.transformer_layers[i].layer_norm_1.bias.data,
                    decoder.transformer_layers[i].self_attn.linear_query.
                    weight.data.transpose(-1, -2).contiguous(),
                    decoder.transformer_layers[i].self_attn.linear_keys.weight.
                    data.transpose(-1, -2).contiguous(),
                    decoder.transformer_layers[i].self_attn.linear_values.
                    weight.data.transpose(-1, -2).contiguous(), decoder.
                    transformer_layers[i].self_attn.linear_query.bias.data,
                    decoder.transformer_layers[i].self_attn.linear_keys.bias.
                    data, decoder.transformer_layers[i].self_attn.
                    linear_values.bias.data, decoder.transformer_layers[i].
                    self_attn.final_linear.weight.data.transpose(
                        -1, -2).contiguous(), decoder.transformer_layers[i].
                    self_attn.final_linear.bias.data,
                    decoder.transformer_layers[i].layer_norm_2.weight.data,
                    decoder.transformer_layers[i].layer_norm_2.bias.data,
                    decoder.transformer_layers[i].context_attn.linear_query.
                    weight.data.transpose(
                        -1, -2).contiguous(), decoder.transformer_layers[i].
                    context_attn.linear_keys.weight.data.transpose(
                        -1, -2).contiguous(), decoder.transformer_layers[i].
                    context_attn.linear_values.weight.data.transpose(
                        -1, -2).contiguous(), decoder.transformer_layers[i].
                    context_attn.linear_query.bias.data, decoder.
                    transformer_layers[i].context_attn.linear_keys.bias.data,
                    decoder.transformer_layers[i].context_attn.linear_values.
                    bias.data, decoder.transformer_layers[i].context_attn.
                    final_linear.weight.data.transpose(
                        -1, -2).contiguous(), decoder.transformer_layers[i].
                    context_attn.final_linear.bias.data, decoder.
                    transformer_layers[i].feed_forward.layer_norm.weight.data,
                    decoder.transformer_layers[i].feed_forward.layer_norm.bias.
                    data, decoder.transformer_layers[i].feed_forward.w_1.
                    weight.data.transpose(-1, -2).contiguous(),
                    decoder.transformer_layers[i].feed_forward.w_1.bias.data,
                    decoder.transformer_layers[i].feed_forward.w_2.weight.data.
                    transpose(-1, -2).contiguous(),
                    decoder.transformer_layers[i].feed_forward.w_2.bias.data
                ])
                for i in range(len(w[-1])):
                    w[-1][i] = w[-1][i].cuda()
                if args.data_type == 'fp16':
                    for i in range(len(w[-1])):
                        w[-1][i] = w[-1][i].half()
            decoder_layers = nn.ModuleList([
                FTDecoderLayer(model_opt.heads,
                               model_opt.dec_rnn_size // model_opt.heads, w[i],
                               args) for i in range(model_opt.dec_layers)
            ])
            model.decoder.transformer_layers = decoder_layers
        elif args.model_type == 'decoding_ext':
            vocab_size = len(fields["tgt"].base_field.vocab)
            bos_idx = fields["tgt"].base_field.vocab.stoi[
                fields["tgt"].base_field.init_token]
            eos_idx = fields["tgt"].base_field.vocab.stoi[
                fields["tgt"].base_field.eos_token]
            decoding_weights = DecodingWeights(model_opt.dec_layers,
                                               model_opt.dec_rnn_size,
                                               vocab_size, checkpoint)
            decoding_weights.to_cuda()
            if args.data_type == 'fp16':
                decoding_weights.to_half()
            model.decoder = CustomDecoding(model_opt.dec_layers,
                                           model_opt.heads,
                                           model_opt.dec_rnn_size //
                                           model_opt.heads,
                                           vocab_size,
                                           bos_idx,
                                           eos_idx,
                                           decoding_weights,
                                           args=args)
        elif args.model_type == 'torch_decoding' or args.model_type == 'torch_decoding_with_decoder_ext':
            vocab_size = len(fields["tgt"].base_field.vocab)
            bos_idx = fields["tgt"].base_field.vocab.stoi[
                fields["tgt"].base_field.init_token]
            eos_idx = fields["tgt"].base_field.vocab.stoi[
                fields["tgt"].base_field.eos_token]
            decoding_weights = DecodingWeights(model_opt.dec_layers,
                                               model_opt.dec_rnn_size,
                                               vocab_size, checkpoint)
            decoding_weights.to_cuda()
            if args.data_type == 'fp16':
                decoding_weights.to_half()
            model.decoder = TorchDecoding(model_opt.dec_layers,
                                          model_opt.heads,
                                          model_opt.dec_rnn_size //
                                          model_opt.heads,
                                          vocab_size,
                                          bos_idx,
                                          eos_idx,
                                          decoding_weights,
                                          args=args)

    else:
        if model_opt.param_init != 0.0:
            for p in model.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
            for p in generator.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
        if model_opt.param_init_glorot:
            for p in model.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
            for p in generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)

        if hasattr(model.encoder, 'embeddings'):
            model.encoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_enc)
        if hasattr(model.decoder, 'embeddings'):
            model.decoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_dec)

    model.generator = generator
    model.to(device)
    if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam':
        model.half()
    return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None):
    """Build a model from opts.

    Args:
        model_opt: the option loaded from checkpoint. It's important that
            the opts have been updated and validated. See
            :class:`onmt.utils.parse.ArgumentParser`.
        fields (dict[str, torchtext.data.Field]):
            `Field` objects for the model.
        gpu (bool): whether to use gpu.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.

    Returns:
        the NMTModel.
    """

    # for back compat when attention_dropout was not defined
    try:
        model_opt.attention_dropout
    except AttributeError:
        model_opt.attention_dropout = model_opt.dropout

    # Build Model
    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")

    model = build_task_specific_model(model_opt, fields)

    # Build Generator.
    if not model_opt.copy_attn:
        if model_opt.generator_function == "sparsemax":
            gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1)
        else:
            gen_func = nn.LogSoftmax(dim=-1)
        generator = nn.Sequential(
            nn.Linear(model_opt.dec_rnn_size,
                      len(fields["tgt"].base_field.vocab)),
            Cast(torch.float32),
            gen_func
        )
        if model_opt.share_decoder_embeddings:
            generator[0].weight = model.decoder.embeddings.word_lut.weight
    else:
        tgt_base_field = fields["tgt"].base_field
        vocab_size = len(tgt_base_field.vocab)
        pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token]
        generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx)
        if model_opt.share_decoder_embeddings:
            generator.linear.weight = model.decoder.embeddings.word_lut.weight

    # Load the model states from checkpoint or initialize them.
    if checkpoint is None or model_opt.update_vocab:
        if model_opt.param_init != 0.0:
            for p in model.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
            for p in generator.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
        if model_opt.param_init_glorot:
            for p in model.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
            for p in generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)

        if hasattr(model, "encoder") and hasattr(model.encoder, "embeddings"):
            model.encoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_enc)
        if hasattr(model.decoder, 'embeddings'):
            model.decoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_dec)

    if checkpoint is not None:
        # This preserves backward-compat for models using customed layernorm
        def fix_key(s):
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                       r'\1.layer_norm\2.bias', s)
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                       r'\1.layer_norm\2.weight', s)
            return s

        checkpoint['model'] = {fix_key(k): v
                               for k, v in checkpoint['model'].items()}
        # end of patch for backward compatibility

        if model_opt.update_vocab:
            # Update model embeddings with those from the checkpoint after initialization
            use_embeddings_from_checkpoint(fields, model, generator, checkpoint)

            # Remove old vocabulary associated embeddings
            # Embedding layers
            enc_emb_name = "encoder.embeddings.make_embedding.emb_luts.0.weight"
            dec_emb_name = "decoder.embeddings.make_embedding.emb_luts.0.weight"
            del checkpoint["model"][enc_emb_name], checkpoint["model"][dec_emb_name]
            del checkpoint["generator"]["0.weight"], checkpoint["generator"]["0.bias"]

        model.load_state_dict(checkpoint['model'], strict=False)
        generator.load_state_dict(checkpoint['generator'], strict=False)        

    model.generator = generator
    model.to(device)
    if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam':
        model.half()
    return model
Beispiel #7
0
    def __init__(self, generator_function, dec_rnn_size, base_field):
        super(BertGenerator, self).__init__()

        self.generator = nn.Sequential(
            nn.Linear(dec_rnn_size, len(base_field.vocab)),
            Cast(torch.float32), generator_function)
Beispiel #8
0
 def __init__(self, d_in, d_out, gen_func):
     super().__init__()
     self.proj = nn.Linear(d_in, d_out)
     self.cast = Cast(torch.float32)
     self.gen_func = gen_func
Beispiel #9
0
    def __init__(self,
                 rnn_type,
                 bidirectional_encoder,
                 num_layers,
                 hidden_size,
                 attn_type="general",
                 attn_func="softmax",
                 coverage_attn=False,
                 context_gate=None,
                 copy_attn=False,
                 dropout=0.0,
                 embeddings=None,
                 text_field=None,
                 reuse_copy_attn=False,
                 copy_attn_type="general"):
        super(RNNDecoderBase, self).__init__(
            attentional=attn_type != "none" and attn_type is not None)

        self.bidirectional_encoder = bidirectional_encoder
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embeddings = embeddings
        self.text_field = text_field
        self.dropout = nn.Dropout(dropout)

        # Decoder state
        self.state = {}

        # Build the RNN.
        self.rnn = self._build_rnn(rnn_type,
                                   input_size=self._input_size,
                                   hidden_size=hidden_size,
                                   num_layers=num_layers,
                                   dropout=dropout)
        vocab_size = len(self.text_field.base_field.vocab)
        self.generator = nn.Sequential(nn.Linear(self.hidden_size, vocab_size),
                                       Cast(torch.float32), nn.Softmax(dim=-1))
        # Set up the context gate.
        self.context_gate = None
        if context_gate is not None:
            self.context_gate = context_gate_factory(context_gate,
                                                     self._input_size,
                                                     hidden_size, hidden_size,
                                                     hidden_size)

        # Set up the standard attention.
        self._coverage = coverage_attn
        if not self.attentional:
            if self._coverage:
                raise ValueError("Cannot use coverage term with no attention.")
            self.attn = None
        else:
            self.attn = TopicAttention(hidden_size,
                                       coverage=coverage_attn,
                                       attn_type=attn_type,
                                       attn_func=attn_func)

        if copy_attn and not reuse_copy_attn:
            if copy_attn_type == "none" or copy_attn_type is None:
                raise ValueError(
                    "Cannot use copy_attn with copy_attn_type none")
            self.copy_attn = GlobalAttention(hidden_size,
                                             attn_type=copy_attn_type,
                                             attn_func=attn_func)
        else:
            self.copy_attn = None

        self._reuse_copy_attn = reuse_copy_attn and copy_attn
        if self._reuse_copy_attn and not self.attentional:
            raise ValueError("Cannot reuse copy attention with no attention.")
Beispiel #10
0
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None):
    """
    Args:
        model_opt: the option loaded from checkpoint.
        fields: `Field` objects for the model.
        gpu (bool): whether to use gpu.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.
    Returns:
        the NMTModel.
    """

    assert model_opt.model_type in ["text", "img", "audio"], \
        "Unsupported model type %s" % model_opt.model_type

    # for backward compatibility
    if model_opt.rnn_size != -1:
        model_opt.enc_rnn_size = model_opt.rnn_size
        model_opt.dec_rnn_size = model_opt.rnn_size

    # Build embeddings.
    if model_opt.model_type == "text":
        src_fields = [f for n, f in fields['src']]
        assert len(src_fields) == 1
        src_field = src_fields[0]
        src_emb = build_embeddings(model_opt, src_field)
    else:
        src_emb = None

    # Build encoder.
    encoder = build_encoder(model_opt, src_emb)

    # Build decoder.
    tgt_fields = [f for n, f in fields['tgt']]
    assert len(tgt_fields) == 1
    tgt_field = tgt_fields[0]
    tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False)

    # Share the embedding matrix - preprocess with share_vocab required.
    if model_opt.share_embeddings:
        # src/tgt vocab should be the same if `-share_vocab` is specified.
        assert src_field.base_field.vocab == tgt_field.base_field.vocab, \
            "preprocess with -share_vocab if you use share_embeddings"

        tgt_emb.word_lut.weight = src_emb.word_lut.weight

    decoder = build_decoder(model_opt, tgt_emb)

    # Build NMTModel(= encoder + decoder).
    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")
    model = onmt.models.NMTModel(encoder, decoder)

    # Build Generator.
    if not model_opt.copy_attn:
        if model_opt.generator_function == "sparsemax":
            gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1)
        else:
            gen_func = nn.LogSoftmax(dim=-1)
        generator = nn.Sequential(
            nn.Linear(model_opt.dec_rnn_size,
                      len(fields["tgt"][0][1].base_field.vocab)),
            Cast(torch.float32), gen_func)
        if model_opt.share_decoder_embeddings:
            generator[0].weight = decoder.embeddings.word_lut.weight
    else:
        assert len(fields["tgt"]) == 1
        tgt_base_field = fields["tgt"][0][1].base_field
        vocab_size = len(tgt_base_field.vocab)
        pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token]
        generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx)

    # Load the model states from checkpoint or initialize them.
    if checkpoint is not None:
        # This preserves backward-compat for models using customed layernorm
        def fix_key(s):
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                       r'\1.layer_norm\2.bias', s)
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                       r'\1.layer_norm\2.weight', s)
            return s

        checkpoint['model'] = {
            fix_key(k): v
            for k, v in checkpoint['model'].items()
        }
        # end of patch for backward compatibility

        model.load_state_dict(checkpoint['model'], strict=False)
        generator.load_state_dict(checkpoint['generator'], strict=False)
    else:
        if model_opt.param_init != 0.0:
            for p in model.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
            for p in generator.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
        if model_opt.param_init_glorot:
            for p in model.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
            for p in generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)

        if hasattr(model.encoder, 'embeddings'):
            model.encoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_enc)
        if hasattr(model.decoder, 'embeddings'):
            model.decoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_dec)

    model.generator = generator
    model.to(device)
    if model_opt.model_dtype == 'fp16':
        logger.warning('FP16 is experimental, the generated checkpoints may '
                       'be incompatible with a future version')
        model.half()

    return model
Beispiel #11
0
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None):
    """Build a model from opts.

    Args:
        model_opt: the option loaded from checkpoint. It's important that
            the opts have been updated and validated. See
            :class:`onmt.utils.parse.ArgumentParser`.
        fields (dict[str, torchtext.data.Field]):
            `Field` objects for the model.
        gpu (bool): whether to use gpu.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.

    Returns:
        the NMTModel.
    """

    # Build embeddings.
    if model_opt.model_type == "text":
        src_field = fields["src"]
        src_emb = build_embeddings(model_opt, src_field)
    else:
        src_emb = None

    # Build encoder.
    encoder = build_encoder(model_opt, src_emb)

    # Build decoder.
    tgt_field = fields["tgt"]
    tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False)

    # Share the embedding matrix - preprocess with share_vocab required.
    if model_opt.share_embeddings and model_opt.encoder_type != 'bert':
        # src/tgt vocab should be the same if `-share_vocab` is specified.
        assert src_field.base_field.vocab == tgt_field.base_field.vocab, \
            "preprocess with -share_vocab if you use share_embeddings"

        tgt_emb.word_lut.weight = src_emb.word_lut.weight

    decoder = build_decoder(model_opt, tgt_emb)

    # Build NMTModel(= encoder + decoder).
    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")
    model = onmt.models.NMTModel(encoder, decoder)

    # Build Generator.
    if not model_opt.copy_attn:
        if model_opt.generator_function == "sparsemax":
            gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1)
        else:
            gen_func = nn.LogSoftmax(dim=-1)
        generator = nn.Sequential(
            nn.Linear(model_opt.dec_rnn_size,
                      len(fields["tgt"].base_field.vocab)),
            Cast(torch.float32), gen_func)
        if model_opt.share_decoder_embeddings:
            if not model_opt.copy_attn:
                generator[0].weight = decoder.embeddings.word_lut.weight
            else:
                generator.linear.weight = decoder.embeddings.word_lut.weight
    else:
        tgt_base_field = fields["tgt"].base_field
        vocab_size = len(tgt_base_field.vocab)
        pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token]
        generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx)

    # Load the model states from checkpoint or initialize them.
    if checkpoint is not None:
        # This preserves backward-compat for models using customed layernorm
        def fix_key(s):
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                       r'\1.layer_norm\2.bias', s)
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                       r'\1.layer_norm\2.weight', s)
            return s

        checkpoint['model'] = {
            fix_key(k): v
            for k, v in checkpoint['model'].items()
        }
        # end of patch for backward compatibility
        model.load_state_dict(checkpoint['model'], strict=False)
        generator.load_state_dict(checkpoint['generator'], strict=False)
    elif model_opt.encoder_type != 'bert' or model_opt.decoder_type != 'bert':
        if model_opt.param_init != 0.0:
            for p in model.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
            for p in generator.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
        if model_opt.param_init_glorot:
            for p in model.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
            for p in generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)

        if (hasattr(model.encoder, 'embeddings')
                and not model_opt.encoder_type == 'bert'):
            model.encoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_enc)
        if (hasattr(model.decoder, 'embeddings')
                and not model_opt.decoder_type == 'bert'):
            model.decoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_dec)

    if model_opt.encoder_type == 'bert' or model_opt.decoder_type == 'bert':
        if model_opt.bert_type != 'none':
            model_opt.enc_bert_type = model_opt.bert_type
            model_opt.dec_bert_type = model_opt.bert_type

        if model_opt.enc_bert_type != 'none' and checkpoint is None:
            model.encoder.initialize_bert(model_opt.enc_bert_type)

        if model_opt.dec_bert_type != 'none' and checkpoint is None:
            model.decoder.initialize_bert(model_opt.dec_bert_type)

        # Tie word embedding layer of encoder BERT and decoder
        if model_opt.encoder_type == 'bert' and model_opt.share_embeddings:
            decoder.embeddings.word_lut.weight = \
                encoder.embeddings.word_lut.weight

        # Tie decoder word embedding layer with generator weights
        if model_opt.share_decoder_embeddings:
            if not model_opt.copy_attn:
                generator[0].weight = \
                    decoder.embeddings.word_lut.weight
            else:
                generator.linear.weight = \
                    decoder.embeddings.word_lut.weight

    if model_opt.encoder_type == 'bert' and model_opt.decoder_type == 'bert':
        # Tie word, position and token_type embedding
        # layers of encoder and decoder BERT
        if model_opt.share_embeddings:
            decoder.embeddings.position_embeddings.weight = \
                encoder.embeddings.position_embeddings.weight
            decoder.embeddings.token_type_embeddings.weight = \
                encoder.embeddings.token_type_embeddings.weight

        # Tie self-attention between encoder and decoder
        if model_opt.share_self_attn:
            for encoder_layer, decoder_layer in zip(
                    encoder.encoder.layer, decoder.transformer_layers):
                # QUERY
                clone_or_share_layer(decoder_layer.self_attn.linear_query,
                                     encoder_layer.attention.self.query,
                                     share=True)

                # KEY
                clone_or_share_layer(decoder_layer.self_attn.linear_keys,
                                     encoder_layer.attention.self.key,
                                     share=True)

                # VALUE
                clone_or_share_layer(decoder_layer.self_attn.linear_values,
                                     encoder_layer.attention.self.value,
                                     share=True)

                # MULTIHEAD ATTN FINAL LINEAR LAYER
                clone_or_share_layer(decoder_layer.self_attn.final_linear,
                                     encoder_layer.attention.output.dense,
                                     share=True)

        # Tie context-attention with self-attention
        if model_opt.tie_context_attn:
            for decoder_layer in decoder.transformer_layers:
                # QUERY
                clone_or_share_layer(decoder_layer.context_attn.linear_query,
                                     decoder_layer.self_attn.linear_query,
                                     share=True)

                # KEY
                clone_or_share_layer(decoder_layer.context_attn.linear_keys,
                                     decoder_layer.self_attn.linear_keys,
                                     share=True)

                # VALUE
                clone_or_share_layer(decoder_layer.context_attn.linear_values,
                                     decoder_layer.self_attn.linear_values,
                                     share=True)

                # MULTIHEAD ATTN FINAL LINEAR LAYER
                clone_or_share_layer(decoder_layer.context_attn.final_linear,
                                     decoder_layer.self_attn.final_linear,
                                     share=True)

        # Tie positionwise feedforward between encoder and decoder
        if model_opt.share_feed_forward:
            for encoder_layer, decoder_layer in zip(
                    encoder.encoder.layer, decoder.transformer_layers):

                # TRANSFORMER FF
                clone_or_share_layer(decoder_layer.intermediate.dense,
                                     encoder_layer.intermediate.dense,
                                     share=True)

                clone_or_share_layer(decoder_layer.output.dense,
                                     encoder_layer.output.dense,
                                     share=True)

    model.generator = generator
    model.to(device)
    if model_opt.model_dtype == 'fp16':
        model.half()

    return model
Beispiel #12
0
def build_base_model(model_opt, fields, gpu, length_model, length_penalty_a, length_penalty_b, checkpoint=None, gpu_id=None):
    """Build a model from opts.

    Args:
        model_opt: the option loaded from checkpoint. It's important that
            the opts have been updated and validated. See
            :class:`onmt.utils.parse.ArgumentParser`.
        fields (dict[str, torchtext.data.Field]):
            `Field` objects for the model.
        gpu (bool): whether to use gpu.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.

    Returns:
        the NMTModel.
    """

    # for back compat when attention_dropout was not defined
    try:
        model_opt.attention_dropout
    except AttributeError:
        model_opt.attention_dropout = model_opt.dropout

    # Build embeddings.
    if model_opt.model_type == "text" or model_opt.model_type == "vec":
        src_field = fields["src"]
        src_emb = build_embeddings(model_opt, src_field)
    else:
        src_emb = None

    # Build encoder.
    encoder = build_encoder(model_opt, src_emb)

    # Build decoder.
    tgt_field = fields["tgt"]
    tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False)

    # Share the embedding matrix - preprocess with share_vocab required.
    if model_opt.share_embeddings:
        # src/tgt vocab should be the same if `-share_vocab` is specified.
        assert src_field.base_field.vocab == tgt_field.base_field.vocab, \
            "preprocess with -share_vocab if you use share_embeddings"

        tgt_emb.word_lut.weight = src_emb.word_lut.weight

    decoder = build_decoder(model_opt, tgt_emb)

    # Build NMTModel(= encoder + decoder).
    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")
    model = onmt.models.NMTModel(encoder, decoder)

    # Build Generator.
    if not model_opt.copy_attn:
        if model_opt.generator_function == "sparsemax":
            gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1)
        else:
            gen_func = nn.LogSoftmax(dim=-1)
        # MMM: commented the lines below
        # generator = nn.Sequential(
        #     nn.Linear(model_opt.dec_rnn_size,
        #               len(fields["tgt"].base_field.vocab)),
        #     Cast(torch.float32),
        #     gen_func
        # )

            # MMM
            class tune_out_prob(nn.Module):

                def __init__(self):
                    super(tune_out_prob, self).__init__()
                    self.t_lens = None
                    self.eos_ind = None
                    self.batch_max_len = None
                    self.word_index = None
                    self.tgt_vocab_size = None
                    self.validation = False

                def length_model_loss(self, scale, value, a, b):
                    # return -(value / scale) ** 2 - scale.log()
                    # return -((value / scale) **2)/2 - (2.5066*scale).log()
                    return -a * (value / scale) ** 2 + b  # *abs(scale)
                    # return -((value / scale) ** 2)*scale + scale
                    # return -(value / scale)*4 + scale

                def forward(self, x):
                    y = x.clone()
                    # mask = np.ones(x.size())
                    # for i in range(self.t_lens.size(-1)):
                    #     y[i*self.batch_size + self.t_lens[i], self.eos_ind] = \
                    #         y[i * self.batch_size + self.t_lens[i], self.eos_ind].clone() + math.log(0.9)
                    if self.training or self.validation:  # training phase
                        y = y.view(self.batch_max_len, -1, self.tgt_vocab_size)
                        # eos_list = [(i * self.batch_max_len + self.t_lens.data.cpu().numpy()[i]) for i in
                        #             range(self.t_lens.size(-1))]
                        # other_list = list(set(list(range(x.size(0)))) - set(eos_list))
                        # y[other_list, self.eos_ind] = -100
                        # y[eos_list, self.eos_ind] = 0
                        for wi in range(self.batch_max_len):
                            delta_p = (self.t_lens - wi - 1).float()
                            delta_p[delta_p < 0] = 0.05 * delta_p[delta_p < 0]
                            scale = (self.t_lens.float()).sqrt() / 2.0
                            penalties = self.length_model_loss(scale, delta_p, length_penalty_a, length_penalty_b)
                            # penalties[penalties > 0] = 0
                            y[wi, :, self.eos_ind] += penalties
                        y = y.view(-1, self.tgt_vocab_size)
                        # mask[eos_list, self.eos_ind] = +2
                        # mask[other_list, self.eos_ind] = -2
                    else:  # translation phase
                        if len(x.size()) == 3:  # x of shape [ tgt_len, batch_size, vocab ] is a full sentence
                            # for i in range(len(self.t_lens)):
                            #     other_list = list(set(list(range(x.size(0)))) - set(list([self.t_lens.data.cpu().numpy()[i]])))
                            #     #mask[other_list, i, self.eos_ind] = -2
                            #     y[other_list, i, self.eos_ind] = -100
                            #     if self.t_lens[i] < x.size(0):
                            #         #mask[self.t_lens[i], i, self.eos_ind] = +2
                            #         y[self.t_lens[i], i, self.eos_ind] = 0
                            pass
                        else:  # x of shape [(batch_size x beam_size) , vocab ] is only for one step
                            beam_size = x.size(0) // self.t_lens.numel()
                            wi = self.word_index
                            delta_p = (self.t_lens - wi - 2).float()
                            delta_p[delta_p < 0] = 0.005 * delta_p[delta_p < 0]
                            delta_p = delta_p.unsqueeze(1).expand(self.t_lens.numel(), beam_size).flatten()
                            scale = (self.t_lens.float()).sqrt() / 2.0
                            scale = scale.unsqueeze(1).expand(self.t_lens.numel(), beam_size).flatten()
                            penalties = self.length_model_loss(scale, delta_p, length_penalty_a, length_penalty_b)
                            # penalties[penalties > 0] = 0
                            y[:, self.eos_ind] += penalties
                            # y[eos_list ^ 1, self.eos_ind] = -100
                    return y
                    # mask = torch.tensor(mask, dtype=x.dtype).to(device)
                    # x= x+mask
                    # return x

                    # y = x.clone()
                    # # 1. since y is the output of log_softmax, apply exponential
                    # # to convert it to probabilistic form
                    # y = torch.exp(y)
                    # # 2. tune probabilities
                    # eos_list = [(i * self.batch_max_len + self.t_lens.data.cpu().numpy()[i]) for i in
                    #             range(self.t_lens.size(-1))]
                    # other_list = list(set(list(range(y.size(0)))) - set(eos_list))
                    #
                    # z = y.clone()
                    # # 2.1. tune probabilities for eos positions
                    # z[eos_list, self.eos_ind] = 1
                    # z[eos_list, 0:self.eos_ind] = 0
                    # z[eos_list, self.eos_ind+1:-1] = 0
                    #
                    # # 2.2. tune probabilities for non-eos positions
                    # p_val = z[other_list, self.eos_ind] / (self.tgt_vocab_size - 1)
                    # z[other_list, self.eos_ind] = 0
                    # non_eos_inds = list(set(list(range(self.tgt_vocab_size))) - set([self.eos_ind]))
                    # for i in range(len(other_list)):
                    #     z[other_list[i], non_eos_inds] = y[other_list[i], non_eos_inds] + p_val[i]
                    #
                    # # 3. convert y back to log-probability form
                    # z = torch.log(z)
                    # return z

            # MMM
            if length_model == 'oracle' or length_model == 'fixed_ratio' or length_model == 'lstm':
                generator = nn.Sequential(
                    nn.Linear(model_opt.dec_rnn_size,
                              len(fields["tgt"].base_field.vocab)),
                    Cast(torch.float32),
                    gen_func,
                    tune_out_prob()
                )
            else:
                generator = nn.Sequential(
                    nn.Linear(model_opt.dec_rnn_size,
                              len(fields["tgt"].base_field.vocab)),
                    Cast(torch.float32),
                    gen_func
                )
            # /MMM
        if model_opt.share_decoder_embeddings:
            generator[0].weight = decoder.embeddings.word_lut.weight
    else:
        tgt_base_field = fields["tgt"].base_field
        vocab_size = len(tgt_base_field.vocab)
        pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token]
        generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx)

    # Load the model states from checkpoint or initialize them.
    if checkpoint is not None:
        # This preserves backward-compat for models using customed layernorm
        def fix_key(s):
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                       r'\1.layer_norm\2.bias', s)
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                       r'\1.layer_norm\2.weight', s)
            return s

        checkpoint['model'] = {fix_key(k): v
                               for k, v in checkpoint['model'].items()}
        # end of patch for backward compatibility

        model.load_state_dict(checkpoint['model'], strict=False)
        generator.load_state_dict(checkpoint['generator'], strict=False)
    else:
        if model_opt.param_init != 0.0:
            for p in model.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
            for p in generator.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
        if model_opt.param_init_glorot:
            for p in model.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
            for p in generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)

        if hasattr(model.encoder, 'embeddings'):
            model.encoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_enc)
        if hasattr(model.decoder, 'embeddings'):
            model.decoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_dec)

    model.generator = generator
    model.to(device)
    if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam':
        model.half()
    return model
Beispiel #13
0
 def __init__(self, self_ref_mask_dict, proj, gen_func):
     super(SelfRefMaskGenerator, self).__init__()
     self.sr_dict = self_ref_mask_dict
     self.proj = proj
     self.cast = Cast(torch.float32)
     self.gen_func = gen_func
Beispiel #14
0
def build_base_model(model_opt,
                     opt,
                     fields,
                     gpu,
                     checkpoint=None,
                     gpu_id=None):
    """Build a model from opts.

    Args:
        model_opt: the option loaded from checkpoint. It's important that
            the opts have been updated and validated. See
            :class:`onmt.utils.parse.ArgumentParser`.
        fields (dict[str, torchtext.data.Field]):
            `Field` objects for the model.
        gpu (bool): whether to use gpu.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.

    Returns:
        the NMTModel.
    """

    # Build embeddings.
    if model_opt.model_type == "text":
        src_field = fields["src"]
        src_emb = build_embeddings(model_opt, src_field)
    else:
        src_emb = None

    # Build encoder.
    redr_encoder = build_encoder(model_opt, src_emb)

    # Build decoder.
    tgt_field = fields["tgt"]
    tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False)

    # Share the embedding matrix - preprocess with share_vocab required.
    if model_opt.share_embeddings:
        # src/tgt vocab should be the same if `-share_vocab` is specified.
        assert src_field.base_field.vocab == tgt_field.base_field.vocab, \
            "preprocess with -share_vocab if you use share_embeddings"

        tgt_emb.word_lut.weight = src_emb.word_lut.weight

    decoder = build_decoder(model_opt, tgt_emb)

    # Build NMTModel(= encoder + decoder).
    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")
    model = onmt.models.NMTModel(redr_encoder, decoder)

    # Build Generator.
    if not model_opt.copy_attn:
        if model_opt.generator_function == "sparsemax":
            gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1)
        else:
            gen_func = nn.LogSoftmax(dim=-1)
        generator = nn.Sequential(
            nn.Linear(model_opt.dec_rnn_size,
                      len(fields["tgt"].base_field.vocab)),
            Cast(torch.float32), gen_func)
        if model_opt.share_decoder_embeddings:
            generator[0].weight = decoder.embeddings.word_lut.weight
    else:
        tgt_base_field = fields["tgt"].base_field
        vocab_size = len(tgt_base_field.vocab)
        pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token]
        generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx)

    # Load the model states from checkpoint or initialize them.
    if checkpoint is not None:
        # This preserves backward-compat for models using customed layernorm
        def fix_key(s):
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                       r'\1.layer_norm\2.bias', s)
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                       r'\1.layer_norm\2.weight', s)
            return s

        checkpoint['model'] = {
            fix_key(k): v
            for k, v in checkpoint['model'].items()
        }
        # end of patch for backward compatibility

        model.load_state_dict(checkpoint['model'], strict=False)
        generator.load_state_dict(checkpoint['generator'], strict=False)
    else:
        if model_opt.param_init != 0.0:
            for p in model.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
            for p in generator.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
        if model_opt.param_init_glorot:
            for p in model.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
            for p in generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)

        if hasattr(model.encoder.reference_encoder, 'embeddings'):
            model.encoder.reference_encoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_enc)
        if hasattr(model.encoder.history_encoder, 'embeddings'):
            model.encoder.history_encoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_enc)
        if hasattr(model.decoder, 'embeddings'):
            model.decoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_dec)

    model.generator = generator
    model.to(device)
    if model_opt.model_dtype == 'fp16':
        model.half()

    return model
Beispiel #15
0
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None):
    """Build a model from opts.

    Args:
        model_opt: the option loaded from checkpoint. It's important that
            the opts have been updated and validated. See
            :class:`onmt.utils.parse.ArgumentParser`.
        fields (dict[str, torchtext.data.Field]):
            `Field` objects for the model.
        gpu (bool): whether to use gpu.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.

    Returns:
        the NMTModel.
    """

    # Build embeddings.
    if model_opt.model_type == "text" or model_opt.model_type == "vec":
        src_field = fields["sent1"]
        src_emb = build_embeddings(model_opt, src_field)
    else:
        src_emb = None

    # Build encoder.
    encoder = build_encoder(model_opt, src_emb)

    # Build decoder.
    # tgt_field = fields["tgt"]
    # tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False)

    # Share the embedding matrix - preprocess with share_vocab required.
    #if model_opt.share_embeddings:
    #    # src/tgt vocab should be the same if `-share_vocab` is specified.
    #    assert src_field.base_field.vocab == tgt_field.base_field.vocab, \
    #        "preprocess with -share_vocab if you use share_embeddings"

    #    tgt_emb.word_lut.weight = src_emb.word_lut.weight

    #decoder = build_decoder(model_opt, tgt_emb)

    # Build NMTModel(= encoder + decoder).
    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")
    # model = onmt.models.CLSModel(encoder)

    # Build Generator.
    gen_func = nn.LogSoftmax(dim=-1)
    classifier = nn.Sequential(
        nn.Linear(model_opt.enc_rnn_size * 4, model_opt.enc_rnn_size),
        nn.ReLU(),
        nn.Linear(model_opt.enc_rnn_size, model_opt.n_label),
        Cast(torch.float32),
        # gen_func
    )

    model = onmt.models.CLSModel(encoder, classifier)

    # Load the model states from checkpoint or initialize them.
    if checkpoint is not None:
        # This preserves backward-compat for models using customed layernorm
        def fix_key(s):
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                       r'\1.layer_norm\2.bias', s)
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                       r'\1.layer_norm\2.weight', s)
            return s

        checkpoint['model'] = {
            fix_key(k): v
            for k, v in checkpoint['model'].items()
        }
        # end of patch for backward compatibility

        model.load_state_dict(checkpoint['model'], strict=False)
        classifier.load_state_dict(checkpoint['classifier'], strict=False)
    else:
        if model_opt.param_init != 0.0:
            for p in model.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
            for p in classifier.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
        if model_opt.param_init_glorot:
            for p in model.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
            for p in generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)

        if hasattr(model.encoder, 'embeddings'):
            model.encoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_enc)
        # if hasattr(model.decoder, 'embeddings'):
        #     model.decoder.embeddings.load_pretrained_vectors(
        #         model_opt.pre_word_vecs_dec)

    model.classifier = classifier
    model.to(device)

    return model
Beispiel #16
0
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None):
    """Build a model from opts.

    Args:
        model_opt: the option loaded from checkpoint. It's important that
            the opts have been updated and validated. See
            :class:`onmt.utils.parse.ArgumentParser`.
        fields (dict[str, torchtext.data.Field]):
            `Field` objects for the model.
        gpu (bool): whether to use gpu.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.

    Returns:
        the NMTModel.
    """

    # for back compat when attention_dropout was not defined
    try:
        model_opt.attention_dropout
    except AttributeError:
        model_opt.attention_dropout = model_opt.dropout

    # Build NMTModel(= encoder + decoder).
    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")
    #  model = onmt.models.NMTModel(encoder, decoder, model_opt.pos_enc, model_opt.pos_dec)
    gen_func = nn.LogSoftmax(dim=-1)

    # model = nn.Sequential(
    #     nn.Linear(model_opt.enc_rnn_size,
    #               model_opt.enc_rnn_size),
    #     # nn.BatchNorm1d(model_opt.enc_rnn_size),
    #     nn.ReLU(),
    #     nn.Dropout()
    # )
    input_size = model_opt.dec_rnn_size if model_opt.rl_step else model_opt.enc_rnn_size
    output_size = 54 if model_opt.sample_method == "topk" else 20

    generators = {}
    for i, kv in enumerate(model_opt.generators.split(",")):
        k, _ = kv.split(":")
        # output_size = 54 if k == "0" else output_size
        generators[k] = nn.Sequential(
            nn.Linear(input_size, input_size),
            Cast(torch.float32),
            # nn.BatchNorm1d(model_opt.enc_rnn_size),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(input_size, output_size),
            Cast(torch.float32),
            gen_func)

    class TMEPModel(nn.Module):
        def __init__(self, gens):
            super(TMEPModel, self).__init__()
            self.generators = gens

        def forward(self, inputs, fix_k=None):
            outputs = {}
            for name, gen in self.generators.items():
                if name == fix_k:
                    with torch.no_grad():
                        outputs[name] = gen(inputs)
                else:
                    outputs[name] = gen(inputs)
            return outputs

    model = TMEPModel(generators)
    for k, v in model.generators.items():
        setattr(model, k, v)
        # v.to(device)

    # Load the model states from checkpoint or initialize them.
    if checkpoint is not None:
        # This preserves backward-compat for models using customed layernorm
        def fix_key(s):
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                       r'\1.layer_norm\2.bias', s)
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                       r'\1.layer_norm\2.weight', s)
            return s

        checkpoint['model'] = {
            fix_key(k): v
            for k, v in checkpoint['model'].items()
        }
        # end of patch for backward compatibility

        model.load_state_dict(checkpoint['model'], strict=False)
    else:
        if model_opt.param_init != 0.0:
            for p in model.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
        if model_opt.param_init_glorot:
            for p in model.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)

    model.to(device)
    if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam':
        model.half()
    return model
Beispiel #17
0
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None):
    """Build a model from opts.

    Args:
        model_opt: the option loaded from checkpoint. It's important that
            the opts have been updated and validated. See
            :class:`onmt.utils.parse.ArgumentParser`.
        fields (dict[str, torchtext.data.Field]):
            `Field` objects for the model.
        gpu (bool): whether to use gpu.
        checkpoint: the model gnerated by train phase, or a resumed snapshot
                    model from a stopped training.
        gpu_id (int or NoneType): Which GPU to use.

    Returns:
        the NMTModel.
    """

    # for back compat when attention_dropout was not defined
    try:
        model_opt.attention_dropout
    except AttributeError:
        model_opt.attention_dropout = model_opt.dropout

    # Build embeddings.
    if model_opt.model_type == "text" or model_opt.model_type == "vec":
        src_field = fields["src"]
        src_emb = build_embeddings(model_opt, src_field)
    else:
        src_emb = None

    # Build encoder.
    encoder = build_encoder(model_opt, src_emb)

    # Build decoder.
    tgt_field = fields["tgt"]
    tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False)

    # Share the embedding matrix - preprocess with share_vocab required.
    if model_opt.share_embeddings:
        # src/tgt vocab should be the same if `-share_vocab` is specified.
        assert src_field.base_field.vocab == tgt_field.base_field.vocab, \
            "preprocess with -share_vocab if you use share_embeddings"

        tgt_emb.word_lut.weight = src_emb.word_lut.weight

    decoder = build_decoder(model_opt, tgt_emb)

    if "continuous" in model_opt.generator_function:
        #make target embeddings
        tgt_out_vectors = tgt_field.base_field.vocab.vectors
        if model_opt.center:
            center_emb = tgt_out_vectors.sum(
                dim=0, keepdim=True) / (tgt_out_vectors.size(0))
            tgt_out_vectors = tgt_out_vectors - center_emb
        tgt_out_vectors_unitnorm = nn.functional.normalize(tgt_out_vectors,
                                                           p=2,
                                                           dim=1)

        tgt_out_emb = nn.Embedding(tgt_out_vectors.size(0),
                                   tgt_out_vectors.size(1))
        tgt_out_emb.weight.data.copy_(tgt_out_vectors_unitnorm)
        tgt_out_emb.weight.requires_grad = False  # do not train the embeddings

    # Build NMTModel(= encoder + decoder).
    if gpu and gpu_id is not None:
        device = torch.device("cuda", gpu_id)
    elif gpu and not gpu_id:
        device = torch.device("cuda")
    elif not gpu:
        device = torch.device("cpu")
    model = onmt.models.NMTModel(encoder, decoder)

    # Build Generator.
    if not model_opt.copy_attn:
        if model_opt.generator_function == 'continuous-linear':
            generator_modules = [
                nn.Linear(model_opt.dec_rnn_size, tgt_out_vectors.size(1))
            ]
            if model_opt.generator_layer_norm:
                generator_modules.append(
                    nn.LayerNorm(tgt_out_vectors.size(1), eps=1e-6))
            generator = nn.Sequential(*generator_modules)
        elif model_opt.generator_function == 'continuous-nonlinear':  #add a non-linear layer before generating the continuous vector
            generator_modules = [
                nn.Linear(model_opt.dec_rnn_size, tgt_out_vectors.size(1)),
                nn.ReLU(),
                nn.Linear(tgt_out_vectors.size(1), tgt_out_vectors.size(1))
            ]
            if model_opt.generator_layer_norm:
                generator_modules.append(
                    nn.LayerNorm(tgt_out_vectors.size(1), eps=1e-6))
            generator = nn.Sequential(*generator_modules)
        else:
            if model_opt.generator_function == "sparsemax":
                gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1)
            else:
                gen_func = nn.LogSoftmax(dim=-1)
            generator = nn.Sequential(
                nn.Linear(model_opt.dec_rnn_size,
                          len(fields["tgt"].base_field.vocab)),
                Cast(torch.float32), gen_func)
            if model_opt.share_decoder_embeddings:
                generator[0].weight = decoder.embeddings.word_lut.weight
    else:
        tgt_base_field = fields["tgt"].base_field
        vocab_size = len(tgt_base_field.vocab)
        pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token]
        generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx)

    # Load the model states from checkpoint or initialize them.
    if checkpoint is not None:
        # This preserves backward-compat for models using customed layernorm
        def fix_key(s):
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2',
                       r'\1.layer_norm\2.bias', s)
            s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2',
                       r'\1.layer_norm\2.weight', s)
            return s

        checkpoint['model'] = {
            fix_key(k): v
            for k, v in checkpoint['model'].items()
        }
        # end of patch for backward compatibility

        model.load_state_dict(checkpoint['model'], strict=False)
        generator.load_state_dict(checkpoint['generator'], strict=False)
    else:
        if model_opt.param_init != 0.0:
            for p in model.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
            for p in generator.parameters():
                p.data.uniform_(-model_opt.param_init, model_opt.param_init)
        if model_opt.param_init_glorot:
            for p in model.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
            for p in generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)

        if hasattr(model.encoder, 'embeddings'):
            model.encoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_enc)
        if hasattr(model.decoder, 'embeddings'):
            model.decoder.embeddings.load_pretrained_vectors(
                model_opt.pre_word_vecs_dec)

    model.generator = generator
    if "continuous" in model_opt.generator_function:
        model.decoder.tgt_out_emb = tgt_out_emb
        if model_opt.share_decoder_embeddings:
            model.decoder.embeddings.tie_embeddings(tgt_out_emb.weight)

    model.to(device)
    if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam':
        model.half()
    return model