Esempio n. 1
0
    def __init__(self,
                 config,
                 sequence_length,
                 use_pretrained=True,
                 pretrained_model=None):
        """Constructor"""
        super().__init__(config, sequence_length)
        # suspend logging due to hellish verbosity
        lvl = logging.getLogger().level
        logging.getLogger().setLevel(logging.WARN)
        config_args = {"pretrained_model_name_or_path": self.pretrained_id}

        if pretrained_model is None:
            if use_pretrained:
                model = EncoderDecoderModel.from_encoder_decoder_pretrained(
                    self.pretrained_id, self.pretrained_id)
            else:
                enc, dec = BertConfig(), BertConfig()
                dec.is_decoder = True
                dec.add_cross_attention = True
                enc_dec_config = EncoderDecoderConfig.from_encoder_decoder_configs(
                    enc, dec)
                model = EncoderDecoderModel(config=enc_dec_config)

            logging.getLogger().setLevel(lvl)
            self.model = model
        else:
            self.model = pretrained_model
        logging.getLogger().setLevel(self.config.print.log_level.upper())
    def test_real_bert_model_save_load_from_pretrained(self):
        model_2 = EncoderDecoderModel.from_encoder_decoder_pretrained(
            "bert-base-uncased", "bert-base-uncased")
        model_2.to(torch_device)
        input_ids = ids_tensor([13, 5], model_2.config.encoder.vocab_size)
        decoder_input_ids = ids_tensor([13, 1],
                                       model_2.config.encoder.vocab_size)
        attention_mask = ids_tensor([13, 5], vocab_size=2)
        with torch.no_grad():
            outputs = model_2(
                input_ids=input_ids,
                decoder_input_ids=decoder_input_ids,
                attention_mask=attention_mask,
            )
            out_2 = outputs[0].cpu().numpy()
            out_2[np.isnan(out_2)] = 0

            with tempfile.TemporaryDirectory() as tmp_dirname:
                model_2.save_pretrained(tmp_dirname)
                model_1 = EncoderDecoderModel.from_pretrained(tmp_dirname)
                model_1.to(torch_device)

                after_outputs = model_1(
                    input_ids=input_ids,
                    decoder_input_ids=decoder_input_ids,
                    attention_mask=attention_mask,
                )
                out_1 = after_outputs[0].cpu().numpy()
                out_1[np.isnan(out_1)] = 0
                max_diff = np.amax(np.abs(out_1 - out_2))
                self.assertLessEqual(max_diff, 1e-5)
Esempio n. 3
0
    def check_equivalence_tf_to_pt(self, config, decoder_config, inputs_dict):

        encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)

        # Using `_tf_model`, the test will fail, because the weights of `_tf_model` get extended before saving
        # the encoder/decoder models.
        # There was a (very) ugly potential fix, which wasn't integrated to `transformers`: see
        #   https://github.com/huggingface/transformers/pull/13222/commits/dbb3c9de76eee235791d2064094654637c99f36d#r697304245
        #   (the change in `src/transformers/modeling_tf_utils.py`)
        _tf_model = TFEncoderDecoderModel(encoder_decoder_config)
        # Make sure model is built
        _tf_model(**inputs_dict)

        # Using `tf_model` to pass the test.
        encoder = _tf_model.encoder.__class__(encoder_decoder_config.encoder)
        decoder = _tf_model.decoder.__class__(encoder_decoder_config.decoder)
        # Make sure models are built
        encoder(encoder.dummy_inputs)
        decoder(decoder.dummy_inputs)
        tf_model = TFEncoderDecoderModel(encoder=encoder, decoder=decoder)

        with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:

            tf_model.encoder.save_pretrained(encoder_tmp_dirname)
            tf_model.decoder.save_pretrained(decoder_tmp_dirname)
            pt_model = EncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_tf=True, decoder_from_tf=True
            )
            # This is only for copying some specific attributes of this particular model.
            pt_model.config = tf_model.config

        self.check_pt_tf_equivalence(pt_model, tf_model, inputs_dict)
 def __init__(
     self,
     model_name_or_path,
     tokenizer_name,
     model_cache_dir,
     input_max_length,
     target_max_length,
     summary_column_name,
     document_column_name,
     wandb_project,
     wandb_run_name,
     **kwargs,
 ):
     super().__init__(
         input_max_length,
         target_max_length,
         summary_column_name,
         document_column_name,
         wandb_project,
         wandb_run_name,
     )
     self.tokenizer = BertTokenizer.from_pretrained(
         tokenizer_name if tokenizer_name else model_name_or_path,
         cache_dir=model_cache_dir,
     )
     self.model = EncoderDecoderModel.from_encoder_decoder_pretrained(
         model_name_or_path, model_name_or_path, cache_dir=model_cache_dir,
     )
Esempio n. 5
0
    def __init__(
        self,
        is_eval=False,
    ):
        super().__init__()

        self.model = EncoderDecoderModel.from_encoder_decoder_pretrained(
            'bert-base-uncased',
            'bert-base-uncased',
        )

        if is_eval:
            self.model = self.model.eval()

        self.optimizer = torch.optim.Adam(self.parameters(), lr=config.lr)

        if config.use_sgd:
            self.optimizer = torch.optim.SGD(self.parameters(), lr=config.lr)

        if config.USE_CUDA:
            self.model = self.model.cuda()

        self.model_dir = config.save_path
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        self.best_path = ""
    def check_encoder_decoder_model_from_pretrained(
            self, config, input_ids, attention_mask, encoder_hidden_states,
            decoder_config, decoder_input_ids, decoder_attention_mask,
            **kwargs):
        encoder_model, decoder_model = self.get_encoder_decoder_model(
            config, decoder_config)
        kwargs = {
            "encoder_model": encoder_model,
            "decoder_model": decoder_model
        }
        enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained(
            **kwargs)
        enc_dec_model.to(torch_device)
        outputs_encoder_decoder = enc_dec_model(
            input_ids=input_ids,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
        )

        self.assertEqual(outputs_encoder_decoder[0].shape,
                         (decoder_input_ids.shape +
                          (decoder_config.vocab_size, )))
        self.assertEqual(outputs_encoder_decoder[1].shape,
                         (input_ids.shape + (config.hidden_size, )))
Esempio n. 7
0
    def check_save_and_load_encoder_decoder_model(
        self,
        config,
        input_ids,
        attention_mask,
        encoder_hidden_states,
        decoder_config,
        decoder_input_ids,
        decoder_attention_mask,
        **kwargs,
    ):
        encoder_model, decoder_model = self.get_encoder_decoder_model(
            config, decoder_config)
        enc_dec_model = EncoderDecoderModel(encoder=encoder_model,
                                            decoder=decoder_model)
        enc_dec_model.to(torch_device)
        enc_dec_model.eval()
        with torch.no_grad():
            outputs = enc_dec_model(
                input_ids=input_ids,
                decoder_input_ids=decoder_input_ids,
                attention_mask=attention_mask,
                decoder_attention_mask=decoder_attention_mask,
            )
            out_2 = outputs[0].cpu().numpy()
            out_2[np.isnan(out_2)] = 0

            with tempfile.TemporaryDirectory(
            ) as encoder_tmp_dirname, tempfile.TemporaryDirectory(
            ) as decoder_tmp_dirname:
                enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname)
                enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname)
                EncoderDecoderModel.from_encoder_decoder_pretrained(
                    encoder_pretrained_model_name_or_path=encoder_tmp_dirname,
                    decoder_pretrained_model_name_or_path=decoder_tmp_dirname,
                )

                after_outputs = enc_dec_model(
                    input_ids=input_ids,
                    decoder_input_ids=decoder_input_ids,
                    attention_mask=attention_mask,
                    decoder_attention_mask=decoder_attention_mask,
                )
                out_1 = after_outputs[0].cpu().numpy()
                out_1[np.isnan(out_1)] = 0
                max_diff = np.amax(np.abs(out_1 - out_2))
                self.assertLessEqual(max_diff, 1e-5)
Esempio n. 8
0
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):

        # must assign tokenizers before init
        if cfg.language_model.pretrained_model_name:
            if cfg.language_model.pretrained_encoder_model_name or cfg.language_model.pretrained_decoder_model_name:
                raise ValueError(
                    "Must have either pretrained_model_name or both pretrained_encoder_model name and "
                    "pretrained_decoder_model_name."
                )
            # setup tokenizer
            self.encoder_tokenizer = self.setup_tokenizer(cfg.encoder_tokenizer)
            self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens

            # set decoder to encoder
            self.decoder_tokenizer = self.encoder_tokenizer
            self.decoder_add_special_tokens = self.encoder_add_special_tokens
        else:
            if not (
                cfg.language_model.pretrained_encoder_model_name and cfg.language_model.pretrained_decoder_model_name
            ):
                raise ValueError("Both encoder and decoder must be specified")

            # setup tokenizers
            self.encoder_tokenizer = self.setup_tokenizer(cfg.encoder_tokenizer)
            self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens

            self.decoder_tokenizer = self.setup_tokenizer(cfg.decoder_tokenizer)
            self.decoder_add_special_tokens = cfg.decoder_tokenizer.add_special_tokens

        if not self.encoder_tokenizer:
            raise TypeError("encoder_tokenizer failed to initialize")
        if not self.decoder_tokenizer:
            raise TypeError("decoder_tokenizer failed to initialize")

        # init superclass
        super().__init__(cfg=cfg, trainer=trainer)

        # must assign modules after init
        if cfg.language_model.pretrained_model_name:
            # Setup end-to-end model
            if "bart" in cfg.language_model.pretrained_model_name:
                self.model = BartForConditionalGeneration.from_pretrained(cfg.language_model.pretrained_model_name)
            else:
                self.model = AutoModel.from_pretrained(cfg.language_model.pretrained_model_name)
        else:
            if not (
                cfg.language_model.pretrained_encoder_model_name and cfg.language_model.pretrained_decoder_model_name
            ):
                raise ValueError("Both encoder and decoder must be specified")

            # Setup encoder/decoder model
            self.model = EncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder=cfg.language_model.pretrained_encoder_model_name,
                decoder=cfg.language_model.pretrained_decoder_model_name,
            )

        self.validation_perplexity = Perplexity(compute_on_step=False)

        self.setup_optimization(cfg.optim)
Esempio n. 9
0
 def __init__(self):
     super().__init__()
     from transformers import EncoderDecoderModel
     from transformers import BertTokenizer
     self.seq2seq_model = EncoderDecoderModel.from_encoder_decoder_pretrained(
         'bert-base-uncased', 'bert-base-uncased'
     )  # initialize Bert2Bert from pre-trained checkpoints
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def prep(encoder_model, decoder_model, seq_length):
    tokenizer = AutoTokenizer.from_pretrained(encoder_model,
                                              model_max_length=seq_length)
    model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        encoder_model,
        decoder_model,
        max_length=40,
    )
    return tokenizer, model
Esempio n. 11
0
    def build(self):

        # to be further set
        # breakpoint()
        self.image_feature_module = build_image_encoder(
            self.config.image_feature_processor, direct_features=True
        )
        if self.config.concate_trace:
            self.trace_feature_module = build_encoder(self.config.trace_feature_encoder)

        if self.config.base_model_name == "bert-base-uncased":
            self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained(
                "bert-base-uncased", "bert-base-uncased"
            )
        elif self.config.base_model_name == "2layer-base":
            config_encoder = BertConfig()
            config_decoder = BertConfig()
            config_encoder.max_position_embeddings = 1090
            config_encoder.num_hidden_layers = 2
            config_decoder.num_hidden_layers = 2
            self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs(
                config_encoder, config_decoder
            )
            self.encoderdecoder = EncoderDecoderModel(config=self.codec_config)
        elif self.config.base_model_name == "3layer-base":
            config_encoder = BertConfig()
            config_decoder = BertConfig()
            config_encoder.num_hidden_layers = 3
            config_decoder.num_hidden_layers = 3
            self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs(
                config_encoder, config_decoder
            )
            self.encoderdecoder = EncoderDecoderModel(config=self.codec_config)
        if self.config.loop_contrastive:
            self.trace_caption_contrastive = TraceCaptionContrastiveModel(
                self.config.tc_contrastive_aggregate_method
            )
        if (
            hasattr(self.config, "pretrans_attention")
            and self.config.pretrans_attention
        ):

            # import ipdb; ipdb.set_trace()
            tempconf = self.encoderdecoder.config.encoder
            num_heads = tempconf.num_attention_heads
            num_layers = tempconf.num_hidden_layers
            self.attention_trans = AttentionTransform(num_layers, num_heads, 100)
        self.BOS_ID = 101
        self.vae = OpenAIDiscreteVAE()
        image_code_dim = 768
        image_fmap_size = self.vae.image_size // (2 ** self.vae.num_layers)
        self.image_seq_len = image_fmap_size ** 2
        self.image_emb = torch.nn.Embedding(self.vae.num_tokens, image_code_dim)
        self.image_pos_emb = AxialPositionalEmbedding(
            image_code_dim, axial_shape=(image_fmap_size, image_fmap_size)
        )
Esempio n. 12
0
 def from_encoder_decoder_pretrained(
         cls,
         encoder_pretrained_model_name_or_path: str = None,
         decoder_pretrained_model_name_or_path: str = None,
         *model_args,
         **kwargs):
     instance = EncoderDecoderModel.from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path,
         decoder_pretrained_model_name_or_path, *model_args, **kwargs)
     return BottleneckEncoderDecoderModel(instance.config, instance.encoder,
                                          instance.decoder)
    def __init__(self,
                 args=args,
                 max_seq_len=64,
                 max_seq_len_title=32,
                 max_img_seq_len=args.num_features,
                 tr_name=args.tr):
        """
        max_seq_len: Or Repo - VQA: 128
        max_img_seq_len: Or Repo - NLVR2: 40 // GQA: 45 // VQA: 50 --- Set to args.num_features, as we dont have padding implemented
        tr_name: transformer model
        """
        super().__init__()
        self.max_seq_len = max_seq_len
        self.max_seq_len_title = max_seq_len_title
        self.tr_name = tr_name
        self.max_img_seq_len = max_img_seq_len

        ### BUILD TOKENIZER ###
        self.tokenizer = AutoTokenizer.from_pretrained(tr_name)

        ### BUILD MODEL ###
        if tr_name.startswith("bert"):
            self.model, loading_info = BertO.from_pretrained(
                tr_name,
                output_loading_info=True,
                img_feature_dim=2048 + args.num_pos)

        print("UNEXPECTED: ", loading_info["unexpected_keys"])
        print("MISSING: ", loading_info["missing_keys"])
        print("ERRORS: ", loading_info["error_msgs"])

        ### CLASSIFICATION HEADS ###
        # LXRT Default classifier tends to perform best; For Albert gelu_new outperforms gelu
        # Make sure to only have used stuff below as it seems to have an effect on random initilization!

        self.encoder_decoder = EncoderDecoderModel.from_encoder_decoder_pretrained(
            'bert-base-uncased', 'gpt2')

        self.decoder = self.encoder_decoder.decoder
        self.decoder = self.decoder.cuda(0)

        self.decoder.config.max_length = 128
        self.decoder.config.min_length = 8
        self.decoder.config.no_repeat_ngram_size = 3
        self.decoder.config.early_stopping = True
        self.decoder.config.length_penalty = 2.0
        self.decoder.config.num_beams = 4

        if args.from_scratch:
            print("initializing all the weights")
            self.model.apply(self.model.init_weights)
Esempio n. 14
0
def define_G(model, source='en', dest='de', gpu_ids=[], use_init_net= True, freeze_encoder=False):
    """Create a generator

    Parameters:
        model (str) -- the type of the network: encoder | encoder-decoder
        netG (str) -- the architecture's name: resnet_9blocks | resnet_6blocks | unet_256 | unet_128
        norm (str) -- the name of normalization layers used in the network: batch | instance | none
        use_dropout (bool) -- if use dropout layers.
        init_type (str)    -- the name of our initialization method.
        init_gain (float)  -- scaling factor for normal, xavier and orthogonal.
        gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2

    Returns a generator

    Our current implementation provides two types of generators:
        U-Net: [unet_128] (for 128x128 input images) and [unet_256] (for 256x256 input images)
        The original U-Net paper: https://arxiv.org/abs/1505.04597

        Resnet-based generator: [resnet_6blocks] (with 6 Resnet blocks) and [resnet_9blocks] (with 9 Resnet blocks)
        Resnet-based generator consists of several Resnet blocks between a few downsampling/upsampling operations.
        We adapt Torch code from Justin Johnson's neural style transfer project (https://github.com/jcjohnson/fast-neural-style).


    The generator has been initialized by <init_net>. It uses RELU for non-linearity.
    """
    net = None


    if model == 't5':
        src_lang = define_language(source)
        tgt_lang = define_language(dest)
        model_name = 't5-small'
        net = EncDecT5Model(model_name, freeze_encoder=freeze_encoder, source_language=src_lang, target_language=tgt_lang)
    elif model == 'marianMT':
        model_name = 'Helsinki-NLP/opus-mt-'+source+'-'+dest
        net = EncDecModel(model_name, freeze_encoder=freeze_encoder)
    else:
        net = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'bert-base-german-cased')    #net=SentenceTransformer(netG)

    if use_init_net == True:
        return init_net(net, gpu_ids)
    else:
        return net
Esempio n. 15
0
    def __init__(self):
        super().__init__()
        # Model - load pretrained BERT-based encoder-decoder.
        self.bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained(
            "bert-base-uncased", "bert-base-uncased")

        # Set special tokens.
        self.bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
        self.bert2bert.config.eos_token_id = tokenizer.eos_token_id
        self.bert2bert.config.pad_token_id = tokenizer.pad_token_id

        # Sensible parameters for beam search.
        self.bert2bert.config.vocab_size = self.bert2bert.config.decoder.vocab_size
        self.bert2bert.config.max_length = 142
        self.bert2bert.config.min_length = 56
        self.bert2bert.config.no_repeat_ngram_size = 3
        self.bert2bert.config.early_stopping = True
        self.bert2bert.config.length_penalty = 2.0
        self.bert2bert.config.num_beams = 4
Esempio n. 16
0
    def build(self):

        # to be further set
        # breakpoint()
        self.image_feature_module = build_image_encoder(
            self.config.image_feature_processor, direct_features=True)
        if self.config.concate_trace:
            self.trace_feature_module = build_encoder(
                self.config.trace_feature_encoder)

        if self.config.base_model_name == "bert-base-uncased":
            self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained(
                "bert-base-uncased", "bert-base-uncased")
        elif self.config.base_model_name == "2layer-base":
            config_encoder = BertConfig()
            config_decoder = BertConfig()
            config_encoder.num_hidden_layers = 2
            config_decoder.num_hidden_layers = 2
            self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs(
                config_encoder, config_decoder)
            self.encoderdecoder = EncoderDecoderModel(config=self.codec_config)
        elif self.config.base_model_name == "3layer-base":
            config_encoder = BertConfig()
            config_decoder = BertConfig()
            config_encoder.num_hidden_layers = 3
            config_decoder.num_hidden_layers = 3
            self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs(
                config_encoder, config_decoder)
            self.encoderdecoder = EncoderDecoderModel(config=self.codec_config)
        if self.config.loop_contrastive:
            self.trace_caption_contrastive = TraceCaptionContrastiveModel(
                self.config.tc_contrastive_aggregate_method)
        if (hasattr(self.config, "pretrans_attention")
                and self.config.pretrans_attention):

            # import ipdb; ipdb.set_trace()
            tempconf = self.encoderdecoder.config.encoder
            num_heads = tempconf.num_attention_heads
            num_layers = tempconf.num_hidden_layers
            self.attention_trans = AttentionTransform(num_layers, num_heads,
                                                      100)
        self.BOS_ID = 101
Esempio n. 17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡')
    parser.add_argument('--model_path', default='model/epoch_0/model.pth', type=str, required=False, help='模型位置')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    device = args.device
    model_path = args.model_path

    # device
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    # model
    model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-multilingual-cased", "bert-base-multilingual-cased")
    model.load_state_dict(torch.load(model_path))
    model.eval()

    # dataset
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")

    # 打印参数量
    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    while True:
        question = input('请输入问题:')
        ids = tokenizer.encode(question)
        input_ids = torch.tensor([ids], dtype=torch.long)
        generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
        answer = tokenizer.decode(generated[0,:])
        print(answer)
def generate_summaries(
    examples: list, out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE
):
    fout = Path(out_file).open("w")
#    model = EncoderDecoderModel.from_pretrained(model_name, output_past=True).to(device)
    model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    model.to(device)

#    max_length = 140
#    min_length = 55

    for batch in tqdm(list(chunks(examples, batch_size))):
        dct = tokenizer.batch_encode_plus(batch, max_length=128, return_tensors="pt", pad_to_max_length=True, add_special_tokens=True)
        print(dct["input_ids"][0])
        print(dct["attention_mask"][0])
        summaries = model.generate(
            input_ids=dct["input_ids"].to(device),
            attention_mask=dct["attention_mask"].to(device),
            num_beams=4,
            length_penalty=10.0,
            repetition_penalty = 5.0,
            max_length=20,  # +2 from original because we start at step=1 and stop before max_length
            min_length=3,  # +1 from original because we start at step=1
            no_repeat_ngram_size=3,
            early_stopping=True,
        #    decoder_start_token_id=model.config.decoder.bos_token_id
            decoder_start_token_id=0
        )
        dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
        in_ids = dct["input_ids"].to(device)
        in_dec = [tokenizer.decode(id, skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in in_ids]
        for input, hypothesis in zip(in_dec, dec):
            fout.write(input + ' ||| ' + hypothesis + "\n")
            fout.flush()
    def check_encoder_decoder_model_from_pretrained_using_model_paths(
            self, config, input_ids, attention_mask, encoder_hidden_states,
            decoder_config, decoder_input_ids, decoder_attention_mask,
            **kwargs):
        encoder_model, decoder_model = self.get_encoder_decoder_model(
            config, decoder_config)
        with tempfile.TemporaryDirectory(
        ) as encoder_tmp_dirname, tempfile.TemporaryDirectory(
        ) as decoder_tmp_dirname:
            encoder_model.save_pretrained(encoder_tmp_dirname)
            decoder_model.save_pretrained(decoder_tmp_dirname)
            model_kwargs = {"encoder_hidden_dropout_prob": 0.0}

            # BartConfig has no hidden_dropout_prob.
            if not hasattr(decoder_config, "hidden_dropout_prob"):
                model_kwargs["decoder_activation_function"] = "gelu"
            else:
                model_kwargs["decoder_hidden_dropout_prob"] = 0.0

            enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder_tmp_dirname, decoder_tmp_dirname, **model_kwargs)
        enc_dec_model.to(torch_device)
        outputs_encoder_decoder = enc_dec_model(
            input_ids=input_ids,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            return_dict=True,
        )

        self.assertEqual(outputs_encoder_decoder["logits"].shape,
                         (decoder_input_ids.shape +
                          (decoder_config.vocab_size, )))
        self.assertEqual(
            outputs_encoder_decoder["encoder_last_hidden_state"].shape,
            (input_ids.shape + (config.hidden_size, )))
from transformers import EncoderDecoderModel

model_path = 'rubert_cased_L-12_H-768_A-12_pt'
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    model_path, model_path)
model.save_pretrained('pretrained_init_enc_dec')
    def test_finetune_bert2bert(self):
        bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("prajjwal1/bert-tiny", "prajjwal1/bert-tiny")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size
        bert2bert.config.eos_token_id = tokenizer.sep_token_id
        bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
        bert2bert.config.max_length = 128

        train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
        val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]")

        train_dataset = train_dataset.select(range(32))
        val_dataset = val_dataset.select(range(16))

        batch_size = 4

        def _map_to_encoder_decoder_inputs(batch):
            # Tokenizer will automatically set [BOS] <text> [EOS]
            inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512)
            outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=128)
            batch["input_ids"] = inputs.input_ids
            batch["attention_mask"] = inputs.attention_mask

            batch["decoder_input_ids"] = outputs.input_ids
            batch["labels"] = outputs.input_ids.copy()
            batch["labels"] = [
                [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
            ]
            batch["decoder_attention_mask"] = outputs.attention_mask

            assert all([len(x) == 512 for x in inputs.input_ids])
            assert all([len(x) == 128 for x in outputs.input_ids])

            return batch

        def _compute_metrics(pred):
            labels_ids = pred.label_ids
            pred_ids = pred.predictions

            # all unnecessary tokens are removed
            pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
            label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

            accuracy = sum([int(pred_str[i] == label_str[i]) for i in range(len(pred_str))]) / len(pred_str)

            return {"accuracy": accuracy}

        # map train dataset
        train_dataset = train_dataset.map(
            _map_to_encoder_decoder_inputs,
            batched=True,
            batch_size=batch_size,
            remove_columns=["article", "highlights"],
        )
        train_dataset.set_format(
            type="torch",
            columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
        )

        # same for validation dataset
        val_dataset = val_dataset.map(
            _map_to_encoder_decoder_inputs,
            batched=True,
            batch_size=batch_size,
            remove_columns=["article", "highlights"],
        )
        val_dataset.set_format(
            type="torch",
            columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
        )

        output_dir = self.get_auto_remove_tmp_dir()

        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            predict_with_generate=True,
            evaluation_strategy="steps",
            do_train=True,
            do_eval=True,
            warmup_steps=0,
            eval_steps=2,
            logging_steps=2,
        )

        # instantiate trainer
        trainer = Seq2SeqTrainer(
            model=bert2bert,
            args=training_args,
            compute_metrics=_compute_metrics,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
        )

        # start training
        trainer.train()
 def get_pretrained_model(self):
     return EncoderDecoderModel.from_encoder_decoder_pretrained(
         "bert-large-uncased",
         "patrickvonplaten/prophetnet-decoder-clm-large-uncased")
 def get_pretrained_model(self):
     return EncoderDecoderModel.from_encoder_decoder_pretrained(
         "bert-base-cased", "gpt2")
 def get_pretrained_model(self):
     return EncoderDecoderModel.from_encoder_decoder_pretrained(
         "roberta-base", "roberta-base")
 def get_pretrained_model(self):
     return EncoderDecoderModel.from_encoder_decoder_pretrained(
         "google/bert_for_seq_generation_L-24_bbc_encoder",
         "google/bert_for_seq_generation_L-24_bbc_encoder")
Esempio n. 26
0
 def __init__(self):
     super(BERTEncDecModel, self).__init__()
     print("Model creation...")
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.model = EncoderDecoderModel.from_encoder_decoder_pretrained(
         'bert-base-uncased', 'bert-base-uncased')
Esempio n. 27
0
 def test_real_bert_model_from_pretrained_add_cross_attention(self):
     model = EncoderDecoderModel.from_encoder_decoder_pretrained(
         "bert-base-uncased", "bert-base-uncased")
     self.assertTrue(
         hasattr(model.decoder.bert.encoder.layer[0], "crossattention"))
Esempio n. 28
0
 def test_real_bert_model_from_pretrained(self):
     model = EncoderDecoderModel.from_encoder_decoder_pretrained(
         "bert-base-uncased", "bert-base-uncased")
     self.assertIsNotNone(model)
val_data = val_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    #     remove_columns=['name', 'note'],
)
val_data.set_format(
    type='torch',
    columns=[
        'input_ids', 'attention_mask', 'decoder_input_ids',
        'decoder_attention_mask', 'labels'
    ],
)

ed_model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    'bert-base-uncased', 'bert-base-uncased')

# set special tokens
ed_model.config.decoder_start_token_id = 1
ed_model.config.eos_token_id = input_tokenizer.eos_token_id
ed_model.config.pad_token_id = input_tokenizer.pad_token_id

# sensible parameters for beam search
ed_model.config.vocab_size = len(output_vocab)
ed_model.config.max_length = 142
ed_model.config.min_length = 56
ed_model.config.no_repeat_ngram_size = 3
ed_model.config.early_stopping = True
ed_model.config.length_penalty = 2.0
ed_model.config.num_beams = 4
Esempio n. 30
0
def get_model(model=0, seed=8888):
    set_seed(seed)
    print("loading :", config.MODEL_LIST[model])
    config.TOKENIZER = AutoTokenizer.from_pretrained(config.MODEL_LIST[model])
    return EncoderDecoderModel.from_encoder_decoder_pretrained(
        config.MODEL_LIST[model], config.MODEL_LIST[model])