def train_model(config_path: str): writer = SummaryWriter() config = read_training_pipeline_params(config_path) logger.info("pretrained_emb {b}", b=config.net_params.pretrained_emb) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info("Device is {device}", device=device) SRC, TRG, dataset = get_dataset(config.dataset_path, False) train_data, valid_data, test_data = split_data( dataset, **config.split_ration.__dict__) SRC.build_vocab(train_data, min_freq=3) TRG.build_vocab(train_data, min_freq=3) torch.save(SRC.vocab, config.src_vocab_name) torch.save(TRG.vocab, config.trg_vocab_name) logger.info("Vocab saved") print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}") print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}") train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=config.BATCH_SIZE, device=device, sort_key=_len_sort_key, ) INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) config_encoder = BertConfig(vocab_size=INPUT_DIM) config_decoder = BertConfig(vocab_size=OUTPUT_DIM) config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) config_encoder = model.config.encoder config_decoder = model.config.decoder config_decoder.is_decoder = True config_decoder.add_cross_attention = True config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) args = TrainingArguments( output_dir="output", evaluation_strategy="steps", eval_steps=500, per_device_train_batch_size=128, per_device_eval_batch_size=128, num_train_epochs=10, save_steps=3000, seed=0, load_best_model_at_end=True, ) # args.place_model_on_device = device trainer = Trainer( model=model, args=args, train_dataset=train_iterator, eval_dataset=valid_iterator, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], ) trainer.train() model.save_pretrained("bert2bert")
def build(self): # to be further set # breakpoint() self.image_feature_module = build_image_encoder( self.config.image_feature_processor, direct_features=True ) if self.config.concate_trace: self.trace_feature_module = build_encoder(self.config.trace_feature_encoder) if self.config.base_model_name == "bert-base-uncased": self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased" ) elif self.config.base_model_name == "2layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.max_position_embeddings = 1090 config_encoder.num_hidden_layers = 2 config_decoder.num_hidden_layers = 2 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder ) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) elif self.config.base_model_name == "3layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 3 config_decoder.num_hidden_layers = 3 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder ) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) if self.config.loop_contrastive: self.trace_caption_contrastive = TraceCaptionContrastiveModel( self.config.tc_contrastive_aggregate_method ) if ( hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention ): # import ipdb; ipdb.set_trace() tempconf = self.encoderdecoder.config.encoder num_heads = tempconf.num_attention_heads num_layers = tempconf.num_hidden_layers self.attention_trans = AttentionTransform(num_layers, num_heads, 100) self.BOS_ID = 101 self.vae = OpenAIDiscreteVAE() image_code_dim = 768 image_fmap_size = self.vae.image_size // (2 ** self.vae.num_layers) self.image_seq_len = image_fmap_size ** 2 self.image_emb = torch.nn.Embedding(self.vae.num_tokens, image_code_dim) self.image_pos_emb = AxialPositionalEmbedding( image_code_dim, axial_shape=(image_fmap_size, image_fmap_size) )
def __init__(self, config, pad_id): super(Transformer, self).__init__() encoder_config = BertConfig( vocab_size=config.src_vocab_size, hidden_size=config.h_size, num_hidden_layers=config.enc_layers, num_attention_heads=config.n_heads, intermediate_size=config.d_ff, hidden_dropout_prob=config.dropout, pad_token_id=pad_id, ) decoder_config = BertConfig( vocab_size=config.tgt_vocab_size, hidden_size=config.h_size, num_hidden_layers=config.dec_layers, num_attention_heads=config.n_heads, intermediate_size=config.d_ff, hidden_dropout_prob=config.dropout, pad_token_id=pad_id, is_decoder=True, add_cross_attention=True, ) encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config, decoder_config) self.tr = EncoderDecoderModel(config=encoder_decoder_config) if config.joined_vocab: self.tr.encoder.embeddings.word_embeddings = self.tr.decoder.bert.embeddings.word_embeddings
def check_equivalence_tf_to_pt(self, config, decoder_config, inputs_dict): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) # Using `_tf_model`, the test will fail, because the weights of `_tf_model` get extended before saving # the encoder/decoder models. # There was a (very) ugly potential fix, which wasn't integrated to `transformers`: see # https://github.com/huggingface/transformers/pull/13222/commits/dbb3c9de76eee235791d2064094654637c99f36d#r697304245 # (the change in `src/transformers/modeling_tf_utils.py`) _tf_model = TFEncoderDecoderModel(encoder_decoder_config) # Make sure model is built _tf_model(**inputs_dict) # Using `tf_model` to pass the test. encoder = _tf_model.encoder.__class__(encoder_decoder_config.encoder) decoder = _tf_model.decoder.__class__(encoder_decoder_config.decoder) # Make sure models are built encoder(encoder.dummy_inputs) decoder(decoder.dummy_inputs) tf_model = TFEncoderDecoderModel(encoder=encoder, decoder=decoder) with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: tf_model.encoder.save_pretrained(encoder_tmp_dirname) tf_model.decoder.save_pretrained(decoder_tmp_dirname) pt_model = EncoderDecoderModel.from_encoder_decoder_pretrained( encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_tf=True, decoder_from_tf=True ) # This is only for copying some specific attributes of this particular model. pt_model.config = tf_model.config self.check_pt_tf_equivalence(pt_model, tf_model, inputs_dict)
def get_encoder_decoder_config(self): encoder_config = AutoConfig.from_pretrained("bert-base-uncased") decoder_config = AutoConfig.from_pretrained("bert-base-uncased", is_decoder=True, add_cross_attention=True) return EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config, decoder_config)
def __init__(self, config, dataset): super(BERT2BERT, self).__init__(config, dataset) self.sos_token_idx = 101 self.eos_token_idx = 102 self.pretrained_model_path = config['pretrained_model_path'] self.tokenizer = BertTokenizer.from_pretrained(self.pretrained_model_path) self.encoder_configure = BertConfig.from_pretrained(self.pretrained_model_path) self.decoder_configure = BertConfig.from_pretrained(self.pretrained_model_path) self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config=self.encoder_configure, decoder_config=self.decoder_configure ) self.encoder = BertGenerationEncoder.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx ) self.decoder = BertGenerationDecoder.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx, add_cross_attention=True, is_decoder=True ) self.model = EncoderDecoderModel(encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config) self.padding_token_idx = self.tokenizer.pad_token_id self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def __init__(self, config, sequence_length, use_pretrained=True, pretrained_model=None): """Constructor""" super().__init__(config, sequence_length) # suspend logging due to hellish verbosity lvl = logging.getLogger().level logging.getLogger().setLevel(logging.WARN) config_args = {"pretrained_model_name_or_path": self.pretrained_id} if pretrained_model is None: if use_pretrained: model = EncoderDecoderModel.from_encoder_decoder_pretrained( self.pretrained_id, self.pretrained_id) else: enc, dec = BertConfig(), BertConfig() dec.is_decoder = True dec.add_cross_attention = True enc_dec_config = EncoderDecoderConfig.from_encoder_decoder_configs( enc, dec) model = EncoderDecoderModel(config=enc_dec_config) logging.getLogger().setLevel(lvl) self.model = model else: self.model = pretrained_model logging.getLogger().setLevel(self.config.print.log_level.upper())
def __init__(self, word_num, embedding_dim, batch_size): super().__init__() self.word_num = word_num self.embedding_dim = embedding_dim self.batch_size = batch_size self.config_encoder = BertConfig( vocab_size=word_num, hidden_size=embedding_dim, num_hidden_layers=6, num_attention_heads=2, intermediate_size=512, output_hidden_states=False, output_attentions=False) #shape (bs, inp_len, inp_len) self.config_decoder = BertConfig( vocab_size=word_num, hidden_size=embedding_dim, num_hidden_layers=6, num_attention_heads=2, intermediate_size=512, output_hidden_states=True, output_attentions=False) #shape (bs, tar_len, tar_len) self.config = EncoderDecoderConfig.from_encoder_decoder_configs( self.config_encoder, self.config_decoder) self.encoder = BertModel(config=self.config_encoder) #self.seq2seq = EncoderDecoderModel(config=self.config) #self.fc1 = nn.Linear(word_num, 1) self.fc2 = nn.Linear(embedding_dim, 1)
def check_encoder_decoder_model_from_pretrained_configs( self, config, input_ids, attention_mask, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, **kwargs): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( config, decoder_config) self.assertTrue(encoder_decoder_config.decoder.is_decoder) enc_dec_model = EncoderDecoderModel(encoder_decoder_config) enc_dec_model.to(torch_device) enc_dec_model.eval() self.assertTrue(enc_dec_model.config.is_encoder_decoder) outputs_encoder_decoder = enc_dec_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, return_dict=True, ) self.assertEqual(outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size, ))) self.assertEqual( outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size, )))
def get_encoder_decoder_config_small(self): encoder_config = AutoConfig.from_pretrained( "hf-internal-testing/tiny-bert") decoder_config = AutoConfig.from_pretrained( "hf-internal-testing/tiny-bert", is_decoder=True, add_cross_attention=True) return EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config, decoder_config)
def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = EncoderDecoderModel(encoder_decoder_config) fx_model = FlaxEncoderDecoderModel(encoder_decoder_config) pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params) self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
def load_model(path, model=0): config_encoder = AutoConfig.from_pretrained(config.MODEL_LIST[model]) config_decoder = AutoConfig.from_pretrained(config.MODEL_LIST[model]) configer = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) config.TOKENIZER = AutoTokenizer.from_pretrained(config.MODEL_LIST[model]) model = EncoderDecoderModel.from_pretrained(path, config=configer) print('MODEL LOADED!') return model
def build(self): # to be further set # breakpoint() self.image_feature_module = build_image_encoder( self.config.image_feature_processor, direct_features=True) if self.config.concate_trace: self.trace_feature_module = build_encoder( self.config.trace_feature_encoder) if self.config.base_model_name == "bert-base-uncased": self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") elif self.config.base_model_name == "2layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 2 config_decoder.num_hidden_layers = 2 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) elif self.config.base_model_name == "3layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 3 config_decoder.num_hidden_layers = 3 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) if self.config.loop_contrastive: self.trace_caption_contrastive = TraceCaptionContrastiveModel( self.config.tc_contrastive_aggregate_method) if (hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention): # import ipdb; ipdb.set_trace() tempconf = self.encoderdecoder.config.encoder num_heads = tempconf.num_attention_heads num_layers = tempconf.num_hidden_layers self.attention_trans = AttentionTransform(num_layers, num_heads, 100) self.BOS_ID = 101
def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = EncoderDecoderModel(encoder_decoder_config) fx_model = FlaxEncoderDecoderModel(encoder_decoder_config) fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model) fx_model.params = fx_state self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
def test_pt_tf_equivalence(self): config_inputs_dict = self.prepare_config_and_inputs() # Keep only common arguments arg_names = [ "config", "input_ids", "attention_mask", "decoder_config", "decoder_input_ids", "decoder_attention_mask", "encoder_hidden_states", ] config_inputs_dict = {k: v for k, v in config_inputs_dict.items() if k in arg_names} config = config_inputs_dict.pop("config") decoder_config = config_inputs_dict.pop("decoder_config") inputs_dict = config_inputs_dict # `encoder_hidden_states` is not used in model call/forward del inputs_dict["encoder_hidden_states"] # Avoid the case where a sequence has no place to attend (after combined with the causal attention mask) batch_size = inputs_dict["decoder_attention_mask"].shape[0] inputs_dict["decoder_attention_mask"] = tf.constant( np.concatenate([np.ones(shape=(batch_size, 1)), inputs_dict["decoder_attention_mask"][:, 1:]], axis=1) ) # TF models don't use the `use_cache` option and cache is not returned as a default. # So we disable `use_cache` here for PyTorch model. decoder_config.use_cache = False self.assertTrue(decoder_config.cross_attention_hidden_size is None) # check without `enc_to_dec_proj` projection self.assertTrue(config.hidden_size == decoder_config.hidden_size) self.check_equivalence_pt_to_tf(config, decoder_config, inputs_dict) self.check_equivalence_tf_to_pt(config, decoder_config, inputs_dict) # This is not working, because pt/tf equivalence test for encoder-decoder use `from_encoder_decoder_pretrained`, # which randomly initialize `enc_to_dec_proj`. # # check `enc_to_dec_proj` work as expected # decoder_config.hidden_size = decoder_config.hidden_size * 2 # self.assertTrue(config.hidden_size != decoder_config.hidden_size) # self.check_equivalence_pt_to_tf(config, decoder_config, inputs_dict) # self.check_equivalence_tf_to_pt(config, decoder_config, inputs_dict) # Let's just check `enc_to_dec_proj` can run for now decoder_config.hidden_size = decoder_config.hidden_size * 2 self.assertTrue(config.hidden_size != decoder_config.hidden_size) encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) model = TFEncoderDecoderModel(encoder_decoder_config) model(**inputs_dict)
def inference(): step = sys.argv[1] encoder_config = BertConfig.from_pretrained("monologg/kobert") decoder_config = BertConfig.from_pretrained("monologg/kobert") config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config, decoder_config) tokenizer = KoBertTokenizer() model = EncoderDecoderModel(config=config) ckpt = "model.pt" device = "cuda" model.load_state_dict( torch.load(f"saved/{ckpt}.{step}", map_location="cuda"), strict=True, ) model = model.half().eval().to(device) test_data = open("dataset/abstractive_test_v2.jsonl", "r").read().splitlines() submission = open(f"submission_{step}.csv", "w") test_set = [] for data in test_data: data = json.loads(data) article_original = data["article_original"] article_original = " ".join(article_original) news_id = data["id"] test_set.append((news_id, article_original)) for i, (news_id, text) in tqdm(enumerate(test_set)): tokens = tokenizer.encode_batch([text], max_length=512) generated = model.generate( input_ids=tokens["input_ids"].to(device), attention_mask=tokens["attention_mask"].to(device), use_cache=True, bos_token_id=tokenizer.token2idx["[CLS]"], eos_token_id=tokenizer.token2idx["[SEP]"], pad_token_id=tokenizer.token2idx["[PAD]"], num_beams=12, do_sample=False, temperature=1.0, no_repeat_ngram_size=3, bad_words_ids=[[tokenizer.token2idx["[UNK]"]]], length_penalty=1.0, repetition_penalty=1.5, max_length=512, ) output = tokenizer.decode_batch(generated.tolist())[0] submission.write(f"{news_id},{output}" + "\n") print(news_id, output)
def get_model(vocab_size=30000): config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.vocab_size = vocab_size config_decoder.vocab_size = vocab_size config_decoder.is_decoder = True config_decoder.add_cross_attention = True config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) return model
def get_model(args): if args.model_path: model = EncoderDecoderModel.from_pretrained(args.model_path) src_tokenizer = BertTokenizer.from_pretrained( os.path.join(args.model_path, "src_tokenizer") ) tgt_tokenizer = GPT2Tokenizer.from_pretrained( os.path.join(args.model_path, "tgt_tokenizer") ) tgt_tokenizer.build_inputs_with_special_tokens = types.MethodType( build_inputs_with_special_tokens, tgt_tokenizer ) if local_rank == 0 or local_rank == -1: print("model and tokenizer load from save success") else: src_tokenizer = BertTokenizer.from_pretrained(args.src_pretrain_dataset_name) tgt_tokenizer = GPT2Tokenizer.from_pretrained(args.tgt_pretrain_dataset_name) tgt_tokenizer.add_special_tokens( {"bos_token": "[BOS]", "eos_token": "[EOS]", "pad_token": "[PAD]"} ) tgt_tokenizer.build_inputs_with_special_tokens = types.MethodType( build_inputs_with_special_tokens, tgt_tokenizer ) encoder = BertGenerationEncoder.from_pretrained(args.src_pretrain_dataset_name) decoder = GPT2LMHeadModel.from_pretrained( args.tgt_pretrain_dataset_name, add_cross_attention=True, is_decoder=True ) decoder.resize_token_embeddings(len(tgt_tokenizer)) decoder.config.bos_token_id = tgt_tokenizer.bos_token_id decoder.config.eos_token_id = tgt_tokenizer.eos_token_id decoder.config.vocab_size = len(tgt_tokenizer) decoder.config.add_cross_attention = True decoder.config.is_decoder = True model_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder.config, decoder.config ) model = EncoderDecoderModel( encoder=encoder, decoder=decoder, config=model_config ) if local_rank != -1: model = model.to(device) if args.ngpu > 1: print("{}/{} GPU start".format(local_rank, torch.cuda.device_count())) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank ) optimizer, scheduler = get_optimizer_and_schedule(args, model) return model, src_tokenizer, tgt_tokenizer, optimizer, scheduler
def convert_to_huggingface(self): self.encoder.save_pretrained('./tmp_encoder') self.decoder.save_pretrained('./tmp_decoder') encoder_decoder_config = EncoderDecoderConfig.from_pretrained( './models/checkpoint-1500') encoder = AutoModel.from_pretrained('./tmp_encoder') decoder = AutoModelForCausalLM.from_pretrained( './tmp_decoder', add_cross_attention=True) huggingface_model = EncoderDecoderModel(config=encoder_decoder_config, encoder=encoder, decoder=decoder) return huggingface_model
def test_relative_position_embeds(self): config_and_inputs = self.prepare_config_and_inputs() encoder_config = config_and_inputs["config"] decoder_config = config_and_inputs["decoder_config"] encoder_config.position_embedding_type = "relative_key_query" decoder_config.position_embedding_type = "relative_key_query" config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config) model = EncoderDecoderModel(config).eval().to(torch_device) logits = model( input_ids=config_and_inputs["input_ids"], decoder_input_ids=config_and_inputs["decoder_input_ids"] ).logits self.assertTrue(logits.shape, (13, 7))
def check_equivalence_pt_to_tf(self, config, decoder_config, inputs_dict): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = EncoderDecoderModel(encoder_decoder_config) with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: pt_model.encoder.save_pretrained(encoder_tmp_dirname) pt_model.decoder.save_pretrained(decoder_tmp_dirname) tf_model = TFEncoderDecoderModel.from_encoder_decoder_pretrained( encoder_tmp_dirname, decoder_tmp_dirname, encoder_from_pt=True, decoder_from_pt=True ) # This is only for copying some specific attributes of this particular model. tf_model.config = pt_model.config self.check_pt_tf_equivalence(pt_model, tf_model, inputs_dict)
class EncoderDecoderAdapterTestBase(AdapterTestBase): model_class = EncoderDecoderModel config_class = EncoderDecoderConfig config = staticmethod( lambda: EncoderDecoderConfig.from_encoder_decoder_configs( BertConfig( hidden_size=32, num_hidden_layers=4, num_attention_heads=4, intermediate_size=37, ), BertConfig( hidden_size=32, num_hidden_layers=4, num_attention_heads=4, intermediate_size=37, is_decoder=True, add_cross_attention=True, ), )) tokenizer_name = "bert-base-uncased"
def __init__( self, model_save_path: str, batch_size: int, num_gpus: int, max_len: int = 512, lr: float = 3e-5, weight_decay: float = 1e-4, save_step_interval: int = 1000, accelerator: str = "ddp", precision: int = 16, use_amp: bool = True, ) -> None: super(Bert2Bert, self).__init__( model_save_path=model_save_path, max_len=max_len, batch_size=batch_size, num_gpus=num_gpus, lr=lr, weight_decay=weight_decay, save_step_interval=save_step_interval, accelerator=accelerator, precision=precision, use_amp=use_amp, ) encoder_config = BertConfig.from_pretrained("monologg/kobert") decoder_config = BertConfig.from_pretrained("monologg/kobert") config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config, decoder_config) self.model = EncoderDecoderModel(config) self.tokenizer = KoBertTokenizer() state_dict = BertModel.from_pretrained("monologg/kobert").state_dict() self.model.encoder.load_state_dict(state_dict) self.model.decoder.bert.load_state_dict(state_dict, strict=False)
def __init__(self, config, dataset): super(BERT2BERT, self).__init__(config, dataset) self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased') self.encoder_configure = BertConfig.from_pretrained('bert-base-cased') self.decoder_configure = BertConfig.from_pretrained('bert-base-cased') self.encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs( encoder_config=self.encoder_configure, decoder_config=self.decoder_configure) self.encoder = BertGenerationEncoder.from_pretrained('bert-base-cased', bos_token_id=101, eos_token_id=102) self.decoder = BertGenerationDecoder.from_pretrained( 'bert-base-cased', add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102) self.encoder_decoder = EncoderDecoderModel( encoder=self.encoder, decoder=self.decoder, config=self.encoder_decoder_config) self.sos_token = dataset.sos_token self.eos_token = dataset.eos_token self.padding_token_idx = self.tokenizer.pad_token_id self.max_source_length = config['source_max_seq_length'] self.max_target_length = config['target_max_seq_length'] self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def create_and_check_encoder_decoder_shared_weights( self, config, input_ids, attention_mask, encoder_hidden_states, decoder_config, decoder_input_ids, decoder_attention_mask, labels, **kwargs): torch.manual_seed(0) encoder_model, decoder_model = self.get_encoder_decoder_model( config, decoder_config) model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) model.to(torch_device) model.eval() # load state dict copies weights but does not tie them decoder_state_dict = model.decoder._modules[ model.decoder.base_model_prefix].state_dict() model.encoder.load_state_dict(decoder_state_dict, strict=False) torch.manual_seed(0) tied_encoder_model, tied_decoder_model = self.get_encoder_decoder_model( config, decoder_config) config = EncoderDecoderConfig.from_encoder_decoder_configs( tied_encoder_model.config, tied_decoder_model.config, tie_encoder_decoder=True) tied_model = EncoderDecoderModel(encoder=tied_encoder_model, decoder=tied_decoder_model, config=config) tied_model.to(torch_device) tied_model.eval() model_result = model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) tied_model_result = tied_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) # check that models has less parameters self.assertLess(sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())) random_slice_idx = ids_tensor((1, ), model_result[0].shape[-1]).item() # check that outputs are equal self.assertTrue( torch.allclose(model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4)) # check that outputs after saving and loading are equal with tempfile.TemporaryDirectory() as tmpdirname: tied_model.save_pretrained(tmpdirname) tied_model = EncoderDecoderModel.from_pretrained(tmpdirname) tied_model.to(torch_device) tied_model.eval() # check that models has less parameters self.assertLess(sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())) random_slice_idx = ids_tensor((1, ), model_result[0].shape[-1]).item() tied_model_result = tied_model( input_ids=input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, ) # check that outputs are equal self.assertTrue( torch.allclose(model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4))
def encoder_decoder_example(): from transformers import EncoderDecoderConfig, EncoderDecoderModel from transformers import BertConfig, GPT2Config pretrained_model_name = 'bert-base-uncased' #pretrained_model_name = 'gpt2' if 'bert' in pretrained_model_name: # Initialize a BERT bert-base-uncased style configuration. config_encoder, config_decoder = BertConfig(), BertConfig() elif 'gpt2' in pretrained_model_name: config_encoder, config_decoder = GPT2Config(), GPT2Config() else: print('Invalid model, {}.'.format(pretrained_model_name)) return config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) if 'bert' in pretrained_model_name: # Initialize a Bert2Bert model from the bert-base-uncased style configurations. model = EncoderDecoderModel(config=config) #model = EncoderDecoderModel.from_encoder_decoder_pretrained(pretrained_model_name, pretrained_model_name) # Initialize Bert2Bert from pre-trained checkpoints. tokenizer = BertTokenizer.from_pretrained(pretrained_model_name) elif 'gpt2' in pretrained_model_name: model = EncoderDecoderModel(config=config) tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name) #print('Configuration of the encoder & decoder:\n{}.\n{}.'.format(model.config.encoder, model.config.decoder)) #print('Encoder type = {}, decoder type = {}.'.format(type(model.encoder), type(model.decoder))) if False: # Access the model configuration. config_encoder = model.config.encoder config_decoder = model.config.decoder # Set decoder config to causal LM. config_decoder.is_decoder = True config_decoder.add_cross_attention = True #-------------------- input_ids = torch.tensor(tokenizer.encode('Hello, my dog is cute', add_special_tokens=True)).unsqueeze(0) # Batch size 1. if False: # Forward. outputs = model(input_ids=input_ids, decoder_input_ids=input_ids) # Train. outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids) loss, logits = outputs.loss, outputs.logits # Save the model, including its configuration. model.save_pretrained('my-model') #-------------------- # Load model and config from pretrained folder. encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model') model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config) #-------------------- # Generate. # REF [site] >> # https://huggingface.co/transformers/internal/generation_utils.html # https://huggingface.co/blog/how-to-generate generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id) #generated = model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2, num_return_sequences=5, do_sample=True, top_k=0, temperature=0.7, early_stopping=True, decoder_start_token_id=model.config.decoder.pad_token_id) print('Generated = {}.'.format(tokenizer.decode(generated[0], skip_special_tokens=True)))
SPECIAL_TOKENS = [ "<bos>", "<eos>", "<persona>", "<speaker1>", "<speaker2>", "<pad>" ] ATTR_TO_SPECIAL_TOKEN = { 'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>', 'additional_special_tokens': ['<speaker1>', '<speaker2>', '<persona>'] } tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny") tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) encoder_decoder_config = EncoderDecoderConfig.from_pretrained( './models/checkpoint-1200') model = EncoderDecoderModel.from_pretrained('./models/checkpoint-1200', config=encoder_decoder_config) model.get_encoder().resize_token_embeddings(len(tokenizer)) model.get_decoder().resize_token_embeddings(len(tokenizer)) print(type(model.get_encoder()), type(model.get_decoder())) # model = SimpleEncoderDecoder(tokenizer) # model = load() # model.to('cpu') # create ids of encoded input vectors input_ids = tokenizer("I want to buy a car", return_tensors="pt").input_ids # create BOS token decoder_input_ids = tokenizer("<bos>", add_special_tokens=False,
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized_decoder]) input_ids_ = torch.LongTensor(np.array(padded)) attention_mask_ = np.where(padded != 0, 1, 0) padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized_encoder]) input_ids = torch.LongTensor(np.array(padded)) attention_mask = np.where(padded != 0, 1, 0) attention_mask=torch.Tensor(attention_mask) attention_mask_=torch.Tensor(attention_mask_) device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu") #model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-multilingual-cased', 'bert-base-multilingual-cased') config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.max_length = 1566 config_decoder.max_length = 101 config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-multilingual-cased', 'bert-base-multilingual-cased',config=config) # initialize Bert2Bert model.to(device) for i in range(10): optimizer.zero_grad() loss= model(input_ids=input_ids[:1].to(device), decoder_input_ids=input_ids_[:1].to(device), lm_labels=input_ids_[:1].to(device),attention_mask=attention_mask[:1].to(device),decoder_attention_mask = attention_mask_[:1].to(device))[:1] print(loss[0].item()) loss[0].backward() optimizer.step()
def load_config(model_name_or_path): return EncoderDecoderConfig.from_pretrained(model_name_or_path)
def sample_generate(top_k=50, temperature=1.0, model_path='/content/BERT checkpoints/model-9.pth', gpu_id=0): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") # ------------------------LOAD MODEL----------------- print('load the model....') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_encoder = BertConfig.from_pretrained('bert-base-uncased') bert_decoder = BertConfig.from_pretrained('bert-base-uncased', is_decoder=True) config = EncoderDecoderConfig.from_encoder_decoder_configs( bert_encoder, bert_decoder) model = EncoderDecoderModel(config) model.load_state_dict(torch.load(model_path, map_location='cuda')) model = model.to(device) encoder = model.get_encoder() decoder = model.get_decoder() model.eval() print('load success') # ------------------------END LOAD MODEL-------------- # ------------------------LOAD VALIDATE DATA------------------ test_data = torch.load("/content/test_data.pth") test_dataset = TensorDataset(*test_data) test_dataloader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=1) # ------------------------END LOAD VALIDATE DATA-------------- # ------------------------START GENERETE------------------- update_count = 0 bleu_2scores = 0 bleu_4scores = 0 nist_2scores = 0 nist_4scores = 0 sentences = [] meteor_scores = 0 print('start generating....') for batch in test_dataloader: with torch.no_grad(): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, _ = batch past, _ = encoder(encoder_input, mask_encoder_input) prev_pred = decoder_input[:, :1] sentence = prev_pred # decoding loop for i in range(100): logits = decoder(sentence, encoder_hidden_states=past) logits = logits[0][:, -1] logits = logits.squeeze(1) / temperature logits = top_k_logits(logits, k=top_k) probs = F.softmax(logits, dim=-1) prev_pred = torch.multinomial(probs, num_samples=1) sentence = torch.cat([sentence, prev_pred], dim=-1) if prev_pred[0][0] == 102: break predict = tokenizer.convert_ids_to_tokens(sentence[0].tolist()) encoder_input = encoder_input.squeeze(dim=0) encoder_input_num = (encoder_input != 0).sum() inputs = tokenizer.convert_ids_to_tokens( encoder_input[:encoder_input_num].tolist()) decoder_input = decoder_input.squeeze(dim=0) decoder_input_num = (decoder_input != 0).sum() reference = tokenizer.convert_ids_to_tokens( decoder_input[:decoder_input_num].tolist()) print('-' * 20 + f"example {update_count}" + '-' * 20) print(f"input: {' '.join(inputs)}") print(f"output: {' '.join(reference)}") print(f"predict: {' '.join(predict)}") temp_bleu_2, \ temp_bleu_4, \ temp_nist_2, \ temp_nist_4, \ temp_meteor_scores = calculate_metrics(predict[1:-1], reference[1:-1]) bleu_2scores += temp_bleu_2 bleu_4scores += temp_bleu_4 nist_2scores += temp_nist_2 nist_4scores += temp_nist_4 meteor_scores += temp_meteor_scores sentences.append(" ".join(predict[1:-1])) update_count += 1 entro, dist = cal_entropy(sentences) mean_len, var_len = cal_length(sentences) print(f'avg: {mean_len}, var: {var_len}') print(f'entro: {entro}') print(f'dist: {dist}') print(f'test bleu_2scores: {bleu_2scores / update_count}') print(f'test bleu_4scores: {bleu_4scores / update_count}') print(f'test nist_2scores: {nist_2scores / update_count}') print(f'test nist_4scores: {nist_4scores / update_count}') print(f'test meteor_scores: {meteor_scores / update_count}')