def load_hf_model(config, pretrained=True, path=None): if pretrained: if path: model = BartForConditionalGeneration.from_pretrained( "bart-large-cnn", state_dict=torch.load(path, map_location=torch.device( settings.DEVICE)), config=config) else: model = BartForConditionalGeneration.from_pretrained( "bart-large-cnn", config=config) else: model = BartForConditionalGeneration() return model.to(settings.DEVICE)
def test_resize_tokens_embeddings_more(self): config, input_ids, _ = self._get_config_and_data() def _get_embs(m): return (m.get_input_embeddings().weight.data.clone(), m.get_output_embeddings().weight.data.clone()) model = BartForConditionalGeneration(config).eval().to(torch_device) input, output = _get_embs(model) self.assertTrue(torch.eq(input, output).all()) new_vocab_size = 45 model.resize_token_embeddings(new_vocab_size) input_new, output_new = _get_embs(model) self.assertEqual(input_new.shape, (new_vocab_size, config.d_model)) self.assertEqual(output_new.shape, (new_vocab_size, config.d_model)) self.assertTrue(torch.eq(input_new, output_new).all())
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our BERT structure. """ bart = torch.hub.load("pytorch/fairseq", checkpoint_path) bart.eval() # disable dropout bart.model.upgrade_state_dict(bart.model.state_dict()) hf_model_name = checkpoint_path.replace(".", "-") config = BartConfig.from_pretrained(hf_model_name) tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) assert torch.eq(tokens, tokens2).all() if checkpoint_path in ["bart.large", "bart.large.cnn"]: state_dict = bart.model.state_dict() for k in IGNORE_KEYS: state_dict.pop(k, None) state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] model = BartModel(config) their_output = bart.extract_features(tokens) else: # MNLI Case state_dict = bart.state_dict() for k in IGNORE_KEYS: state_dict.pop(k, None) state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] for src, dest in rename_keys: rename_key(state_dict, src, dest) model = BartForSequenceClassification(config) their_output = bart.predict("mnli", tokens, return_logits=True) # Load state dict model.load_state_dict(state_dict) model.eval() # Check results if checkpoint_path == "bart.large.cnn": model = BartForConditionalGeneration(config, base_model=model) assert "lm_head.weight" in model.state_dict() assert model.lm_head.out_features == config.max_position_embeddings model.eval() our_outputs = model.model(tokens)[0] else: our_outputs = model(tokens)[0] assert their_output.shape == our_outputs.shape assert (their_output == our_outputs).all().item() Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path)
def test_lm_uneven_forward(self): config = BartConfig( vocab_size=self.vocab_size, d_model=14, encoder_layers=2, decoder_layers=2, encoder_attention_heads=2, decoder_attention_heads=2, encoder_ffn_dim=8, decoder_ffn_dim=8, max_position_embeddings=48, ) lm_model = BartForConditionalGeneration(config).to(torch_device) context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device) summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long().to(torch_device) loss, logits, enc_features = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary) expected_shape = (*summary.shape, config.vocab_size) self.assertEqual(logits.shape, expected_shape)
def main(args): # If output_dir not provided, a folder will be generated in pwd if not args.output_dir: args.output_dir = os.path.join( "./results", f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}", ) os.makedirs(args.output_dir) model = SummarizationTrainer(args) sd = model.model.state_dict() shorter_pos_embeds = sd['model.encoder.embed_positions.weight'] new_config = model.config new_config.max_position_embeddings = 3076 new_model = BartForConditionalGeneration(new_config) correctly_shaped_pos_weight = new_model.model.encoder.embed_positions.weight.cuda( ) correctly_shaped_pos_weight[:shorter_pos_embeds. shape[0]] = shorter_pos_embeds.cuda() correctly_shaped_pos_weight[shorter_pos_embeds. shape[0]:2052] = shorter_pos_embeds.cuda() correctly_shaped_pos_weight[2052:] = shorter_pos_embeds.cuda() sd['model.decoder.embed_positions.weight'] = correctly_shaped_pos_weight sd['model.encoder.embed_positions.weight'] = correctly_shaped_pos_weight new_model.load_state_dict(sd, strict=True) model.model = new_model.cuda() trainer = generic_train(model, args) # Optionally, predict on dev set and write to output_dir if args.do_predict: # See https://github.com/huggingface/transformers/issues/3159 # pl use this format to create a checkpoint: # https://github.com/PyTorchLightning/pytorch-lightning/blob/master\ # /pytorch_lightning/callbacks/model_checkpoint.py#L169 checkpoints = list( sorted( glob.glob(os.path.join(args.output_dir, "checkpointepoch=*.ckpt"), recursive=True))) model = model.load_from_checkpoint(checkpoints[-1]) trainer.test(model)
def test_generate_beam_search(self): input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long().to(torch_device) config = BartConfig( vocab_size=self.vocab_size, d_model=24, encoder_layers=2, decoder_layers=2, encoder_attention_heads=2, decoder_attention_heads=2, encoder_ffn_dim=32, decoder_ffn_dim=32, max_position_embeddings=48, output_past=True, ) lm_model = BartForConditionalGeneration(config).to(torch_device) lm_model.eval() new_input_ids = lm_model.generate( input_ids.clone(), num_return_sequences=1, num_beams=2, no_repeat_ngram_size=3, max_length=5 ) self.assertEqual(new_input_ids.shape, (input_ids.shape[0], 5))
def load(self): history = [] learning_rate = [] best_loss = .0 model = BartForConditionalGeneration(self.config) model.to(self.device) optimizer, scheduler = self.get_optim() check_file = os.path.exists(self.path + 'checkpoint.tar') if check_file: checkpoint = torch.load(self.path + 'checkpoint.tar') model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) scheduler.load_state_dict(checkpoint['scheduler_state_dict']) history = checkpoint['history'] learning_rate = checkpoint['learning_rate'] best_loss = checkpoint['best_loss'] return model, optimizer, scheduler, history, learning_rate, best_loss
def pre_init(self, hparams): # Dump empty student model at a path, then call from_pretrained on it teacher = BartForConditionalGeneration.from_pretrained( hparams.teacher).eval() student_updates = { "decoder_layers": hparams.student_decoder_layers, "encoder_layers": hparams.student_encoder_layers, } d_layers_to_copy = get_layers_to_copy( student_updates["decoder_layers"], teacher.config.decoder_layers) e_layers_to_copy: List = get_layers_to_copy( student_updates["encoder_layers"], teacher.config.encoder_layers) hparams.d_layer_to_copy = d_layers_to_copy hparams.e_layer_to_copy = e_layers_to_copy kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = BartConfig(**kw) student = BartForConditionalGeneration(student_cfg) student, _ = init_student(student, teacher) self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams, student, teacher) Path(hparams.output_dir).mkdir(exist_ok=True) return d_layers_to_copy, student, student_cfg, teacher
def main(): batch_size = 4 vocab_size = 16384 max_source_length = 1024 max_target_length = 1024 num_workers = 3 dataset = nlp.load_dataset("iwslt2017.py", "nl-en") # Train tokenizer tokenizer_filename = "tokenizer.json" if os.path.exists(tokenizer_filename): tokenizer = Tokenizer.from_file(tokenizer_filename) else: data_filename = "whole_data.txt" with open(data_filename, "w") as f: for item in dataset["train"]: f.write(item["source"] + "\n") f.write(item["target"] + "\n\n") tokenizer = CharBPETokenizer() tokenizer.train([data_filename], vocab_size=vocab_size) pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False) tokenizer.add_tokens([pad_token]) tokenizer.save(tokenizer_filename) tokenizer.pad_token_id = vocab_size # Loaders train_dataset = Seq2SeqDataset(tokenizer, dataset["train"], max_source_length, max_target_length) val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"], max_source_length, max_target_length) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=num_workers, ) val_loader = DataLoader( val_dataset, batch_size=batch_size, collate_fn=val_dataset.collate_fn, num_workers=num_workers, ) # Train model config = BartConfig( vocab_size=vocab_size + 1, # Pad d_model=1024, encoder_ffn_dim=1024, encoder_layers=6, encoder_attention_heads=4, decoder_ffn_dim=1024, decoder_layers=6, decoder_attention_heads=4, ) model = BartForConditionalGeneration(config) translator = Translate(model, tokenizer) trainer = pl.Trainer(gpus=1) trainer.fit(translator, train_loader, val_loader)
def test_generate_fp16(self): config, input_ids, batch_size = self._get_config_and_data(output_past=True) attention_mask = input_ids.ne(1) lm_model = BartForConditionalGeneration(config).eval().to(torch_device).half() lm_model.generate(input_ids, attention_mask)
# %% import torch from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig # %% # change the position encoding if src_max_length is larger than 1024 # Get original model model = BartForConditionalGeneration.from_pretrained('bart-large-cnn') sd = model.state_dict() shorter_pos_embeds = sd['model.encoder.embed_positions.weight'] # 1024 + 2 embeddings new_config = model.config new_config.max_position_embeddings = 2048 # 2048 -> 2050 new_model = BartForConditionalGeneration(new_config) correctly_shaped_pos_weight = new_model.model.encoder.embed_positions.weight print(correctly_shaped_pos_weight) # %% for i in range(1): correctly_shaped_pos_weight[i * shorter_pos_embeds.shape[0]: (i + 1) * shorter_pos_embeds.shape[0]] = shorter_pos_embeds correctly_shaped_pos_weight[1 * shorter_pos_embeds.shape[0]:] = shorter_pos_embeds[2:, :] # %% sd['model.decoder.embed_positions.weight'] = torch.tensor(correctly_shaped_pos_weight.data) sd['model.encoder.embed_positions.weight'] = torch.tensor(correctly_shaped_pos_weight.data) new_model.load_state_dict(sd, strict=True)
def __init__( self, model_name_or_path, # teacher tokenizer_name, model_cache_dir, input_max_length, target_max_length, summary_column_name, document_column_name, wandb_project, wandb_run_name, student_encoder_layers, student_decoder_layers, **kwargs, ): super().__init__( input_max_length, target_max_length, summary_column_name, document_column_name, wandb_project, wandb_run_name, ) self.tokenizer = BartTokenizer.from_pretrained( tokenizer_name if tokenizer_name else model_name_or_path, cache_dir=model_cache_dir, ) teacher = BartForConditionalGeneration.from_pretrained( model_name_or_path, cache_dir=model_cache_dir, ).eval() student_updates = { "decoder_layers": student_decoder_layers, "encoder_layers": student_encoder_layers, } d_layers_to_copy = self._get_layers_to_copy(student_updates["decoder_layers"], teacher.config.decoder_layers) e_layers_to_copy: List = self._get_layers_to_copy(student_updates["encoder_layers"], teacher.config.encoder_layers) kw = teacher.config.to_diff_dict() kw.update(student_updates) # Copy weights student_cfg = BartConfig(**kw) student = BartForConditionalGeneration(student_cfg) student, _ = self._init_student(student, teacher) self._copy_to_student(d_layers_to_copy, e_layers_to_copy, student_encoder_layers, student_decoder_layers, student, teacher) self.model = student print(student) inputs = self.tokenizer.encode_plus("TEXT TO SUMMARIZE", max_length=1024, return_tensors="pt") # Summarize outputs = self.model.generate( input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=400, min_length=150, length_penalty=2.0, num_beams=4, early_stopping=True ) # Decode summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False) print(summary)
def test_generate_fp16(self): config, input_ids, batch_size = self._get_config_and_data(output_past=True) attention_mask = input_ids.ne(1).to(torch_device) model = BartForConditionalGeneration(config).eval().to(torch_device).half() model.generate(input_ids, attention_mask=attention_mask, do_sample=False, early_stopping=True)
def test_base_model_fp16(self): config, input_ids, batch_size = self._get_config_and_data() attention_mask = input_ids.ne(1).to(torch_device) lm_model = BartForConditionalGeneration(config).eval().to( torch_device).half() lm_model(input_ids, attention_mask=attention_mask)
tot_val_loss += lang_loss * len(inputs['input_ids']) n_val += len(inputs['input_ids']) print("n_val", n_val) avg_val_loss = tot_val_loss.item() / n_val return n_val, avg_val_loss tokenizer = BartTokenizer.from_pretrained('facebook/bart-base') if pretrained: model = BartForConditionalGeneration.from_pretrained('facebook/bart-base', dropout=args.dropout) else: config = BartConfig.from_pretrained('facebook/bart-base') config.dropout = args.dropout model = BartForConditionalGeneration(config) model.to(DEVICE) optimizer = AdamW(list(model.parameters()), lr=args.lr) print("Loaded model") # TODO load data dataset = load_data(args.data, ["walkthrough0"] + [f"randcmd{i}" for i in range(100)], tokenizer, max_seq_len, max_data_size=4000) print("Loaded train data") dev_dataset = load_data(args.data, [f"randcmd{i}" for i in range(100,200)], tokenizer, max_seq_len, max_data_size=500) print("Loaded dev data") # initial eval print("Initial eval") n_val, avg_val_loss = eval_model(args, model, dev_dataset, tokenizer, eval_batchsize) print(f"INIT, avg val loss: {avg_val_loss}") best_val_loss = avg_val_loss
def test_default_generate_kwargs(self): config, input_ids, _ = self._get_config_and_data(output_past=True) model = BartForConditionalGeneration(config).eval().to(torch_device) model.generate(input_ids) model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
def test_dummy_inputs(self): config, *_ = self._get_config_and_data() model = BartForConditionalGeneration(config).eval().to(torch_device) model(**model.dummy_inputs)
def __init__( self, pretrained_model=None, additional_special_tokens_encoder=None, additional_special_tokens_decoder=None, model_config=None, vocab_file=None, args=None, use_cuda=True, cuda_device=-1, **kwargs, ): self.args = self._load_model_args() if isinstance(args, dict): self.args.update_from_dict(args) elif isinstance(args, Seq2SeqArgs): self.args = args if "sweep_config" in kwargs: self.is_sweeping = True sweep_config = kwargs.pop("sweep_config") sweep_values = sweep_config_to_sweep_values(sweep_config) self.args.update_from_dict(sweep_values) else: self.is_sweeping = False if self.args.manual_seed: random.seed(self.args.manual_seed) np.random.seed(self.args.manual_seed) torch.manual_seed(self.args.manual_seed) if self.args.n_gpu > 0: torch.cuda.manual_seed_all(self.args.manual_seed) if use_cuda: if torch.cuda.is_available(): if cuda_device == -1: self.device = torch.device("cuda") else: self.device = torch.device(f"cuda:{cuda_device}") else: raise ValueError( "'use_cuda' set to True when cuda is unavailable." "Make sure CUDA is available or set `use_cuda=False`.") else: self.device = "cpu" self.results = {} if not use_cuda: self.args.fp16 = False # BartConfig, BartForConditionalGeneration, BartTokenizer # config = EncoderDecoderConfig.from_encoder_decoder_configs(config, config) model_config = BartConfig.from_json_file(model_config) if pretrained_model is None: self.model = BartForConditionalGeneration(config=model_config) self.encoder_tokenizer = BartTokenizer.from_pretrained(vocab_file) else: self.model = BartForConditionalGeneration.from_pretrained( pretrained_model) self.encoder_tokenizer = BartTokenizer.from_pretrained(vocab_file) self.decoder_tokenizer = self.encoder_tokenizer # special AST token # additional_special_tokens_encoder = {'additional_special_tokens': ['Assertion', 'RegExp', 'Repetition', 'Quantifier', 'ClassRange', 'CharacterClass']} # additional_special_tokens_decoder = {'additional_special_tokens': ['Assertion', 'RegExp', 'Repetition', 'Quantifier', 'ClassRange', 'CharacterClass']} self.config = self.model.config if additional_special_tokens_encoder is not None: self.encoder_tokenizer.add_special_tokens( additional_special_tokens_encoder) if additional_special_tokens_decoder is not None: self.decoder_tokenizer.add_special_tokens( additional_special_tokens_decoder) if self.args.wandb_project and not wandb_available: warnings.warn( "wandb_project specified but wandb is not available. Wandb disabled." ) self.args.wandb_project = None self.args.model_type = 'bart' self.args.model_name = 'ExplainREGEX'