Beispiel #1
0
def load_hf_model(config, pretrained=True, path=None):
    if pretrained:
        if path:
            model = BartForConditionalGeneration.from_pretrained(
                "bart-large-cnn",
                state_dict=torch.load(path,
                                      map_location=torch.device(
                                          settings.DEVICE)),
                config=config)
        else:
            model = BartForConditionalGeneration.from_pretrained(
                "bart-large-cnn", config=config)
    else:
        model = BartForConditionalGeneration()

    return model.to(settings.DEVICE)
Beispiel #2
0
    def test_resize_tokens_embeddings_more(self):
        config, input_ids, _ = self._get_config_and_data()

        def _get_embs(m):
            return (m.get_input_embeddings().weight.data.clone(),
                    m.get_output_embeddings().weight.data.clone())

        model = BartForConditionalGeneration(config).eval().to(torch_device)
        input, output = _get_embs(model)
        self.assertTrue(torch.eq(input, output).all())
        new_vocab_size = 45
        model.resize_token_embeddings(new_vocab_size)
        input_new, output_new = _get_embs(model)
        self.assertEqual(input_new.shape, (new_vocab_size, config.d_model))
        self.assertEqual(output_new.shape, (new_vocab_size, config.d_model))
        self.assertTrue(torch.eq(input_new, output_new).all())
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our BERT structure.
    """
    bart = torch.hub.load("pytorch/fairseq", checkpoint_path)
    bart.eval()  # disable dropout
    bart.model.upgrade_state_dict(bart.model.state_dict())
    hf_model_name = checkpoint_path.replace(".", "-")
    config = BartConfig.from_pretrained(hf_model_name)
    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
    tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
    assert torch.eq(tokens, tokens2).all()

    if checkpoint_path in ["bart.large", "bart.large.cnn"]:
        state_dict = bart.model.state_dict()
        for k in IGNORE_KEYS:
            state_dict.pop(k, None)
        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
        model = BartModel(config)
        their_output = bart.extract_features(tokens)
    else:  # MNLI Case
        state_dict = bart.state_dict()
        for k in IGNORE_KEYS:
            state_dict.pop(k, None)
        state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
        for src, dest in rename_keys:
            rename_key(state_dict, src, dest)
        model = BartForSequenceClassification(config)
        their_output = bart.predict("mnli", tokens, return_logits=True)

    # Load state dict
    model.load_state_dict(state_dict)
    model.eval()
    # Check results

    if checkpoint_path == "bart.large.cnn":
        model = BartForConditionalGeneration(config, base_model=model)
        assert "lm_head.weight" in model.state_dict()
        assert model.lm_head.out_features == config.max_position_embeddings
        model.eval()
        our_outputs = model.model(tokens)[0]
    else:
        our_outputs = model(tokens)[0]
    assert their_output.shape == our_outputs.shape
    assert (their_output == our_outputs).all().item()
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
Beispiel #4
0
 def test_lm_uneven_forward(self):
     config = BartConfig(
         vocab_size=self.vocab_size,
         d_model=14,
         encoder_layers=2,
         decoder_layers=2,
         encoder_attention_heads=2,
         decoder_attention_heads=2,
         encoder_ffn_dim=8,
         decoder_ffn_dim=8,
         max_position_embeddings=48,
     )
     lm_model = BartForConditionalGeneration(config).to(torch_device)
     context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device)
     summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long().to(torch_device)
     loss, logits, enc_features = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary)
     expected_shape = (*summary.shape, config.vocab_size)
     self.assertEqual(logits.shape, expected_shape)
Beispiel #5
0
def main(args):

    # If output_dir not provided, a folder will be generated in pwd
    if not args.output_dir:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    model = SummarizationTrainer(args)
    sd = model.model.state_dict()
    shorter_pos_embeds = sd['model.encoder.embed_positions.weight']
    new_config = model.config
    new_config.max_position_embeddings = 3076
    new_model = BartForConditionalGeneration(new_config)
    correctly_shaped_pos_weight = new_model.model.encoder.embed_positions.weight.cuda(
    )
    correctly_shaped_pos_weight[:shorter_pos_embeds.
                                shape[0]] = shorter_pos_embeds.cuda()
    correctly_shaped_pos_weight[shorter_pos_embeds.
                                shape[0]:2052] = shorter_pos_embeds.cuda()
    correctly_shaped_pos_weight[2052:] = shorter_pos_embeds.cuda()
    sd['model.decoder.embed_positions.weight'] = correctly_shaped_pos_weight
    sd['model.encoder.embed_positions.weight'] = correctly_shaped_pos_weight
    new_model.load_state_dict(sd, strict=True)
    model.model = new_model.cuda()
    trainer = generic_train(model, args)

    # Optionally, predict on dev set and write to output_dir
    if args.do_predict:
        # See https://github.com/huggingface/transformers/issues/3159
        # pl use this format to create a checkpoint:
        # https://github.com/PyTorchLightning/pytorch-lightning/blob/master\
        # /pytorch_lightning/callbacks/model_checkpoint.py#L169
        checkpoints = list(
            sorted(
                glob.glob(os.path.join(args.output_dir,
                                       "checkpointepoch=*.ckpt"),
                          recursive=True)))
        model = model.load_from_checkpoint(checkpoints[-1])
        trainer.test(model)
    def test_generate_beam_search(self):
        input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long().to(torch_device)
        config = BartConfig(
            vocab_size=self.vocab_size,
            d_model=24,
            encoder_layers=2,
            decoder_layers=2,
            encoder_attention_heads=2,
            decoder_attention_heads=2,
            encoder_ffn_dim=32,
            decoder_ffn_dim=32,
            max_position_embeddings=48,
            output_past=True,
        )
        lm_model = BartForConditionalGeneration(config).to(torch_device)
        lm_model.eval()

        new_input_ids = lm_model.generate(
            input_ids.clone(), num_return_sequences=1, num_beams=2, no_repeat_ngram_size=3, max_length=5
        )
        self.assertEqual(new_input_ids.shape, (input_ids.shape[0], 5))
Beispiel #7
0
    def load(self):

        history = []
        learning_rate = []
        best_loss = .0

        model = BartForConditionalGeneration(self.config)
        model.to(self.device)
        optimizer, scheduler = self.get_optim()

        check_file = os.path.exists(self.path + 'checkpoint.tar')
        if check_file:
            checkpoint = torch.load(self.path + 'checkpoint.tar')

            model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            history = checkpoint['history']
            learning_rate = checkpoint['learning_rate']
            best_loss = checkpoint['best_loss']

        return model, optimizer, scheduler, history, learning_rate, best_loss
Beispiel #8
0
 def pre_init(self, hparams):
     # Dump empty student model at a path, then call from_pretrained on it
     teacher = BartForConditionalGeneration.from_pretrained(
         hparams.teacher).eval()
     student_updates = {
         "decoder_layers": hparams.student_decoder_layers,
         "encoder_layers": hparams.student_encoder_layers,
     }
     d_layers_to_copy = get_layers_to_copy(
         student_updates["decoder_layers"], teacher.config.decoder_layers)
     e_layers_to_copy: List = get_layers_to_copy(
         student_updates["encoder_layers"], teacher.config.encoder_layers)
     hparams.d_layer_to_copy = d_layers_to_copy
     hparams.e_layer_to_copy = e_layers_to_copy
     kw = teacher.config.to_diff_dict()
     kw.update(student_updates)
     # Copy weights
     student_cfg = BartConfig(**kw)
     student = BartForConditionalGeneration(student_cfg)
     student, _ = init_student(student, teacher)
     self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams,
                          student, teacher)
     Path(hparams.output_dir).mkdir(exist_ok=True)
     return d_layers_to_copy, student, student_cfg, teacher
Beispiel #9
0
def main():
    batch_size = 4
    vocab_size = 16384
    max_source_length = 1024
    max_target_length = 1024
    num_workers = 3

    dataset = nlp.load_dataset("iwslt2017.py", "nl-en")

    # Train tokenizer
    tokenizer_filename = "tokenizer.json"
    if os.path.exists(tokenizer_filename):
        tokenizer = Tokenizer.from_file(tokenizer_filename)
    else:
        data_filename = "whole_data.txt"
        with open(data_filename, "w") as f:
            for item in dataset["train"]:
                f.write(item["source"] + "\n")
                f.write(item["target"] + "\n\n")

        tokenizer = CharBPETokenizer()
        tokenizer.train([data_filename], vocab_size=vocab_size)
        pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False)
        tokenizer.add_tokens([pad_token])
        tokenizer.save(tokenizer_filename)

    tokenizer.pad_token_id = vocab_size

    # Loaders
    train_dataset = Seq2SeqDataset(tokenizer, dataset["train"],
                                   max_source_length, max_target_length)
    val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"],
                                 max_source_length, max_target_length)
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=train_dataset.collate_fn,
        num_workers=num_workers,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        collate_fn=val_dataset.collate_fn,
        num_workers=num_workers,
    )

    # Train model
    config = BartConfig(
        vocab_size=vocab_size + 1,  # Pad
        d_model=1024,
        encoder_ffn_dim=1024,
        encoder_layers=6,
        encoder_attention_heads=4,
        decoder_ffn_dim=1024,
        decoder_layers=6,
        decoder_attention_heads=4,
    )
    model = BartForConditionalGeneration(config)
    translator = Translate(model, tokenizer)

    trainer = pl.Trainer(gpus=1)
    trainer.fit(translator, train_loader, val_loader)
 def test_generate_fp16(self):
     config, input_ids, batch_size = self._get_config_and_data(output_past=True)
     attention_mask = input_ids.ne(1)
     lm_model = BartForConditionalGeneration(config).eval().to(torch_device).half()
     lm_model.generate(input_ids, attention_mask)
Beispiel #11
0
# %%
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

# %%
# change the position encoding if src_max_length is larger than 1024
# Get original model
model = BartForConditionalGeneration.from_pretrained('bart-large-cnn')
sd = model.state_dict()

shorter_pos_embeds = sd['model.encoder.embed_positions.weight']  # 1024 + 2 embeddings

new_config = model.config
new_config.max_position_embeddings = 2048  # 2048 -> 2050
new_model = BartForConditionalGeneration(new_config)

correctly_shaped_pos_weight = new_model.model.encoder.embed_positions.weight
print(correctly_shaped_pos_weight)

# %%
for i in range(1):
    correctly_shaped_pos_weight[i * shorter_pos_embeds.shape[0]:
                                (i + 1) * shorter_pos_embeds.shape[0]] = shorter_pos_embeds

correctly_shaped_pos_weight[1 * shorter_pos_embeds.shape[0]:] = shorter_pos_embeds[2:, :]
# %%
sd['model.decoder.embed_positions.weight'] = torch.tensor(correctly_shaped_pos_weight.data)

sd['model.encoder.embed_positions.weight'] = torch.tensor(correctly_shaped_pos_weight.data)

new_model.load_state_dict(sd, strict=True)
    def __init__(
        self,
        model_name_or_path, # teacher
        tokenizer_name,
        model_cache_dir,
        input_max_length,
        target_max_length,
        summary_column_name,
        document_column_name,
        wandb_project,
        wandb_run_name,
        student_encoder_layers,
        student_decoder_layers,
        **kwargs,
    ):
        super().__init__(
            input_max_length,
            target_max_length,
            summary_column_name,
            document_column_name,
            wandb_project,
            wandb_run_name,
        )
        self.tokenizer = BartTokenizer.from_pretrained(
            tokenizer_name if tokenizer_name else model_name_or_path,
            cache_dir=model_cache_dir,
        )
        teacher = BartForConditionalGeneration.from_pretrained(
            model_name_or_path, cache_dir=model_cache_dir,
        ).eval()

        student_updates = {
            "decoder_layers": student_decoder_layers,
            "encoder_layers": student_encoder_layers,
        }
        d_layers_to_copy = self._get_layers_to_copy(student_updates["decoder_layers"], teacher.config.decoder_layers)
        e_layers_to_copy: List = self._get_layers_to_copy(student_updates["encoder_layers"], teacher.config.encoder_layers)
        kw = teacher.config.to_diff_dict()
        kw.update(student_updates)
        # Copy weights
        student_cfg = BartConfig(**kw)
        student = BartForConditionalGeneration(student_cfg)
        student, _ = self._init_student(student, teacher)
        self._copy_to_student(d_layers_to_copy, e_layers_to_copy, student_encoder_layers, student_decoder_layers, student, teacher)
        self.model = student
        print(student)
        inputs = self.tokenizer.encode_plus("TEXT TO SUMMARIZE", max_length=1024, return_tensors="pt")

        # Summarize
        outputs = self.model.generate(
            input_ids=inputs['input_ids'], 
            attention_mask=inputs['attention_mask'], 
            max_length=400, 
            min_length=150, 
            length_penalty=2.0, 
            num_beams=4, 
            early_stopping=True
        )

        # Decode
        summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
        print(summary)
 def test_generate_fp16(self):
     config, input_ids, batch_size = self._get_config_and_data(output_past=True)
     attention_mask = input_ids.ne(1).to(torch_device)
     model = BartForConditionalGeneration(config).eval().to(torch_device).half()
     model.generate(input_ids, attention_mask=attention_mask, do_sample=False, early_stopping=True)
 def test_base_model_fp16(self):
     config, input_ids, batch_size = self._get_config_and_data()
     attention_mask = input_ids.ne(1).to(torch_device)
     lm_model = BartForConditionalGeneration(config).eval().to(
         torch_device).half()
     lm_model(input_ids, attention_mask=attention_mask)
Beispiel #15
0
            tot_val_loss += lang_loss * len(inputs['input_ids'])
            n_val += len(inputs['input_ids'])

    print("n_val", n_val)
    avg_val_loss = tot_val_loss.item() / n_val
    return n_val, avg_val_loss


tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
if pretrained:
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-base', dropout=args.dropout)
else:
    config = BartConfig.from_pretrained('facebook/bart-base')
    config.dropout = args.dropout
    model = BartForConditionalGeneration(config)
model.to(DEVICE)
optimizer = AdamW(list(model.parameters()), lr=args.lr)
print("Loaded model")

# TODO load data
dataset = load_data(args.data, ["walkthrough0"] + [f"randcmd{i}" for i in range(100)], tokenizer, max_seq_len, max_data_size=4000)
print("Loaded train data")
dev_dataset = load_data(args.data, [f"randcmd{i}" for i in range(100,200)], tokenizer, max_seq_len, max_data_size=500)
print("Loaded dev data")

# initial eval
print("Initial eval")
n_val, avg_val_loss = eval_model(args, model, dev_dataset, tokenizer, eval_batchsize)
print(f"INIT, avg val loss: {avg_val_loss}")
best_val_loss = avg_val_loss
 def test_default_generate_kwargs(self):
     config, input_ids, _ = self._get_config_and_data(output_past=True)
     model = BartForConditionalGeneration(config).eval().to(torch_device)
     model.generate(input_ids)
     model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
Beispiel #17
0
 def test_dummy_inputs(self):
     config, *_ = self._get_config_and_data()
     model = BartForConditionalGeneration(config).eval().to(torch_device)
     model(**model.dummy_inputs)
Beispiel #18
0
    def __init__(
        self,
        pretrained_model=None,
        additional_special_tokens_encoder=None,
        additional_special_tokens_decoder=None,
        model_config=None,
        vocab_file=None,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):
        self.args = self._load_model_args()
        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, Seq2SeqArgs):
            self.args = args

        if "sweep_config" in kwargs:
            self.is_sweeping = True
            sweep_config = kwargs.pop("sweep_config")
            sweep_values = sweep_config_to_sweep_values(sweep_config)
            self.args.update_from_dict(sweep_values)
        else:
            self.is_sweeping = False

        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    "Make sure CUDA is available or set `use_cuda=False`.")
        else:
            self.device = "cpu"

        self.results = {}

        if not use_cuda:
            self.args.fp16 = False

        # BartConfig, BartForConditionalGeneration, BartTokenizer
        # config = EncoderDecoderConfig.from_encoder_decoder_configs(config, config)
        model_config = BartConfig.from_json_file(model_config)
        if pretrained_model is None:
            self.model = BartForConditionalGeneration(config=model_config)
            self.encoder_tokenizer = BartTokenizer.from_pretrained(vocab_file)

        else:
            self.model = BartForConditionalGeneration.from_pretrained(
                pretrained_model)
            self.encoder_tokenizer = BartTokenizer.from_pretrained(vocab_file)
        self.decoder_tokenizer = self.encoder_tokenizer

        # special AST token
        # additional_special_tokens_encoder = {'additional_special_tokens': ['Assertion', 'RegExp', 'Repetition', 'Quantifier',  'ClassRange', 'CharacterClass']}
        # additional_special_tokens_decoder = {'additional_special_tokens': ['Assertion', 'RegExp', 'Repetition', 'Quantifier',  'ClassRange', 'CharacterClass']}

        self.config = self.model.config

        if additional_special_tokens_encoder is not None:
            self.encoder_tokenizer.add_special_tokens(
                additional_special_tokens_encoder)

        if additional_special_tokens_decoder is not None:
            self.decoder_tokenizer.add_special_tokens(
                additional_special_tokens_decoder)

        if self.args.wandb_project and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args.wandb_project = None

        self.args.model_type = 'bart'
        self.args.model_name = 'ExplainREGEX'