コード例 #1
0
 def load_tokenizer(self):
     if self.model in ["xlm-roberta-base"]:
         return XLMRobertaTokenizer.from_pretrained(self.model,
                                                    strip_accents=False)
     elif self.model in ['bert-base-multilingual-cased']:
         return BertTokenizer.from_pretrained(self.model,
                                              strip_accents=False)
     elif self.model in ['facebook/mbart-large-cc25']:
         return MBartTokenizer.from_pretrained('facebook/mbart-large-cc25',
                                               strip_accents=False)
コード例 #2
0
    def __init__(
        self,
        hparams: Namespace,
    ):
        super().__init__()
        self.hparams = hparams

        self.tokenizer = MBartTokenizer.from_pretrained(
            self.hparams.model_checkpoint)
        self.model = MBartForConditionalGeneration.from_pretrained(
            self.hparams.model_checkpoint)
コード例 #3
0
ファイル: run_evalden.py プロジェクト: Eymen3455/transformers
def generate_summaries_or_translations(
    examples: List[str],
    out_file: str,
    model_name: str,
    batch_size: int = 8,
    device: str = DEFAULT_DEVICE,
    fp16=False,
    task="summarization",
    prefix=None,
    **generate_kwargs,
) -> Dict:
    """Save model.generate results to <out_file>, and return how long it took."""
    fout = Path(out_file).open("w", encoding="utf-8")
    model_name = str(model_name)
    #model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    model = MBartForConditionalGeneration.from_pretrained(model_name).to(
        device)
    if fp16:
        model = model.half()

    tokenizer = MBartTokenizer.from_pretrained(model_name)
    #tokenizer = AutoTokenizer.from_pretrained(model_name)
    #logger.info(f"Inferred tokenizer type: {tokenizer.__class__}")  # if this is wrong, check config.model_type.

    start_time = time.time()
    # update config with task specific params
    use_task_specific_params(model, task)
    if prefix is None:
        prefix = prefix or getattr(model.config, "prefix", "") or ""
    for examples_chunk in tqdm(list(chunks(examples, batch_size))):
        examples_chunk = [prefix + text for text in examples_chunk]
        batch = tokenizer(examples_chunk,
                          return_tensors="pt",
                          truncation=True,
                          padding="longest").to(device)
        summaries = model.generate(
            input_ids=batch.input_ids,
            attention_mask=batch.attention_mask,
            #**generate_kwargs,
        )
        dec = tokenizer.batch_decode(summaries,
                                     skip_special_tokens=True,
                                     clean_up_tokenization_spaces=False)
        for hypothesis in dec:
            fout.write(hypothesis + "\n")
            fout.flush()
    fout.close()
    runtime = int(time.time() - start_time)  # seconds
    n_obs = len(examples)
    return dict(n_obs=n_obs,
                runtime=runtime,
                seconds_per_sample=round(runtime / n_obs, 4))
コード例 #4
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_path = os.path.join(bolt.ARTIFACT_DIR, 'MBart_translation.pt')

    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
    model = MBartForConditionalGeneration.from_pretrained(
        'facebook/mbart-large-cc25')
    print("loading model")
    model.load_state_dict(torch.load(model_path))
    print("model loaded")
    sentences_lst = "i love you"

    result = translate(sentences_lst, tokenizer, model, 3, device)
    print(result)
コード例 #5
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
    model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')

    # example_english_phrase = ["I love you", 'you hate i']
    # expected_translation_chinese = ["我中意你", '你憎我']
    print("Loading and processing data")
    en, yue = read_file("../MARIAN/en2yue/train.en", "../MARIAN/en2yue/train.yue")
    val_en, val_yue = read_file("../MARIAN/en2yue/val.en", '../MARIAN/en2yue/val.yue')


    train_dataset = token_(tokenizer, en, yue)
    loader = create_data_loader(train_dataset, 8)

    val_dataset = token_(tokenizer, val_en, val_yue)
    val_loader = create_data_loader(val_dataset, 8)

    EPOCHS = 10
    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    resultdir = bolt.ARTIFACT_DIR
    MODEL_SAVE_PATH = os.path.join(resultdir, 'MBart_translation.pt')

    print("Start training")

    best_val_loss = float('inf')
    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 30)
        train_loss = train_epoch(model, loader, optimizer, scheduler, device)
        val_loss = evaluate_epoch(model, val_loader, device)
        print(f'Train_loss: {train_loss} | Val_loss: {val_loss}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), MODEL_SAVE_PATH)

        bolt.send_metrics({
            "Total_train_loss": train_loss,
            "Total_val_loss": val_loss
        })
コード例 #6
0
def test_mbart_dataset_truncation():
    tokenizer = MBartTokenizer.from_pretrained(MBART_TINY)
    tmp_dir = make_test_data_dir()
    max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES)
    max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
    max_src_len = 4
    max_tgt_len = 8
    assert max_len_target > max_src_len  # Truncated
    assert max_len_source > max_src_len
    src_lang, tgt_lang = "ro_RO", "de_DE"  # NOT WHAT IT WAS TRAINED ON
    train_dataset = MBartDataset(
        tokenizer,
        data_dir=tmp_dir,
        type_path="train",
        max_source_length=max_src_len,
        max_target_length=max_tgt_len,  # ignored
        src_lang=src_lang,
        tgt_lang=tgt_lang,
    )
    dataloader = DataLoader(train_dataset,
                            batch_size=2,
                            collate_fn=train_dataset.collate_fn)
    for batch in dataloader:
        assert isinstance(batch, dict)
        assert batch["attention_mask"].shape == batch["input_ids"].shape
        # show that articles were trimmed.
        assert batch["input_ids"].shape[1] == max_src_len
        # show that targets are the same len
        assert batch["decoder_input_ids"].shape[1] == max_tgt_len
        # check language codes in correct place
        assert batch["decoder_input_ids"][
            0, 0].item() == tokenizer.lang_code_to_id[tgt_lang]
        assert batch["decoder_input_ids"][0,
                                          -1].item() == tokenizer.eos_token_id
        assert batch["input_ids"][0, -2].item() == tokenizer.eos_token_id
        assert batch["input_ids"][
            0, -1].item() == tokenizer.lang_code_to_id[src_lang]

        break  # No need to test every batch
コード例 #7
0
def train_MBart(data_path,tokenizer,output_path):
    model_config = MBartConfig(vocab_size=300,d_model=10,encoder_layers=1,decoder_layers=1,encoder_attention_heads=1,decoder_attention_heads=1,encoder_ffn_dim=10,decoder_ffn_dim=10,max_position_embeddings=512)
    model = MBartModel(config=model_config)

    sentences = {} #associates lang_id with list of sentences
    
    #read data files and separate language data into different lists
    lang_id = 0 #counter for languages in dataset
    for sentence_file in os.listdir(data_path):
        with open(data_path+sentence_file,'r') as data:
            sentences[lang_id] = []
            for line in data:
                sentences[lang_id].append(line)
        lang_id += 1

    #create token sequences to pass into model
    src_lang,tgt_lang = (sentences[lang_id] for lang_id in sentences)
    batch = tokenizer.prepare_seq2seq_batch(src_texts=src_lang,tgt_texts=tgt_lang,return_tensors='pt')
    
    
    model(input_ids=batch['input_ids'],decoder_input_ids=batch['labels'])
    model.save_pretrained(output_path)
            
if __name__=='__main__':
    args = get_args()
    
    tokenizer = MBartTokenizer.from_pretrained(args.model_file) #create new MBTokenizer from file created by google/sentencepiece
    
    train_MBart(args.data_path,tokenizer,args.output_path)

コード例 #8
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    check_output_dir(training_args)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED),
        training_args.fp16,
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = MBartConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
    for p in extra_model_params:
        if getattr(training_args, p, None):
            assert hasattr(config, p), f"({config.__class__.__name__}) doesn't have a `{p}` attribute"
            setattr(config, p, getattr(training_args, p))

    tokenizer = MBartTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    # model = MBartForConditionalGeneration.from_pretrained(
    #     model_args.model_name_or_path,
    #     from_tf=".ckpt" in model_args.model_name_or_path,
    #     config=config,
    #     cache_dir=model_args.cache_dir,
    # )
    # model = MBartForConditionalGeneration(config)
    # model = MBartForConditionalGeneration.from_pretrained(model_args.config_name)
    model_config = MBartConfig(vocab_size=300,d_model=10,encoder_layers=1,decoder_layers=1,encoder_attention_heads=1,decoder_attention_heads=1,encoder_ffn_dim=10,decoder_ffn_dim=10,max_position_embeddings=512)
    model = MBartModel(config=model_config)

    # use task specific params
    use_task_specific_params(model, data_args.task)

    # set num_beams for evaluation
    if data_args.eval_beams is None:
        data_args.eval_beams = model.config.num_beams

    # set decoder_start_token_id for MBart
    if model.config.decoder_start_token_id is None and isinstance(tokenizer, MBartTokenizer):
        assert (
            data_args.tgt_lang is not None and data_args.src_lang is not None
        ), "mBart requires --tgt_lang and --src_lang"
        model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang]

    if model_args.freeze_embeds:
        freeze_embeds(model)
    if model_args.freeze_encoder:
        freeze_params(model.get_encoder())
        assert_all_frozen(model.get_encoder())

    dataset_class = Seq2SeqDataset

    # Get datasets
    train_dataset = (
        dataset_class(
            tokenizer,
            type_path="train",
            data_dir=data_args.data_dir,
            n_obs=data_args.n_train,
            max_target_length=data_args.max_target_length,
            max_source_length=data_args.max_source_length,
            prefix=model.config.prefix or "",
        )
        if training_args.do_train
        else None
    )
    eval_dataset = (
        dataset_class(
            tokenizer,
            type_path="val",
            data_dir=data_args.data_dir,
            n_obs=data_args.n_val,
            max_target_length=data_args.val_max_target_length,
            max_source_length=data_args.max_source_length,
            prefix=model.config.prefix or "",
        )
        if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO
        else None
    )
    test_dataset = (
        dataset_class(
            tokenizer,
            type_path="test",
            data_dir=data_args.data_dir,
            n_obs=data_args.n_test,
            max_target_length=data_args.test_max_target_length,
            max_source_length=data_args.max_source_length,
            prefix=model.config.prefix or "",
        )
        if training_args.do_predict
        else None
    )

    # Initialize our Trainer
    compute_metrics_fn = (
        build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None
    )
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores),
        compute_metrics=compute_metrics_fn,
        tokenizer=tokenizer,
    )

    all_metrics = {}
    # Training
    if training_args.do_train:
        logger.info("*** Train ***")

        train_result = trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        metrics = train_result.metrics
        metrics["train_n_objs"] = data_args.n_train

        trainer.save_model()  # this also saves the tokenizer

        if trainer.is_world_process_zero():
            handle_metrics("train", metrics, training_args.output_dir)
            all_metrics.update(metrics)

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

            # For convenience, we also re-save the tokenizer to the same directory,
            # so that you can share your model easily on huggingface.co/models =)
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate(
            metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams
        )
        metrics["val_n_objs"] = data_args.n_val
        metrics["val_loss"] = round(metrics["val_loss"], 4)

        if trainer.is_world_process_zero():

            handle_metrics("val", metrics, training_args.output_dir)
            all_metrics.update(metrics)

    if training_args.do_predict:
        logger.info("*** Predict ***")

        test_output = trainer.predict(
            test_dataset=test_dataset,
            metric_key_prefix="test",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.eval_beams,
        )
        metrics = test_output.metrics
        metrics["test_n_objs"] = data_args.n_test

        if trainer.is_world_process_zero():
            metrics["test_loss"] = round(metrics["test_loss"], 4)
            handle_metrics("test", metrics, training_args.output_dir)
            all_metrics.update(metrics)

            if training_args.predict_with_generate:
                test_preds = tokenizer.batch_decode(
                    test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                test_preds = lmap(str.strip, test_preds)
                write_txt_file(test_preds, os.path.join(training_args.output_dir, "test_generations.txt"))

    if trainer.is_world_process_zero():
        save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json"))

    return all_metrics
コード例 #9
0
 def setUpClass(cls):
     cls.tokenizer: MBartTokenizer = MBartTokenizer.from_pretrained(
         cls.checkpoint_name)
     cls.pad_token_id = 1
     return cls
コード例 #10
0
    def setup(self, process_on_fly=True, n_augment=0):

        if process_on_fly:
            data = load_dataset("csv", data_files=self.file_path)["train"]
            data = data.map(
                lambda x: {"article_length": len(x["Text"].split())})
            data = data.map(
                lambda x: {"summary_length": len(x["Headline"].split())})

            data = data.map(lambda x: {
                "CleanedText":
                preprocess_article(x["cleaned"], self.sep_token)
            })

            data = data.map(lambda x: {"CleanedHeadline": x["Headline"]})
            fn_kwargs = {
                "model":
                MBartForConditionalGeneration.from_pretrained(
                    "vasudevgupta/mbart-iitb-hin-eng"),
                "tokenizer":
                MBartTokenizer.from_pretrained(
                    "vasudevgupta/mbart-iitb-hin-eng"),
                "max_pred_length":
                32,
            }

            data = data.map(translate, fn_kwargs=fn_kwargs)
            data.to_csv(f"cleaned-{self.file_path}")

        else:
            data = load_dataset(
                "csv", data_files=f"cleaned-{self.file_path}")["train"]

        data = data.filter(
            lambda x: x["article_length"] > 32 and x["summary_length"] > 1)

        removed_samples = data.filter(lambda x: type(x["CleanedHeadline"]) !=
                                      str or type(x["CleanedText"]) != str)
        print(removed_samples["CleanedHeadline"])
        print(removed_samples["CleanedText"])

        data = data.filter(lambda x: type(x["CleanedHeadline"]) == str and
                           type(x["CleanedText"]) == str)
        print("Dataset", data)

        # print("Samples with article length > 560 are", data.filter(lambda x: x["article_length"] > 560))

        data = data.train_test_split(test_size=600,
                                     shuffle=True,
                                     seed=self.seed)
        tr_dataset = data["train"].map(lambda x: {"split": "TRAIN"})
        val_dataset = data["test"].map(lambda x: {"split": "VALIDATION"})

        if n_augment > 0:
            print("AUGMENTING")
            tr_dataset = tr_dataset.map(
                lambda x: {"augmentation_status": "Not Augmented"})
            val_dataset = val_dataset.map(
                lambda x: {"augmentation_status": "Not Augmented"})
            noisy_dataset = tr_dataset.filter(
                lambda x: x["Mobile_Tech_Flag"] == 1)

            noisy_datasets = []
            for _ in range(n_augment):
                noisy_datasets.append(
                    noisy_dataset.map(lambda x: {
                        "CleanedText":
                        get_noisy_sent(x["CleanedText"].split())
                    }))
            noisy_dataset = concatenate_datasets(noisy_datasets)
            noisy_dataset = noisy_dataset.map(
                lambda x: {"augmentation_status": "Augmented"})

            tr_dataset = concatenate_datasets([noisy_dataset, tr_dataset])

        return tr_dataset, val_dataset
コード例 #11
0
        return features


if __name__ == '__main__':

    class args:
        batch_size: int = 2
        process_on_fly: bool = False
        num_workers: int = 2
        max_length: int = 512
        max_target_length: int = 20
        file_path: str = "data/dev_data_article.csv"
        seed: int = 42
        n_augment: int = 1

    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
    dl = DataLoader(tokenizer, args)

    tr_dataset, val_dataset = dl.setup(process_on_fly=args.process_on_fly,
                                       n_augment=args.n_augment)

    print(val_dataset)
    print(tr_dataset)

    dataset = concatenate_datasets([tr_dataset, val_dataset]).sort('Text_ID')
    print(dataset)

    print(tr_dataset[:2])

    tr_dataset = dl.train_dataloader(tr_dataset)
    val_dataset = dl.val_dataloader(val_dataset)
コード例 #12
0
TGT_DATA = "./data_tgt_de.txt"
SRC_DATA = "./data_source_hsb.txt"

from transformers import MBartForConditionalGeneration, MBartTokenizer, MBartModel, MBartConfig

#Read from the data files
src_txts = []
tgt_txts = []
with open(SRC_DATA) as f:
    for line in f:
        src_txts.append(line)

with open(TGT_DATA) as f:
    for line in f:
        tgt_txts.append(line)

tokenizer = MBartTokenizer.from_pretrained('./tokenizer_de_hsb.model')
batch = tokenizer.prepare_seq2seq_batch(src_texts=src_txts,
                                        src_lang="en_XX",
                                        tgt_texts=tgt_txts,
                                        tgt_lang="ro_RO",
                                        return_tensors="pt")
config = MBartConfig()
model = MBartModel(config)
model(input_ids=batch['input_ids'],
      decoder_input_ids=batch['labels'])  # forward pass
model.save_pretrained('./trained_model')
コード例 #13
0
import torch
from transformers import MBartTokenizer, BartForConditionalGeneration, MBartConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

model_name = 'facebook/mbart-large-cc25'
tokenizer_name = 'facebook/mbart-large-cc25'

config = MBartConfig.from_pretrained(model_name)
tokenizer = MBartTokenizer.from_pretrained(tokenizer_name)
config.attention_type = 'performer'

model = BartForConditionalGeneration.from_pretrained(model_name)
device_maps_flat = {
    model_name: {
        "encoder": {
            0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        },
        "decoder": {
            1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        },
    },
}
model.parallelize(device_maps_flat[model_name])

wiki = load_dataset("wikipedia", "20200501.frr", split='train[:3%]')

train_encodings = tokenizer(wiki['text'], padding=True, truncation=True)
train_encodings.to("cuda:0")

コード例 #14
0
 def setUpClass(cls):
     cls.tokenizer: MBartTokenizer = MBartTokenizer.from_pretrained(
         cls.checkpoint_name, src_lang="en_XX", tgt_lang="ro_RO"
     )
     cls.pad_token_id = 1
     return cls
コード例 #15
0
    parser = argparse.ArgumentParser()
    parser.add_argument("--config",
                        type=str,
                        default="main",
                        help="configurations defined in config.py")
    p_args = parser.parse_args()

    args = getattr(config, p_args.config)
    print(args)

    ## use this for running sweep
    # wandb.init(config=args.__dict__)
    # args = wandb.config
    # print(dict(args))

    tokenizer = MBartTokenizer.from_pretrained(args.tokenizer_id)

    if args.load_dir:
        bart = MBartForConditionalGeneration(args.bart_config)
        print(f"model is loaded from {args.load_dir}")
    else:
        bart = MBartForConditionalGeneration.from_pretrained(args.model_id)
        print(f"model is loaded from {args.model_id}")

    print("====Working on layers freezing====")
    bart.ffn_requires_grad_(args.enc_ffn_grad, args.dec_ffn_grad)
    bart.attn_requires_grad_(args.enc_attn_grad, args.dec_attn_grad,
                             args.cross_attn_grad)
    bart.embed_requires_grad_(args.embed_grad, args.pos_embed_grad)
    bart.norm_requires_grad_(args.enc_norm_grad, args.dec_norm_grad,
                             args.cross_attn_norm_grad)
コード例 #16
0
def train(
    model_name,
    train_file,
    val_file,
    batch_size,
    output_dir,
    learning_rate,
    logging_steps,
    eval_steps,
    save_steps,
    warmup_steps,
    num_train_epochs,
    gradient_accumulation_steps,
    max_grad_norm,
    weight_decay,
    max_source_tokens_count,
    max_target_tokens_count,
    fp16_opt_level,
    fp16=False
):
    if fp16:
        print("Using FP16")
    tokenizer = MBartTokenizer.from_pretrained(model_name)
    train_dataset = MBartSummarizationDataset(
        train_file,
        tokenizer,
        max_source_tokens_count,
        max_target_tokens_count)
    val_dataset = MBartSummarizationDataset(
        val_file,
        tokenizer,
        max_source_tokens_count,
        max_target_tokens_count)
    model = MBartForConditionalGeneration.from_pretrained(model_name)
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        do_train=True,
        do_eval=True,
        overwrite_output_dir=True,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_steps=eval_steps,
        learning_rate=learning_rate,
        warmup_steps=warmup_steps,
        num_train_epochs=num_train_epochs,
        gradient_accumulation_steps=gradient_accumulation_steps,
        max_grad_norm=max_grad_norm,
        weight_decay=weight_decay,
        fp16=fp16,
        fp16_opt_level=fp16_opt_level,
        label_smoothing_factor=0.1,
        evaluation_strategy="steps"
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    trainer.train()
コード例 #17
0
 def setUpClass(cls):
     checkpoint_name = "facebook/mbart-large-en-ro"
     cls.tokenizer = MBartTokenizer.from_pretrained(checkpoint_name)
     cls.pad_token_id = 1
     return cls
コード例 #18
0
 def test_special_tokens_unaffacted_by_save_load(self):
     tmpdirname = tempfile.mkdtemp()
     original_special_tokens = self.tokenizer.fairseq_tokens_to_ids
     self.tokenizer.save_pretrained(tmpdirname)
     new_tok = MBartTokenizer.from_pretrained(tmpdirname)
     self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens)
コード例 #19
0
def main(args_dict=None):
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if args_dict is not None:
        model_args, data_args, training_args = parser.parse_dict(args_dict)
    elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Set project name
    os.environ["WANDB_PROJECT"] = "multilingual_zeroshot"

    num_labels = 3
    labels = ['entailment', 'neutral', 'contradiction']

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = MBartConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        dropout=model_args.dropout,
        attention_dropout=model_args.attention_dropout,
        finetuning_task="mnli",
        cache_dir=model_args.cache_dir,
    )
    tokenizer = MBartTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = MBartForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    columns = ['input_ids', 'attention_mask', 'labels']
    map_fn = get_mnli_map_fn(data_args.lang, data_args.max_seq_length,
                             tokenizer)

    train_dataset = nlp.load_dataset("multi_nli", split="train")
    train_dataset = train_dataset.map(map_fn, batched=True, batch_size=512)
    train_dataset.set_format(type='torch', columns=columns)

    eval_dataset = (nlp.load_dataset("multi_nli", split="validation_matched")
                    if training_args.do_eval else None)
    eval_dataset = eval_dataset.map(map_fn, batched=True, batch_size=512)
    eval_dataset.set_format(type='torch', columns=columns)

    def compute_metrics_fn(p: EvalPrediction):
        preds = np.argmax(p.predictions, axis=1)
        return glue_compute_metrics("classification", preds, p.label_ids)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics_fn,
        data_collator=DataCollator(tokenizer),
    )

    # disable wandb console logs
    logging.getLogger('wandb.run_manager').setLevel(logging.WARNING)

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        mis_matched_dataset = nlp.load_dataset("multi_nli",
                                               split="validation_mismatched")
        mis_matched_dataset = mis_matched_dataset.map(map_fn,
                                                      batched=True,
                                                      batch_size=512)
        mis_matched_dataset.set_format(type='torch', columns=columns)
        eval_datasets = [eval_dataset, mis_matched_dataset]

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = compute_metrics_fn
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(training_args.output_dir,
                                            f"eval_results.txt")
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results *****")
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)