Exemple #1
0
def run_finetuning_single_task(model_args,
                               data_args,
                               training_args,
                               last_checkpoint=None):
    """On a single task train, evaluate, and save results"""

    datasets = init_datasets_task(data_args, training_args)
    is_regression, label_list, num_labels = get_labels(datasets, data_args)
    logging.info(f"Training {data_args.task_name} with {num_labels} labels")

    # For finetuning required to add labels and task name to config kwargs
    extra_config_kwargs = dict(
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
    )
    config = init_config(model_args, extra_config_kwargs=extra_config_kwargs)
    tokenizer = init_tokenizer(model_args)
    model = init_model(model_args, config, tokenizer, finetuning=True)

    # Tokenizing and preprocessing the datasets for downstream tasks
    # TODO: load from cached tokenized datasets for finetuning as well
    logging.info(f"Tokenizing datasets for finetuning ...")
    tokenized_datasets = preprocess_datasets_task(datasets, tokenizer,
                                                  data_args, model, num_labels,
                                                  label_list, is_regression)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation_matched" if data_args.
                                      task_name == "mnli" else "validation"]
    test_dataset = None
    if ((data_args.task_name is not None or data_args.test_file is not None)
            and training_args.do_predict):
        test_dataset = tokenized_datasets["test_matched" if data_args.
                                          task_name == "mnli" else "test"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will default to DataCollatorWithPadding,
    # so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    # Train
    trainer = init_trainer(
        tokenizer=tokenizer,
        data_collator=data_collator,
        training_args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        model=init_model(model_args, config, tokenizer),
        trainer_callbacks=model_args.trainer_callbacks or None,
        finetuning=True,
        task_name=data_args.task_name,
        is_regression=is_regression)
    if training_args.do_train:
        train(trainer, training_args.output_dir, last_checkpoint)

    # Evaluate
    eval_results = {}
    if training_args.do_eval:
        logging.info("*** Evaluate ***")

        # Handle special case of extra validation dataset for MNLI
        tasks = [data_args.task_name]
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            eval_datasets.append(tokenized_datasets["validation_mismatched"])

        eval_results = evaluate_tasks(trainer, training_args.output_dir, tasks,
                                      eval_datasets)

    # Test/Predict
    if training_args.do_predict:
        logging.info("*** Test ***")

        # Handle special case of extra test dataset for MNLI
        tasks = [data_args.task_name]
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            test_datasets.append(tokenized_datasets["test_mismatched"])

        test_tasks(trainer, training_args.output_dir, tasks, test_datasets,
                   is_regression, label_list)

    # There is an existing issue on training multiple models in sequence in this code
    # There is a memory leakage on the model, a small amount of GPU memory remains after
    # the run and accumulates over several runs. It fails with OOM after about 20 runs,
    # even when all tensors on GPU are explicitly deleted, garbage is collected and
    # cache is cleared. Tried multiple solutions but this weird little hack is the only
    # thing that worked.
    model.to("cpu")

    return eval_results
Exemple #2
0
def run_finetuning_single_task(
    model_args,
    data_args,
    training_args,
    last_checkpoint=None,
    run_idx=None,
):
    """On a single task train, evaluate, and save results"""

    # TODO
    # accept run# as an argument for finetuning with multiple runs on a single task
    # update the save directory to include run#
    tokenizer, data_collator, train_dataset, eval_dataset, test_dataset, model, \
        is_regression, tokenized_datasets, label_list, config = \
        init_dataset_for_finetuning(
            model_args, data_args, training_args, last_checkpoint
        )

    # Code safety
    check_eval_and_max_steps(training_args, train_dataset)
    training_args = check_best_metric(training_args, data_args.task_name)

    # Update where model is saved for each run
    training_args = update_run_number(training_args, run_idx)

    # Train
    trainer = init_trainer(
        tokenizer=tokenizer,
        data_collator=data_collator,
        training_args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        model=model,
        trainer_callbacks=model_args.trainer_callbacks or None,
        finetuning=True,
        task_name=data_args.task_name,
        is_regression=is_regression)

    if training_args.do_train:
        train(trainer, training_args.output_dir, training_args.rm_checkpoints,
              last_checkpoint)

    if training_args.do_eval:
        eval_results = evaluate_tasks_handler(trainer, data_args, model_args,
                                              training_args, eval_dataset,
                                              tokenized_datasets)

    # Test/Predict
    if training_args.do_predict:
        logging.info("*** Test ***")

        # Handle special case of extra test dataset for MNLI
        tasks = [data_args.task_name]
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            test_datasets.append(tokenized_datasets["test_mismatched"])

        test_tasks(trainer, training_args.output_dir, tasks, test_datasets,
                   is_regression, label_list)

    # TODO
    # Remove any unnecessary checkpoints to reduce space demands
    if training_args.load_best_model_at_end:
        pass
        # find best model checkpoint
        # delete the rest

    # There is an existing issue on training multiple models in sequence in this code
    # There is a memory leakage on the model, a small amount of GPU memory remains after
    # the run and accumulates over several runs. It fails with OOM after about 20 runs,
    # even when all tensors on GPU are explicitly deleted, garbage is collected and
    # cache is cleared. Tried multiple solutions but this weird little hack is the only
    # thing that worked.
    model.to("cpu")

    return eval_results
Exemple #3
0
def run_pretraining(model_args,
                    data_args,
                    training_args,
                    last_checkpoint=None):
    """Pretrain and evaluate a language model"""

    logging.info(f"Pre-training a masked language model.")

    datasets, tokenized_datasets, dataset_path = init_datasets_mlm(data_args)

    config = init_config(model_args)
    tokenizer = init_tokenizer(model_args)

    if tokenized_datasets is None:
        # Tokenizing and preprocessing the datasets for language modeling
        if training_args.do_train:
            column_names = datasets["train"].column_names
        else:
            column_names = datasets["validation"].column_names
        text_column_name = "text" if "text" in column_names else column_names[0]

        logging.info(f"Tokenizing datasets for pretraining ...")
        tokenized_datasets = preprocess_datasets_mlm(datasets, tokenizer,
                                                     data_args, column_names,
                                                     text_column_name)

        # Save only if a dataset_path has been defined in the previous steps
        # that will be True only when loading from dataset hub
        if data_args.save_tokenized_data and dataset_path is not None:
            logging.info(f"Saving tokenized dataset to {dataset_path}")
            tokenized_datasets.save_to_disk(dataset_path)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will take care of randomly masking the tokens.
    # argument defined in experiment config
    assert hasattr(transformers, data_args.data_collator), \
        f"Data collator {data_args.data_collator} not available"
    data_collator = getattr(transformers, data_args.data_collator)(
        tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)

    # Run hp search or regular training
    if model_args.hp_num_trials >= 1:
        run_hyperparameter_search(
            model_args=model_args,
            config=config,
            tokenizer=tokenizer,
            data_collator=data_collator,
            training_args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
        )
    else:
        trainer = init_trainer(
            tokenizer=tokenizer,
            data_collator=data_collator,
            training_args=training_args,
            train_dataset=train_dataset if training_args.do_train else None,
            eval_dataset=eval_dataset if training_args.do_eval else None,
            model=init_model(model_args, config, tokenizer),
            trainer_class=model_args.trainer_class,
            trainer_callbacks=model_args.trainer_callbacks or None,
        )
        if training_args.do_train:
            train(trainer, training_args.output_dir, last_checkpoint)

    # Evaluate in full eval dataset.
    # if using hp search, load best model before running evaluate
    if training_args.do_eval:
        logging.info("*** Evaluate ***")
        evaluate_language_model(trainer, eval_dataset,
                                training_args.output_dir)
Exemple #4
0
def run_finetuning_squad(
    model_args,
    data_args,
    training_args,
    last_checkpoint=None,
    run_idx=None,
):
    """On a single task train, evaluate, and save results"""

    # Make sure dataset name, task name, and version_2_with_negative
    # match before loading the dataset
    data_args = check_squad_version(data_args)

    data_init = init_dataset_for_squad(model_args, data_args, training_args,
                                       last_checkpoint)

    tokenizer = data_init[0]
    data_collator = data_init[1]
    train_dataset = data_init[2]
    eval_dataset = data_init[3]
    eval_examples = data_init[4]
    model = data_init[5]
    answer_column_name = data_init[6]

    # Code safety
    check_eval_and_max_steps(training_args, train_dataset)
    # Pass dataset_name instead of task_name for the special case of squad
    # squad and squad_v2 have different metrics and datasets, but same task_name
    training_args = check_best_metric(training_args, data_args.dataset_name)

    # Post-processing:
    def post_processing_function(examples,
                                 features,
                                 predictions,
                                 stage="eval"):
        # Post-processing: we match the start logits and end logits to
        # answers in the original context.

        if data_args.beam_search:
            predictions, scores_diff_json = \
                postprocess_qa_predictions_with_beam_search(
                    examples=examples,
                    features=features,
                    predictions=predictions,
                    version_2_with_negative=data_args.version_2_with_negative,
                    n_best_size=data_args.n_best_size,
                    max_answer_length=data_args.max_answer_length,
                    start_n_top=model.config.start_n_top,
                    end_n_top=model.config.end_n_top,
                    output_dir=training_args.output_dir,
                    # log_level=log_level,
                    prefix=stage,
                )

        else:
            predictions = postprocess_qa_predictions(
                examples=examples,
                features=features,
                predictions=predictions,
                version_2_with_negative=data_args.version_2_with_negative,
                n_best_size=data_args.n_best_size,
                max_answer_length=data_args.max_answer_length,
                output_dir=training_args.output_dir,
                prefix=stage,
            )

        if data_args.version_2_with_negative:
            if data_args.beam_search:
                formatted_predictions = [
                    {
                        "id": k,
                        "prediction_text": v,
                        "no_answer_probability": scores_diff_json[k]
                    }  # noqa E501
                    for k, v in predictions.items()
                ]
            else:
                formatted_predictions = [
                    {
                        "id": k,
                        "prediction_text": v,
                        "no_answer_probability": 0.0
                    } for k, v in predictions.items()  # noqa E501
                ]
        else:
            formatted_predictions = [{
                "id": k,
                "prediction_text": v
            } for k, v in predictions.items()]  # noqa E501

        references = [{
            "id": ex["id"],
            "answers": ex[answer_column_name]
        } for ex in examples]  # noqa E501
        return EvalPrediction(predictions=formatted_predictions,
                              label_ids=references)

    # Update where model is saved for each run
    training_args = update_run_number(training_args, run_idx)

    training_args.trainer_class = QuestionAnsweringTrainer

    trainer_kwargs = dict(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        eval_examples=eval_examples,
        data_collator=data_collator,
        post_process_function=post_processing_function,
        callbacks=model_args.trainer_callbacks or None,
    )

    # Train
    trainer = init_squad_trainer(trainer_kwargs, data_args,
                                 training_args.trainer_class,
                                 model_args.trainer_callbacks)

    if training_args.do_train:
        # Note, rm_checkpoints=True means one model will be saved
        # in the output_dir, and all checkpoint subdirectories will be
        # deleted when train() is called.
        train(trainer, training_args.output_dir, training_args.rm_checkpoints,
              last_checkpoint)

    eval_results = {}
    if training_args.do_eval:
        eval_results = evaluate_task_handler(trainer, data_args, model_args,
                                             training_args, eval_dataset)

    if training_args.do_predict:
        raise NotImplementedError(
            "Storing test results for squad not yet implemented")

    # There is an existing issue on training multiple models in sequence in this code
    # There is a memory leakage on the model, a small amount of GPU memory remains after
    # the run and accumulates over several runs. It fails with OOM after about 20 runs,
    # even when all tensors on GPU are explicitly deleted, garbage is collected and
    # cache is cleared. Tried multiple solutions but this weird little hack is the only
    # thing that worked.
    model.to("cpu")

    return eval_results