Beispiel #1
0
def train():
    """Trains a BERT ethicality classifer."""

    args = transformers.TrainingArguments(
        "saved_models",
        evaluation_strategy="epoch",
        learning_rate=config['learning_rate'],
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size'],
        num_train_epochs=config['num_epochs'],
        weight_decay=config['weight_decay'],
        load_best_model_at_end=True,
        metric_for_best_model="f1")

    train, val, test = get_train_val_test_datasets()
    trainer = transformers.Trainer(model=get_model(),
                                   args=args,
                                   train_dataset=train,
                                   eval_dataset=val,
                                   compute_metrics=metrics)

    # Train the model.
    trainer.train()

    # Display model eval statistics.
    print(trainer.evaluate())

    # Test dataset metrics.
    trainer.predict(test).metrics
Beispiel #2
0
 def __init__(self):
     """Initializes a Inference object."""
     # self.model = get_pretrained_model()
     self.tokenizer = get_tokenizer()
     self.model = transformers.Trainer(model=get_pretrained_model())
     self.summarizer = pipeline(
         "summarization")  # ~1.2 GB download the first time this is run.
Beispiel #3
0
 def _train_model(self, model, tokenizer, train_dataset, val_dataset, **train_kwargs):
     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
     train_args = self._get_train_args(**train_kwargs)
     trainer = transformers.Trainer(model=model,
                                    args=train_args,
                                    data_collator=data_collator,
                                    train_dataset=train_dataset,
                                    eval_dataset=val_dataset,
                                    )
     trainer.train()
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = transformers.HfArgumentParser(
        (ModelArguments, ynt.GenernalDataTrainingArguments,
         transformers.TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)
    logger.info(f"Model arguments: {model_args}")
    logger.info(f"Data Training arguments: {data_args}")

    # Set seed
    transformers.set_seed(training_args.seed)

    if data_args.task_name in ynt.genernal_tasks_num_labels:
        num_labels = ynt.genernal_tasks_num_labels[data_args.task_name]
        output_mode = ynt.genernal_output_modes[data_args.task_name]
    elif data_args.task_name in transformers.glue_tasks_num_labels:
        num_labels = transformers.glue_tasks_num_labels[data_args.task_name]
        output_mode = transformers.glue_output_modes[data_args.task_name]
    else:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    config = transformers.AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = transformers.AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    if data_args.task_name in ynt.genernal_tasks_num_labels:
        train_dataset = (ynt.GenernalDataset(data_args,
                                             tokenizer=tokenizer,
                                             mode='train',
                                             cache_dir=model_args.cache_dir)
                         if training_args.do_train else None)
        eval_dataset = (ynt.GenernalDataset(data_args,
                                            tokenizer=tokenizer,
                                            mode='dev',
                                            cache_dir=model_args.cache_dir)
                        if training_args.do_eval and not data_args.online else
                        None)
        test_dataset = (ynt.GenernalDataset(data_args,
                                            tokenizer=tokenizer,
                                            mode='test',
                                            cache_dir=model_args.cache_dir)
                        if training_args.do_predict and not data_args.online
                        else None)
    elif data_args.task_name in transformers.glue_tasks_num_labels:
        # Get datasets
        train_dataset = (transformers.GlueDataset(
            data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir)
                         if training_args.do_train else None)
        eval_dataset = (transformers.GlueDataset(
            data_args,
            tokenizer=tokenizer,
            mode="dev",
            cache_dir=model_args.cache_dir) if training_args.do_eval else None)
        test_dataset = (transformers.GlueDataset(
            data_args,
            tokenizer=tokenizer,
            mode="test",
            cache_dir=model_args.cache_dir)
                        if training_args.do_predict else None)

    def build_compute_metrics_fn(
            task_name: str) -> Callable[[transformers.EvalPrediction], Dict]:
        def compute_metrics_fn(p: transformers.EvalPrediction):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            if task_name in ynt.genernal_tasks_num_labels:
                return ynt.genernal_compute_metrics(task_name, preds,
                                                    p.label_ids)
            elif task_name in transformers.glue_tasks_num_labels:
                return transformers.glue_compute_metrics(
                    task_name, preds, p.label_ids)

        return compute_metrics_fn

    # Initialize our Trainer
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(data_args.task_name),
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            mnli_mm_data_args = dataclasses.replace(data_args,
                                                    task_name="mnli-mm")
            eval_datasets.append(
                GlueDataset(mnli_mm_data_args,
                            tokenizer=tokenizer,
                            mode="dev",
                            cache_dir=model_args.cache_dir))

        for eval_dataset in eval_datasets:
            trainer.compute_metrics = build_compute_metrics_fn(
                eval_dataset.args.task_name)
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(
                training_args.output_dir,
                f"eval_results_{eval_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results {} *****".format(
                        eval_dataset.args.task_name))
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

    if training_args.do_predict:
        logging.info("*** Test ***")
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            mnli_mm_data_args = dataclasses.replace(data_args,
                                                    task_name="mnli-mm")
            test_datasets.append(
                transformers.GlueDataset(mnli_mm_data_args,
                                         tokenizer=tokenizer,
                                         mode="test",
                                         cache_dir=model_args.cache_dir))

        for test_dataset in test_datasets:
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions
            if output_mode == "classification":
                predictions = np.argmax(predictions, axis=1)

            output_test_file = os.path.join(
                training_args.output_dir,
                f"test_results_{test_dataset.args.task_name}.txt")
            if trainer.is_world_master():
                with open(output_test_file, "w") as writer:
                    logger.info("***** Test results {} *****".format(
                        test_dataset.args.task_name))
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if output_mode == "regression":
                            writer.write("%d\t%3.3f\n" % (index, item))
                        else:
                            item = test_dataset.get_labels()[item]
                            writer.write("%d\t%s\n" % (index, item))
    return eval_results
Beispiel #5
0
    logging_dir=f"{_dir}/logging",
    logging_steps=256,
    dataloader_num_workers=64,
    evaluation_strategy="steps",
    eval_steps=256,
    save_steps=256,
    fp16=True,
    fp16_opt_level="O3",
    learning_rate=5e-4,
    run_name=_dir,
)

model = transformers.AlbertForSequenceClassification.from_pretrained(
    "albert-large-v2", num_labels=2)
tokenizer = transformers.AlbertTokenizerFast.from_pretrained("albert-large-v2")
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer,
                                                     pad_to_multiple_of=32)
trainer = transformers.Trainer(
    args=args,
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# In[ ]:

trainer.train()
Beispiel #6
0
import argparse
import transformers

parser = argparse.ArgumentParser()
parser.add_argument('--vocab', type=str)
parser.add_argument('--model', type=str)
parser.add_argument('--data', type=str)
args = parser.parse_args()

tokenizer = transformers.BertTokenizer(vocab_file=args.vocab,
                                       do_lower_case=False,
                                       do_basic_tokenize=True)
model = transformers.BertForMaskedLM.from_pretrained(args.model)

dataset = transformers.LineByLineTextDataset(tokenizer=tokenizer,
                                             file_path=args.data,
                                             block_size=128)
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
train_args = transformers.TrainingArguments(
    per_device_eval_batch_size=16, output_dir=f"/tmp/echau18/{args.model}")
trainer = transformers.Trainer(model=model,
                               eval_dataset=dataset,
                               data_collator=data_collator,
                               prediction_loss_only=True,
                               args=train_args)

eval_output = trainer.evaluate()
print(eval_output)
Beispiel #7
0
def main(
    mode: str,
    num_examples_to_test: int = 5,
    num_repetitions: int = 4,
) -> List[Dict[str, Any]]:

    if mode not in ["only-correct", "only-incorrect"]:
        raise ValueError(f"Unrecognized mode {mode}")

    task_tokenizer, task_model = misc_utils.create_tokenizer_and_model(
        constants.MNLI_MODEL_PATH)
    train_dataset, eval_dataset = misc_utils.create_datasets(
        task_name="mnli", tokenizer=task_tokenizer)
    eval_instance_data_loader = misc_utils.get_dataloader(dataset=eval_dataset,
                                                          batch_size=1,
                                                          random=False)

    output_mode = glue_output_modes["mnli"]

    def build_compute_metrics_fn(task_name: str):
        def compute_metrics_fn(p):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(task_name, preds, p.label_ids)

        return compute_metrics_fn

    # Most of these arguments are placeholders
    # and are not really used at all, so ignore
    # the exact values of these.
    trainer = transformers.Trainer(
        model=task_model,
        args=TrainingArguments(output_dir="./tmp-output",
                               per_device_train_batch_size=128,
                               per_device_eval_batch_size=128,
                               learning_rate=5e-5,
                               logging_steps=100),
        data_collator=default_data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn("mnli"),
    )

    task_model.cuda()
    num_examples_tested = 0
    output_collections = []
    for test_index, test_inputs in enumerate(eval_instance_data_loader):
        if num_examples_tested >= num_examples_to_test:
            break

        # Skip when we only want cases of correction prediction but the
        # prediction is incorrect, or vice versa
        prediction_is_correct = misc_utils.is_prediction_correct(
            trainer=trainer, model=task_model, inputs=test_inputs)

        if mode == "only-correct" and prediction_is_correct is False:
            continue

        if mode == "only-incorrect" and prediction_is_correct is True:
            continue

        for k, v in test_inputs.items():
            if isinstance(v, torch.Tensor):
                test_inputs[k] = v.to(torch.device("cuda"))

        # with batch-size 128, 1500 iterations is enough
        for num_samples in range(700, 1300 + 1, 100):  # 7 choices
            for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]:  # 8 choices
                for repetition in range(num_repetitions):
                    print(
                        f"Running #{test_index} "
                        f"N={num_samples} "
                        f"B={batch_size} "
                        f"R={repetition} takes ...",
                        end=" ")
                    with Timer() as timer:
                        s_test = one_experiment(
                            model=task_model,
                            train_dataset=train_dataset,
                            test_inputs=test_inputs,
                            batch_size=batch_size,
                            random=True,
                            n_gpu=1,
                            device=torch.device("cuda"),
                            damp=constants.DEFAULT_INFLUENCE_HPARAMS["mnli"]
                            ["mnli"]["damp"],
                            scale=constants.DEFAULT_INFLUENCE_HPARAMS["mnli"]
                            ["mnli"]["scale"],
                            num_samples=num_samples)
                        time_elapsed = timer.elapsed
                        print(f"{time_elapsed:.2f} seconds")

                    outputs = {
                        "test_index": test_index,
                        "num_samples": num_samples,
                        "batch_size": batch_size,
                        "repetition": repetition,
                        "s_test": s_test,
                        "time_elapsed": time_elapsed,
                        "correct": prediction_is_correct,
                    }
                    output_collections.append(outputs)
                    remote_utils.save_and_mirror_scp_to_remote(
                        object_to_save=outputs,
                        file_name=f"stest.{mode}.{num_examples_to_test}."
                        f"{test_index}.{num_samples}."
                        f"{batch_size}.{repetition}.pth")

        num_examples_tested += 1

    return output_collections
Beispiel #8
0
def main(
    train_task_name: str,
    train_heuristic: str,
    eval_heuristics: Optional[List[str]] = None,
    num_replicas: Optional[int] = None,
    use_parallel: bool = True,
    version: Optional[str] = None,
) -> Dict[str, List[Dict[str, Any]]]:

    if train_task_name not in ["mnli-2", "hans"]:
        raise ValueError

    if eval_heuristics is None:
        eval_heuristics = DEFAULT_EVAL_HEURISTICS

    if num_replicas is None:
        num_replicas = DEFAULT_NUM_REPLICAS

    if version not in ["new-only-z", "new-only-ztest", "new-z-and-ztest"]:
        raise ValueError

    task_tokenizer, task_model = misc_utils.create_tokenizer_and_model(
        constants.MNLI2_MODEL_PATH)

    (mnli_train_dataset,
     mnli_eval_dataset) = misc_utils.create_datasets(task_name="mnli-2",
                                                     tokenizer=task_tokenizer)

    (hans_train_dataset,
     hans_eval_dataset) = misc_utils.create_datasets(task_name="hans",
                                                     tokenizer=task_tokenizer)

    if train_task_name == "mnli-2":
        train_dataset = mnli_train_dataset

    if train_task_name == "hans":
        train_dataset = hans_train_dataset

    (s_test_damp, s_test_scale,
     s_test_num_samples) = influence_helpers.select_s_test_config(
         trained_on_task_name="mnli-2",
         train_task_name=train_task_name,
         eval_task_name="hans",
     )

    hans_helper = HansHelper(hans_train_dataset=hans_train_dataset,
                             hans_eval_dataset=hans_eval_dataset)

    # We will be running model trained on MNLI-2
    # but calculate influences on HANS dataset
    faiss_index = influence_helpers.load_faiss_index(
        trained_on_task_name="mnli-2", train_task_name=train_task_name)

    output_mode = glue_output_modes["mnli-2"]

    def build_compute_metrics_fn(task_name: str):
        def compute_metrics_fn(p):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(task_name, preds, p.label_ids)

        return compute_metrics_fn

    # Most of these arguments are placeholders
    # and are not really used at all, so ignore
    # the exact values of these.
    trainer = transformers.Trainer(
        model=task_model,
        args=TrainingArguments(output_dir="./tmp-output",
                               per_device_train_batch_size=128,
                               per_device_eval_batch_size=128,
                               learning_rate=5e-5,
                               logging_steps=100),
    )

    output_collections: Dict[str, List] = defaultdict(list)

    if version == "old":
        raise ValueError("Deprecated")

    else:
        NUM_STEPS = 10
        num_total_experiments = (len(EXPERIMENT_TYPES) * num_replicas *
                                 len(VERSION_2_NUM_DATAPOINTS_CHOICES) *
                                 len(VERSION_2_LEARNING_RATE_CHOICES) *
                                 NUM_STEPS)

        with tqdm(total=num_total_experiments) as pbar:
            for experiment_type in EXPERIMENT_TYPES:
                for replica_index in range(num_replicas):

                    (hans_eval_heuristic_inputs, hans_eval_heuristic_raw_inputs
                     ) = hans_helper.sample_batch_of_heuristic(
                         mode="eval",
                         heuristic=train_heuristic,
                         size=EVAL_HEURISTICS_SAMPLE_BATCH_SIZE,
                         return_raw_data=True)

                    misc_utils.move_inputs_to_device(
                        inputs=hans_eval_heuristic_inputs,
                        device=task_model.device)

                    for version_2_num_datapoints in VERSION_2_NUM_DATAPOINTS_CHOICES:
                        for version_2_learning_rate in VERSION_2_LEARNING_RATE_CHOICES:

                            # The model will be used for multiple
                            # steps so `deepcopy` it here.
                            _model = deepcopy(task_model)
                            for step in range(NUM_STEPS):
                                outputs_one_experiment, _model = one_experiment(
                                    use_parallel=use_parallel,
                                    train_heuristic=train_heuristic,
                                    eval_heuristics=eval_heuristics,
                                    experiment_type=experiment_type,
                                    hans_helper=hans_helper,
                                    train_dataset=train_dataset,
                                    task_model=_model,
                                    faiss_index=faiss_index,
                                    s_test_damp=s_test_damp,
                                    s_test_scale=s_test_scale,
                                    s_test_num_samples=s_test_num_samples,
                                    trainer=trainer,
                                    version=version,
                                    version_2_num_datapoints=
                                    version_2_num_datapoints,
                                    version_2_learning_rate=
                                    version_2_learning_rate,
                                    hans_eval_heuristic_inputs=
                                    hans_eval_heuristic_inputs,
                                    hans_eval_heuristic_raw_inputs=
                                    hans_eval_heuristic_raw_inputs,
                                )

                                output_collections[
                                    f"{experiment_type}-"
                                    f"{replica_index}-"
                                    f"{version_2_num_datapoints}-"
                                    f"{version_2_learning_rate}-"].append(
                                        outputs_one_experiment)

                                pbar.update(1)
                                pbar.set_description(
                                    f"{experiment_type} #{replica_index}")

        torch.save(
            output_collections, f"hans-augmentation-{version}."
            f"{train_task_name}."
            f"{train_heuristic}."
            f"{num_replicas}."
            f"{use_parallel}.pth")

    return output_collections
Beispiel #9
0
writer = SummaryWriter()

training_args = transformers.TrainingArguments(
    output_dir="models/gpt2/",
    do_train=True,
    do_eval=True,
    evaluate_during_training=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    logging_first_step=True,
    save_steps=2000,
    save_total_limit=2,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_set,
    eval_dataset=valid_set,
    prediction_loss_only=True,
    tb_writer=writer
)

trainer.train()

# Save Model
trainer.save_model("models/gpt2/")
Beispiel #10
0
def train(args):
    logging.basicConfig(level=logging.INFO)
    tokenizer = transformers.AlbertTokenizer.from_pretrained(
        'albert-base-v2', cache_dir=cache_dir)
    albert_for_math_config = transformers.AlbertConfig(
        hidden_size=768,
        num_attention_heads=12,
        intermediate_size=3072,
    )

    if args['--load']:
        model = transformers.AlbertForMaskedLM.from_pretrained(
            args['--load-from'])
        training_args = transformers.TrainingArguments(
            output_dir=args['--save-to'],
            overwrite_output_dir=True,
            num_train_epochs=int(args['--max-epoch']),
            per_gpu_train_batch_size=int(args['--batch-size']),
            per_gpu_eval_batch_size=int(args['--batch-size']),
            logging_steps=int(args['--log-every']),
            save_steps=int(args['--save-every']),
            save_total_limit=10,
            learning_rate=float(args['--lr']),
            seed=int(args['--seed']),
        )

    else:
        model = transformers.AlbertForMaskedLM(albert_for_math_config)
        training_args = transformers.TrainingArguments(
            output_dir=args['--save-to'],
            num_train_epochs=int(args['--max-epoch']),
            per_gpu_train_batch_size=int(args['--batch-size']),
            per_gpu_eval_batch_size=int(args['--batch-size']),
            logging_steps=int(args['--log-every']),
            save_steps=int(args['--save-every']),
            save_total_limit=10,
            learning_rate=float(args['--lr']),
            seed=int(args['--seed']),
        )

    #load datasets
    print('Loading Data...')
    train_data = torch.load(
        './data/train_data_train-easy_algebra__linear_1d.pt')
    dev_data = torch.load('./data/dev_data_train-easy_algebra__linear_1d.pt')
    print('Finished loading data')
    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    model.to(device)
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        data_collator=AnswerMaskDataCollator(tokenizer),
        train_dataset=train_data,
        eval_dataset=dev_data,
        prediction_loss_only=True,
    )

    if args['--load']:
        trainer.train(model_path=args['--load-from'])
    else:
        trainer.train()
Beispiel #11
0
from cfg import config
from data import get_train_val_test_datasets
from models import get_model
from utils import metrics


args = transformers.TrainingArguments(
  "saved_models",
  evaluation_strategy = "epoch",
  learning_rate=config['learning_rate'],
  per_device_train_batch_size=config['batch_size'],
  per_device_eval_batch_size=config['batch_size'],
  num_train_epochs=config['num_epochs'],
  weight_decay=config['weight_decay'],
  load_best_model_at_end=True,
  metric_for_best_model="f1"
)

train, val, test = get_train_val_test_datasets()
trainer = transformers.Trainer(model=get_model(), args=args, train_dataset=train, eval_dataset=val, compute_metrics=metrics)

# Train the model.
trainer.train()

# Display model eval statistics.
print(trainer.evaluate())

# Test dataset metrics.
trainer.predict(test).metrics
Beispiel #12
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = transformers.HfArgumentParser(
        (ModelArguments, DataTrainingArguments,
         transformers.TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    transformers.set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = transformers.AutoConfig.from_pretrained(
            model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = transformers.AutoConfig.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = transformers.CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = transformers.AutoModelForPreTraining.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = transformers.AutoModelWithLMHead.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling).")

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
        mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results
Beispiel #13
0
    # Load data
    dlnd_train_dset, dlnd_valid_dset, dlnd_test_dset = DlndData(
    ).return_datasets()
    # Load model
    model = create_model()
    # Training
    training_args = transformers.TrainingArguments(
        evaluation_strategy='epoch',
        load_best_model_at_end=True,
        logging_dir='training_logs',
        logging_first_step=True,
        logging_steps=10,
        num_train_epochs=10,
        output_dir='training_results',
        per_device_eval_batch_size=BATCH_SIZE,
        per_device_train_batch_size=BATCH_SIZE,
        weight_decay=0.01,
        metric_for_best_model='accuracy',
        disable_tqdm=True,
    )
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=dlnd_train_dset,
        eval_dataset=dlnd_valid_dset,
        callbacks=[LogCallback],
    )
    trainer.train()
    trainer.evaluate()
Beispiel #14
0
def run_training(args, train_data):

    ## Checkpoint Loading ########################################################
    if args.load:
        if '2700' in args.load:
            model = transformers.GPTNeoForCausalLM.from_pretrained(args.load)
        else:
            model = transformers.GPT2LMHeadModel.from_pretrained(args.load)
        print(f"Loaded model from {args.load}")
    else:
        if "EleutherAI" in args.arch:
            model = transformers.GPTNeoForCausalLM.from_pretrained(args.arch)
        else:
            model = transformers.GPT2LMHeadModel.from_pretrained(args.arch)

    if args.resume:
        raise NotImplementedError
        model = transformers.GPT2LMHeadModel.from_pretrained(args.resume)
        print(f"Loaded model from {args.resume}")
        start_epoch = 0
        start_iteration = int(args.resume.split("-")[-1])
        print("start_iteration = ", start_iteration)
    else:
        start_iteration = 0

    ## Dataloading ########################################################
    train_data.start_iteration = start_iteration

    ## Start Loop ########################################################
    print(f"Starting main loop")

    training_args = transformers.TrainingArguments(
        output_dir=args.save_dir,
        overwrite_output_dir=False,
        do_train=True,
        do_eval=False,
        do_predict=True,
        evaluation_strategy='no',
        eval_steps=0,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size_per_replica,
        gradient_accumulation_steps=args.grad_acc_steps,
        learning_rate=args.lr,
        weight_decay=0.05,
        # warmup_steps=args.lr_warmup_steps,
        # max_grad_norm=100000.0,
        logging_dir=args.save_dir,
        logging_first_step=True,
        logging_steps=args.log_freq,
        save_steps=args.save_freq,
        save_total_limit=2,
        dataloader_drop_last=True,
        dataloader_num_workers=3,
        local_rank=args.local_rank,
        deepspeed=args.deepspeed,
        fp16=args.fp16,
    )

    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
    )
    trainer.remove_callback(transformers.integrations.TensorBoardCallback)
    trainer.add_callback(CustomTensorBoardCallback())

    trainer.train()

    if args.local_rank == 0:
        model.save_pretrained(os.path.join(args.save_dir, "final_checkpoint"))
Beispiel #15
0
def run_full_influence_functions(
        mode: str,
        num_examples_to_test: int,
        s_test_num_samples: int = 1000) -> Dict[int, Dict[str, Any]]:

    if mode not in ["only-correct", "only-incorrect"]:
        raise ValueError(f"Unrecognized mode {mode}")

    tokenizer, model = misc_utils.create_tokenizer_and_model(
        constants.MNLI_MODEL_PATH)

    (mnli_train_dataset,
     mnli_eval_dataset) = misc_utils.create_datasets(task_name="mnli",
                                                     tokenizer=tokenizer)

    batch_train_data_loader = misc_utils.get_dataloader(mnli_train_dataset,
                                                        batch_size=128,
                                                        random=True)

    instance_train_data_loader = misc_utils.get_dataloader(mnli_train_dataset,
                                                           batch_size=1,
                                                           random=False)

    eval_instance_data_loader = misc_utils.get_dataloader(
        dataset=mnli_eval_dataset, batch_size=1, random=False)

    output_mode = glue_output_modes["mnli"]

    def build_compute_metrics_fn(task_name: str):
        def compute_metrics_fn(p):
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(task_name, preds, p.label_ids)

        return compute_metrics_fn

    # Most of these arguments are placeholders
    # and are not really used at all, so ignore
    # the exact values of these.
    trainer = transformers.Trainer(
        model=model,
        args=TrainingArguments(output_dir="./tmp-output",
                               per_device_train_batch_size=128,
                               per_device_eval_batch_size=128,
                               learning_rate=5e-5,
                               logging_steps=100),
        data_collator=default_data_collator,
        train_dataset=mnli_train_dataset,
        eval_dataset=mnli_eval_dataset,
        compute_metrics=build_compute_metrics_fn("mnli"),
    )

    params_filter = [
        n for n, p in model.named_parameters() if not p.requires_grad
    ]

    weight_decay_ignores = ["bias", "LayerNorm.weight"] + [
        n for n, p in model.named_parameters() if not p.requires_grad
    ]

    model.cuda()
    num_examples_tested = 0
    outputs_collections = {}
    for test_index, test_inputs in enumerate(eval_instance_data_loader):
        if num_examples_tested >= num_examples_to_test:
            break

        # Skip when we only want cases of correction prediction but the
        # prediction is incorrect, or vice versa
        prediction_is_correct = misc_utils.is_prediction_correct(
            trainer=trainer, model=model, inputs=test_inputs)

        if mode == "only-correct" and prediction_is_correct is False:
            continue

        if mode == "only-incorrect" and prediction_is_correct is True:
            continue

        with Timer() as timer:
            influences, _, s_test = nn_influence_utils.compute_influences(
                n_gpu=1,
                device=torch.device("cuda"),
                batch_train_data_loader=batch_train_data_loader,
                instance_train_data_loader=instance_train_data_loader,
                model=model,
                test_inputs=test_inputs,
                params_filter=params_filter,
                weight_decay=constants.WEIGHT_DECAY,
                weight_decay_ignores=weight_decay_ignores,
                s_test_damp=5e-3,
                s_test_scale=1e4,
                s_test_num_samples=s_test_num_samples,
                train_indices_to_include=None,
                s_test_iterations=1,
                precomputed_s_test=None)

            outputs = {
                "test_index": test_index,
                "influences": influences,
                "s_test": s_test,
                "time": timer.elapsed,
                "correct": prediction_is_correct,
            }
            num_examples_tested += 1
            outputs_collections[test_index] = outputs

            remote_utils.save_and_mirror_scp_to_remote(
                object_to_save=outputs,
                file_name=
                f"KNN-recall.{mode}.{num_examples_to_test}.{test_index}.pth")
            print(
                f"Status: #{test_index} | {num_examples_tested} / {num_examples_to_test}"
            )

    return outputs_collections
Beispiel #16
0
training_arg = transformers.TrainingArguments(num_train_epochs=8,
                                              learning_rate=5e-5,
                                              output_dir='scratch/adv312/',
                                              evaluation_strategy="epoch",
                                              per_device_train_batch_size=8)

## TODO: Initialize a transformers.Trainer object and run a Bayesian
## hyperparameter search for at least 5 trials (but not too many) on the
## learning rate. Hint: use the model_init() and
## compute_metrics() methods from finetuning_utils.py as arguments to
## Trainer().

trainer = transformers.Trainer(
    model_init=finetuning_utils.model_init,
    args=training_arg,
    compute_metrics=finetuning_utils.compute_metrics,
    train_dataset=train_data)

##Use the hp_space parameter in hyperparameter_search() to specify
## your hyperparameter search space. (Note that this parameter takes a function
## as its value.)

bestrun = trainer.hyperparameter_search(
    hp_space=lambda _: {"learning rate": tune.uniform(1e-5, 5e-5)},
    n_trials=3,
    search_alg=BayesOptSearch(),
    metric='eval_loss',
    mode='min')

## Also print out the run ID, objective value
Beispiel #17
0
def imitator_main(mode: str,
                  num_examples_to_test: int) -> List[Dict[str, Any]]:
    if mode not in ["only-correct", "only-incorrect"]:
        raise ValueError(f"Unrecognized mode {mode}")

    task_tokenizer, task_model = misc_utils.create_tokenizer_and_model(
        constants.MNLI_MODEL_PATH)

    imitator_tokenizer, imitator_model = misc_utils.create_tokenizer_and_model(
        constants.MNLI_IMITATOR_MODEL_PATH)

    (mnli_train_dataset,
     mnli_eval_dataset) = misc_utils.create_datasets(task_name="mnli",
                                                     tokenizer=task_tokenizer)

    task_model.cuda()
    imitator_model.cuda()
    if task_model.training is True or imitator_model.training is True:
        raise ValueError("One of the model is in training mode")
    print(task_model.device, imitator_model.device)

    # Most of these arguments are placeholders
    # and are not really used at all, so ignore
    # the exact values of these.
    trainer = transformers.Trainer(
        model=task_model,
        args=TrainingArguments(output_dir="./tmp-output",
                               per_device_train_batch_size=128,
                               per_device_eval_batch_size=128,
                               learning_rate=5e-5,
                               logging_steps=100),
    )

    eval_instance_data_loader = misc_utils.get_dataloader(
        mnli_eval_dataset, batch_size=1, data_collator=default_data_collator)

    train_inputs_collections = torch.load(
        constants.MNLI_TRAIN_INPUT_COLLECTIONS_PATH)

    inputs_by_label: Dict[str, List[int]] = defaultdict(list)
    for i in range(len(train_inputs_collections)):
        label = mnli_train_dataset.label_list[train_inputs_collections[i]
                                              ["labels"]]
        inputs_by_label[label].append(i)

    outputs_collections = []
    for i, test_inputs in enumerate(eval_instance_data_loader):
        if mode == "only-correct" and i not in CORRECT_INDICES[:
                                                               num_examples_to_test]:
            continue
        if mode == "only-incorrect" and i not in INCORRECT_INDICES[:
                                                                   num_examples_to_test]:
            continue

        start_time = time.time()
        for using_ground_truth in [True, False]:
            outputs = run_one_imitator_experiment(
                task_model=task_model,
                imitator_model=imitator_model,
                test_inputs=test_inputs,
                trainer=trainer,
                train_dataset=mnli_train_dataset,
                train_inputs_collections=train_inputs_collections,
                inputs_by_label=inputs_by_label,
                finetune_using_ground_truth_label=using_ground_truth)
            outputs["index"] = i
            outputs_collections.append(outputs)

        end_time = time.time()
        print(f"#{len(outputs_collections)}/{len(outputs_collections)}: "
              f"Elapsed {(end_time - start_time) / 60:.2f}")

    torch.save(outputs_collections,
               f"imiator_experiments.{mode}.{num_examples_to_test}.pt")

    return outputs_collections
Beispiel #18
0
def train_bert(corpus_path, hebrew_model=False):
    """
    Bert model training
    :param corpus_path: Corpus to train Bert on
    :param hebrew_model: Model in Hebrew or not
    :return: The name of the new trained model
    """
    language = 'hebrew' if hebrew_model else 'english'
    df = pd.read_csv(corpus_path)
    corpus_name = get_corpus_name(corpus_path)
    print("Preprocess...")
    if hebrew_model:
        model_name, vocab, raw_text_file = preprocess_hebrew(df, corpus_name)
    else:
        model_name, vocab, raw_text_file = preprocess_english(df, corpus_name)
        pass

    print("Cuda availability :", torch.cuda.is_available())
    print("Getting tokenizer...")
    tokenizer = transformers.AutoTokenizer.from_pretrained(conf.bert_model[language], use_fast=True)
    model = transformers.AutoModelForMaskedLM.from_pretrained(conf.bert_model[language]).to('cuda')

    tokenizer.add_tokens(vocab)
    model.resize_token_embeddings(len(tokenizer))

    if os.path.exists(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name):
        shutil.rmtree(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)

    os.mkdir(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)
    tokenizer.save_pretrained(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)

    print("Tokenizing...")
    dataset = transformers.LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=raw_text_file,
        block_size=128,
    )

    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    training_args = transformers.TrainingArguments(
        output_dir=conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name,
        overwrite_output_dir=True,
        num_train_epochs=20,
        per_device_train_batch_size=16,
        save_steps=300,
        logging_steps=100,
        save_total_limit=3,
    )

    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )
    print("Begin training...")
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    trainer.train()
    trainer.save_model(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)
    print('The model has been saved under : ', conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)

    return conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name
Beispiel #19
0
 def testModel(self,
               train_val_split_iterator: typing.Iterator = [
                   sklearn.model_selection.train_test_split
               ],
               **kwargs):
     logger.info("Starting testing of RobertaModel")
     num_epochs = kwargs['epochs']
     batch_size = kwargs['batch_size']
     for i, train_test_split in enumerate(train_val_split_iterator):
         logger.debug(
             f'{i}-th enumeration of train_val split iterator under cross validation'
         )
         self.model = self.createModel()
         # optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
         # loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
         if callable(getattr(self.model, 'compile', None)):  # if tf model
             train_dataset, val_dataset = self.pipeLine.getEncodedDataset(
                 train_test_split, batch_size=batch_size)
             # self.model.compile(optimizer=optimizer, loss=loss, metrics=self._registeredMetrics)
             # self.model.fit(train_dataset, epochs=num_epochs)
             training_args = transformers.TFTrainingArguments(
                 output_dir=
                 f'./results/{self._modelName}',  # output directory
                 num_train_epochs=
                 num_epochs,  # total number of training epochs
                 per_device_train_batch_size=
                 batch_size,  # batch size per device during training
                 per_device_eval_batch_size=
                 batch_size,  # batch size for evaluation
                 warmup_steps=kwargs[
                     'warmup_steps'],  # number of warmup steps for learning rate scheduler
                 weight_decay=kwargs[
                     'weight_decay'],  # strength of weight decay
                 logging_dir='./logs',  # directory for storing logs
             )
             trainer = transformers.TFTrainer(
                 model=self.
                 model,  # the instantiated 🤗 Transformers model to be trained
                 args=training_args,  # training arguments, defined above
                 train_dataset=
                 train_dataset,  # tensorflow_datasets training dataset
                 eval_dataset=
                 val_dataset,  # tensorflow_datasets evaluation dataset
                 compute_metrics=get_compute_metrics(
                     self._registeredMetrics
                 )  # metrics to compute while training
             )
         else:  # if pytorch model
             train_dataset, val_dataset = self.pipeLine.getEncodedDataset(
                 train_test_split,
                 batch_size=batch_size,
                 tfOrPyTorch=torchOrTFEnum.TORCH)
             training_args = transformers.TrainingArguments(
                 output_dir=
                 f'./results/{self._modelName}',  # output directory
                 num_train_epochs=
                 num_epochs,  # total number of training epochs
                 per_device_train_batch_size=
                 batch_size,  # batch size per device during training
                 per_device_eval_batch_size=
                 batch_size,  # batch size for evaluation
                 warmup_steps=kwargs[
                     'warmup_steps'],  # number of warmup steps for learning rate scheduler
                 weight_decay=kwargs[
                     'weight_decay'],  # strength of weight decay
                 logging_dir='./logs',  # directory for storing logs
                 logging_steps=10,
             )
             trainer = transformers.Trainer(
                 model=self.
                 model,  # the instantiated 🤗 Transformers model to be trained
                 args=training_args,  # training arguments, defined above
                 train_dataset=train_dataset,  # training dataset
                 eval_dataset=val_dataset,  # evaluation dataset
                 compute_metrics=get_compute_metrics(
                     self._registeredMetrics
                 )  # metrics to compute while training
             )
             trainer.train()