Esempio n. 1
0
def train():
    """Trains a BERT ethicality classifer."""

    args = transformers.TrainingArguments(
        "saved_models",
        evaluation_strategy="epoch",
        learning_rate=config['learning_rate'],
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size'],
        num_train_epochs=config['num_epochs'],
        weight_decay=config['weight_decay'],
        load_best_model_at_end=True,
        metric_for_best_model="f1")

    train, val, test = get_train_val_test_datasets()
    trainer = transformers.Trainer(model=get_model(),
                                   args=args,
                                   train_dataset=train,
                                   eval_dataset=val,
                                   compute_metrics=metrics)

    # Train the model.
    trainer.train()

    # Display model eval statistics.
    print(trainer.evaluate())

    # Test dataset metrics.
    trainer.predict(test).metrics
Esempio n. 2
0
 def _get_train_args(self, nepochs: int, eval_every: int, batch_size: int, save_every: int
                     ) -> transformers.TrainingArguments:
     training_arguments = transformers.TrainingArguments(
         output_dir=self.checkpoints_dir,
         overwrite_output_dir=True,
         do_train=True,
         do_eval=True,
         evaluation_strategy="steps",
         eval_steps=eval_every,
         save_steps=save_every,
         num_train_epochs=nepochs,
         per_device_train_batch_size=batch_size,
         per_device_eval_batch_size=batch_size,
         save_total_limit=3
     )
     return training_arguments
Esempio n. 3
0
        metrics[f"{average}_f1"] = f1
    metrics["accuracy"] = sklearn.metrics.accuracy_score(labels, predictions)
    return metrics


_dir = pathlib.Path().resolve() / uuid.uuid4().hex
_dir.mkdir()
_dir = str(_dir)
args = transformers.TrainingArguments(
    output_dir=f"{_dir}/output",
    num_train_epochs=32,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    logging_dir=f"{_dir}/logging",
    logging_steps=256,
    dataloader_num_workers=64,
    evaluation_strategy="steps",
    eval_steps=256,
    save_steps=256,
    fp16=True,
    fp16_opt_level="O3",
    learning_rate=5e-4,
    run_name=_dir,
)

model = transformers.AlbertForSequenceClassification.from_pretrained(
    "albert-large-v2", num_labels=2)
tokenizer = transformers.AlbertTokenizerFast.from_pretrained("albert-large-v2")
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer,
                                                     pad_to_multiple_of=32)
trainer = transformers.Trainer(
    args=args,
Esempio n. 4
0
import argparse
import transformers

parser = argparse.ArgumentParser()
parser.add_argument('--vocab', type=str)
parser.add_argument('--model', type=str)
parser.add_argument('--data', type=str)
args = parser.parse_args()

tokenizer = transformers.BertTokenizer(vocab_file=args.vocab,
                                       do_lower_case=False,
                                       do_basic_tokenize=True)
model = transformers.BertForMaskedLM.from_pretrained(args.model)

dataset = transformers.LineByLineTextDataset(tokenizer=tokenizer,
                                             file_path=args.data,
                                             block_size=128)
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
train_args = transformers.TrainingArguments(
    per_device_eval_batch_size=16, output_dir=f"/tmp/echau18/{args.model}")
trainer = transformers.Trainer(model=model,
                               eval_dataset=dataset,
                               data_collator=data_collator,
                               prediction_loss_only=True,
                               args=train_args)

eval_output = trainer.evaluate()
print(eval_output)
Esempio n. 5
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--corpus_dir', required=True)
    parser.add_argument('--text_column')
    parser.add_argument('--model_name')
    parser.add_argument('--max_seq_length', type=int)
    parser.add_argument('--num_epochs', type=int)
    parser.add_argument('--learning_rate', type=float, default=1e-5)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--output_dir')
    args = parser.parse_args()
    torch.manual_seed(42)

    logging.basicConfig(level=logging.INFO)

    corpus_dir = args.corpus_dir
    text_column_name = args.text_column
    model_name = args.model_name
    max_seq_length = args.max_seq_length
    num_epochs = args.num_epochs
    learning_rate = args.learning_rate
    batch_size = args.batch_size
    output_dir = args.output_dir

    train_df_1 = pd.read_csv(os.path.join(
        corpus_dir, "train_{}.csv".format("sentiments_cloudvision")),
                             encoding="utf-8")
    print("Train_1", train_df_1.shape)
    train_df_1.dropna(subset=[text_column_name], inplace=True)
    print("Train_1", train_df_1.shape)
    val_df_1 = pd.read_csv(os.path.join(
        corpus_dir, "val_{}.csv".format("sentiments_cloudvision")),
                           encoding="utf-8")
    print("Val_1", val_df_1.shape)
    val_df_1.dropna(subset=[text_column_name], inplace=True)
    print("Val_1", val_df_1.shape)
    train_df_2 = pd.read_csv(os.path.join(
        corpus_dir, "train_{}.csv".format("topics_cloudvision")),
                             encoding="utf-8")
    print("Train_2", train_df_2.shape)
    train_df_2.dropna(subset=[text_column_name], inplace=True)
    print("Train_2", train_df_2.shape)
    val_df_2 = pd.read_csv(os.path.join(
        corpus_dir, "val_{}.csv".format("topics_cloudvision")),
                           encoding="utf-8")
    print("Val_2", val_df_2.shape)
    val_df_2.dropna(subset=[text_column_name], inplace=True)
    print("Val_2", val_df_2.shape)
    train_dfs = {"task_1": train_df_1, "task_2": train_df_2}
    val_dfs = {"task_1": val_df_1, "task_2": val_df_2}
    dataset_dict_1, id_to_class_1 = load_dataset(train_df_1, val_df_1,
                                                 text_column_name)
    dataset_dict_2, id_to_class_2 = load_dataset(train_df_2, val_df_2,
                                                 text_column_name)
    classes_list_1 = []
    for i in range(len(id_to_class_1.keys())):
        class_label = id_to_class_1[i]
        classes_list_1.append(class_label)
    classes_list_2 = []
    for i in range(len(id_to_class_2.keys())):
        class_label = id_to_class_2[i]
        classes_list_2.append(class_label)
    dataset_dict = {"task_1": dataset_dict_1, "task_2": dataset_dict_2}
    id_to_class_dicts = {"task_1": id_to_class_1, "task_2": id_to_class_2}
    id_to_class = {"task_1": classes_list_1, "task_2": classes_list_2}

    multitask_model = MultitaskModel.create(
        model_name=model_name,
        model_type_dict={
            "task_1": transformers.AutoModelForSequenceClassification,
            "task_2": transformers.AutoModelForSequenceClassification,
        },
        model_config_dict={
            "task_1":
            transformers.AutoConfig.from_pretrained(
                model_name,
                num_labels=len(id_to_class_dicts["task_1"].keys())),
            "task_2":
            transformers.AutoConfig.from_pretrained(
                model_name,
                num_labels=len(id_to_class_dicts["task_2"].keys())),
        },
    )
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    feature_fn = convert_features_function(tokenizer, max_seq_length)
    convert_func_dict = {
        "task_1": feature_fn,
        "task_2": feature_fn,
    }
    columns_dict = {
        "task_1": ['input_ids', 'attention_mask', 'labels'],
        "task_2": ['input_ids', 'attention_mask', 'labels'],
    }
    features_dict = data_to_features(dataset_dict, convert_func_dict,
                                     columns_dict)

    train_dataset = {
        task_name: dataset["train"]
        for task_name, dataset in features_dict.items()
    }
    val_dataset_dict = {
        task_name: dataset["validation"]
        for task_name, dataset in features_dict.items()
    }

    trainer = MultitaskTrainer(
        model=multitask_model,
        args=transformers.TrainingArguments(
            output_dir="./models/multitask_model",
            overwrite_output_dir=True,
            learning_rate=learning_rate,
            do_train=True,
            num_train_epochs=num_epochs,
            # Adjust batch size if this doesn't fit on the Colab GPU
            per_device_train_batch_size=batch_size,
            save_steps=3000,
        ),
        # compute_metrics=classification_metrics,
        data_collator=NLPDataCollator(),
        train_dataset=train_dataset,
        eval_dataset=val_dataset_dict)
    trainer.train()

    validation_results = evaluate_classification(trainer, features_dict,
                                                 dataset_dict)
    for task_name, results_dict in validation_results.items():
        for metric_name, value in results_dict.items():
            print(f"Validation quality: After training, task: {task_name},"
                  f" {metric_name} : {value}")
    training_results = evaluate_classification(trainer,
                                               features_dict,
                                               dataset_dict,
                                               collection="train")
    for task_name, results_dict in training_results.items():
        for metric_name, value in results_dict.items():
            print(f"Training quality: After training, task: {task_name},"
                  f" {metric_name} : {value}")

    validation_predictions = get_predictions(trainer,
                                             features_dict,
                                             id_to_class,
                                             collection="validation")
    train_predictions = get_predictions(trainer,
                                        features_dict,
                                        id_to_class,
                                        collection="train")
    # print("Pred train", train_predictions.shape)
    # print("Pred val", validation_predictions.shape)
    # train_embeddings = get_last_layer_embedding(multitask_model, trainer, features_dict, collection="train")
    # validation_embeddings = get_last_layer_embedding(multitask_model, trainer, features_dict, collection="validation")

    train_embeddings = get_embeddings(
        multitask_model,
        features_dict,
        collection="train",
    )
    validation_embeddings = get_embeddings(
        multitask_model,
        features_dict,
        collection="validation",
    )
    # print("Embe train", train_embeddings.shape)
    # print("Embe val", validation_embeddings.shape)

    for task_name in ["task_1", "task_2"]:
        train_df = train_dfs[task_name]
        prediction_df = train_predictions[task_name]
        cls_emb_df = train_embeddings[task_name]["cls"]
        mean_emb_df = train_embeddings[task_name]["mean"]
        train_df = pd.concat(
            [train_df, prediction_df, cls_emb_df, mean_emb_df],
            axis=1,
        )
        output_path = os.path.join(output_dir, task_name, "train.csv")
        d = os.path.dirname(output_path)
        if not os.path.exists(d):
            os.makedirs(d)
        prediction_df.to_csv(os.path.join(output_dir, task_name,
                                          "tr_prediction.csv"),
                             encoding="utf-8",
                             index=False)
        cls_emb_df.to_csv(os.path.join(output_dir, task_name,
                                       "tr_cls_emb.csv"),
                          encoding="utf-8",
                          index=False)
        mean_emb_df.to_csv(os.path.join(output_dir, task_name,
                                        "tr_mean_emb.csv"),
                           encoding="utf-8",
                           index=False)
        train_df.to_csv(output_path, encoding="utf-8", index=False)

        val_df = val_dfs[task_name]
        prediction_df = validation_predictions[task_name]
        cls_emb_df = validation_embeddings[task_name]["cls"]
        mean_emb_df = validation_embeddings[task_name]["mean"]
        val_df = pd.concat([val_df, prediction_df, cls_emb_df, mean_emb_df],
                           axis=1)
        output_path = os.path.join(output_dir, task_name, "val.csv")
        d = os.path.dirname(output_path)
        if not os.path.exists(d):
            os.makedirs(d)
        prediction_df.to_csv(os.path.join(output_dir, task_name,
                                          "val_prediction.csv"),
                             encoding="utf-8",
                             index=False)
        cls_emb_df.to_csv(os.path.join(output_dir, task_name,
                                       "val_cls_emb.csv"),
                          encoding="utf-8",
                          index=False)
        mean_emb_df.to_csv(os.path.join(output_dir, task_name,
                                        "val_mean_emb.csv"),
                           encoding="utf-8",
                           index=False)
        val_df.to_csv(output_path, encoding="utf-8", index=False)
Esempio n. 6
0
from utils import get_timestamp

logging.basicConfig(level=logging.ERROR)
torch.manual_seed(42)


trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        output_dir="./models/multitask_model",
        overwrite_output_dir=True,
        learning_rate=2e-5,
        do_train=True,
        do_eval=True,
        # evaluation_strategy ="steps",
        num_train_epochs=epochs,
        fp16=True,
        # Adjust batch size if this doesn't fit on the Colab GPU
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        save_steps=3000,
        # eval_steps=50,
        load_best_model_at_end=True,
    ),
    data_collator=NLPDataCollator(tokenizer=tokenizer),
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    callbacks=[]
)
# train the model
trainer.train()
Esempio n. 7
0
        )

        return data_loader


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir')
    default_training_args = vars(
        transformers.TrainingArguments(
            output_dir="./models/rubert_cased_nplus1",
            overwrite_output_dir=True,
            do_train=True,
            do_eval=True,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=128,
            num_train_epochs=5,
            learning_rate=2e-5,
            logging_steps=500,
            logging_first_step=True,
            save_steps=1000,
            evaluate_during_training=True,
        ))
    for k, v in default_training_args.items():
        parser.add_argument('--' + k, default=v, type=type(v))
    args = parser.parse_args()
    training_args_dict = {
        k: v
        for k, v in vars(args).items() if k in default_training_args
    }

    data_dir = args.data_dir
from data import features_dict

##UPDATE THIS
multitask_model.load_state_dict(torch.load("src/models/{}/pytorch_model.bin"))

trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        learning_rate=learning_rate,
        output_dir="/tmp",
        do_train=False,
        do_eval=True,
        # evaluation_strategy ="steps",
        num_train_epochs=epochs,
        fp16=True,
        # Adjust batch size if this doesn't fit on the Colab GPU
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        save_steps=3000,
        # eval_steps=50,
        load_best_model_at_end=True,
    ),
    data_collator=NLPDataCollator(tokenizer=tokenizer),
    callbacks=[])

tests_dict = {}
for task_name in ["document", "paragraph", "sentence"]:
    test_dataloader = DataLoaderWithTaskname(
        task_name,
        trainer.get_eval_dataloader(features_dict[task_name]["test"]))
Esempio n. 9
0
    pad_token_id=t_tokenizer.pad_token_id,
    bos_token_id=t_tokenizer.bos_token_id,
    eos_token_id=t_tokenizer.eos_token,
    sep_token_id=t_tokenizer.sep_token_id)

# 创建Albert语言模型
albert_model = AutoModelForMaskedLM.from_config(albert_config)
# albert_model = AlbertForMaskedLM.from_pretrained("/home/hedan/tools/Github/NLP_Based_Transformer/model/checkpoint-5000")
# albert_model.resize_token_embeddings(len(t_tokenizer))

# 配置训练参数
train_args = transformers.TrainingArguments(output_dir="./model",
                                            do_train=True,
                                            logging_steps=50,
                                            learning_rate=0.001,
                                            num_train_epochs=30,
                                            save_steps=1000,
                                            per_device_train_batch_size=32,
                                            lr_scheduler_type="polynomial",
                                            dataloader_num_workers=4)

# t = t_DataCollator(h["input_ids"])
# x = albert_model(torch.tensor(h["input_ids"]))
# 训练
trainer = Trainer(model=albert_model,
                  args=train_args,
                  train_dataset=t_dataset,
                  tokenizer=t_tokenizer,
                  data_collator=t_DataCollator)

trainer.train()
Esempio n. 10
0
    n_layer=3,
    n_head=3
)

model = transformers.GPT2LMHeadModel(config=config)

print("Training Model...")

writer = SummaryWriter()

training_args = transformers.TrainingArguments(
    output_dir="models/gpt2/",
    do_train=True,
    do_eval=True,
    evaluate_during_training=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    logging_first_step=True,
    save_steps=2000,
    save_total_limit=2,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_set,
    eval_dataset=valid_set,
    prediction_loss_only=True,
    tb_writer=writer
)
Esempio n. 11
0
import torch
import transformers

from cfg import config
from data import get_train_val_test_datasets
from models import get_model
from utils import metrics


args = transformers.TrainingArguments(
  "saved_models",
  evaluation_strategy = "epoch",
  learning_rate=config['learning_rate'],
  per_device_train_batch_size=config['batch_size'],
  per_device_eval_batch_size=config['batch_size'],
  num_train_epochs=config['num_epochs'],
  weight_decay=config['weight_decay'],
  load_best_model_at_end=True,
  metric_for_best_model="f1"
)

train, val, test = get_train_val_test_datasets()
trainer = transformers.Trainer(model=get_model(), args=args, train_dataset=train, eval_dataset=val, compute_metrics=metrics)

# Train the model.
trainer.train()

# Display model eval statistics.
print(trainer.evaluate())

# Test dataset metrics.
Esempio n. 12
0
def main():
    args = get_args()

    dataset_dict = {
        "stsb": nlp.load_dataset('glue', name="stsb"),
        "rte": nlp.load_dataset('glue', name="rte"),
        "commonsense_qa": nlp.load_dataset('commonsense_qa'),
    }

    for task_name, dataset in dataset_dict.items():
        print(task_name)
        print(dataset_dict[task_name]["train"][0])
        print()

    multitask_model = MultitaskModel.create(
        model_name=model_name,
        model_type_dict={
            "stsb": transformers.AutoModelForSequenceClassification,
            "rte": transformers.AutoModelForSequenceClassification,
            "commonsense_qa": transformers.AutoModelForMultipleChoice,
        },
        model_config_dict={
            "stsb":
            transformers.AutoConfig.from_pretrained(model_name, num_labels=1),
            "rte":
            transformers.AutoConfig.from_pretrained(model_name, num_labels=2),
            "commonsense_qa":
            transformers.AutoConfig.from_pretrained(model_name),
        })

    if model_name.startswith("roberta-"):
        print(multitask_model.encoder.embeddings.word_embeddings.weight.
              data_ptr())
        print(multitask_model.taskmodels_dict["stsb"].roberta.embeddings.
              word_embeddings.weight.data_ptr())
        print(multitask_model.taskmodels_dict["rte"].roberta.embeddings.
              word_embeddings.weight.data_ptr())
        print(multitask_model.taskmodels_dict["commonsense_qa"].roberta.
              embeddings.word_embeddings.weight.data_ptr())

    convert_func_dict = {
        "stsb": convert_to_stsb_features,
        "rte": convert_to_rte_features,
        "commonsense_qa": convert_to_commonsense_qa_features,
    }

    columns_dict = {
        "stsb": ['input_ids', 'attention_mask', 'labels'],
        "rte": ['input_ids', 'attention_mask', 'labels'],
        "commonsense_qa": ['input_ids', 'attention_mask', 'labels'],
    }

    features_dict = {}
    for task_name, dataset in dataset_dict.items():
        features_dict[task_name] = {}
        for phase, phase_dataset in dataset.items():
            features_dict[task_name][phase] = phase_dataset.map(
                convert_func_dict[task_name],
                batched=True,
                load_from_cache_file=False,
            )
            print(task_name, phase, len(phase_dataset),
                  len(features_dict[task_name][phase]))
            features_dict[task_name][phase].set_format(
                type="torch",
                columns=columns_dict[task_name],
            )
            print(task_name, phase, len(phase_dataset),
                  len(features_dict[task_name][phase]))

    train_dataset = {
        task_name: dataset["train"]
        for task_name, dataset in features_dict.items()
    }
    trainer = MultitaskTrainer(
        model=multitask_model,
        args=transformers.TrainingArguments(
            output_dir=args.job_dir,
            overwrite_output_dir=True,
            learning_rate=1e-5,
            do_train=True,
            num_train_epochs=3,
            per_device_train_batch_size=args.batch_size,
            save_steps=3000,
        ),
        data_collator=NLPDataCollator(),
        train_dataset=train_dataset,
    )
    trainer.train()

    preds_dict = {}
    for task_name in ["rte", "stsb", "commonsense_qa"]:
        eval_dataloader = DataLoaderWithTaskname(
            task_name,
            trainer.get_eval_dataloader(
                eval_dataset=features_dict[task_name]["validation"]))
        print(eval_dataloader.data_loader.collate_fn)
        preds_dict[task_name] = trainer._prediction_loop(
            eval_dataloader,
            description=f"Validation: {task_name}",
        )

    # Evalute RTE
    nlp.load_metric('glue', name="rte").compute(
        np.argmax(preds_dict["rte"].predictions, axis=1),
        preds_dict["rte"].label_ids,
    )

    # Evalute STS-B
    nlp.load_metric('glue', name="stsb").compute(
        preds_dict["stsb"].predictions.flatten(),
        preds_dict["stsb"].label_ids,
    )

    # Evalute Commonsense QA
    np.mean(
        np.argmax(preds_dict["commonsense_qa"].predictions, axis=1) ==
        preds_dict["commonsense_qa"].label_ids)
Esempio n. 13
0
transformers.logging.set_verbosity_debug()

if __name__ == '__main__':
    # Load data
    dlnd_train_dset, dlnd_valid_dset, dlnd_test_dset = DlndData(
    ).return_datasets()
    # Load model
    model = create_model()
    # Training
    training_args = transformers.TrainingArguments(
        evaluation_strategy='epoch',
        load_best_model_at_end=True,
        logging_dir='training_logs',
        logging_first_step=True,
        logging_steps=10,
        num_train_epochs=10,
        output_dir='training_results',
        per_device_eval_batch_size=BATCH_SIZE,
        per_device_train_batch_size=BATCH_SIZE,
        weight_decay=0.01,
        metric_for_best_model='accuracy',
        disable_tqdm=True,
    )
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=dlnd_train_dset,
        eval_dataset=dlnd_valid_dset,
        callbacks=[LogCallback],
    )
    trainer.train()
Esempio n. 14
0
def run_training(args, train_data):

    ## Checkpoint Loading ########################################################
    if args.load:
        if '2700' in args.load:
            model = transformers.GPTNeoForCausalLM.from_pretrained(args.load)
        else:
            model = transformers.GPT2LMHeadModel.from_pretrained(args.load)
        print(f"Loaded model from {args.load}")
    else:
        if "EleutherAI" in args.arch:
            model = transformers.GPTNeoForCausalLM.from_pretrained(args.arch)
        else:
            model = transformers.GPT2LMHeadModel.from_pretrained(args.arch)

    if args.resume:
        raise NotImplementedError
        model = transformers.GPT2LMHeadModel.from_pretrained(args.resume)
        print(f"Loaded model from {args.resume}")
        start_epoch = 0
        start_iteration = int(args.resume.split("-")[-1])
        print("start_iteration = ", start_iteration)
    else:
        start_iteration = 0

    ## Dataloading ########################################################
    train_data.start_iteration = start_iteration

    ## Start Loop ########################################################
    print(f"Starting main loop")

    training_args = transformers.TrainingArguments(
        output_dir=args.save_dir,
        overwrite_output_dir=False,
        do_train=True,
        do_eval=False,
        do_predict=True,
        evaluation_strategy='no',
        eval_steps=0,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size_per_replica,
        gradient_accumulation_steps=args.grad_acc_steps,
        learning_rate=args.lr,
        weight_decay=0.05,
        # warmup_steps=args.lr_warmup_steps,
        # max_grad_norm=100000.0,
        logging_dir=args.save_dir,
        logging_first_step=True,
        logging_steps=args.log_freq,
        save_steps=args.save_freq,
        save_total_limit=2,
        dataloader_drop_last=True,
        dataloader_num_workers=3,
        local_rank=args.local_rank,
        deepspeed=args.deepspeed,
        fp16=args.fp16,
    )

    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
    )
    trainer.remove_callback(transformers.integrations.TensorBoardCallback)
    trainer.add_callback(CustomTensorBoardCallback())

    trainer.train()

    if args.local_rank == 0:
        model.save_pretrained(os.path.join(args.save_dir, "final_checkpoint"))
Esempio n. 15
0
def train(args):
    logging.basicConfig(level=logging.INFO)
    tokenizer = transformers.AlbertTokenizer.from_pretrained(
        'albert-base-v2', cache_dir=cache_dir)
    albert_for_math_config = transformers.AlbertConfig(
        hidden_size=768,
        num_attention_heads=12,
        intermediate_size=3072,
    )

    if args['--load']:
        model = transformers.AlbertForMaskedLM.from_pretrained(
            args['--load-from'])
        training_args = transformers.TrainingArguments(
            output_dir=args['--save-to'],
            overwrite_output_dir=True,
            num_train_epochs=int(args['--max-epoch']),
            per_gpu_train_batch_size=int(args['--batch-size']),
            per_gpu_eval_batch_size=int(args['--batch-size']),
            logging_steps=int(args['--log-every']),
            save_steps=int(args['--save-every']),
            save_total_limit=10,
            learning_rate=float(args['--lr']),
            seed=int(args['--seed']),
        )

    else:
        model = transformers.AlbertForMaskedLM(albert_for_math_config)
        training_args = transformers.TrainingArguments(
            output_dir=args['--save-to'],
            num_train_epochs=int(args['--max-epoch']),
            per_gpu_train_batch_size=int(args['--batch-size']),
            per_gpu_eval_batch_size=int(args['--batch-size']),
            logging_steps=int(args['--log-every']),
            save_steps=int(args['--save-every']),
            save_total_limit=10,
            learning_rate=float(args['--lr']),
            seed=int(args['--seed']),
        )

    #load datasets
    print('Loading Data...')
    train_data = torch.load(
        './data/train_data_train-easy_algebra__linear_1d.pt')
    dev_data = torch.load('./data/dev_data_train-easy_algebra__linear_1d.pt')
    print('Finished loading data')
    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    model.to(device)
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        data_collator=AnswerMaskDataCollator(tokenizer),
        train_dataset=train_data,
        eval_dataset=dev_data,
        prediction_loss_only=True,
    )

    if args['--load']:
        trainer.train(model_path=args['--load-from'])
    else:
        trainer.train()
Esempio n. 16
0
    pd.read_json(f"{args.data_dir}/val.jsonl", lines=True, orient="records"),
    test_size=0.5,
)

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
train_data = boolq.BoolQDataset(train_df, tokenizer)
val_data = boolq.BoolQDataset(val_df, tokenizer)
test_data = boolq.BoolQDataset(test_df, tokenizer)

## TODO: Initialize a transformers.TrainingArguments object here for use in
## training and tuning the model. Consult the assignment handout for some
## sample hyperparameter values.

training_arg = transformers.TrainingArguments(num_train_epochs=8,
                                              learning_rate=5e-5,
                                              output_dir='scratch/adv312/',
                                              evaluation_strategy="epoch",
                                              per_device_train_batch_size=8)

## TODO: Initialize a transformers.Trainer object and run a Bayesian
## hyperparameter search for at least 5 trials (but not too many) on the
## learning rate. Hint: use the model_init() and
## compute_metrics() methods from finetuning_utils.py as arguments to
## Trainer().

trainer = transformers.Trainer(
    model_init=finetuning_utils.model_init,
    args=training_arg,
    compute_metrics=finetuning_utils.compute_metrics,
    train_dataset=train_data)
Esempio n. 17
0
def run_training(args, train_data):
    
    if not args.save_steps:
        # Save every epoch
        if not args.tpu_num_cores:
            save_steps = len(train_data) 
            save_steps = int(save_steps / torch.cuda.device_count())
            save_steps = int(save_steps / args.grad_acc_steps)
            save_steps = int(save_steps / args.batch_size_per_replica)
        else:
            save_steps = len(train_data) 
            save_steps = int(save_steps / 8) # 8 TPU cores is constant for now.
            save_steps = int(save_steps / args.grad_acc_steps)
            save_steps = int(save_steps / args.batch_size_per_replica)
    else:
        save_steps = args.save_steps



    print("Save Steps = ", save_steps)

    ## Checkpoint Loading ######################################################## 
    if args.load:
        model = transformers.GPT2LMHeadModel.from_pretrained(args.load)
        print(f"Loaded model from {args.load}")
    else:
        model = transformers.GPT2LMHeadModel.from_pretrained(args.arch)

    start_epoch = 0
    start_iteration = 0
    
    ## Dataloading ######################################################## 
    train_data.start_iteration = start_iteration

    ## Start Loop ########################################################
    print(f"Setting up Trainer")

    training_args = transformers.TrainingArguments(
        output_dir=args.save_dir,
        overwrite_output_dir=False,

        do_train=True,
        do_eval=False,
        do_predict=True,
        evaluation_strategy='no',
        eval_steps=0, 

        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size_per_replica,
        gradient_accumulation_steps=args.grad_acc_steps,

        learning_rate=args.lr,
        weight_decay=args.weight_decay,
        warmup_steps=args.lr_warmup_steps,
        max_grad_norm=100000.0, # Essentially disable gradient clipping

        logging_dir=args.save_dir, 
        logging_first_step=True,
        logging_steps=args.log_freq,
        save_steps=save_steps,
        save_total_limit=10, # Only save the last epoch

        dataloader_drop_last=True,
        dataloader_num_workers=args.dataloader_num_workers,

        local_rank=args.local_rank,
        tpu_num_cores=args.tpu_num_cores,
    )

    trainer = GPT2Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
    )
    trainer.remove_callback(transformers.integrations.TensorBoardCallback)
    trainer.add_callback(CustomTensorBoardCallback())

    print(f"STARTING TRAINING. save_steps={save_steps}")
    trainer.train()
    
    trainer.save_model(os.path.join(args.save_dir, "final_checkpoint"))
    print("Finished")
Esempio n. 18
0
def train_bert(corpus_path, hebrew_model=False):
    """
    Bert model training
    :param corpus_path: Corpus to train Bert on
    :param hebrew_model: Model in Hebrew or not
    :return: The name of the new trained model
    """
    language = 'hebrew' if hebrew_model else 'english'
    df = pd.read_csv(corpus_path)
    corpus_name = get_corpus_name(corpus_path)
    print("Preprocess...")
    if hebrew_model:
        model_name, vocab, raw_text_file = preprocess_hebrew(df, corpus_name)
    else:
        model_name, vocab, raw_text_file = preprocess_english(df, corpus_name)
        pass

    print("Cuda availability :", torch.cuda.is_available())
    print("Getting tokenizer...")
    tokenizer = transformers.AutoTokenizer.from_pretrained(conf.bert_model[language], use_fast=True)
    model = transformers.AutoModelForMaskedLM.from_pretrained(conf.bert_model[language]).to('cuda')

    tokenizer.add_tokens(vocab)
    model.resize_token_embeddings(len(tokenizer))

    if os.path.exists(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name):
        shutil.rmtree(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)

    os.mkdir(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)
    tokenizer.save_pretrained(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)

    print("Tokenizing...")
    dataset = transformers.LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=raw_text_file,
        block_size=128,
    )

    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    training_args = transformers.TrainingArguments(
        output_dir=conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name,
        overwrite_output_dir=True,
        num_train_epochs=20,
        per_device_train_batch_size=16,
        save_steps=300,
        logging_steps=100,
        save_total_limit=3,
    )

    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )
    print("Begin training...")
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    trainer.train()
    trainer.save_model(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)
    print('The model has been saved under : ', conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)

    return conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name
Esempio n. 19
0
 def testModel(self,
               train_val_split_iterator: typing.Iterator = [
                   sklearn.model_selection.train_test_split
               ],
               **kwargs):
     logger.info("Starting testing of RobertaModel")
     num_epochs = kwargs['epochs']
     batch_size = kwargs['batch_size']
     for i, train_test_split in enumerate(train_val_split_iterator):
         logger.debug(
             f'{i}-th enumeration of train_val split iterator under cross validation'
         )
         self.model = self.createModel()
         # optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
         # loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
         if callable(getattr(self.model, 'compile', None)):  # if tf model
             train_dataset, val_dataset = self.pipeLine.getEncodedDataset(
                 train_test_split, batch_size=batch_size)
             # self.model.compile(optimizer=optimizer, loss=loss, metrics=self._registeredMetrics)
             # self.model.fit(train_dataset, epochs=num_epochs)
             training_args = transformers.TFTrainingArguments(
                 output_dir=
                 f'./results/{self._modelName}',  # output directory
                 num_train_epochs=
                 num_epochs,  # total number of training epochs
                 per_device_train_batch_size=
                 batch_size,  # batch size per device during training
                 per_device_eval_batch_size=
                 batch_size,  # batch size for evaluation
                 warmup_steps=kwargs[
                     'warmup_steps'],  # number of warmup steps for learning rate scheduler
                 weight_decay=kwargs[
                     'weight_decay'],  # strength of weight decay
                 logging_dir='./logs',  # directory for storing logs
             )
             trainer = transformers.TFTrainer(
                 model=self.
                 model,  # the instantiated 🤗 Transformers model to be trained
                 args=training_args,  # training arguments, defined above
                 train_dataset=
                 train_dataset,  # tensorflow_datasets training dataset
                 eval_dataset=
                 val_dataset,  # tensorflow_datasets evaluation dataset
                 compute_metrics=get_compute_metrics(
                     self._registeredMetrics
                 )  # metrics to compute while training
             )
         else:  # if pytorch model
             train_dataset, val_dataset = self.pipeLine.getEncodedDataset(
                 train_test_split,
                 batch_size=batch_size,
                 tfOrPyTorch=torchOrTFEnum.TORCH)
             training_args = transformers.TrainingArguments(
                 output_dir=
                 f'./results/{self._modelName}',  # output directory
                 num_train_epochs=
                 num_epochs,  # total number of training epochs
                 per_device_train_batch_size=
                 batch_size,  # batch size per device during training
                 per_device_eval_batch_size=
                 batch_size,  # batch size for evaluation
                 warmup_steps=kwargs[
                     'warmup_steps'],  # number of warmup steps for learning rate scheduler
                 weight_decay=kwargs[
                     'weight_decay'],  # strength of weight decay
                 logging_dir='./logs',  # directory for storing logs
                 logging_steps=10,
             )
             trainer = transformers.Trainer(
                 model=self.
                 model,  # the instantiated 🤗 Transformers model to be trained
                 args=training_args,  # training arguments, defined above
                 train_dataset=train_dataset,  # training dataset
                 eval_dataset=val_dataset,  # evaluation dataset
                 compute_metrics=get_compute_metrics(
                     self._registeredMetrics
                 )  # metrics to compute while training
             )
             trainer.train()