/
bert.py
167 lines (135 loc) · 7.34 KB
/
bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os
import random
from pathlib import Path
from labels import id2label, label2id, idx_to_labels_list
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" # do this to remove gpu with full memory (MUST be before torch import)
os.environ["TOKENIZERS_PARALLELISM"] = "true" # used for disabling warning (BUT: if deadlock occurs, remove this)
from transformers import EvaluationStrategy, AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from train_transformer import train_transformer_pipeline
from util import make_reproducible, compute_metrics, get_prediction_ids
import fire
import wandb
import torch
from pprint import pprint
import pandas as pd
ROOT_DIR = Path(os.path.dirname(os.path.abspath(__file__)))
DATA_DIR = ROOT_DIR / "data"
train_transformer_pipeline(DATA_DIR) # preprocess csv files
extension = ".json"
train_path = DATA_DIR / ("train_trans" + extension)
validation_path = DATA_DIR / ("validation_trans" + extension)
test_path = DATA_DIR / ("test_trans" + extension)
def run(base_model="dbmdz/bert-base-german-uncased", fine_tuned_checkpoint_name=None,
dataset="joelito/sem_eval_2010_task_8",
input_col_name="MailComplete", label_col_name="ServiceProcessed",
num_train_epochs=100, do_train=False, do_eval=False, do_predict=True, test_set_sub_size=None, seed=42, ):
"""
Runs the specified transformer model
:param base_model: the name of the base model from huggingface transformers (e.g. roberta-base)
:param fine_tuned_checkpoint_name: the name of the fine tuned checkpoint (e.g. checkpoint-500)
:param dataset: the name of the dataset from huggingface datasets (e.g. joelito/sem_eval_2010_task_8)
:param num_train_epochs: number of epochs to train for
:param do_train: whether to train the model
:param do_eval: whether to evaluate the model in the end
:param do_predict: whether to do predictions on the test set in the end
:param test_set_sub_size: make faster by only selecting small subset, otherwise just set to False/None
:param seed: random seed for reproducibility
:return:
"""
wandb.init()
dir_path = os.path.dirname(os.path.realpath(__file__))
local_model_name = f"{dir_path}/{base_model}-local"
make_reproducible(seed)
training_args = TrainingArguments(
output_dir=f'{local_model_name}/results', # output directory
num_train_epochs=num_train_epochs, # total number of training epochs
# max_steps=10, # Set to a small positive number to test models (training is short)
per_device_train_batch_size=6, # batch size per device during training
per_device_eval_batch_size=6, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir=f'{local_model_name}/logs', # directory for storing logs
logging_steps=10,
save_steps=500,
eval_steps=250,
evaluation_strategy=EvaluationStrategy.STEPS,
seed=seed,
run_name=base_model, # used for wandb
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
)
print("Loading Dataset")
# data = load_dataset('csv', data_files={'train': [train_path], 'validation': [validation_path], 'test': [test_path]}, delimiter=";")
#data = load_dataset('json', data_files={'train': [train_path], 'validation': [validation_path], 'test': [test_path]},field="data")
data = load_dataset('json', data_files={'train': [train_path], 'validation': [train_path], 'test': [test_path]}, field="data")
model_path = base_model
if fine_tuned_checkpoint_name:
model_path = f"{training_args.output_dir}/{fine_tuned_checkpoint_name}"
print("Loading Model")
model = AutoModelForSequenceClassification.from_pretrained(model_path, id2label=id2label, label2id=label2id,
finetuning_task=dataset)
print("Loading Tokenizer")
tokenizer = AutoTokenizer.from_pretrained(base_model)
print("Tokenizing Dataset")
# supervised_keys = data['train'].supervised_keys # 'sentence' and 'relation'
data = data.map(lambda ex: tokenizer(ex[input_col_name], truncation=True, padding='max_length'),
batched=True)
data.rename_column_(original_column_name=label_col_name,
new_column_name='label') # IMPORTANT: otherwise the loss cannot be computed
data.set_format(type='pt', columns=['input_ids', 'attention_mask', 'label'], output_all_columns=True)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=data['train'], # training dataset
eval_dataset=data['validation'], # evaluation dataset
compute_metrics=compute_metrics, # additional metrics to the loss
)
if do_train:
print("Training on train set")
trainer.train()
trainer.save_model(training_args.output_dir)
# For convenience, we also re-save the tokenizer to the same directory,
if trainer.is_world_process_zero():
tokenizer.save_pretrained(training_args.output_dir)
if do_eval:
print("Evaluating on validation set")
metrics = trainer.evaluate()
print(metrics)
if do_predict:
print(f"Predicting on test set")
data['test'].remove_columns_(['label'])
if test_set_sub_size:
# IMPORTANT: This command somehow may delete some features in the dataset!
data['test'] = data['test'].select(indices=range(test_set_sub_size))
# save inputs because they will be removed by trainer.predict()
ids = data['test'][0:]['Id']
subjects = data['test'][0:]['MailSubject']
textBody = data['test'][0:]['MailTextBody']
predictions, label_ids, metrics = trainer.predict(data['test'])
# rename metrics entries to test_{} for wandb
test_metrics = {}
for old_key in metrics:
new_key = old_key.replace("eval_", "test/")
test_metrics[new_key] = metrics[old_key]
print(test_metrics)
wandb.log(test_metrics)
prediction_ids = get_prediction_ids(predictions) # get ids of predictions
predicted_labels = [idx_to_labels_list[prediction_id] for prediction_id in
prediction_ids] # get labels of predictions
# correct_labels = [idx_to_labels_list[label_id] for label_id in label_ids] # get labels of ground truth
# create submissions csv file
df = pd.DataFrame(list(zip(ids, predicted_labels)),
columns=['Id', 'Predicted'])
df.to_csv("submission6.csv", index=False)
examples = random.sample(range(data['test'].num_rows), 5) # look at five random examples from the dataset
for i in examples:
print(f"\nId: {ids[i]}")
print(f"Subject: {subjects[i]}")
print(f"Text: {textBody[i]}")
print(f"Predicted Label: {predicted_labels[i]}")
# print(f"Ground Truth Relation: {correct_labels[i]}")
if __name__ == '__main__':
fire.Fire(run)