-
Notifications
You must be signed in to change notification settings - Fork 0
/
Minitagger.py
577 lines (501 loc) · 25.9 KB
/
Minitagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
import os
import sys
import math
import time
import pickle
import random
import datetime
import subprocess
import collections
import numpy as np
import warnings
import copy
from SequenceData import SequenceData
from sklearn.metrics import confusion_matrix
from model_evaluation import report_fscore
from visualize_utils import plot_confusion_matrix
LIBLINEAR_PATH = os.path.join(os.path.dirname(__file__), "liblinear-1.96/python")
sys.path.append(os.path.abspath(LIBLINEAR_PATH))
import liblinearutil
class Minitagger(object):
"""
Represents the Minitagger model and can be used to train a classifier and make predictions.
Also it includes the active learning feature
"""
def __init__(self):
# feature extractor that is used (it is a SequenceDataFeatureExtractor object)
self.__feature_extractor = None
# it stores a trained liblinearutil
self.__liblinear_model = None
# flag in order to print more/less log messages
self.quiet = False
# path to output directory for active learning
self.active_output_path = ""
# store predictions
self.debug = False
# path to output the predictions
self.prediction_path = ""
# path to output the predictions
self.model_path = ""
# number of seed examples for active learning
self.active_seed_size = 0
# number of examples for labeling at each iteration in active learning
self.active_step_size = 0
# output actively selected examples every time this value divides their number
self.active_output_interval = 0
# path of the project
self.project_dir = "."
# language of the model
self.language = ""
# wikiner dataset
self.wikiner = ""
def equip_feature_extractor(self, feature_extractor):
"""
Equips the Minitagger with a feature extractor
@type feature_extractor: SequenceDataFeatureExtractor
@param feature_extractor: contains the feature extraction object
"""
self.__feature_extractor = feature_extractor
def set_prediction_path(self, prediction_path, embedding_size=None):
assert self.language != "", "you must set the language before using this function"
self.prediction_path = os.path.join(self.project_dir, "predictions", self.language, prediction_path+ ("_"+str(embedding_size) if embedding_size else ""))
if not os.path.isdir(self.prediction_path):
os.mkdir(self.prediction_path)
def set_model_path(self, model_path, embedding_size):
assert self.language != "", "you must set the language before using this function"
self.model_path = os.path.join(self.project_dir, "models_path", self.language, model_path + ("_"+str(embedding_size) if embedding_size else ""))
if not os.path.isdir(self.model_path):
os.mkdir(self.model_path)
def train(self, data_train, data_test):
"""
Trains Minitagger on the given train data. If test data is given, it reports the accuracy of the trained model
and the F1_score (macro average of f1_score of each label)
@type data_train: SequenceData
@param data_train: the training data set
@type data_test: SequenceData
@param data_test: the test data set
"""
# keep the training start timestamp
start_time = time.time()
assert (self.__feature_extractor.is_training), "In order to train, is_training flag should be True"
# Extract features only for labeled instances from data_train
[label_list, features_list, _] = self.__feature_extractor.extract_features(data_train, False, [])
# print some useful information about the data
if not self.quiet:
print("{0} labeled words (out of {1})".format(len(label_list), data_train.num_of_words))
print("{0} label types".format(len(data_train.label_count)))
print("{0} word types".format(len(data_train.word_count)))
print("\"{0}\" feature template".format(self.__feature_extractor.feature_template))
print("{0} feature types".format(self.__feature_extractor.num_feature_types()))
# define problem to be trained using the parameters received from the feature_extractor
problem = liblinearutil.problem(label_list, features_list)
# train the model (-q stands for quiet = True in the liblinearutil)
self.__liblinear_model = liblinearutil.train(problem, liblinearutil.parameter("-q"))
# training is done, set is_training to False, so that prediction can be done
self.__feature_extractor.is_training = False
# print some useful information
if not self.quiet:
num_seconds = int(math.ceil(time.time() - start_time))
# how much did the training last
print("Training time: {0}".format(str(datetime.timedelta(seconds=num_seconds))))
# perform prediction on the data_test and report accuracy
if data_test is not None:
quiet_value = self.quiet
self.quiet = True
pred_labels, acc = self.predict(data_test)
self.quiet = quiet_value
self.__save_prediction_to_file(data_test, pred_labels)
f1score, precision, recall = report_fscore(self.prediction_path + "/predictions.txt", wikiner=self.wikiner)
print("Accuracy: ", acc)
# create some files useful for debugging
if self.debug:
self.__debug(data_test, pred_labels)
return f1score, precision, recall
def cross_validation(self, data_train, data_test=None, n_fold=2):
"""
compute the cross validation on the data_train.
It report the f1_score of each fold and the average with the standard deviation
If data_test is given, it reports the accuracy on the test set and
the F1 score (macro average f1_score of each label)
:type data_train: SequenceData
:param data_train: the training data_set
:type data_test: SequenceData
:param data_test: the test data_set
:param n_fold: int
:type n_fold: number of fold for the CV
"""
start_time = time.time()
print("======== Cross Validation =========")
training_size = len(data_train.sequence_pairs)
assert (n_fold >= 2), "n_fold must be at least 2"
if n_fold > training_size:
n_fold = training_size
warnings.warn("n_fold can not be bigger than the size of the training set. n-fold is replace by the size "
"of the training set")
# for cross validation we want to train in quiet mode to not have x times the same information
# but we need to restore the quiet status at the end of the cross validation
quiet_status_backup = self.quiet
self.quiet = True
# data_train.sequence_pairs: [[[tokens sentence1]. [label senteces1]], [[tokens sentence2]. [label senteces12]],...]
# 1)permute randomly all sentences
np.random.seed(123456)
data_train.sequence_pairs = np.random.permutation(data_train.sequence_pairs)
# 2) do cross validation
f_score_list = []
precision_list = []
recall_list = []
test_size = int(training_size / n_fold)
if data_test is not None:
data_train.sequence_pairs = np.append(data_train.sequence_pairs, data_test.sequence_pairs, axis=0)
for k in range(n_fold):
print("--- Fold: {} ---".format(k))
test_set = copy.deepcopy(data_train)
test_set.sequence_pairs = test_set.sequence_pairs[k * test_size: (k + 1) * test_size]
train_set = copy.deepcopy(data_train)
train_set.sequence_pairs = np.append(train_set.sequence_pairs[0:k * test_size],
train_set.sequence_pairs[(k + 1) * test_size:], axis=0)
# automatilcy set to false after one training session
self.__feature_extractor.is_training = True
print('Total:', len(data_train.sequence_pairs))
print('total test: ', len(test_set.sequence_pairs))
print('total train: ', len(train_set.sequence_pairs))
fscore, precision, recall = self.train(train_set, test_set)
f_score_list.append(float("{:.3f}".format(fscore)))
precision_list.append(float("{:.3f}".format(precision)))
recall_list.append(float("{:.3f}".format(recall)))
report_fscore(self.prediction_path + "/predictions.txt", wikiner=self.wikiner)
print("F1-Score: ", f_score_list)
print("F1-score Mean: {:.3f}".format(np.mean(f_score_list)))
print("F1-score Standard deviation: {:.3f}".format(np.std(f_score_list)))
print("---")
print("Precision: ", precision_list)
print("Precision Mean: {:.3f}".format(np.mean(precision_list)))
print("Precision Standard deviation: {:.3f}".format(np.std(precision_list)))
print("---")
print("Recall: ", recall_list)
print("Recall Mean: {:.3f}".format(np.mean(recall_list)))
print("Recall Standard deviation: {:.3f}".format(np.std(recall_list)))
print("=================================")
self.quiet = quiet_status_backup
def __debug(self, data_test, pred_labels):
"""
Creates log files useful for debugging and prints a confusion matrix
@type data_test: SequenceData object
@param data_test: contains the testing data set
@type pred_labels: list
@param pred_labels: contains the prediction labels as they result from the classifier
"""
true_labels = self.__save_prediction_to_file(data_test, pred_labels)
print()
# find number of each label in the test set
max_count = 0
labels_list = list(data_test.label_count.keys())
for label in labels_list: # ["B", "I", "O"]:
count = (np.array(true_labels) == label).sum()
print("Number of " + label + " in the test set:", count)
# find most frequent class in the test data set
if count > max_count:
max_count = count
l = label
print()
# print accuracy of a naive baseline mode
baseline_accuracy = "{0:.3f}".format(float(max_count) / len(true_labels) * 100)
print("A naive model could predict always \'" + l + "\' with an accuracy of " + baseline_accuracy + "%")
print()
# create confusion matrix
cm = confusion_matrix(true_labels, pred_labels, labels_list)
print(labels_list)
for row in cm:
print(row)
# plot_confusion_matrix(cm, labels_list)
def save(self, model_path):
"""
Saves the model as a directory at the given path
@type model_path: str
@param model_path: path to save the trained model
"""
## if-else statement added on 06.02.2017
if (self.__feature_extractor.feature_template == "relational") and (
self.__feature_extractor.parser_type == "spacy"):
print("Relational model with spaCy parser cannot be saved")
else:
pickle.dump(self.__feature_extractor, open(os.path.join(self.model_path, "feature_extractor"), "wb"),
protocol=pickle.HIGHEST_PROTOCOL)
# save trained model in the model_path directory
liblinearutil.save_model(os.path.join(self.model_path, "liblinear_model"), self.__liblinear_model)
def __save_prediction_to_file(self, data_test, pred_labels):
# file to print all predictions
file_name = os.path.join(self.prediction_path, "predictions.txt")
f1 = open(file_name, "w")
# file to print only sentences that contain at least one wrong label after classification
file_name = os.path.join(self.prediction_path, "predictions_wrong.txt")
f2 = open(file_name, "w")
# file to print only sentences whose labels are predicted 100% correctly
file_name = os.path.join(self.prediction_path, "predictions_correct.txt")
f3 = open(file_name, "w")
# index for prediction label
pred_idx = 0
# list to store all true labels
true_labels = []
true_pos_tags = []
# iterate through the test set
# labels_pos: [[labels]. [pos tag]] => labels = labels_pos[0] / pos_tag = labels_pos[1]
for words, *labels_pos in data_test.sequence_pairs:
# prediction sequence for each sentence
pred_sequence = []
for i in range(len(words)):
# append label to the prediction sequence
pred_sequence.append(pred_labels[pred_idx])
# append label to the list of true labels
true_labels.append(labels_pos[0][i])
# append pos tag (if exist) to the list of true pos tag
if len(labels_pos) == 2: true_pos_tags.append(labels_pos[1][i])
# create line to print in the file
line = words[i] + " " + labels_pos[0][i] + " " + pred_labels[pred_idx] + "\n"
# write to file
f1.write(line)
pred_idx += 1
# separate sentences with empty lines
f1.write("\n")
# check if classification error occurred
if labels_pos[0] != pred_sequence:
for i in range(len(labels_pos[0])):
# create line to print to file
line = words[i] + " " + labels_pos[0][i] + " " + pred_sequence[i] + "\n"
f2.write(line)
# separate sentences with empty lines
f2.write("\n")
else:
for i in range(len(labels_pos[0])):
# create line to print to file
line = words[i] + " " + labels_pos[0][i] + " " + pred_sequence[i] + "\n"
f3.write(line)
# separate sentences with empty lines
f3.write("\n")
# close files
f1.close()
f2.close()
f3.close()
return true_labels
def load(self, model_path):
"""
Loads the model from the directory at the given path
@type model_path: str
@param model_path: path to load the trained model
"""
# load feature_extractor object (used to extract features for the test set)
## if-else statement added on 06.02.2017
# if (self.__feature_extractor.feature_template == "relational") and (self.__feature_extractor.parser_type == "spacy"):
# print("Relational model with spaCy parser cannot be loaded")
# else:
# self.__feature_extractor = pickle.load(open(os.path.join(model_path, "feature_extractor"), "rb"))
# # load trained model
# self.__liblinear_model = liblinearutil.load_model(os.path.join(model_path, "liblinear_model"))
try:
print(self.model_path)
self.__feature_extractor = pickle.load(open(os.path.join(self.model_path, "feature_extractor"), "rb"))
# load trained model
self.__liblinear_model = liblinearutil.load_model(os.path.join(self.model_path, "liblinear_model"))
except:
raise Exception("No files found in the model path " + self.model_path)
def predict(self, data_test):
"""
Predicts tags in the given data
It reports the accuracy if the data is fully labeled
@type data_test: SequenceData
@param data_test: the test data set
@return: the predicted labels, the accuracy, the f1_score
"""
# keep starting timestamp
start_time = time.time()
assert (not self.__feature_extractor.is_training), "In order to predict, is_training should be False"
# Extract features on all instances (labeled or unlabeled) of the test set
[label_list, features_list, _] = self.__feature_extractor.extract_features(data_test, True, [])
# pass them to liblinearutil for prediction
pred_labels, (acc, _, _), _ = liblinearutil.predict(label_list, features_list, self.__liblinear_model, "-q")
# print some useful information
if not self.quiet:
num_seconds = int(math.ceil(time.time() - start_time))
# estimate prediction time
print("Prediction time: {0}".format(str(datetime.timedelta(seconds=num_seconds))))
# report accuracy if the data is fully labeled
if not data_test.is_partially_labeled:
print("Per-instance accuracy: {0:.3f}%".format(acc))
else:
print("Not reporting accuracy: test data is not fully labeled")
# convert predicted labels from integer IDs to strings.
for i, label in enumerate(pred_labels):
pred_labels[i] = self.__feature_extractor.get_label_string(label)
return pred_labels, acc
def __find_most_frequent_words(self, data_train):
"""
Computes the (active_seed_size) most frequent word types in data_train
@type data_train: SequenceData
@param data_train: t# Train for the first time.he training data set
@return: the X most frequent words (X = active_seed_size) in train set
"""
# data_train.word_count is a dictionary with key = word , value = frequency of word in the train set
# sort dictionary in descending order
sorted_wordcount_pairs = sorted(data_train.word_count.items(),
key=lambda type_count: type_count[1], reverse=True)
# take the X most frequent words (X = active_seed_size)
seed_wordtypes = [wordtype for wordtype, _ in sorted_wordcount_pairs[:self.active_seed_size]]
return seed_wordtypes
def __find_frequent_word_locations(self, data_train, seed_wordtypes):
"""
Finds random locations of the most frequent words
@type data_train: SequenceData
@param data_train: train data set
@type seed_wordtypes: list
@param seed_wordtypes: list of the X most frequent words (X = active_seed_size) in train set
@return: a random location for each word in the seed_wordtypes
each location is a tuple of the form (sequence number, position of the word in the sequence)
"""
occurring_locations = collections.defaultdict(list)
# iterate through all sequences and words in the train set
for sequence_num, (word_sequence, _) in enumerate(data_train.sequence_pairs):
for position, word in enumerate(word_sequence):
# append location is the current word is one of the seed_wordtypes
if word in seed_wordtypes:
occurring_locations[word].append((sequence_num, position))
# take one random position for each word
locations = [random.sample(occurring_locations[wordtype], 1)[0] for wordtype in seed_wordtypes]
return locations
def __make_data_from_locations(self, data_train, locations, skip_extraction):
"""
Makes SequenceData out of a subset of data_train from given location=(sequence_num, position) pairs
@type data_train: SequenceData
@param data_train: the train data set
@type locations: list
@param locations: list of tuples of locations for each one of the X most frequent words
(X = active_seed_size) in train set
@return: a SequenceData object. It contains all words in the sequences. For words corresponding to locations,
the true labels are used. Otherwise, the label is None
"""
# find all selected positions for each sequence
selected_positions = collections.defaultdict(list)
for (sequence_num, position) in locations:
selected_positions[sequence_num].append(position)
sequence_list = []
for sequence_num in selected_positions:
word_sequence, label_sequence = data_train.sequence_pairs[sequence_num]
# initialize all labels to None
selected_labels = [None for _ in range(len(word_sequence))]
# take the right label for the words in the selected positions
for position in selected_positions[sequence_num]:
selected_labels[position] = label_sequence[position]
# skip each word that corresponds to a position in the locations list
# this example will not be selected again
skip_extraction[sequence_num][position] = True
# the sequence_list contains all words in a sequence. For words corresponding to locations, the true labels
# are used. Otherwise, the label is None
sequence_list.append((word_sequence, selected_labels))
# make a SequenceData object using the sequence_list
selected_data = SequenceData(sequence_list)
return selected_data
def __train_silently(self, data_selected):
"""
Trains on the selected data in silent mode
@type data_selected: SequenceData
@param data_selected: the selected subset of the training data set. Some words have correct labels
and other words have None labels
"""
# reset for training
self.__feature_extractor.is_training = True
quiet_value = self.quiet
self.quiet = True
# no need for test set here.
self.train(data_selected, None)
self.quiet = quiet_value
def __interval_report(self, data_selected, data_test, logfile):
"""
Reports accuracy in the specified interval
@type data_selected: SequenceData
@param data_selected: selected data based on locations
@type data_test: SequenceData
@param data_test: test data set
@type logfile: file
@param logfile: file used for log messages
@return: None if it is not time to report output yet
"""
# report only at each interval
if data_selected.num_labeled_words % self.active_output_interval != 0:
return
# test on the development data if any available
if data_test is not None:
quiet_value = self.quiet
self.quiet = True
# make prediction and return accuracy
_, acc = self.predict(data_test)
self.quiet = quiet_value
message = "{0} labels: {1:.3f}%".format(data_selected.num_labeled_words, acc)
print(message)
logfile.write(message + "\n")
logfile.flush()
# Output the selected labeled examples so far.
file_name = os.path.join(self.active_output_path, "example" + str(data_selected.num_labeled_words))
with open(file_name, "w") as outfile:
outfile.write(data_selected.__str__())
def __find_confidence_index_pairs(self, confidence_index_pairs, scores_list):
"""
Estimates the confidence index pairs
@type confidence_index_pairs:list
@param confidence_index_pairs: list of tuples like (confidence, index)
@type: list
@param scores_list: list of scores for each word
"""
for index, scores in enumerate(scores_list):
sorted_scores = sorted(scores, reverse=True)
# handle the binary case
# liblinear gives only 1 score whose sign indicates the class (+ versus -)
confidence = sorted_scores[0] - sorted_scores[1] if len(scores) > 1 else abs(scores[0])
confidence_index_pairs.append((confidence, index))
def train_actively(self, data_train, data_test):
"""
Does margin-based active learning on the given data
@type data_train: list
@param data_train: list of training data set
@type data_test: list
@param data_test: list of test data set
"""
# for active learning, every data point (word) in the data_train should be labeled
assert (not data_train.is_partially_labeled), "for active learning, every data point (word) " \
"in the data_train should be labeled"
# keep track of which examples can be still selected for labeling.
__skip_extraction = []
# initialize __skip_extraction to False for every label in every sequence,
# i.e. nothing is skipped at the beginning
for _, label_sequence in data_train.sequence_pairs:
__skip_extraction.append([False for _ in label_sequence])
# create an output directory
if os.path.exists(self.active_output_path):
subprocess.check_output(["rm", "-rf", self.active_output_path])
os.makedirs(self.active_output_path)
logfile = open(os.path.join(self.active_output_path, "log"), "w")
# take the X most frequent words (X = active_seed_size)
seed_wordtypes = self.__find_most_frequent_words(data_train)
# select a random location (sequence number, position in the sequence) of each selected type for a seed example
locations = self.__find_frequent_word_locations(data_train, seed_wordtypes)
# build a SequenceData object from the selected locations
data_selected = self.__make_data_from_locations(data_train, locations, __skip_extraction)
# train for the first time
self.__train_silently(data_selected)
self.__interval_report(data_selected, data_test, logfile)
while len(locations) < data_train.num_labeled_words:
# extract features for the remaining (i.e. not in the skip list) labeled examples
[label_list, features_list, location_list] = self.__feature_extractor.extract_features(
data_train, False, __skip_extraction)
# make predictions on the remaining (i.e. not in the skip list) labeled examples
_, _, scores_list = liblinearutil.predict(label_list, features_list, self.__liblinear_model, "-q")
# Compute "confidence" of each prediction:
# max_{y} score(x,y) - max_{y'!=argmax_{y} score(x,y)} score(x,y')
confidence_index_pairs = []
self.__find_confidence_index_pairs(confidence_index_pairs, scores_list)
# Select least confident examples for next labeling.
confidence_index_pairs.sort()
for _, index in confidence_index_pairs[:self.active_step_size]:
locations.append(location_list[index])
data_selected = self.__make_data_from_locations(data_train, locations, __skip_extraction)
self.__train_silently(data_selected) # Train from scratch.
self.__interval_report(data_selected, data_test, logfile)
logfile.close()