-
Notifications
You must be signed in to change notification settings - Fork 2
/
AvgPerceptronChunker.py
94 lines (88 loc) · 6.44 KB
/
AvgPerceptronChunker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
You have to write the perc_train function that trains the feature weights using the perceptron algorithm for the CoNLL 2000 chunking task.
Each element of train_data is a (labeled_list, feat_list) pair.
Inside the perceptron training loop:
- Call perc_test to get the tagging based on the current feat_vec and compare it with the true output from the labeled_list
- If the output is incorrect then we have to update feat_vec (the weight vector)
- In the notation used in the paper we have w = w_0, w_1, ..., w_n corresponding to \phi_0(x,y), \phi_1(x,y), ..., \phi_n(x,y)
- Instead of indexing each feature with an integer we index each feature using a string we called feature_id
- The feature_id is constructed using the elements of feat_list (which correspond to x above) combined with the output tag (which correspond to y above)
- The function perc_test shows how the feature_id is constructed for each word in the input, including the bigram feature "B:" which is a special case
- feat_vec[feature_id] is the weight associated with feature_id
- This dictionary lookup lets us implement a sparse vector dot product where any feature_id not used in a particular example does not participate in the dot product
- To save space and time make sure you do not store zero values in the feat_vec dictionary which can happen if \phi(x_i,y_i) - \phi(x_i,y_{perc_test}) results in a zero value
- If you are going word by word to check if the predicted tag is equal to the true tag, there is a corner case where the bigram 'T_{i-1} T_i' is incorrect even though T_i is correct.
"""
import perc
import sys, optparse, os, random
from collections import defaultdict
def perc_train(train_data, tagset, numepochs):
# perceptron train
T = float(len(train_data))
step = numepochs*T
feat_vec_cache = defaultdict(int)
# feat_vec stores the weights for the features of a sentence, initially all weights are 0
feat_vec = defaultdict(int)
# default_tag = 'B-NP'
default_tag = tagset[0]
# for each epoch/iteration
for i in range(0, numepochs):
# for each item (e.g tuple=([labeled words for each sentence],[features for those words of sentence])) in train_data
for (label_list, feat_list) in train_data:
# cur = list of best tag for each word in sentence found using viterbi algo
cur = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag)
# gold = list of reference/true tag for each word in sentence
gold = [entry.split()[2] for entry in label_list]
if cur != gold:
cur.insert(0, 'B_-1')
gold.insert(0, 'B_-1')
cur.append('B_+1')
gold.append('B_+1')
cur_len = len(cur)
gold_len = len(gold)
if cur_len != gold_len:
raise ValueError("output length is not the same with the input sentence")
feat_index = 0
# perceptron update
# for each tag/word of a sentence
for i in range(1, cur_len):
# for each word in a sentence, (feat_index, features) is a tuple, where feat_index=endindex of the list of features for that word, and features=list of features for that word
(feat_index, features) = perc.feats_for_word(feat_index, feat_list)
# update the weights of the features for that word, by rewarding the features seen in reference, while penalizing the ones not seen in reference but returned by viterbi
for f in features:
feat_vec[(f,cur[i])] = feat_vec[(f,cur[i])] - 1
feat_vec[(f,gold[i])] = feat_vec[(f,gold[i])] + 1
# averaged perceptron
# usual way of averaging over all intermediate weight vectors is:
# w = (w0 + w1 + w2 + ...... + wt) / (numepochs * T)
# But we can also average in an efficient way:
# w = w1*(step/numepochs*T) + w2*(step-1/numepochs*T) + w3*(step-2/numepochs*T) + ...... + wt*(1/numepochs*T)
feat_vec_cache[(f, cur[i])] = feat_vec_cache[(f, cur[i])] - 1*(float(step/numepochs*T))
feat_vec_cache[(f, gold[i])] = feat_vec_cache[(f, gold[i])] + 1*(float(step/numepochs*T))
step-=1
print >>sys.stderr, "iteration %d done."%i
return feat_vec_cache
if __name__ == '__main__':
optparser = optparse.OptionParser()
optparser.add_option("-t", "--tagsetfile", dest="tagsetfile", default=os.path.join("data", "tagset.txt"), help="tagset that contains all the labels produced in the output, i.e. the y in \phi(x,y)")
optparser.add_option("-i", "--trainfile", dest="trainfile", default=os.path.join("data", "train.txt.gz"), help="input data, i.e. the x in \phi(x,y)")
optparser.add_option("-f", "--featfile", dest="featfile", default=os.path.join("data", "train.feats.gz"), help="precomputed features for the input data, i.e. the values of \phi(x,_) without y")
optparser.add_option("-e", "--numepochs", dest="numepochs", default=int(1), help="number of epochs of training; in each epoch we iterate over over all the training examples")
optparser.add_option("-m", "--modelfile", dest="modelfile", default=os.path.join("data", "default.model"), help="weights for all features stored on disk")
(opts, _) = optparser.parse_args()
# each element in the feat_vec dictionary is:
# key=feature_id value=weight
feat_vec = {}
tagset = []
train_data = []
# tagset contains list of the tags in tagset.txt
# ['B-NP', 'I-NP', 'O', 'B-VP', 'B-PP', 'I-VP', 'B-ADVP', 'B-SBAR', 'B-ADJP', 'I-ADJP', 'B-PRT', 'I-ADVP', 'I-PP', 'I-CONJP', 'I-SBAR', 'B-CONJP', 'B-INTJ', 'B-LST', 'I-INTJ', 'I-UCP', 'I-PRT', 'I-LST', 'B-UCP']
tagset = perc.read_tagset(opts.tagsetfile)
print >>sys.stderr, "reading data ..."
# opts.trainfile contains the labeled training data with word, POS, chunk tag in each line
# opts.featfile contain unigram and bigram features with their feature id
# train_data is a list of 8936 tuples, where each tuple=([list of labeled words making up each sentence],[list of features for those words of that sentence])
train_data = perc.read_labeled_data(opts.trainfile, opts.featfile)
print >>sys.stderr, "done."
feat_vec = perc_train(train_data, tagset, int(opts.numepochs))
perc.perc_write_to_file(feat_vec, opts.modelfile)