def perc_train(train_data, tagset, numepochs): """ :current_global_vector: a dict of features for the predicted labels :gold_global_vector: a dict of features for the standard """ feat_vec = defaultdict(int) avg_vec = defaultdict(int) default_tag = tagset[0] for t in range(numepochs): error_num = 0 for (labeled_list, feat_list) in train_data: std_labels = get_labels(labeled_list) output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) if std_labels != output: error_num += 1 gold_global_vector = get_global_vector(std_labels, feat_list) current_global_vector = get_global_vector(output, feat_list) add_vector(feat_vec, gold_global_vector, 1) add_vector(feat_vec, current_global_vector, -1) print >> sys.stderr, "Epoch", t + 1, "done. # of incorrect sentences: ", error_num # Supposedly we should average over all epoch * len(train_data) feature vectors, # but that would lead to too many long-vector additions and is painfully slow. add_vector(avg_vec, feat_vec, 1) perc.perc_write_to_file( {key: float(avg_vec[key]) / (t + 1) for key in avg_vec}, opts.modelfile + str(t)) return {key: float(avg_vec[key]) / numepochs for key in avg_vec}
def perc_train(train_data, tagset, numepochs, pos_dict): feat_vec = defaultdict(int) # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") numepochs = int(20) default_tag = tagset[0] for t in range(numepochs): tmp = 0 # Count sentence print 'Iteration#',t,' is processing now.' for (labeled_list, feat_list) in train_data: labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag, pos_dict) # compare current output and true result # correct_flag = True feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") fields = labels[i].split() label = fields[2] if i > 0: label_pre = labels[i-1].split()[2] if output[i-1] is not label_pre or output[i] != label: for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 feat_vec[feat_out, label] = feat_vec[feat_out, label] + 1 feat_vec[feat_lab, output[i]] = feat_vec[feat_lab, output[i]] - 1 feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 else: # for i==0 case, all the first word in each sentence label_pre = 'B_-1' # previous label will be denoted by B_-1 for feat in feats: if feat[0] == 'B': # bigram feature case feat = feat + ":" + label_pre feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 perc.perc_write_to_file(feat_vec, 'model_' + str(t)) # please limit the number of iterations of training to n iterations return feat_vec
def perc_train(train_data, tagset, n): feat_vec = defaultdict(int) feat_avg_vec = defaultdict(int) # insert your code here # please limit the number of iterations of training to n iterations default_tag = tagset[0] # tag any word with 'B-NP' in the beginning num_sentence = len(train_data) num_words = 0 count = 0 for iteration in range(n): sent_index = 0 for sentence in train_data: #sentence = (labeled_list, feat_list) for each sentence sent_index += 1 print '{0}\r'.format("\rIteration: %d/%d. Sentence: %d/%d\t" % (iteration + 1, n, sent_index, num_sentence)), (labeled_list, feat_list) = sentence num_words += len(labeled_list) #compute tags based on current weights estimated_tags = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) #the target 'right' tag list standard_tags = [item.split()[2] for item in labeled_list] if estimated_tags != standard_tags: st_prev = es_prev = 'B_-1' index = 0 #reference: http://gul.gu.se/public/pp/public_courses/course38351/published/1360057354030/resourceId/19456476/content/9adb1f1e-52e4-48b4-8001-ada93be18089/9adb1f1e-52e4-48b4-8001-ada93be18089.html step = (n * num_sentence - count) * 1.0 / (n * num_sentence) for (st_tag, es_tag) in zip(standard_tags, estimated_tags): (index, feats) = perc.feats_for_word(index, feat_list) for feat in feats: #deal with feat B: according to the given output example. if feat == 'B': if st_prev != es_prev or st_tag != es_tag: feat_vec[('B:' + es_prev, es_tag)] -= 1 feat_vec[('B:' + st_prev, st_tag)] += 1 feat_avg_vec[('B:' + es_prev, es_tag)] -= step feat_avg_vec[('B:' + st_prev, st_tag)] += step es_prev = es_tag st_prev = st_tag else: if st_tag != es_tag: feat_vec[(feat, es_tag)] -= 1 feat_vec[(feat, st_tag)] += 1 feat_avg_vec[(feat, es_tag)] -= step feat_avg_vec[(feat, st_tag)] += step count += 1 perc.perc_write_to_file( feat_avg_vec, 'models/n' + str(iteration) + 'avg_params.model') return feat_avg_vec
def perc_train(train_data, tagset, numepochs): """ :current_global_vector: a dict of features for the predicted labels :gold_global_vector: a dict of features for the standard """ feat_vec = defaultdict(int) #for t in range(numepochs): default_tag = tagset[0] for t in range(numepochs): for (labeled_list, feat_list) in train_data: std_labels = get_labels(labeled_list) output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) gold_global_vector = get_global_vector(std_labels, feat_list) current_global_vector = get_global_vector(output, feat_list) add_vector(feat_vec, gold_global_vector, 1) add_vector(feat_vec, current_global_vector, -1) perc.perc_write_to_file(feat_vec, opts.modelfile + str(t)) return feat_vec
help= "precomputed features for the input data, i.e. the values of \phi(x,_) without y" ) optparser.add_option( "-e", "--numepochs", dest="numepochs", default=int(10), help= "number of epochs of training; in each epoch we iterate over over all the training examples" ) optparser.add_option("-m", "--modelfile", dest="modelfile", default=os.path.join("data", "default.model"), help="weights for all features stored on disk") (opts, _) = optparser.parse_args() # each element in the feat_vec dictionary is: # key=feature_id value=weight feat_vec = {} tagset = [] train_data = [] tagset = perc.read_tagset(opts.tagsetfile) print >> sys.stderr, "reading data ..." train_data = perc.read_labeled_data(opts.trainfile, opts.featfile) print >> sys.stderr, "done." feat_vec = perc_train(train_data, tagset, int(opts.numepochs)) perc.perc_write_to_file(feat_vec, opts.modelfile)
from collections import defaultdict def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) # insert your code here # please limit the number of iterations of training to n iterations return feat_vec if __name__ == '__main__': optparser = optparse.OptionParser() optparser.add_option("-t", "--tagsetfile", dest="tagsetfile", default=os.path.join("data", "tagset.txt"), help="tagset that contains all the labels produced in the output, i.e. the y in \phi(x,y)") optparser.add_option("-i", "--trainfile", dest="trainfile", default=os.path.join("data", "train.txt.gz"), help="input data, i.e. the x in \phi(x,y)") optparser.add_option("-f", "--featfile", dest="featfile", default=os.path.join("data", "train.feats.gz"), help="precomputed features for the input data, i.e. the values of \phi(x,_) without y") optparser.add_option("-e", "--numepochs", dest="numepochs", default=int(10), help="number of epochs of training; in each epoch we iterate over over all the training examples") optparser.add_option("-m", "--modelfile", dest="modelfile", default=os.path.join("data", "default.model"), help="weights for all features stored on disk") (opts, _) = optparser.parse_args() # each element in the feat_vec dictionary is: # key=feature_id value=weight feat_vec = {} tagset = [] train_data = [] tagset = perc.read_tagset(opts.tagsetfile) print >>sys.stderr, "reading data ..." train_data = perc.read_labeled_data(opts.trainfile, opts.featfile) print >>sys.stderr, "done." feat_vec = perc_train(train_data, tagset, int(opts.numepochs)) perc.perc_write_to_file(feat_vec, opts.modelfile)
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(float) avg_feat_vec = defaultdict(float) tau_feat_vec = dict() # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") default_tag = tagset[0] m = len(train_data) # length of training data for t in range(numepochs): print 'Iteration#',t,' is processing now.' for j, (labeled_list, feat_list) in enumerate(train_data): labels = copy.deepcopy(labeled_list) # print 'sentence[',j,']' # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) # compare current output and true result if j != m - 1 or t != numepochs - 1: feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") label = labels[i].split()[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i] != label or feat_out != feat_lab: # laze update the tau vector value lazy_update_vect(feat_out, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat_lab, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # update original feature vector, if feat_out == feat_lab perform 2nd type updating update_bigram_vect(feat_vec, avg_feat_vec, feat_out, feat_lab, output[i], label) # if feat_out == feat_lab then update twice for the same tau tau_feat_vec[feat_out, output[i]] = (j, t) tau_feat_vec[feat_lab, label] = (j, t) elif output[i] != label: lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # for U00 to U22 feature update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) # update vector tau_feat_vec[feat, output[i]] = (j, t) tau_feat_vec[feat, label] = (j, t) else: # for i==0 case, all the first word in each sentence label_pre = '_B-2' # previous label will be denoted by _B-2 for feat in feats: if feat[0] == 'B' and output[i] != label: # bigram feature case feat = feat + ":" + label_pre lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) update_bigram_vect(feat_vec, avg_feat_vec, feat, feat, output[i], label) tau_feat_vec[feat, label] = (j, t) tau_feat_vec[feat, output[i]] = (j, t) elif output[i] != label: lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # for U00 to U22 feature update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) tau_feat_vec[feat, output[i]] = (j, t) tau_feat_vec[feat, label] = (j, t) else: final_lazy_update_vect(tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # special case for the last sentence feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") label = labels[i].split()[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i] != label: # update original feature vector update_bigram_vect(feat_vec, avg_feat_vec, feat_out, feat_lab, output[i], label) elif output[i] != label: update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) else: # for i==0 case, all the first word in each sentence label_pre = '_B-1' # previous label will be denoted by _B-2 for feat in feats: if feat[0] == 'B' and output[i] != label: # bigram feature case feat = feat + ":" + label_pre update_bigram_vect(feat_vec, avg_feat_vec, feat, feat, output[i], label) elif output[i] != label: # for U00 to U22 feature update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) # end of iteration # averaging perceptron for key in avg_feat_vec.keys(): avg_feat_vec[key] = avg_feat_vec[key]/float(numepochs*m) # please limit the number of iterations of training to n iterations perc.perc_write_to_file(feat_vec, 'model_feat_vec') return avg_feat_vec
def perc_train(train_data, tagset, numepochs, word_set): feat_vec = defaultdict(int) # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") # numepochs = int(50) default_tag = tagset[0] for t in range(numepochs): tmp = 0 # Count sentence print 'Iteration#',t,' is processing now.' cnt = 0 for (labeled_list, feat_list) in train_data: cnt = cnt + 1 if cnt % 1000 == 0: print 'current status: ', str(round(100*cnt/9000.0,2)),'%' labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag, word_set) feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") fields = labels[i].split() label = fields[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i-1] != label_pre and output[i] != label: feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, output[i]] -= 1 feat_vec[feat_out, label] += 1 feat_vec[feat_lab, label] += 1 elif output[i-1] == label_pre and output[i] != label: feat_vec[feat_lab, output[i]] -= 2 feat_vec[feat_lab, label] += 2 elif output[i-1] != label_pre and output[i] == label: pass elif output[i-1] == label_pre and output[i] == label: pass # feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 # feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # feat_vec[feat_out, label] = feat_vec[feat_out, label] + 1 # feat_vec[feat_lab, output[i]] = feat_vec[feat_lab, output[i]] - 1 else: # for U00 to U22 feature # if the condition is not right, there will be no penaulty and rewarding feat_vec[feat, output[i]] -= 1 feat_vec[feat, label] += 1 else: # for i==0 case, all the first word in each sentence label_pre = '_B-1' # previous label will be denoted by _B-1 for feat in feats: if feat[0] == 'B': # bigram feature case feat = feat + ":" + label_pre feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 # if i > 0: # label_pre = labels[i-1].split()[2] # if output[i-1] != label_pre or output[i] != label: # for feat in feats: # if feat[0] == 'B': # # for bigram feature # feat_out = feat + ":" + output[i-1] # # feat_out is the "B:<previous output>" # feat_lab = feat + ":" + label_pre # # feat_lab is the "B:<previous label>" # # reward best condition # feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # # penalize condition # feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 # else: # # for U00 to U22 feature # feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 # feat_vec[feat, label] = feat_vec[feat, label] + 1 # else: # # for i==0 case, all the first word in each sentence # label_pre = '_B-1' # previous label will be denoted by _B-1 # for feat in feats: # if feat[0] == 'B': # # bigram feature case # feat = feat + ":" + label_pre # feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 # feat_vec[feat, label] = feat_vec[feat, label] + 1 if t % 3 == 0: perc.perc_write_to_file(feat_vec, 'model_' + str(t)) perc.perc_write_to_file(feat_vec, 'model') os.system('python perc.py -m model | python score-chunks.py') # please limit the number of iterations of training to n iterations return feat_vec
if __name__ == '__main__': optparser = optparse.OptionParser() optparser.add_option("-t", "--tagsetfile", dest="tagsetfile", default=os.path.join("data", "tagset.txt"), help="tagset that contains all the labels produced in the output, i.e. the y in \phi(x,y)") optparser.add_option("-i", "--trainfile", dest="trainfile", default=os.path.join("data", "train.txt.gz"), help="input data, i.e. the x in \phi(x,y)") optparser.add_option("-f", "--featfile", dest="featfile", default=os.path.join("data", "train.feats.gz"), help="precomputed features for the input data, i.e. the values of \phi(x,_) without y") # optparser.add_option("-i", "--trainfile", dest="trainfile", default=os.path.join("data", "train.dev"), help="input data, i.e. the x in \phi(x,y)") # optparser.add_option("-f", "--featfile", dest="featfile", default=os.path.join("data", "train.feats.dev"), help="precomputed features for the input data, i.e. the values of \phi(x,_) without y") optparser.add_option("-e", "--numepochs", dest="numepochs", default=int(10), help="number of epochs of training; in each epoch we iterate over over all the training examples") optparser.add_option("-m", "--modelfile", dest="modelfile", default=os.path.join("data", "default.model"), help="weights for all features stored on disk") optparser.add_option("-w", "--wordsetfile", dest="wordsetfile", default=os.path.join("data", "word_set"), help="the word set write to disk") (opts, _) = optparser.parse_args() # each element in the feat_vec dictionary is: # key=feature_id value=weight feat_vec = {} # format: {('U14:VBG','B-VP'):w1, ...} tagset = [] train_data = [] tagset = perc.read_tagset(opts.tagsetfile) print >>sys.stderr, "reading data ..." data = perc.read_labeled_data(opts.trainfile, opts.featfile) word_set = data[0] perc.perc_write_to_file(word_set, opts.wordsetfile) train_data = data[1] print >>sys.stderr, "done." feat_vec = perc_train(train_data, tagset, int(opts.numepochs), word_set) perc.perc_write_to_file(feat_vec, opts.modelfile)
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") numepochs = int(1) default_tag = tagset[0] for t in range(numepochs): tmp = 0 # Count sentence print 'Iteration#',t,' is processing now.' cnt = 0 for (labeled_list, feat_list) in train_data: cnt = cnt + 1 print 'Sentence[',cnt,'] is now processing...' labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") label = labels[i].split()[2] if i > 1: label_i_1 = labels[i-1].split()[2] label_i_2 = labels[i-2].split()[2] if output[i] != label: for feat in feats: if feat[0] == 'T' and output[i-2] != label_i_2 and output[i-1] != label_i_1: # trigram case feat_out = feat + ":" + output[i-2] + "," + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_i_2 + "," + label_i_1 # feat_lab is the "B:<previous label>" # reward best condition feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # penalize condition feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 elif feat[0] == 'B' and output[i-1] != label_i_1: # bigram case feat_out = feat + ":" + output[i-1] feat_lab = feat + ":" + label_i_1 feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 elif i == 1: # for i==0 case, all the first word in each sentence label_i_2 = '_-1' # previous label will be denoted by B_-1 label_i_1 = labels[i-1].split()[2] if output[i] != label: for feat in feats: if feat[0] == 'T' and output[i-1] != label_i_1: # trigram case feat_out = feat + ":" + label_i_2 + "," + output[i-1] feat_lab = feat + ":" + label_i_2 + "," + label_i_1 # reward best condition feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # penalize condition feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 elif feat[0] == 'B': feat_out = feat + ":" + output[i-1] feat_lab = feat + ":" + label_i_1 feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 elif i == 0: label_i_2 = '_B-2' label_i_1 = '_B-1' if output[i] != label: for feat in feats: if feat[0] == 'T': # trigram case feat = feat + ":" + label_i_2 + "," + label_i_1 elif feat[0] == 'B': #bigram case feat = feat + ":" + label_i_1 feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 filename = 'mid_model_iter' + str(t) perc.perc_write_to_file(feat_vec, filename) for (k1, k2), v in feat_vec.items(): if v == 0: del feat_vec[k1,k2] # please limit the number of iterations of training to n iterations return feat_vec