def get_feature_ids(feat_list, gold_tags, output_tags): feat_ids_gold = [] feat_ids_output = [] # Get list of list of features - each sublist corresponds to a word feat_index = 0 feat_list_by_words = [] for i in range(len(gold_tags)): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) feat_list_by_words.append(feats) # For each feature sublist, create a list of feature IDs ((feature, tag)). # One set based on gold tags, another based on argmax tags j = 0 for sublist in feat_list_by_words: gold_sublist = [] out_sublist = [] for f in sublist: # Feature based on bigrams of output tags if f == 'B' and j > 0: curr_feat_g = '{0}:{1}'.format(f, gold_tags[j - 1]) curr_feat_o = '{0}:{1}'.format(f, output_tags[j - 1]) else: curr_feat_g = f curr_feat_o = f gold_sublist.append((curr_feat_g, gold_tags[j])) out_sublist.append((curr_feat_o, output_tags[j])) feat_ids_gold.append(gold_sublist) feat_ids_output.append(out_sublist) j += 1 return feat_ids_gold, feat_ids_output
def perc_train(train_data, tagset, numepochs): print len(train_data) feat_vec = defaultdict(int) defaultTag = tagset[0] for i in range(numepochs): print i k = 0 feat_index = 0 for (labeled_list, feat_list) in train_data: if k % 100 == 0: print " ", k k += 1 z = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, defaultTag) # get the augmented labels and feats for the word labels = copy.deepcopy(labeled_list) (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) labels.insert(0, '_B-1 _B-1 _B-1') z.insert(0, '_B-1') # update weights when t != labels[j] N = len(labels) for j in range(1, N - 1): if x(labels, j, 2) != z[j]: updateWeights(feat_vec, labels, z, j, feats) return feat_vec
def perc_train(train_data, tagset, numepochs): # perceptron train feat_vec = defaultdict(int) default_tag = tagset[0] for i in range(0, numepochs): for (label_list, feat_list) in train_data: cur = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) gold = [entry.split()[2] for entry in label_list] if cur != gold: cur.insert(0, 'B_-1') gold.insert(0, 'B_-1') cur.append('B_+1') gold.append('B_+1') cur_len = len(cur) gold_len = len(gold) if cur_len != gold_len: raise ValueError( "output length is not the same with the input sentence" ) feat_index = 0 # perceptron update for i in range(1, cur_len): (feat_index, features) = perc.feats_for_word(feat_index, feat_list) for f in features: feat_vec[(f, cur[i])] = feat_vec[(f, cur[i])] - 1 feat_vec[(f, gold[i])] = feat_vec[(f, gold[i])] + 1 print >> sys.stderr, "iteration %d done." % i return feat_vec
def perc_train(train_data, tagset, numepochs, pos_dict): feat_vec = defaultdict(int) # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") numepochs = int(20) default_tag = tagset[0] for t in range(numepochs): tmp = 0 # Count sentence print 'Iteration#',t,' is processing now.' for (labeled_list, feat_list) in train_data: labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag, pos_dict) # compare current output and true result # correct_flag = True feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") fields = labels[i].split() label = fields[2] if i > 0: label_pre = labels[i-1].split()[2] if output[i-1] is not label_pre or output[i] != label: for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 feat_vec[feat_out, label] = feat_vec[feat_out, label] + 1 feat_vec[feat_lab, output[i]] = feat_vec[feat_lab, output[i]] - 1 feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 else: # for i==0 case, all the first word in each sentence label_pre = 'B_-1' # previous label will be denoted by B_-1 for feat in feats: if feat[0] == 'B': # bigram feature case feat = feat + ":" + label_pre feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 perc.perc_write_to_file(feat_vec, 'model_' + str(t)) # please limit the number of iterations of training to n iterations return feat_vec
def global_feature_vector(feat_list, tag_list): vec = defaultdict(int) feat_index = 0 for i in range(0, len(tag_list)): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) for feat in feats: vec[(feat, tag_list[i])] += 1 return vec
def perc_train(train_data, tagset, n): feat_vec = defaultdict(int) feat_avg_vec = defaultdict(int) # insert your code here # please limit the number of iterations of training to n iterations default_tag = tagset[0] # tag any word with 'B-NP' in the beginning num_sentence = len(train_data) num_words = 0 count = 0 for iteration in range(n): sent_index = 0 for sentence in train_data: #sentence = (labeled_list, feat_list) for each sentence sent_index += 1 print '{0}\r'.format("\rIteration: %d/%d. Sentence: %d/%d\t" % (iteration + 1, n, sent_index, num_sentence)), (labeled_list, feat_list) = sentence num_words += len(labeled_list) #compute tags based on current weights estimated_tags = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) #the target 'right' tag list standard_tags = [item.split()[2] for item in labeled_list] if estimated_tags != standard_tags: st_prev = es_prev = 'B_-1' index = 0 #reference: http://gul.gu.se/public/pp/public_courses/course38351/published/1360057354030/resourceId/19456476/content/9adb1f1e-52e4-48b4-8001-ada93be18089/9adb1f1e-52e4-48b4-8001-ada93be18089.html step = (n * num_sentence - count) * 1.0 / (n * num_sentence) for (st_tag, es_tag) in zip(standard_tags, estimated_tags): (index, feats) = perc.feats_for_word(index, feat_list) for feat in feats: #deal with feat B: according to the given output example. if feat == 'B': if st_prev != es_prev or st_tag != es_tag: feat_vec[('B:' + es_prev, es_tag)] -= 1 feat_vec[('B:' + st_prev, st_tag)] += 1 feat_avg_vec[('B:' + es_prev, es_tag)] -= step feat_avg_vec[('B:' + st_prev, st_tag)] += step es_prev = es_tag st_prev = st_tag else: if st_tag != es_tag: feat_vec[(feat, es_tag)] -= 1 feat_vec[(feat, st_tag)] += 1 feat_avg_vec[(feat, es_tag)] -= step feat_avg_vec[(feat, st_tag)] += step count += 1 perc.perc_write_to_file( feat_avg_vec, 'models/n' + str(iteration) + 'avg_params.model') return feat_avg_vec
def perc_train(train_data, tagset, n): # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") default_tag = tagset[0] feat_vec = defaultdict(int) epochs = n for round in range(0, epochs): num_mistakes = 0 for (labeled_list, feat_list) in train_data: output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) true_output = get_truth(labeled_list) logging.info("arg max output: %s" % (" ".join(output))) logging.info("truth: %s" % (" ".join(true_output))) #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output if output != true_output: num_mistakes += 1 output.insert(0, 'B_-1') output.append('B_+1') true_output.insert(0, 'B_-1') true_output.append('B_+1') feat_index = 0 for i in range(1, len(output) - 1): #print >>sys.stderr, output[i], true_output[i] (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) if len(feats) == 0: print >> sys.stderr, " ".join(labeled_list), " ".join( feat_list), "\n" raise ValueError( "features do not align with input sentence") #print >>sys.stderr, feats feat_vec_update = defaultdict(int) for feat in feats: #!!!Debug: output_feat is not truth feat.... output_feat = truth_feat = feat feat_vec_update[output_feat, output[i]] += -1 feat_vec_update[truth_feat, true_output[i]] += 1 for (upd_feat, upd_tag) in feat_vec_update: if feat_vec_update[upd_feat, upd_tag] != 0: feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag] logging.info( "updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag])) print >> sys.stderr, "number of mistakes:", num_mistakes logging.info("current number of mistakes: %d" % (num_mistakes)) return feat_vec
def perc_train(train_data, tagset, numepochs): # perceptron train T = float(len(train_data)) step = numepochs * T feat_vec_cache = defaultdict(int) # feat_vec stores the weights for the features of a sentence, initially all weights are 0 feat_vec = defaultdict(int) # default_tag = 'B-NP' default_tag = tagset[0] # for each epoch/iteration for i in range(0, numepochs): # for each item (e.g tuple=([labeled words for each sentence],[features for those words of sentence])) in train_data for (label_list, feat_list) in train_data: # cur = list of best tag for each word in sentence found using viterbi algo cur = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) # gold = list of reference/true tag for each word in sentence gold = [entry.split()[2] for entry in label_list] if cur != gold: cur.insert(0, 'B_-1') gold.insert(0, 'B_-1') cur.append('B_+1') gold.append('B_+1') cur_len = len(cur) gold_len = len(gold) if cur_len != gold_len: raise ValueError( "output length is not the same with the input sentence" ) feat_index = 0 # perceptron update # for each tag/word of a sentence for i in range(1, cur_len): # for each word in a sentence, (feat_index, features) is a tuple, where feat_index=endindex of the list of features for that word, and features=list of features for that word (feat_index, features) = perc.feats_for_word(feat_index, feat_list) # update the weights of the features for that word, by rewarding the features seen in reference, while penalizing the ones not seen in reference but returned by viterbi for f in features: feat_vec[(f, cur[i])] = feat_vec[(f, cur[i])] - 1 feat_vec[(f, gold[i])] = feat_vec[(f, gold[i])] + 1 # averaged perceptron # usual way of averaging over all intermediate weight vectors is: # w = (w0 + w1 + w2 + ...... + wt) / (numepochs * T) # But we can also average in an efficient way: # w = w1*(step/numepochs*T) + w2*(step-1/numepochs*T) + w3*(step-2/numepochs*T) + ...... + wt*(1/numepochs*T) feat_vec_cache[(f, cur[i])] = feat_vec_cache[ (f, cur[i])] - 1 * (float(step / numepochs * T)) feat_vec_cache[(f, gold[i])] = feat_vec_cache[ (f, gold[i])] + 1 * (float(step / numepochs * T)) step -= 1 print >> sys.stderr, "iteration %d done." % i return feat_vec_cache
def retrieve_feature(output, feat_list): # This function returns feature vector generated by certain output feat_vec = FeatureVector() index = 0 for i in range(1, len(output) - 1): (index, feats) = perc.feats_for_word(index, feat_list) if len(feats) == 0: raise ValueError("Returned empty feature") for feat in feats: feat_vec[feat, output[i]] += 1 return feat_vec
def perc_avg_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) avg_feat_vec = defaultdict(float) default_tag = tagset[0] for epoch in range(numepochs): count_mistake = 0 print(f"Running on epoch {epoch+1}......") tic = time.time() for _, (labeled_list, feat_list) in enumerate(train_data): pred_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) true_output = [x.split()[2] for x in labeled_list] if pred_output != true_output: count_mistake += 1 feat_index = 0 for w_index in range(len(pred_output)): pred_tag = pred_output[w_index] true_tag = true_output[w_index] (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) for feat in feats: if feat == 'B' and w_index > 0: if true_output[w_index - 1] != pred_output[ w_index - 1] or pred_tag != true_tag: feat_vec['B:' + true_output[w_index - 1], true_tag] += 1 feat_vec['B:' + pred_output[w_index - 1], pred_tag] -= 1 elif pred_tag != true_tag: feat_vec[feat, true_tag] += 1 feat_vec[feat, pred_tag] -= 1 for key in feat_vec.keys(): # γ = σ/(mT) avg_feat_vec[key] += feat_vec[key] toc = time.time() print( f'Epoch {epoch+1} finished. Time cost on this epoch: {toc-tic}. Number of mistakes: {count_mistake}.' ) for key in avg_feat_vec.keys(): avg_feat_vec[key] /= (numepochs * len(train_data)) return avg_feat_vec
def perc_train(train_data, tagset, n): # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") default_tag = tagset[0] feat_vec = defaultdict(int) epochs = n for round in range(0,epochs): num_mistakes = 0 for (labeled_list, feat_list) in train_data: output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) true_output = get_truth(labeled_list) logging.info("arg max output: %s" % (" ".join(output))) logging.info("truth: %s" % (" ".join(true_output))) #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output if output != true_output: num_mistakes += 1 output.insert(0,'B_-1') output.append('B_+1') true_output.insert(0,'B_-1') true_output.append('B_+1') feat_index = 0 for i in range(1,len(output)-1): #print >>sys.stderr, output[i], true_output[i] (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) if len(feats) == 0: print >>sys.stderr, " ".join(labeled_list), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") #print >>sys.stderr, feats feat_vec_update = defaultdict(int) for feat in feats: #!!!Debug: output_feat is not truth feat.... output_feat = truth_feat = feat feat_vec_update[output_feat, output[i]] += -1 feat_vec_update[truth_feat, true_output[i]] += 1 for (upd_feat, upd_tag) in feat_vec_update: if feat_vec_update[upd_feat, upd_tag] != 0: feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag] logging.info("updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag])) print >>sys.stderr, "number of mistakes:", num_mistakes logging.info("current number of mistakes: %d" % (num_mistakes)) return feat_vec
def perc_train(train_data, tagset, n): feat_vec = defaultdict(int) # insert your code here # please limit the number of iterations of training to n iterations n_sentences = len(train_data) for i in range (0,n): for j in range(0,n_sentences): print '{0}\r'.format("\rIteration: %d/%d. Sentence: %d/%d\t" %(i+1, n, j+1, n_sentences)), labeled_list = train_data[j][0] feat_list = train_data[j][1] # Extract the labels from training data toutput = [tags.split(' ')[2] for tags in labeled_list ] # Output from Viterbi Algorithm zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP') # Compare outputs if toutput != zoutput: index = 0 for k in range(0, len(zoutput)): predicted_tag = zoutput[k] true_tag = toutput[k] (index, feats) = perc.feats_for_word(index, feat_list) for feat in feats: s1 = s2 = '' if feat == 'B': if k >= 1: zprevtag = zoutput[k-1] tprevtag = toutput[k-1] else: zprevtag = tprevtag = 'B_-1' s1 = (feat+':'+zprevtag, predicted_tag) s2 = (feat+':'+tprevtag, true_tag) else: s1 = (feat, predicted_tag) s2 = (feat, true_tag) if s1 != s2: feat_vec[s1] -= 1 feat_vec[s2] += 1 return feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) # insert your code here epoch = 0 while (epoch < numepochs): #print(epoch) mistakes = 0 correct = 0 #print(len(train_data)) #sen=0 for sentence_data in train_data: words = [] postags = [] truetags = [] label_list = sentence_data[0] feat_list = sentence_data[1] for label in label_list: (word, postag, chunktag) = label.split(" ") words.append(word) postags.append(postag) truetags.append(chunktag) tagset = perc.read_tagset(opts.tagsetfile) default_tag = tagset[0] argmaxtags = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) feat_index = 0 i = 0 for word in words: (feat_index, feats_for_this_word) = perc.feats_for_word( feat_index, feat_list) # print(len(feats_for_this_word)) argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 continue for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 i += 1 i = 0 for word in words: argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 correct += 1 continue else: mistakes += 1 argmaxprev = "B:" truprev = "B:" if (i == 0): argmaxprev += "B_-1" truprev += "B_-1" else: argmaxprev += argmaxtags[i - 1] truprev += truetags[i - 1] wrongkey = argmaxprev, argmax rightkey = truprev, tru feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 i += 1 #if(sen%1000==0): #print(str(sen)+"/"+str(len(train_data))) #sen+=1 #print(mistakes) #print(correct) epoch += 1 # please limit the number of iterations of training to n iterations return feat_vec
def perc_train(train_data, tagset, numepochs): # feat_vec = perc.perc_read_from_file(opts.modelfile) # print len(feat_vec) feat_vec = defaultdict(int) # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") numepochs = int(1) default_tag = tagset[0] for t in range(numepochs): print "Iteration#", t, " is processing now." counter = 0 for (labeled_list, feat_list) in train_data: counter += 1 print counter labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) # compare current output and true result # correct_flag = True feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >> sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") fields = labels[i].split() label = fields[2] if i > 1: pre_label = labels[i - 1].split()[2] pre_pre_label = labels[i - 2].split()[2] if output[i - 2] != pre_pre_label or output[i - 1] != pre_label or output[i] != label: for feat in feats: if feat[0] == "B": # for bigram feature feat_out = "B:" + output[i - 1] # feat_out is the "B:<previous output>" feat_lab = "B:" + pre_label # feat_lab is the "B:<previous label>" if output[i - 1] != pre_label and output[i] != label: feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, output[i]] -= 1 feat_vec[feat_out, label] += 1 feat_vec[feat_lab, label] += 1 elif output[i - 1] == pre_label and output[i] != label: feat_vec[feat_lab, output[i]] -= 2 feat_vec[feat_lab, label] += 2 elif output[i - 1] != pre_label and output[i] == label: pass elif output[i - 1] == pre_label and output[i] == label: pass feat_out = "T:" + output[i - 2] + "/" + output[i - 1] feat_lab = "T:" + pre_pre_label + "/" + pre_label if output[i - 2] == pre_pre_label and output[i - 1] == pre_label and output[i] != label: feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, label] += 1 feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, label] += 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] -= 1 feat_vec[feat, label] += 1 elif i == 1: pre_label = labels[i - 1].split()[2] if output[i - 1] != pre_label or output[i] != label: for feat in feats: if feat[0] == "B": # for bigram feature feat_out = "B:" + output[i - 1] # feat_out is the "B:<previous output>" feat_lab = "B:" + pre_label # feat_lab is the "B:<previous label>" feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, label] += 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] -= 1 feat_vec[feat, label] += 1 feat_out = "T:B_-1/" + output[i - 1] feat_lab = "T:B_-1/" + pre_label feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, label] += 1 else: # for i==0 case, all the first word in each sentence for feat in feats: if feat[0] == "B": # bigram feature case feat = "B:B_-1" feat_vec[feat, output[i]] -= 1 feat_vec[feat, label] += 1 feat = "T:B_-2/B_-1" feat_vec[feat, output[i]] -= 1 feat_vec[feat, label] += 1 # please limit the number of iterations of training to n iterations return feat_vec
def perc_train(train_data, tagset, n): feat_vec = defaultdict(int) sigma_feat_vec = defaultdict(float) # insert your code here # please limit the number of iterations of training to n iterations print >> sys.stderr, "training data ..." n_sentences = len(train_data) for i in range (0,n): for j in range(0,n_sentences): print >> sys.stderr, '\r{0}'.format("Iteration: %d/%d. Sentence: %d/%d\t" %(i+1, n, j+1, n_sentences)), labeled_list = train_data[j][0] feat_list = train_data[j][1] # Extract the labels from training data toutput = [tags.split(' ')[2] for tags in labeled_list ] # Output from Viterbi Algorithm zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP') # Compare outputs if toutput != zoutput: index = 0 for p in range(0, len(zoutput)): predicted_tag = zoutput[p] true_tag = toutput[p] (index, feats) = perc.feats_for_word(index, feat_list) for feat in feats: s1 = s2 = '' if feat == 'B': if p >= 1: zprevtag = zoutput[p-1] tprevtag = toutput[p-1] else: zprevtag = tprevtag = 'B_-1' s1 = (feat+':'+zprevtag, predicted_tag) s2 = (feat+':'+tprevtag, true_tag) else: s1 = (feat, predicted_tag) s2 = (feat, true_tag) if s1 != s2: feat_vec[s1] -= 1 feat_vec[s2] += 1 # Compute average vector for f in feat_vec: sigma_feat_vec[f] += feat_vec[f] print >> sys.stderr, "\ndone" zerokeys = [] for f in sigma_feat_vec: if sigma_feat_vec[f] == 0: zerokeys.append(f) else: sigma_feat_vec[f] = sigma_feat_vec[f]/(n*n_sentences) for k in zerokeys: del sigma_feat_vec[k] return sigma_feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(float) avg_feat_vec = defaultdict(float) tau_feat_vec = dict() # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") default_tag = tagset[0] m = len(train_data) # length of training data for t in range(numepochs): print 'Iteration#',t,' is processing now.' for j, (labeled_list, feat_list) in enumerate(train_data): labels = copy.deepcopy(labeled_list) # print 'sentence[',j,']' # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) # compare current output and true result if j != m - 1 or t != numepochs - 1: feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") label = labels[i].split()[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i] != label or feat_out != feat_lab: # laze update the tau vector value lazy_update_vect(feat_out, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat_lab, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # update original feature vector, if feat_out == feat_lab perform 2nd type updating update_bigram_vect(feat_vec, avg_feat_vec, feat_out, feat_lab, output[i], label) # if feat_out == feat_lab then update twice for the same tau tau_feat_vec[feat_out, output[i]] = (j, t) tau_feat_vec[feat_lab, label] = (j, t) elif output[i] != label: lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # for U00 to U22 feature update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) # update vector tau_feat_vec[feat, output[i]] = (j, t) tau_feat_vec[feat, label] = (j, t) else: # for i==0 case, all the first word in each sentence label_pre = '_B-2' # previous label will be denoted by _B-2 for feat in feats: if feat[0] == 'B' and output[i] != label: # bigram feature case feat = feat + ":" + label_pre lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) update_bigram_vect(feat_vec, avg_feat_vec, feat, feat, output[i], label) tau_feat_vec[feat, label] = (j, t) tau_feat_vec[feat, output[i]] = (j, t) elif output[i] != label: lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # for U00 to U22 feature update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) tau_feat_vec[feat, output[i]] = (j, t) tau_feat_vec[feat, label] = (j, t) else: final_lazy_update_vect(tau_feat_vec, feat_vec, avg_feat_vec, t, j, m) # special case for the last sentence feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") label = labels[i].split()[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i] != label: # update original feature vector update_bigram_vect(feat_vec, avg_feat_vec, feat_out, feat_lab, output[i], label) elif output[i] != label: update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) else: # for i==0 case, all the first word in each sentence label_pre = '_B-1' # previous label will be denoted by _B-2 for feat in feats: if feat[0] == 'B' and output[i] != label: # bigram feature case feat = feat + ":" + label_pre update_bigram_vect(feat_vec, avg_feat_vec, feat, feat, output[i], label) elif output[i] != label: # for U00 to U22 feature update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label) # end of iteration # averaging perceptron for key in avg_feat_vec.keys(): avg_feat_vec[key] = avg_feat_vec[key]/float(numepochs*m) # please limit the number of iterations of training to n iterations perc.perc_write_to_file(feat_vec, 'model_feat_vec') return avg_feat_vec
def avg_perc_train(train_data, tagset, n): # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") default_tag = tagset[0] feat_vec = defaultdict(int) avg_vec = defaultdict(int) last_iter = {} epochs = n num_updates = 0 for round in range(0,epochs): num_mistakes = 0 for (labeled_list, feat_list) in train_data: num_updates += 1 output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) true_output = get_truth(labeled_list) logging.info("arg max output: %s" % (" ".join(output))) logging.info("truth: %s" % (" ".join(true_output))) #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output if output != true_output: num_mistakes += 1 output.insert(0,'B_-1') output.append('B_+1') true_output.insert(0,'B_-1') true_output.append('B_+1') feat_index = 0 for i in range(1,len(output)-1): #print >>sys.stderr, output[i], true_output[i] (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) if len(feats) == 0: print >>sys.stderr, " ".join(labeled_list), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") #print >>sys.stderr, feats feat_vec_update = defaultdict(int) for feat in feats: if feat == 'B': output_feat = 'B:' + output[i-1] truth_feat = 'B:' + true_output[i-1] else: output_feat = truth_feat = feat feat_vec_update[output_feat, output[i]] += -1 feat_vec_update[truth_feat, true_output[i]] += 1 #reason: if output[i]==true_output[i] update = 0 for (upd_feat, upd_tag) in feat_vec_update: if feat_vec_update[upd_feat, upd_tag] != 0: feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag] logging.info("updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag])) if (upd_feat, upd_tag) in last_iter: avg_vec[upd_feat, upd_tag] += (num_updates - last_iter[upd_feat, upd_tag]) * feat_vec[upd_feat, upd_tag] else: avg_vec[upd_feat, upd_tag] = feat_vec[upd_feat, upd_tag] last_iter[upd_feat, upd_tag] = num_updates print >>sys.stderr, "number of mistakes:", num_mistakes for (feat, tag) in feat_vec: if (feat, tag) in last_iter: avg_vec[feat, tag] += (num_updates - last_iter[feat, tag]) * feat_vec[feat, tag] else: avg_vec[feat, tag] = feat_vec[feat, tag] feat_vec[feat, tag] = avg_vec[feat, tag] / num_updates return feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) # insert your code here cumulative_feat_vec = defaultdict(float) index_dec = dict() epoch = 0 count = 0 numsen = len(train_data) while (epoch < numepochs): print(epoch) mistakes = 0 correct = 0 #print(numsen) sen = 0 for sentence_data in train_data: if (epoch != numepochs or sen != numsen): words = [] postags = [] truetags = [] label_list = sentence_data[0] feat_list = sentence_data[1] for label in label_list: (word, postag, chunktag) = label.split(" ") words.append(word) postags.append(postag) truetags.append(chunktag) tagset = perc.read_tagset(opts.tagsetfile) default_tag = tagset[0] argmaxtags = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) feat_index = 0 i = 0 for word in words: (feat_index, feats_for_this_word) = perc.feats_for_word( feat_index, feat_list) # print(len(feats_for_this_word)) argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 continue for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru if (wrongkey in index_dec): (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[ wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime if (rightkey in index_dec): (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[ rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[ wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[ rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 i = 0 for word in words: argmax = argmaxtags[i] tru = truetags[i] if (argmax == tru): i += 1 correct += 1 continue else: mistakes += 1 argmaxprev = "B:" truprev = "B:" if (i == 0): argmaxprev += "B_-1" truprev += "B_-1" else: argmaxprev += argmaxtags[i - 1] truprev += truetags[i - 1] wrongkey = argmaxprev, argmax rightkey = truprev, tru if (wrongkey in index_dec): (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime if (rightkey in index_dec): (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 #keys=feat_vec.keys() #for key in keys: #cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key] count += 1 else: words = [] postags = [] truetags = [] label_list = sentence_data[0] feat_list = sentence_data[1] for label in label_list: (word, postag, chunktag) = label.split(" ") words.append(word) postags.append(postag) truetags.append(chunktag) tagset = perc.read_tagset(opts.tagsetfile) default_tag = tagset[0] argmaxtags = perc.perc_test(feat_vec, label_list, feat_list, tagset, default_tag) feat_index = 0 i = 0 for word in words: (feat_index, feats_for_this_word) = perc.feats_for_word( feat_index, feat_list) # print(len(feats_for_this_word)) argmax = argmaxtags[i] tru = truetags[i] for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime if (argmax == tru): i += 1 continue for f in feats_for_this_word: wrongkey = f, argmax rightkey = f, tru feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[ wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[ rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 i = 0 for word in words: argmax = argmaxtags[i] tru = truetags[i] argmaxprev = "B:" truprev = "B:" if (i == 0): argmaxprev += "B_-1" truprev += "B_-1" else: argmaxprev += argmaxtags[i - 1] truprev += truetags[i - 1] wrongkey = argmaxprev, argmax rightkey = truprev, tru (index_epoch, index_sen) = index_dec[wrongkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime (index_epoch, index_sen) = index_dec[rightkey] idletime = (epoch * numsen + sen - index_epoch * numsen - index_sen) cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec.get(rightkey, 0) * idletime if (argmax == tru): i += 1 correct += 1 continue else: mistakes += 1 feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1 feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1 cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get( wrongkey, 0) + feat_vec[wrongkey] cumulative_feat_vec[rightkey] = cumulative_feat_vec.get( rightkey, 0) + feat_vec[rightkey] index_dec[wrongkey] = (epoch, sen) index_dec[rightkey] = (epoch, sen) i += 1 # keys=feat_vec.keys() # for key in keys: # cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key] count += 1 if (sen % 1000 == 0): print(str(sen) + "/" + str(len(train_data))) sen += 1 #print(mistakes) #print(correct) epoch += 1 keys = cumulative_feat_vec.keys() for key in keys: cumulative_feat_vec[key] = float( cumulative_feat_vec[key]) / float(count) # please limit the number of iterations of training to n iterations return cumulative_feat_vec
def perc_train(train_data, tagset, T): feat_vec = defaultdict(int) sigma_feat_vec = defaultdict(float) sigma_feat_vec2 = defaultdict(float) tau = {} # insert your code here # please limit the number of iterations of training to n iterations print >> sys.stderr, "training data ..." M = len(train_data) for t in range (0,T): for i in range(0,M): print >> sys.stderr, '\r{0}'.format("Iteration: %d/%d. Sentence: %d/%d\t" %(t+1, T, i+1, M)), labeled_list = train_data[i][0] feat_list = train_data[i][1] # Extract the labels from training data toutput = [tags.split(' ')[2] for tags in labeled_list ] # Output from Viterbi Algorithm zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP') # Compare outputs diff = toutput != zoutput if t != T-1 or i != M-1: if diff: index = 0 for p in range(0, len(zoutput)): predicted_tag = zoutput[p] true_tag = toutput[p] (index, feats) = perc.feats_for_word(index, feat_list) for feat in feats: s1 = s2 = '' if feat == 'B': if p >= 1: zprevtag = zoutput[p-1] tprevtag = toutput[p-1] else: zprevtag = tprevtag = 'B_-1' s1 = (feat+':'+zprevtag, predicted_tag) s2 = (feat+':'+tprevtag, true_tag) else: s1 = (feat, predicted_tag) s2 = (feat, true_tag) if s1 in tau: sigma_feat_vec[s1] = sigma_feat_vec[s1] + feat_vec[s1] * ((t+1) * M + (i+1) - tau[s1][1] * M - tau[s1][0]) if s1 != s2 and s2 in tau: sigma_feat_vec[s2] = sigma_feat_vec[s2] + feat_vec[s2] * ((t+1) * M + (i+1) - tau[s2][1] * M - tau[s2][0]) if s1 != s2: feat_vec[s1] -= 1 feat_vec[s2] += 1 sigma_feat_vec[s1] -= 1 sigma_feat_vec[s2] += 1 tau[s1] = (i+1,t+1) tau[s2] = (i+1,t+1) else: for s in tau: sigma_feat_vec[s] = sigma_feat_vec[s] + feat_vec[s] * (T * M + M - tau[s][1] * M - tau[s][0]) if diff: index = 0 for p in range(0, len(zoutput)): predicted_tag = zoutput[p] true_tag = toutput[p] (index, feats) = perc.feats_for_word(index, feat_list) for feat in feats: s1 = s2 = '' if feat == 'B': if p >= 1: zprevtag = zoutput[p-1] tprevtag = toutput[p-1] else: zprevtag = tprevtag = 'B_-1' s1 = (feat+':'+zprevtag, predicted_tag) s2 = (feat+':'+tprevtag, true_tag) else: s1 = (feat, predicted_tag) s2 = (feat, true_tag) if s1 != s2: feat_vec[s1] -= 1 feat_vec[s2] += 1 sigma_feat_vec[s1] -= 1 sigma_feat_vec[s2] += 1 print >> sys.stderr, "\ndone" print >> sys.stderr, "computing average vector ..." zerokeys = [] for f in sigma_feat_vec: if sigma_feat_vec[f] == 0: zerokeys.append(f) else: sigma_feat_vec[f] = sigma_feat_vec[f]/(T*M) for k in zerokeys: del sigma_feat_vec[k] print >> sys.stderr, "done" return sigma_feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(int) # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") numepochs = int(1) default_tag = tagset[0] for t in range(numepochs): tmp = 0 # Count sentence print 'Iteration#',t,' is processing now.' cnt = 0 for (labeled_list, feat_list) in train_data: cnt = cnt + 1 print 'Sentence[',cnt,'] is now processing...' labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") label = labels[i].split()[2] if i > 1: label_i_1 = labels[i-1].split()[2] label_i_2 = labels[i-2].split()[2] if output[i] != label: for feat in feats: if feat[0] == 'T' and output[i-2] != label_i_2 and output[i-1] != label_i_1: # trigram case feat_out = feat + ":" + output[i-2] + "," + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_i_2 + "," + label_i_1 # feat_lab is the "B:<previous label>" # reward best condition feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # penalize condition feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 elif feat[0] == 'B' and output[i-1] != label_i_1: # bigram case feat_out = feat + ":" + output[i-1] feat_lab = feat + ":" + label_i_1 feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 elif i == 1: # for i==0 case, all the first word in each sentence label_i_2 = '_-1' # previous label will be denoted by B_-1 label_i_1 = labels[i-1].split()[2] if output[i] != label: for feat in feats: if feat[0] == 'T' and output[i-1] != label_i_1: # trigram case feat_out = feat + ":" + label_i_2 + "," + output[i-1] feat_lab = feat + ":" + label_i_2 + "," + label_i_1 # reward best condition feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # penalize condition feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 elif feat[0] == 'B': feat_out = feat + ":" + output[i-1] feat_lab = feat + ":" + label_i_1 feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 else: # for U00 to U22 feature feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 elif i == 0: label_i_2 = '_B-2' label_i_1 = '_B-1' if output[i] != label: for feat in feats: if feat[0] == 'T': # trigram case feat = feat + ":" + label_i_2 + "," + label_i_1 elif feat[0] == 'B': #bigram case feat = feat + ":" + label_i_1 feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 filename = 'mid_model_iter' + str(t) perc.perc_write_to_file(feat_vec, filename) for (k1, k2), v in feat_vec.items(): if v == 0: del feat_vec[k1,k2] # please limit the number of iterations of training to n iterations return feat_vec
def perc_train(train_data, tagset, numepochs, word_set): feat_vec = defaultdict(int) # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") # numepochs = int(50) default_tag = tagset[0] for t in range(numepochs): tmp = 0 # Count sentence print 'Iteration#',t,' is processing now.' cnt = 0 for (labeled_list, feat_list) in train_data: cnt = cnt + 1 if cnt % 1000 == 0: print 'current status: ', str(round(100*cnt/9000.0,2)),'%' labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag, word_set) feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") fields = labels[i].split() label = fields[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i-1] != label_pre and output[i] != label: feat_vec[feat_out, output[i]] -= 1 feat_vec[feat_lab, output[i]] -= 1 feat_vec[feat_out, label] += 1 feat_vec[feat_lab, label] += 1 elif output[i-1] == label_pre and output[i] != label: feat_vec[feat_lab, output[i]] -= 2 feat_vec[feat_lab, label] += 2 elif output[i-1] != label_pre and output[i] == label: pass elif output[i-1] == label_pre and output[i] == label: pass # feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 # feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # feat_vec[feat_out, label] = feat_vec[feat_out, label] + 1 # feat_vec[feat_lab, output[i]] = feat_vec[feat_lab, output[i]] - 1 else: # for U00 to U22 feature # if the condition is not right, there will be no penaulty and rewarding feat_vec[feat, output[i]] -= 1 feat_vec[feat, label] += 1 else: # for i==0 case, all the first word in each sentence label_pre = '_B-1' # previous label will be denoted by _B-1 for feat in feats: if feat[0] == 'B': # bigram feature case feat = feat + ":" + label_pre feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 feat_vec[feat, label] = feat_vec[feat, label] + 1 # if i > 0: # label_pre = labels[i-1].split()[2] # if output[i-1] != label_pre or output[i] != label: # for feat in feats: # if feat[0] == 'B': # # for bigram feature # feat_out = feat + ":" + output[i-1] # # feat_out is the "B:<previous output>" # feat_lab = feat + ":" + label_pre # # feat_lab is the "B:<previous label>" # # reward best condition # feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1 # # penalize condition # feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1 # else: # # for U00 to U22 feature # feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 # feat_vec[feat, label] = feat_vec[feat, label] + 1 # else: # # for i==0 case, all the first word in each sentence # label_pre = '_B-1' # previous label will be denoted by _B-1 # for feat in feats: # if feat[0] == 'B': # # bigram feature case # feat = feat + ":" + label_pre # feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1 # feat_vec[feat, label] = feat_vec[feat, label] + 1 if t % 3 == 0: perc.perc_write_to_file(feat_vec, 'model_' + str(t)) perc.perc_write_to_file(feat_vec, 'model') os.system('python perc.py -m model | python score-chunks.py') # please limit the number of iterations of training to n iterations return feat_vec
def avg_perc_train(train_data, tagset, n): # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") default_tag = tagset[0] feat_vec = defaultdict(int) avg_vec = defaultdict(int) last_iter = {} epochs = n num_updates = 0 for round in range(0, epochs): num_mistakes = 0 for (labeled_list, feat_list) in train_data: num_updates += 1 output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) true_output = get_truth(labeled_list) logging.info("arg max output: %s" % (" ".join(output))) logging.info("truth: %s" % (" ".join(true_output))) #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output if output != true_output: num_mistakes += 1 output.insert(0, 'B_-1') output.append('B_+1') true_output.insert(0, 'B_-1') true_output.append('B_+1') feat_index = 0 for i in range(1, len(output) - 1): #print >>sys.stderr, output[i], true_output[i] (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) if len(feats) == 0: print >> sys.stderr, " ".join(labeled_list), " ".join( feat_list), "\n" raise ValueError( "features do not align with input sentence") #print >>sys.stderr, feats feat_vec_update = defaultdict(int) for feat in feats: if feat == 'B': output_feat = 'B:' + output[i - 1] truth_feat = 'B:' + true_output[i - 1] else: output_feat = truth_feat = feat feat_vec_update[output_feat, output[i]] += -1 feat_vec_update[truth_feat, true_output[i]] += 1 #reason: if output[i]==true_output[i] update = 0 for (upd_feat, upd_tag) in feat_vec_update: if feat_vec_update[upd_feat, upd_tag] != 0: feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag] logging.info( "updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag])) if (upd_feat, upd_tag) in last_iter: avg_vec[upd_feat, upd_tag] += ( num_updates - last_iter[upd_feat, upd_tag] ) * feat_vec[upd_feat, upd_tag] else: avg_vec[upd_feat, upd_tag] = feat_vec[upd_feat, upd_tag] last_iter[upd_feat, upd_tag] = num_updates print >> sys.stderr, "number of mistakes:", num_mistakes for (feat, tag) in feat_vec: if (feat, tag) in last_iter: avg_vec[feat, tag] += (num_updates - last_iter[feat, tag]) * feat_vec[feat, tag] else: avg_vec[feat, tag] = feat_vec[feat, tag] feat_vec[feat, tag] = avg_vec[feat, tag] / num_updates return feat_vec
def perc_train(train_data, tagset, numepochs): feat_vec = defaultdict(float) avg_feat_vec = defaultdict(float) tau_feat_vec = dict() # insert your code here if len(tagset) <= 0: raise ValueError("Empty tagset") numepochs = int(2) default_tag = tagset[0] m = len(train_data) # length of training data for t in range(numepochs): print 'Iteration#',t,' is processing now.' for j, (labeled_list, feat_list) in enumerate(train_data): labels = copy.deepcopy(labeled_list) # add in the start and end buffers for the context # for every sentence in the training set, iterate numepochs times output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag) # compare current output and true result # correct_flag = True if j != m or t != numepochs - 1: feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") label = labels[i].split()[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i-1] != label_pre and output[i] != label: if feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) # update original feature vector feat_vec[feat_out, output[i]] -= 1.0 feat_vec[feat_lab, output[i]] -= 1.0 feat_vec[feat_out, label] += 1.0 feat_vec[feat_lab, label] += 1.0 # update avg feature vector avg_feat_vec[feat_out, output[i]] -= 1.0 avg_feat_vec[feat_lab, output[i]] -= 1.0 avg_feat_vec[feat_out, label] += 1.0 avg_feat_vec[feat_lab, label] += 1.0 tau_feat_vec[feat_out] = (j, t) tau_feat_vec[feat_lab] = (j, t) elif output[i-1] == label_pre and output[i] != label: if feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat_lab, output[i]] -= 2.0 feat_vec[feat_lab, label] += 2.0 avg_feat_vec[feat_lab, output[i]] -= 2.0 avg_feat_vec[feat_lab, label] += 2.0 tau_feat_vec[feat_lab] = (j, t) tau_feat_vec[feat_lab] = (j, t) elif output[i-1] != label_pre and output[i] == label: pass elif output[i-1] == label_pre and output[i] == label: pass else: # for U00 to U22 feature if output[i] != label and feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat, output[i]] -= 1.0 feat_vec[feat, label] += 1.0 avg_feat_vec[feat, output[i]] -= 1.0 avg_feat_vec[feat, label] += 1.0 # update vector tau_feat_vec[feat, output[i]] = (j, t) tau_feat_vec[feat, label] = (j, t) else: # for i==0 case, all the first word in each sentence label_pre = 'B_-1' # previous label will be denoted by B_-1 for feat in feats: if feat[0] == 'B': # bigram feature case feat = feat + ":" + label_pre if output[i] != label and feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat, output[i]] -= 1.0 feat_vec[feat, label] += 1.0 avg_feat_vec[feat, output[i]] -= 1.0 avg_feat_vec[feat, label] += 1.0 tau_feat_vec[feat, output[i]] = (j, t) tau_feat_vec[feat, label] = (j, t) else: # last sentence of each iteration feat_index = 0 # check word by word if the predicted tag is equal to the true tag for i, v in enumerate(output): (feat_index, feats) = perc.feats_for_word(feat_index, feat_list) # retrieve the feature for a word if len(feats) == 0: print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n" raise ValueError("features do not align with input sentence") fields = labels[i].split() label = fields[2] if i > 0: label_pre = labels[i-1].split()[2] for feat in feats: if feat[0] == 'B': # for bigram feature feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>" feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>" if output[i-1] != label_pre and output[i] != label: if feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) # update original feature vector feat_vec[feat_out, output[i]] -= 1.0 feat_vec[feat_lab, output[i]] -= 1.0 feat_vec[feat_out, label] += 1.0 feat_vec[feat_lab, label] += 1.0 # update avg feature vector avg_feat_vec[feat_out, output[i]] -= 1.0 avg_feat_vec[feat_lab, output[i]] -= 1.0 avg_feat_vec[feat_out, label] += 1.0 avg_feat_vec[feat_lab, label] += 1.0 elif output[i-1] == label_pre and output[i] != label: if feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat_lab, output[i]] -= 2.0 feat_vec[feat_lab, label] += 2.0 avg_feat_vec[feat_lab, output[i]] -= 2.0 avg_feat_vec[feat_lab, label] += 2.0 elif output[i-1] != label_pre and output[i] == label: pass elif output[i-1] == label_pre and output[i] == label: pass else: # for U00 to U22 feature if output[i] != label and feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat, output[i]] -= 1.0 feat_vec[feat, label] += 1.0 avg_feat_vec[feat, output[i]] -= 1.0 avg_feat_vec[feat, label] += 1.0 else: # for i==0 case, all the first word in each sentence label_pre = 'B_-1' # previous label will be denoted by B_-1 for feat in feats: if feat[0] == 'B': # bigram feature case feat = feat + ":" + label_pre if output[i] != label and feat in tau_feat_vec: (js, ts) = tau_feat_vec[feat] for (feature, tag) in avg_feat_vec.keys(): if feature == feat: avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js) feat_vec[feat, output[i]] -= 1.0 feat_vec[feat, label] += 1.0 avg_feat_vec[feat, output[i]] -= 1.0 avg_feat_vec[feat, label] += 1.0 # end of iteration # averaging perceptron for key in avg_feat_vec.keys(): avg_feat_vec[key] = avg_feat_vec[key]/float(numepochs*m) # please limit the number of iterations of training to n iterations return avg_feat_vec