print(arcState) break ## Go though each node in the graph and find the predicted head for id in graph.nodes(): if id == 0: continue head = 0 for edge in arcState.relations: if edge[1] == id: head = edge[0] break ## Write the result to the output file word = graph.node[id]['word'] lemma = graph.node[id]['lemma'] cpos = graph.node[id]['cpos'] pos = graph.node[id]['pos'] feats = graph.node[id]['feats'] writer.writerow([ id, word, lemma, cpos, pos, feats, head, '_', '_', '_' ]) except Exception as ex: print(ex) pass writer.writerow([]) ## Evaluate the resulting output file depeval.eval(file_test, file_out)
def perceptron(training_configs, dev_configs, testfile, testout, eval_dev): transitions = [ArcState.ARC_LEFT, ArcState.ARC_RIGHT, ArcState.SHIFT] # Create beginning structure for weights theta = dict() m = dict() m_last_updated = dict() for t in transitions: theta[t] = defaultdict(float) m[t] = defaultdict(float) m_last_updated[t] = defaultdict(float) training_vec = make_feature_vec(training_configs) dev_vec = make_feature_vec(dev_configs) training_labels = [] for config in training_configs: training_labels.append(config[2]) dev_labels = [] for config in dev_configs: dev_labels.append(config[2]) # Initialize list of indices used for randomizing training instance order indices = [] for i in range(0, len(training_configs)): indices.append(i) gen = iterCoNLL("en.dev") # Holds tuples of {initial config, gold relation set} for each dev sentence dev_golds = [] for s in gen: state = ArcState(s['buffer'], [ArcNode(0, "*ROOT*")], [], s['graph'], []) p = state while not p.done(): p = p.do_action(p.get_next_action()) dev_golds.append({'config': state, 'relations': p.relations}) # Initialize variables for holding best-performing dev relations best_dev_results = 0. best_dev_iteration = 0 best_dev_relations = [] best_theta = dict() # Main perceptron loop counter = 0 iteration = 0 while (True): # Evaluates accuracy on training set after each pass through whole training set if (counter % len(training_configs) == 0 and counter > 0): m_temp = dict() theta_temp = dict() for t in transitions: m_temp[t] = defaultdict(float) theta_temp[t] = defaultdict(float) # Obtain weights from running average before evaluating for feature in m[t]: m_temp[t][feature] = m[t][feature] + theta[t][feature] * ( counter - m_last_updated[t][feature]) theta_temp[t][feature] = m_temp[t][feature] / counter #print "Training set config accuracy: " + str(eval(training_labels, predict_labels(transitions, theta_temp, training_vec))) #print "Dev set config accuracy: " + str(eval(dev_labels, predict_labels(transitions, theta_temp, dev_vec))) correct = 0 total = 0 current_dev_relations = [] for gold in dev_golds: p = score_with_features(gold['config'], theta_temp, transitions) for relation in gold['relations']: total += 1 if relation in p.relations: correct += 1 current_dev_relations.append(p.relations) current_dev_results = float(correct) / total * 100 if (current_dev_results > best_dev_results): best_dev_results = current_dev_results best_dev_relations = current_dev_relations best_dev_iteration = iteration best_theta = theta_temp elif (iteration - best_dev_iteration >= 5): #print "Stopping; will use best dev result of " + str(best_dev_results) + " from iteration " + str(best_dev_iteration) # Test: write to file dev results if eval_dev: sentences = read_sentences("en.dev") for i in range(0, len(sentences)): for j in range(0, len(sentences[i])): sentences[i][j][6] = "" for relation in current_dev_relations[i]: # Writes the head number for the words that have relations # Ugly indexing though... sentences[i][relation[1] - 1][6] = str(relation[0]) with open("en.dev.out", 'wb') as csvfile: writer = csv.writer(csvfile, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) for sentence in sentences: for word in sentence: writer.writerow(word) writer.writerow([]) depeval.eval("en.dev", "en.dev.out") # Code to write to testout results on test sentences else: gen = iterCoNLL(testfile) test_relations = [] for s in gen: state = ArcState(s['buffer'], [ArcNode(0, "*ROOT*")], [], s['graph'], []) p = score_with_features(state, best_theta, transitions) test_relations.append(p.relations) sentences = read_sentences(testfile) for i in range(0, len(sentences)): for relation in test_relations[i]: sentences[i][relation[1] - 1][6] = str(relation[0]) with open(testout, 'wb') as csvfile: writer = csv.writer(csvfile, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) for sentence in sentences: for word in sentence: writer.writerow(word) writer.writerow([]) return #print "Relation attachment score on dev set: " + str(current_dev_results) shuffle(indices) iteration += 1 index = indices[counter % len(indices)] # Obtain predicted based on argmax of score for each class scores = defaultdict(float) for t in transitions: for feature in training_vec[index]: scores[t] += training_vec[index][feature] * theta[t][feature] v = list(scores.values()) k = list(scores.keys()) yhat = k[v.index(max(v))] # If prediction is wrong, update weight vector correct_label = training_configs[index][2] if (yhat != correct_label): for feature in training_vec[index]: # Updates for scores of predicted class m[yhat][feature] += theta[yhat][feature] * ( counter - m_last_updated[yhat][feature]) m_last_updated[yhat][feature] = counter theta[yhat][feature] -= training_vec[index][feature] m[yhat][feature] -= training_vec[index][feature] # Updates for scores of actual class m[correct_label][feature] += theta[correct_label][feature] * ( counter - m_last_updated[correct_label][feature]) m_last_updated[correct_label][feature] = counter theta[correct_label][feature] += training_vec[index][feature] m[correct_label][feature] += training_vec[index][feature] counter += 1