Ejemplo n.º 1
0
                        print(arcState)
                        break

                ## Go though each node in the graph and find the predicted head
                for id in graph.nodes():
                    if id == 0:
                        continue
                    head = 0
                    for edge in arcState.relations:
                        if edge[1] == id:
                            head = edge[0]
                            break

                    ## Write the result to the output file
                    word = graph.node[id]['word']
                    lemma = graph.node[id]['lemma']
                    cpos = graph.node[id]['cpos']
                    pos = graph.node[id]['pos']
                    feats = graph.node[id]['feats']
                    writer.writerow([
                        id, word, lemma, cpos, pos, feats, head, '_', '_', '_'
                    ])

            except Exception as ex:
                print(ex)
                pass
            writer.writerow([])

    ## Evaluate the resulting output file
    depeval.eval(file_test, file_out)
Ejemplo n.º 2
0
def perceptron(training_configs, dev_configs, testfile, testout, eval_dev):

    transitions = [ArcState.ARC_LEFT, ArcState.ARC_RIGHT, ArcState.SHIFT]

    # Create beginning structure for weights
    theta = dict()
    m = dict()
    m_last_updated = dict()
    for t in transitions:
        theta[t] = defaultdict(float)
        m[t] = defaultdict(float)
        m_last_updated[t] = defaultdict(float)

    training_vec = make_feature_vec(training_configs)
    dev_vec = make_feature_vec(dev_configs)
    training_labels = []
    for config in training_configs:
        training_labels.append(config[2])
    dev_labels = []
    for config in dev_configs:
        dev_labels.append(config[2])

    # Initialize list of indices used for randomizing training instance order
    indices = []
    for i in range(0, len(training_configs)):
        indices.append(i)

    gen = iterCoNLL("en.dev")
    # Holds tuples of {initial config, gold relation set} for each dev sentence
    dev_golds = []
    for s in gen:
        state = ArcState(s['buffer'], [ArcNode(0, "*ROOT*")], [], s['graph'],
                         [])
        p = state
        while not p.done():
            p = p.do_action(p.get_next_action())
        dev_golds.append({'config': state, 'relations': p.relations})

    # Initialize variables for holding best-performing dev relations
    best_dev_results = 0.
    best_dev_iteration = 0
    best_dev_relations = []
    best_theta = dict()

    # Main perceptron loop
    counter = 0
    iteration = 0
    while (True):

        # Evaluates accuracy on training set after each pass through whole training set
        if (counter % len(training_configs) == 0 and counter > 0):
            m_temp = dict()
            theta_temp = dict()
            for t in transitions:
                m_temp[t] = defaultdict(float)
                theta_temp[t] = defaultdict(float)
                # Obtain weights from running average before evaluating
                for feature in m[t]:
                    m_temp[t][feature] = m[t][feature] + theta[t][feature] * (
                        counter - m_last_updated[t][feature])
                    theta_temp[t][feature] = m_temp[t][feature] / counter
            #print "Training set config accuracy: " + str(eval(training_labels, predict_labels(transitions, theta_temp, training_vec)))
            #print "Dev set config accuracy: " + str(eval(dev_labels, predict_labels(transitions, theta_temp, dev_vec)))

            correct = 0
            total = 0
            current_dev_relations = []
            for gold in dev_golds:
                p = score_with_features(gold['config'], theta_temp,
                                        transitions)
                for relation in gold['relations']:
                    total += 1
                    if relation in p.relations:
                        correct += 1
                current_dev_relations.append(p.relations)
            current_dev_results = float(correct) / total * 100
            if (current_dev_results > best_dev_results):
                best_dev_results = current_dev_results
                best_dev_relations = current_dev_relations
                best_dev_iteration = iteration
                best_theta = theta_temp
            elif (iteration - best_dev_iteration >= 5):
                #print "Stopping; will use best dev result of " + str(best_dev_results) + " from iteration " + str(best_dev_iteration)
                # Test: write to file dev results
                if eval_dev:
                    sentences = read_sentences("en.dev")
                    for i in range(0, len(sentences)):
                        for j in range(0, len(sentences[i])):
                            sentences[i][j][6] = ""
                        for relation in current_dev_relations[i]:
                            # Writes the head number for the words that have relations
                            # Ugly indexing though...
                            sentences[i][relation[1] - 1][6] = str(relation[0])
                    with open("en.dev.out", 'wb') as csvfile:
                        writer = csv.writer(csvfile,
                                            delimiter='\t',
                                            quotechar='|',
                                            quoting=csv.QUOTE_MINIMAL)
                        for sentence in sentences:
                            for word in sentence:
                                writer.writerow(word)
                            writer.writerow([])
                    depeval.eval("en.dev", "en.dev.out")
                # Code to write to testout results on test sentences
                else:
                    gen = iterCoNLL(testfile)
                    test_relations = []
                    for s in gen:
                        state = ArcState(s['buffer'], [ArcNode(0, "*ROOT*")],
                                         [], s['graph'], [])
                        p = score_with_features(state, best_theta, transitions)
                        test_relations.append(p.relations)
                    sentences = read_sentences(testfile)
                    for i in range(0, len(sentences)):
                        for relation in test_relations[i]:
                            sentences[i][relation[1] - 1][6] = str(relation[0])
                    with open(testout, 'wb') as csvfile:
                        writer = csv.writer(csvfile,
                                            delimiter='\t',
                                            quotechar='|',
                                            quoting=csv.QUOTE_MINIMAL)
                        for sentence in sentences:
                            for word in sentence:
                                writer.writerow(word)
                            writer.writerow([])
                return
            #print "Relation attachment score on dev set: " + str(current_dev_results)

            shuffle(indices)
            iteration += 1

        index = indices[counter % len(indices)]

        # Obtain predicted based on argmax of score for each class
        scores = defaultdict(float)
        for t in transitions:
            for feature in training_vec[index]:
                scores[t] += training_vec[index][feature] * theta[t][feature]
        v = list(scores.values())
        k = list(scores.keys())
        yhat = k[v.index(max(v))]

        # If prediction is wrong, update weight vector
        correct_label = training_configs[index][2]
        if (yhat != correct_label):
            for feature in training_vec[index]:
                # Updates for scores of predicted class
                m[yhat][feature] += theta[yhat][feature] * (
                    counter - m_last_updated[yhat][feature])
                m_last_updated[yhat][feature] = counter
                theta[yhat][feature] -= training_vec[index][feature]
                m[yhat][feature] -= training_vec[index][feature]

                # Updates for scores of actual class
                m[correct_label][feature] += theta[correct_label][feature] * (
                    counter - m_last_updated[correct_label][feature])
                m_last_updated[correct_label][feature] = counter
                theta[correct_label][feature] += training_vec[index][feature]
                m[correct_label][feature] += training_vec[index][feature]

        counter += 1