def evaluate(args):
    sparse_graphs_gold = {}
    sparse_graphs_predicted = {}

    total = 0.0
    correct = 0.0

    total_arcs = 0.0
    correct_arcs = 0.0

    for sentence in sentences(codecs.open(args.gold, encoding='utf-8')):
        sparse_graph_gold = Graph(sentence, "sparse").heads
        sparse_graphs_gold[len(sparse_graphs_gold)] = sparse_graph_gold
    for sentence in sentences(codecs.open(args.in_file, encoding='utf-8')):
        sparse_graph_predicted = Graph(sentence, "sparse").heads
        sparse_graphs_predicted[len(sparse_graphs_predicted)] = sparse_graph_predicted

    if len(sparse_graphs_gold) == len(sparse_graphs_predicted):
        for gold_graph in sorted(sparse_graphs_gold.keys()):
            total += 1
            if make_graph_compareable(sparse_graphs_gold[gold_graph]) == make_graph_compareable(
                    sparse_graphs_predicted[gold_graph]):
                correct += 1

    else:
        print "Error in file length, Gold: " + str(len(sparse_graphs_gold)) + ", Predicted: " + str(
            len(sparse_graphs_predicted))

    for predicted_graph in sorted(sparse_graphs_predicted.keys()):
        rev_predicted = reverse_head_graph(sparse_graphs_predicted[predicted_graph])
        rev_gold = reverse_head_graph(sparse_graphs_gold[predicted_graph])
        for dependent in rev_predicted:
            for arc in rev_predicted[dependent]:
                if arc.head == rev_gold[dependent][0].head:
                    correct_arcs += 1
                total_arcs += 1

    with open(args.out_file, "w") as out:
        print >> out, "Total: " + str(total)
        print >> out, "Correct: " + str(correct)
        print >> out, "%: " + str(round(correct/total, 2) * 100)
        print >> out, ""
        print >> out, "Total Arcs: " + str(total_arcs)
        print >> out, "Correct: " + str(correct_arcs)
        print >> out, "%: " + str(round(correct_arcs/total_arcs, 2) * 100)
def find_affixes(file_in, len_list):
    top_x = [2, 3, 4, 5] # all the affix lenghts that should be computed

    # creates lists for suffixes and prefixes, containing as many dictionaries as top_x elements:
    suffixes = {}
    prefixes = {}
    letter_combs = {}

    pos_tags = {}

    for i in top_x:
        suffixes[i] = {}
        prefixes[i] = {}
        letter_combs[i] = {}

    print "\tReading prefixes and suffixes"
    t0 = time.time()

    # after the following loop, every dictionary in both lists contains all affixes
    # that fit in that list as a key, and the respective frequency as it's value:
    for sentence in tk.sentences(codecs.open(file_in, encoding='utf-8')):
        for token in sentence:

            if token.gold_pos in pos_tags:
                pos_tags[token.gold_pos] += 1
            else:
                pos_tags[token.gold_pos] = 1

            for i in top_x: # for every desired affix length
                if len(token.form) > i: # word must be longer than suffix length

                    # token.form[-i:] is the suffix with length i

                    # suffixes[i-2] is the dictionary for suffixes with length i
                    # in the list 'suffixes'

                    if token.form[-i:] in suffixes[i]:
                        if token.gold_pos in suffixes[i][token.form[-i:]]:
                            suffixes[i][token.form[-i:]][token.gold_pos] += 1
                        else:
                            suffixes[i][token.form[-i:]][token.gold_pos] = 1
                    else:
                        suffixes[i][token.form[-i:]] = {token.gold_pos: 1}
                if len(token.form) > i: # word must be longer than prefix length

                    # the same as for suffixes

                    if token.form[:i] in prefixes[i]:
                        if token.gold_pos in prefixes[i][token.form[:i]]:
                            prefixes[i][token.form[:i]][token.gold_pos] += 1
                        else:
                            prefixes[i][token.form[:i]][token.gold_pos] = 1
                    else:
                        prefixes[i][token.form[:i]] = {token.gold_pos: 1}

                if len(token.form) > i+1 and i > 2:

                    # letter combinations in the word
                    # if they don't overlap with pre- or suffixes
                    for j in range(i, len(token.form)-(i*2-1)):
                        if token.form[j:j+i] in letter_combs[i]:
                            if token.gold_pos in letter_combs[i][token.form[j:j+i]]:
                                letter_combs[i][token.form[j:j+i]][token.gold_pos] += 1
                            else:
                                letter_combs[i][token.form[j:j+i]][token.gold_pos] = 1
                        else:
                            letter_combs[i][token.form[j:j+i]] = {token.gold_pos: 1}

    t1 = time.time()
    print "\t\t"+str(t1-t0)+" sec."
    return [suffixes, prefixes, letter_combs]
def evaluate(file_in, out_file):

    t0 = time.time()

    print "\tEvaluate predictions"

    pos_dict = {}
    counter = 0

    prediction_count = 0

    # unique_tags will contain every existing POS tag as key, whether it exists
    # only in gold, predicted, or both. The value is the dict {'TP':0,'FN':0,'FP':0}
    unique_tags = {}

    unique_tags_scores = {}
    correct_predictions = 0
    false_predictions = 0

    TP = 0.0
    FN = 0.0
    FP = 0.0

    for sentence in tk.sentences(codecs.open(file_in, encoding='utf-8')):
        for tid, token in enumerate(sentence):

            prediction_count += 1

            # add POS tags to dictionary:
            if token.gold_pos not in unique_tags:
                unique_tags[token.gold_pos] = {'TP': 0, 'FN': 0, 'FP': 0}
            if token.predicted_pos not in unique_tags:
                unique_tags[token.predicted_pos] = {'TP': 0, 'FN': 0, 'FP': 0}

            # if the prediction was correct, TP of the gold POS is increased by 1
            # otherwise, the FN of the gold POS and FP of the predicted pos are increased by 1
            if token.gold_pos == token.predicted_pos:
                correct_predictions += 1
                unique_tags[token.gold_pos]['TP'] += 1
            else:
                false_predictions += 1
                unique_tags[token.gold_pos]['FN'] += 1
                unique_tags[token.predicted_pos]['FP'] += 1

            # computes precision, recall, accuracy and f-score for each tag based on TP, FN, FP:
    for pos in unique_tags:

        TP += unique_tags[pos]['TP']
        FN += unique_tags[pos]['FN']
        FP += unique_tags[pos]['FP']

        unique_tags_scores[pos] = {'Precision': 0.00, 'Recall': 0.00, 'Accuracy': 0.00, 'F-Score': 0.00}

        if unique_tags[pos]['TP'] + unique_tags[pos]['FP'] == 0:
            unique_tags_scores[pos]['precision'] = 0.00
        else:
            unique_tags_scores[pos]['precision'] = (float(unique_tags[pos]['TP'])) / (float(unique_tags[pos]['TP']) + \
                                                                                      float(unique_tags[pos][
                                                                                          'FP'])) * 100.00

        if unique_tags[pos]['TP'] + unique_tags[pos]['FN'] == 0:
            unique_tags_scores[pos]['recall'] = 0.00
        else:
            unique_tags_scores[pos]['recall'] = (float(unique_tags[pos]['TP'])) / (float(unique_tags[pos]['TP']) + \
                                                                                   float(
                                                                                       unique_tags[pos]['FN'])) * 100.00

        if unique_tags[pos]['TP'] + unique_tags[pos]['FP'] + unique_tags[pos]['FN'] == 0:
            unique_tags_scores[pos]['accuracy'] = 0.00
        else:
            unique_tags_scores[pos]['accuracy'] = float(unique_tags[pos]['TP']) / (float(unique_tags[pos]['TP']) + \
                                                                                   float(unique_tags[pos]['FN']) + \
                                                                                   float(
                                                                                       unique_tags[pos]['FP'])) * 100.00

        if unique_tags_scores[pos]['precision'] + unique_tags_scores[pos]['recall'] == 0.00:
            unique_tags_scores[pos]['f-score'] = 0.00
        else:
            unique_tags_scores[pos]['f-score'] = (2 * float(unique_tags_scores[pos]['precision']) * \
                                                  float(unique_tags_scores[pos]['recall'])) / \
                                                 (float(unique_tags_scores[pos]['precision']) + \
                                                  float(unique_tags_scores[pos]['recall']))

    # computes overall values, then writes results to file:

    precision_sum = 0.0
    recall_sum = 0.0
    f_score_sum = 0.0

    false_tags = prediction_count - correct_predictions

    for pos in unique_tags_scores:
        precision_sum += unique_tags_scores[pos]['precision']
        recall_sum += unique_tags_scores[pos]['recall']
        f_score_sum += unique_tags_scores[pos]['f-score']

    macro_averaged_precision = precision_sum / float(len(unique_tags_scores))
    macro_averaged_recall = recall_sum / float(len(unique_tags_scores))
    macro_averaged_f_score = f_score_sum / float(len(unique_tags_scores))

    if TP+FP != 0:
        micro_averaged_precision = TP/(TP+FP)*100
    else:
        micro_averaged_precision = 0.0
    if TP+FN != 0:
        micro_averaged_recall = TP/(TP+FN)*100
    else:
        micro_averaged_recall = 0.0
    if micro_averaged_precision+micro_averaged_recall != 0:
        micro_averaged_f_score = (2*micro_averaged_precision*micro_averaged_recall)/(micro_averaged_precision+micro_averaged_recall)
    else:
        micro_averaged_f_score = 0.0

    accuracy = (float(correct_predictions) / float(prediction_count)) * 100
    error_rate = (float(false_predictions) / float(prediction_count)) * 100

    t1 = time.time()
    print "\t\t" + str(t1 - t0) + " sec."

    print "\tWrite evaluation results to file"
    z0 = time.time()

    print >> out_file, "Total Predictions:\t" + str(prediction_count)
    print >> out_file, "Correct Predictions:\t" + str(correct_predictions)
    print >> out_file, "False Predictions:\t" + str(false_tags)
    print >> out_file, ""
    print >> out_file, "Accuracy:\t" + str(round(accuracy, 2))
    print >> out_file, "Error rate:\t" + str(round(error_rate, 2))
    print >> out_file, ""
    print >> out_file, "Overall Precision (mac-av):\t" + str(round(macro_averaged_precision, 2))
    print >> out_file, "Overall Recall (mac-av):\t" + str(round(macro_averaged_recall, 2))
    print >> out_file, "Overall F-Score (mac-av):\t" + str(round(macro_averaged_f_score, 2))
    print >> out_file, ""
    print >> out_file, "Overall Precision (mic-av):\t" + str(round(micro_averaged_precision, 2))
    print >> out_file, "Overall Recall (mic-av):\t" + str(round(micro_averaged_recall, 2))
    print >> out_file, "Overall F-Score (mic-av):\t" + str(round(micro_averaged_f_score, 2))
    print ""

    print >> out_file, "Tagwise Accuracy, Precision, Recall and F-Score:\n"
    for pos in unique_tags_scores.keys():
        print >> out_file, pos + "\tAccuracy: " + str(round(unique_tags_scores[pos]['accuracy'], 2)) + "\tPrecision: " + \
                           str(round(unique_tags_scores[pos]['precision'], 2)) + "\tRecall: " + \
                           str(round(unique_tags_scores[pos]['recall'], 2)) + "\tF-Score: " + \
                           str(round(unique_tags_scores[pos]['f-score'], 2))

    print "\t\tTotal Predictions:\t" + str(prediction_count)
    print "\t\tCorrect Predictions:\t" + str(correct_predictions)
    print "\t\tFalse Predictions:\t" + str(false_tags)
    print ""
    print "\t\tAccuracy:\t" + str(round(accuracy, 2))
    print "\t\tError rate:\t" + str(round(error_rate, 2))
    print ""
    print "\t\tOverall Precision (mac-av):\t" + str(round(macro_averaged_precision, 2))
    print "\t\tOverall Recall (mac-av):\t" + str(round(macro_averaged_recall, 2))
    print "\t\tOverall F-Score (mac-av):\t" + str(round(macro_averaged_f_score, 2))
    print ""
    print "\t\tOverall Precision (mic-av):\t" + str(round(micro_averaged_precision, 2))
    print "\t\tOverall Recall (mic-av):\t" + str(round(micro_averaged_recall, 2))
    print "\t\tOverall F-Score (mic-av):\t" + str(round(micro_averaged_f_score, 2))
    print ""
    print "\t\tFor details see the output file."

    z1 = time.time()
    print "\t\t" + str(z1 - z0) + " sec."
def evaluate(file_in, out_file):

    t0 = time.time()

    print "\tEvaluate predictions"

    pos_dict = {}
    counter = 0

    prediction_count = 0

    # unique_tags will contain every existing POS tag as key, whether it exists
    # only in gold, predicted, or both. The value is the dict {'TP':0,'FN':0,'FP':0}
    unique_tags = {}

    unique_tags_scores = {}
    correct_predictions = 0
    false_predictions = 0

    TP = 0.0
    FN = 0.0
    FP = 0.0

    for sentence in tk.sentences(codecs.open(file_in, encoding='utf-8')):
        gold_targets = []
        predicted_targets = []
        
        gold_target = []
        predicted_target = []
        
        i_found_predicted = False
        i_found_gold = False
        
        for tid, token in enumerate(sentence):         
            
            if token.predicted_tag_2 == "I":
                i_found_predicted = True
                predicted_target.append(token.t_id_2)
            else:
                if len(predicted_target) > 0:
                    i_found_predicted = False
                    predicted_targets.append(predicted_target)
                    predicted_target = []
            
            if i_found_predicted:
                predicted_target.append(token.form_2)

            if token.gold_tag_2 == "I":
                i_found_gold = True
                gold_target.append(token.t_id_2)
            else:
                if len(gold_target) > 0:
                    i_found_gold = False
                    gold_targets.append(gold_target)
                    gold_target = []
            
            if i_found_gold:
                gold_target.append(token.form_2)

        for prediction in predicted_targets:
            if prediction in gold_targets:
                TP += 1.0
                del gold_targets[gold_targets.index(prediction)]
            else:
                FP += 1.0
        FN += len(gold_targets)
        """
        for tid, token in enumerate(sentence):

            prediction_count += 1

            # add POS tags to dictionary:
            if token.gold_tag_2 not in unique_tags:
                unique_tags[token.gold_tag_2] = {'TP': 0, 'FN': 0, 'FP': 0}
            if token.predicted_tag_2 not in unique_tags:
                unique_tags[token.predicted_tag_2] = {'TP': 0, 'FN': 0, 'FP': 0}

            # if the prediction was correct, TP of the gold POS is increased by 1
            # otherwise, the FN of the gold POS and FP of the predicted pos are increased by 1
            if token.gold_tag_2 == token.predicted_tag_2:
                correct_predictions += 1
                unique_tags[token.gold_tag_2]['TP'] += 1
            else:
                false_predictions += 1
                unique_tags[token.gold_tag_2]['FN'] += 1
                unique_tags[token.predicted_tag_2]['FP'] += 1

            # computes precision, recall, accuracy and f-score for each tag based on TP, FN, FP:
    for pos in unique_tags:

        TP += unique_tags[pos]['TP']
        FN += unique_tags[pos]['FN']
        FP += unique_tags[pos]['FP']

        unique_tags_scores[pos] = {'Precision': 0.00, 'Recall': 0.00, 'Accuracy': 0.00, 'F-Score': 0.00}

        if unique_tags[pos]['TP'] + unique_tags[pos]['FP'] == 0:
            unique_tags_scores[pos]['precision'] = 0.00
        else:
            unique_tags_scores[pos]['precision'] = (float(unique_tags[pos]['TP'])) / (float(unique_tags[pos]['TP']) + \
                                                                                      float(unique_tags[pos][
                                                                                          'FP'])) * 100.00

        if unique_tags[pos]['TP'] + unique_tags[pos]['FN'] == 0:
            unique_tags_scores[pos]['recall'] = 0.00
        else:
            unique_tags_scores[pos]['recall'] = (float(unique_tags[pos]['TP'])) / (float(unique_tags[pos]['TP']) + \
                                                                                   float(
                                                                                       unique_tags[pos]['FN'])) * 100.00

        if unique_tags[pos]['TP'] + unique_tags[pos]['FP'] + unique_tags[pos]['FN'] == 0:
            unique_tags_scores[pos]['accuracy'] = 0.00
        else:
            unique_tags_scores[pos]['accuracy'] = float(unique_tags[pos]['TP']) / (float(unique_tags[pos]['TP']) + \
                                                                                   float(unique_tags[pos]['FN']) + \
                                                                                   float(
                                                                                       unique_tags[pos]['FP'])) * 100.00

        if unique_tags_scores[pos]['precision'] + unique_tags_scores[pos]['recall'] == 0.00:
            unique_tags_scores[pos]['f-score'] = 0.00
        else:
            unique_tags_scores[pos]['f-score'] = (2 * float(unique_tags_scores[pos]['precision']) * \
                                                  float(unique_tags_scores[pos]['recall'])) / \
                                                 (float(unique_tags_scores[pos]['precision']) + \
                                                  float(unique_tags_scores[pos]['recall']))
    """
    # computes overall values, then writes results to file:

    if TP+FP != 0:
        micro_averaged_precision = TP/(TP+FP)*100
    else:
        micro_averaged_precision = 0.0
    if TP+FN != 0:
        micro_averaged_recall = TP/(TP+FN)*100
    else:
        micro_averaged_recall = 0.0
    if micro_averaged_precision+micro_averaged_recall != 0:
        micro_averaged_f_score = (2*micro_averaged_precision*micro_averaged_recall)/(micro_averaged_precision+micro_averaged_recall)
    else:
        micro_averaged_f_score = 0.0


    t1 = time.time()
    print "\t\t" + str(t1 - t0) + " sec."

    print "\tWrite evaluation results to file"
    z0 = time.time()

    print >> out_file, "Overall Precision (mic-av):\t" + str(round(micro_averaged_precision, 2))
    print >> out_file, "Overall Recall (mic-av):\t" + str(round(micro_averaged_recall, 2))
    print >> out_file, "Overall F-Score (mic-av):\t" + str(round(micro_averaged_f_score, 2))
    print ""

    print "\t\tOverall Precision (mic-av):\t" + str(round(micro_averaged_precision, 2))
    print "\t\tOverall Recall (mic-av):\t" + str(round(micro_averaged_recall, 2))
    print "\t\tOverall F-Score (mic-av):\t" + str(round(micro_averaged_f_score, 2))
    print ""
    print "\t\tFor details see the output file."

    z1 = time.time()
    print "\t\t" + str(z1 - z0) + " sec."