def main(ptb_file, results_dir, annotations_dir):
    correct_patterns = set()
    for root, dirs, files in os.walk(results_dir):
        for f in files:
            if 'correct_' not in f:
                continue
            for line in open(root + '/' + f):
                correct_patterns.add(line.strip()[1:-1])
    pattern_heads = dict()
    for root, dirs, files in os.walk(annotations_dir):
        for f in files:
            for line in open(root + '/' + f):
                if line.startswith('i'): continue  # skip the header
                index, pattern, head = line.strip().split('\t')
                pattern_heads[pattern[1:-1]] = int(head) - 1  # 0-index the head
    trees = []
    text = ''
    skipped = 0
    for line in open(ptb_file):
        if text and line[0] != ' ':
            try:
                trees.append(Tree.read(text))
            except AttributeError:
                #print text
                skipped += 1
            text = ''
        text += line
    if text:
        trees.append(Tree.read(text))
    good_trees = []
    for tree in trees:
        if tree_is_good(tree.root.children[0], pattern_heads):
        #if tree_is_good(tree.root.children[0], correct_patterns):
            good_trees.append(tree)
    print 'Number of trees:', len(trees)
    print 'Number of good trees:', len(good_trees)
    print 'Skipped:', skipped
    cats = error_category_counts.keys()
    cats.sort(key=lambda x: error_category_counts[x])
    for cat in cats:
        print '%s: %d' % (cat, error_category_counts[cat])
    print
    errors = error_counts.keys()
    errors.sort(key=lambda x: error_counts[x], reverse=True)
    for error in errors[:30]:
        print '%s: %d' % (error, error_counts[error])
    out = open('good_trees.mrg', 'w')
    for tree in good_trees:
        out.write(tree.pretty())
        out.write('\n\n')
    out.close()
    shuffle(good_trees)
    num_examples = 100
    out = open('marked_example_trees.mrg', 'w')
    for tree in good_trees[:num_examples]:
        mark_heads(tree.root.children[0], pattern_heads)
        out.write(tree.pretty())
        out.write('\n\n')
    out.close()
def main():
    annotations = Annotation.objects.select_related().filter(head_correct=True)
    for annotation in annotations:
        expansion = annotation.expansion
        root = None
        for i, line in enumerate(expansion.supa_example.split('\n')):
            if 'ROOT' in line:
                if root != None:
                    print 'Two roots found; skipping'
                    continue
                root = i
        if root == None:
            # Sometimes the SUPA is empty; testing on 3/20/2013 showed that was
            # the only time this happened
            continue
        tree = Tree.read(expansion.penn_example)
        head_index = None
        for i, child in enumerate(tree.root.children):
            if root in child.terminal_indices():
                head_index = i+1
                break
        if head_index:
            annotation.head_index = head_index
            annotation.save()
        else:
            print 'Head index not found...'
    transaction.commit()
def simplify_trees(suite_file, outfile):
    new_trees = []
    tree = ''
    for line in open(suite_file):
        if line == '\n':
            new_trees.append(convert_tree(Tree.read(tree)))
            tree = ''
        tree += line
    out = open(outfile, 'w')
    for tree in new_trees:
        out.write(tree.pretty())
        out.write('\n\n')
def main(category, suites_dir):
    tree_file = None
    for root, dirs, files in os.walk('.'):
        for f in files:
            if 'sierra_postop' in f:
                tree_file = f
    if not tree_file:
        print 'Could not find sierra_postop file!  Exiting...'
        exit(-1)
    trees = []
    patterns = []
    text = ''
    for line in open(tree_file):
        if line == 'null\n':
            print 'Null found!'
            trees.append(None)
            continue
        if line == '\n':
            trees.append(Tree.read(text))
            text = ''
            continue
        text += line
    new_trees = []
    for i, tree in enumerate(trees):
        # Clear off some of the extra processing that the SUPA pipeline adds
        if tree is None:
            new_trees.append('')
            continue
        root = tree.root.children[1]
        clear_extra_labels(root)
        new_trees.append(root)
    outfile = '../' + suites_dir + '/' + category + '/' + category
    outfile += '_PTBtrees_intermediate.mrg'
    out = open(outfile, 'w')
    for tree in new_trees:
        if tree:
            out.write(tree.pretty())
            out.write('\n\n')
        else:
            out.write('\n')
    out.close()
def main(annotation_file, category):
    outfile = 'results/results.tsv'
    tree_file = None
    for root, dirs, files in os.walk('.'):
        for f in files:
            if 'sierra_postop' in f:
                tree_file = f
    if not tree_file:
        print 'Could not find sierra_postop file!  Exiting...'
        exit(-1)
    annotations = {}
    annotation_patterns = {}
    for line in open(annotation_file):
        if line.startswith('index'): continue
        index, pattern, head_index = line.strip().split('\t')
        annotations[int(index)] = int(head_index) - 1
        annotation_patterns[pattern] = int(head_index) - 1
    trees = []
    patterns = []
    text = ''
    for line in open(tree_file):
        if line == 'null\n':
            trees.append(None)
            continue
        if line == '\n':
            trees.append(Tree.read(text))
            text = ''
            continue
        text += line
    count_file = '../test_suites_v2/%s/%s_tagAsParent_rules_grouped.txt' % (
            category, category)
    counts = []
    i = 0
    for line in open(count_file):
        count, pattern, _ = line.split('\t')
        counts.append(int(count))
        # TODO: this could be better - like check the annotation file to be
        # sure that the patterns match
        patterns.append(pattern)
        i += 1
    if len(counts) != len(trees):
        print 'Error! Incorrect alignment between trees and counts:'
        print len(counts), len(trees)
        exit(-1)
    # 'count' is token count, 'num' is type count
    total_count = 0
    count_annotated = 0
    count_correct = 0
    num_patterns = 0
    num_annotated = 0
    num_correct = 0
    errors = []
    correct = []
    for i, tree in enumerate(trees):
        pattern = patterns[i]
        num_patterns += 1
        total_count += counts[i]
        index = i + 1
        if index not in annotations: continue
        num_annotated += 1
        count_annotated += counts[i]

        # Clear off some of the extra processing that the SUPA pipeline adds
        if tree is None:
            continue
        root = tree.root.children[1]
        head = root.label.split('__', 1)[1]
        head_index = -1

        # The labels are on trees that haven't had WH-movement undone - we have
        # to correct the annotations for that.  This isn't perfect, but it will
        # do for now.
        annotated_children = len(pattern.split()) - 1
        actual_children = len(root.children)
        is_conjpp = 'CONJPP' in [x.label.split('__')[0] for x in root.children]
        if (not is_conjpp
                and actual_children == annotated_children - 1
                and annotations[index] != 0):
            annotations[index] = annotations[index] - 1

        # Now to actually check to see what was labeled as the head
        for j, child in enumerate(root.children):
            child_head = child.label.split('__', 1)[1]
            if child_head == head:
                head_index = j + 1
                if j == annotations[index]:
                    correct.append(patterns[i])
                    num_correct += 1
                    count_correct += counts[i]
                    break
        else:
            errors.append((patterns[i], head_index, annotations[index]+1))
    percent_tested = num_annotated / num_patterns
    percent_correct = num_correct / num_annotated
    count_percent_annotated = count_annotated / total_count
    count_percent_correct = count_correct / count_annotated
    out = open(outfile, 'a')
    out.write('%s\t%d\t%d\t%.3f\t%d\t%.3f\t%d\t%d\t%.3f\t%d\t%.3f\n' % (
            category, num_patterns, num_annotated, percent_tested, num_correct,
            percent_correct, total_count, count_annotated,
            count_percent_annotated, count_correct, count_percent_correct))
    error_file = open('results/errors_%s.tsv' % category, 'w')
    error_file.write('pattern\tpredicted\tactual\n')
    for error in errors:
        error_file.write('%s\t%d\t%d\n' % error);
    correct_file = open('results/correct_%s.tsv' % category, 'w')
    for pattern in correct:
        correct_file.write('%s\n' % pattern);