Exemple #1
0
def pcfg_demo():
    """
    A demonstration showing how C{WeightedGrammar}s can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print 'A PCFG production:', ` pcfg_prod `
    print '    pcfg_prod.lhs()  =>', ` pcfg_prod.lhs() `
    print '    pcfg_prod.rhs()  =>', ` pcfg_prod.rhs() `
    print '    pcfg_prod.prob() =>', ` pcfg_prod.prob() `
    print

    grammar = toy_pcfg2
    print 'A PCFG grammar:', ` grammar `
    print '    grammar.start()       =>', ` grammar.start() `
    print '    grammar.productions() =>',
    # Use string.replace(...) is to line-wrap the output.
    print ` grammar.productions() `.replace(',', ',\n' + ' ' * 26)
    print

    print 'Coverage of input words by a grammar:'
    print grammar.covers(['a', 'boy'])
    print grammar.covers(['a', 'girl'])

    # extract productions from three trees and induce the PCFG
    print "Induce PCFG grammar from treebank data:"

    productions = []
    for item in treebank.items[:2]:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            productions += tree.productions()

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    print grammar
    print

    print "Parse sentence using induced grammar:"

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    #sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves()
    print sent
    for parse in parser.nbest_parse(sent):
        print parse
Exemple #2
0
def pcfg_demo():
    """
    A demonstration showing how C{WeightedGrammar}s can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', repr(pcfg_prod))
    print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    print()

    grammar = toy_pcfg2
    print('A PCFG grammar:', repr(grammar))
    print('    grammar.start()       =>', repr(grammar.start()))
    print('    grammar.productions() =>', end=' ')
    # Use string.replace(...) is to line-wrap the output.
    print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
    print()

    print('Coverage of input words by a grammar:')
    print(grammar.covers(['a', 'boy']))
    print(grammar.covers(['a', 'girl']))

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    for item in treebank.items[:2]:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            productions += tree.productions()

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    print(grammar)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    #sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves()
    print(sent)
    for parse in parser.nbest_parse(sent):
        print(parse)
Exemple #3
0
def grammar_development_with_treebank():
    from nltk.corpus import treebank
    t = treebank.parsed_sents("wsj_0001.mrg")[0]
    print t
    print "identify verbs for SV in VP -> SV S", [
        subtree for tree in treebank.parsed_sents()
        for subtree in tree.subtrees(_grammar_filter)
    ]
Exemple #4
0
def grammar_development_with_treebank():
    from nltk.corpus import treebank

    t = treebank.parsed_sents("wsj_0001.mrg")[0]
    print t
    print "identify verbs for SV in VP -> SV S", [
        subtree for tree in treebank.parsed_sents() for subtree in tree.subtrees(_grammar_filter)
    ]
Exemple #5
0
def pcfg_demo():
    """
    A demonstration showing how a ``PCFG`` can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    # pcfg_prods = toy_pcfg1.productions()
    #
    # pcfg_prod = pcfg_prods[2]
    # print('A PCFG production:', repr(pcfg_prod))
    # print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    # print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    # print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    # print()
    #
    # grammar = toy_pcfg2
    # print('A PCFG grammar:', repr(grammar))
    # print('    grammar.start()       =>', repr(grammar.start()))
    # print '    grammar.productions() =>',
    # # Use .replace(...) is to line-wrap the output.
    # print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
    # print()

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    item = treebank._fileids[0]
    for tree in treebank.parsed_sents(item)[:3]:
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS=False)
        tree.chomsky_normal_form(horzMarkov=2)

        productions += tree.productions()

    # S = Nonterminal('S')
    # grammar = induce_pcfg(S, productions)
    print(productions)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    # sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents(item)[0].leaves()
    print(sent)
    for parse in parser.parse(sent):
        print(parse)
Exemple #6
0
def learn_treebank(files=None, markov_order=None):
    """
    Learn a PCFG from the Penn Treebank, and return it.
    
    By default, this learns from NLTK's 10% sample of the Penn Treebank.
    You can give the filename of a Treebank file; 'wsj-02-21.mrg' will
    learn from the entire training section of Treebank.
    """
    if files is None: bank = treebank.parsed_sents()
    else: bank = treebank.parsed_sents(files)
    return learn_trees(bank, collapse=True, markov_order=markov_order)
Exemple #7
0
def learn_treebank(files=None, markov_order=None):
    """
    Learn a PCFG from the Penn Treebank, and return it.
    
    By default, this learns from NLTK's 10% sample of the Penn Treebank.
    You can give the filename of a Treebank file; 'wsj-02-21.mrg' will
    learn from the entire training section of Treebank.
    """
    if files is None: bank = treebank.parsed_sents()
    else: bank = treebank.parsed_sents(files)
    return learn_trees(bank, collapse=True, markov_order=markov_order)
Exemple #8
0
def grammarDevelopmen():
    print "page 315 8.6  Grammar Developmen"
    print "=============== Treebanks and Grammars ==============="
    from nltk.corpus import treebank
    t = treebank.parsed_sents('wsj_0001.mrg')[0]
    print t

    def filter(tree):
        child_nodes = [child.node for child in tree if isinstance(child, nltk.Tree)]
        return  (tree.node == 'VP') and ('S' in child_nodes)

    print [subtree for tree in treebank.parsed_sents() for subtree in tree.subtrees(filter)]
Exemple #9
0
def sentences():
    for f in treebank.fileids():
        for t in treebank.parsed_sents(f):
            t.chomsky_normal_form(horzMarkov=1)
            t.collapse_unary(collapsePOS=True)

            yield (t, t.leaves())
def gen_corpus(path, threshold):
    """
    src: http://www.nltk.org/_modules/nltk/tree.html
    corpora from wsj_0001.mrg to wsj_0199.mrg
    e.g.: t = treebank.parsed_sents('wsj_0001.mrg')[0]
    to visualize a tree: t.draw()
    :param path: save to path
    :param threshold: minimum length of a sentence to keep
    :return: none
    """
    boundaries = []
    sentences = []
    for t in treebank.parsed_sents(treebank.fileids()):
        flat = _flatten_tree(t, threshold)
        if flat:
            boundaries.append(flat)
            sentence = ' '.join(t.leaves()).translate(PUNC_TRANS).lower()
            sentence = re.sub(r' +', ' ', sentence)
            # replace digit(s) as 'x'(s)
            sentences.append(re.sub(r'\d', 'x', sentence).strip())
    _check_length_match(boundaries, sentences)
    with open(path + "/boundaries.txt", 'w') as f:
        f.write('1'.join(boundaries))
    with open(path + "/sentences.txt", 'w') as f:
        f.write(' '.join(sentences))
Exemple #11
0
def CKY_parser():
    '''
    Given the PCFG, we use the built in CKY praser function
    to get a sentence's most probable parse
    '''
    PCFG_grammar = make_PCFG_grammar()
    # Utilize the ViertabiParser given the PCFG grammar induction rules
    parser = ViterbiParser(PCFG_grammar)

    # Sample sentence parse
    sentences = treebank.parsed_sents('wsj_1964.mrg')

    skipped_sentences = 0

    # A for loop to print out the full parse
    for sentence in sentences:
        sentence = sentence.leaves()
        try:
            PCFG_grammar.check_coverage(sentence)
            for parse in parser.parse(sentence):
                print(parse)
        except:
            skipped_sentences += 1
            continue

    print("Total skipped sentences:", skipped_sentences)
Exemple #12
0
def ex6(symbol='S', display=5):
    """
    PCFG: Probabilistic CFGs

    Generating the probability distribution of a given symbol in a CFG.

    For a condenced visual display of results, expansions with less
    than five or any given number of instances are removed from the
    results although the calculations for the probability distribution
    of the symbol includes all available productions.
    """
    prob_dist = dict()
    l5_view = dict()
    productions = [
        p for tree in treebank.parsed_sents() for p in tree.productions()
    ]
    all_sym_prd = [p for p in productions if p.lhs().symbol() == symbol]
    sym_count = len(all_sym_prd)
    unique_rhs = set([p.rhs() for p in all_sym_prd])
    all_rhs = [p.rhs() for p in all_sym_prd]
    for rhs in unique_rhs:
        prob_dist[rhs] = all_rhs.count(rhs) / sym_count
        if all_rhs.count(rhs) < display:  # condence display
            prob_dist.pop(rhs)
    return prob_dist
Exemple #13
0
def sentences():
    for f in treebank.fileids():
        for t in treebank.parsed_sents(f):
            t.chomsky_normal_form(horzMarkov=1)
            t.collapse_unary(collapsePOS=True)

            yield (t, t.leaves())
Exemple #14
0
def sequence_matching(input):
    sents = treebank.tagged_sents()
    parses = treebank.parsed_sents()
    for s in range(len(sents)):  # look through every sentence in treebank to find a sequence match with input
        sent = sents[s]
        pars = parses[s]
        k = 0  # k will track how far into the sequence has been matched
        matches = []  # log position in sent that there was a match to help build tree later
        for i in range(len(input)):
            match = False  # flag to cut down on time if a word doesn't match anything in the sent
            for j in range(k, len(sent)):  # loop through every word in sentence starting from last match

                if sent[j][1] == input[i][1]:  # labels (pos) match
                    k = j
                    UpdateTree(pars, j, input[i][1])
                    match = True  # if this line is never reached, then don't waste more time on this sentence
                    if i == len(input) - 1:  # made it through the entire input, so sent was a match
                        return pars # pars will have words replaced where there is a match
                    break

            if match == False:
                print("Sentence does not match")
                break  # program has looked through whole sentence without matching a word so move onto the next sentence

    return None  # no sentence was found to match the input sequence, print error message
Exemple #15
0
def test_GrammarParser():
    import nltk
    from nltk.corpus import treebank
    grammar = r"""NP:
    {<DT>*(<NN>|<NNP>|<NNS>)+}          # Chunk everything
    }<VBD|IN>+{      # Chink sequences of VBD and IN
    """
    #     tree=treebank.parsed_sents('wsj_0001.mrg')[0]
    #     print tree
    grammar_VP = r"""VP:
    {<VBZ><VP>}
    """
    #     tree=nltk.RegexpParser(grammar).parse(treebank.parsed_sents('wsj_0001.mrg')[0].pos())
    #     print tree
    fileids = treebank.fileids()

    #     for fileld in fileids:
    for i in range(len(fileids)):
        if i > 10:
            break


#         trees=treebank.parsed_sents(fileld)
        trees = treebank.parsed_sents(fileids[i])
        for tree in trees:
            tree_Gram = nltk.RegexpParser(grammar).parse(tree)
            for subtree in tree_Gram.subtrees():
                if subtree.label() == "VP":
                    print subtree
Exemple #16
0
def test():
    """Do some tree drawing tests."""
    def print_tree(n, tree, sentence=None, ansi=True, **xargs):
        print()
        print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves())))
        print(tree)
        print()
        drawtree = TreePrettyPrinter(tree, sentence)
        try:
            print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs))
        except (UnicodeDecodeError, UnicodeEncodeError):
            print(drawtree.text(unicodelines=False, ansi=False, **xargs))

    from nltk.corpus import treebank
    for n in [0, 1440, 1591, 2771, 2170]:
        tree = treebank.parsed_sents()[n]
        print_tree(n, tree, nodedist=2, maxwidth=8)
    print()
    print('ASCII version:')
    print(TreePrettyPrinter(tree).text(nodedist=2))

    tree = Tree.fromstring(
        '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) '
        '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) '
        '(vg 10) (inf (verb 11)))))) (punct 12))',
        read_leaf=int)
    sentence = ('Ze had met haar moeder kunnen gaan winkelen ,'
                ' zwemmen of terrassen .'.split())
    print_tree('Discontinuous tree', tree, sentence, nodedist=2)
def main(transform_func = None, n = 10):
    parser=StanfordParser(
        path_to_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser.jar",
        path_to_models_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models.jar",
        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    )

    test_sents = treebank.sents()[-n:]

    print "len(test_sents) = %d" %(len(test_sents))

    if transform_func and callable(transform_func):
        print "transforming it using ", transform_func
        test_sents = [[transform_func(w) for w in s] 
                      for s in test_sents] # transform it

    print test_sents[:10]

    print "predicting"
    pred_parses = parser.parse_sents(test_sents)
    
    gold_parses = treebank.parsed_sents()
    
    print "evaluating"

    correct_n = gold_n = predicted_n = 0.0
    
    for gparse, pparse in zip(gold_parses, pred_parses):
        cn, gn, pn = precision_and_recall_stat(get_nodes_with_range(gparse), 
                                               get_nodes_with_range(pparse))
        correct_n += cn
        gold_n += gn
        predicted_n += pn
        
    print "Prediction: %f, Recall: %f" %(correct_n / predicted_n, correct_n / gold_n)
Exemple #18
0
def pcfg(train_idx=None, smoothing=None):
    """
    productions = []
    item = treebank._fileids[0]
    print("ITEM\n\n",item,"\n\n")
    for tree in treebank.parsed_sents(item)[:3]:
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS = False)
        tree.chomsky_normal_form(horzMarkov = 2)
        productions += tree.productions()
"""
    if train_idx == None:
        train_idx = (len(treebank.fileids()) * 3) // 4
    productions = []
    for item in treebank.fileids()[0:train_idx]:
        for tree in treebank.parsed_sents(item):
            tree.collapse_unary(
                collapsePOS=False)  # Remove unary production rule
            tree.chomsky_normal_form(
                horzMarkov=2
            )  # Convert into chomsky normal form i.e., A->(B,C,D) into A->(B,E) E->(C,D)
            productions += tree.productions()

    S = Nonterminal('S')
    if smoothing == None:
        grammar = learn_pcfg(S, productions)
    elif smoothing == 'L1':
        grammar = smoothing_pcfg(S, productions)

    with open('grammar.pkl', 'wb') as f:
        pickle.dump(grammar, f)

    return grammar
Exemple #19
0
def train_grammar(unknown_words=[], nb_reduced_production=6000):

    productions = []

    for item in train:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)  # Remove branches A-B-C into A-B+C
            tree.chomsky_normal_form(horzMarkov=2)  # Remove A->(B,C,D) into A->B,C+D->D
            #tree_prods = tree.productions()


            productions += tree.productions()


    counter = collections.Counter(productions)
    n_comms = [item for item, count in counter.most_common(nb_reduced_production) for i in range(count)]

    #Adding unkwown words and terminal rules back into the reduced productions set
    unknown_words_prods = []
    for p in productions:
        if isinstance(p._rhs[0], str):
            unknown_words_prods.append(p)
            for u in unknown_words:
                rhs = [u]
                lhs = p._lhs
                new_prod = Production(lhs, rhs)
                unknown_words_prods.append(new_prod)


    n_comms += unknown_words_prods
    S = Nonterminal('S')
    grammar = induce_pcfg(S, n_comms)

    return grammar
Exemple #20
0
def convert_wsj(file_obj):
    from nltk.corpus import treebank
    sys.stderr.write("Converting Penn Treebank sampler...\n")
    tb = TreebankConverter()
    for sentence in treebank.parsed_sents():
        tb.add_sentence(sentence)
    tb.write(file_obj)
Exemple #21
0
def test():
    model = torch.load('./ckpt/model0.pt')
    leafmodel = LeafNet()
    x = treebank.sents('wsj_0003.mrg')[0]
    y = treebank.parsed_sents('wsj_0003.mrg')[0]
    preprocess(y)
    # embed_x is the list of embedding vectors of x
    embed_x = []
    x_list = []
    l = int(len(x))

    for i in range(0, l):
        txlist = []
        x[i] = x[i].lower()
        txlist.append(x[i])
        tembed = torch.Tensor(get_embed(x[i]))
        embed_x.append(tembed)

        pred = leafmodel(embed_x[i])
        gt = (torch.argmax(pred)).item()
        txlist.append(gt)
        x_list.append(txlist)

    # we got the (sentence,gt) list, embedding vector list for the leafs
    xscore = 0.0
    while (len(x_list) != 1):
        x_list, embed_x, tscore = calculate_score(x_list, embed_x, model)
        xscore = xscore + tscore
    x_list = str(x_list).replace('[', '(').replace(']', ')').replace(
        '\'', '').replace(',', '')
    x_list_tree = Tree.fromstring((x_list))

    draw_trees(x_list_tree)
    draw_trees(y)
Exemple #22
0
def test():
    """Do some tree drawing tests."""
    def print_tree(n, tree, sentence=None, ansi=True, **xargs):
        print()
        print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves())))
        print(tree)
        print()
        drawtree = TreePrettyPrinter(tree, sentence)
        try:
            print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs))
        except (UnicodeDecodeError, UnicodeEncodeError):
            print(drawtree.text(unicodelines=False, ansi=False, **xargs))

    from nltk.corpus import treebank
    for n in [0, 1440, 1591, 2771, 2170]:
        tree = treebank.parsed_sents()[n]
        print_tree(n, tree, nodedist=2, maxwidth=8)
    print()
    print('ASCII version:')
    print(TreePrettyPrinter(tree).text(nodedist=2))

    tree = Tree.fromstring(
        '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) '
        '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) '
        '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int)
    sentence = ('Ze had met haar moeder kunnen gaan winkelen ,'
                ' zwemmen of terrassen .'.split())
    print_tree('Discontinuous tree', tree, sentence, nodedist=2)
def PCFG_Section():
    toy_pcfg1 = PCFG.fromstring("""
        S -> NP VP [1.0]
        NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
        Det -> 'the' [0.8] | 'my' [0.2]
        N -> 'man' [0.5] | 'telescope' [0.5]
        VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
        V -> 'ate' [0.35] | 'saw' [0.65]
        PP -> P NP [1.0]
        P -> 'with' [0.61] | 'under' [0.39]
    """)

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', pcfg_prod)
    print('pcfg_prod.lhs()  =>', pcfg_prod.lhs())
    print('pcfg_prod.rhs()  =>', pcfg_prod.rhs())
    print('pcfg_prod.prob() =>', pcfg_prod.prob())

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    for item in treebank.fileids()[:2]:
      for tree in treebank.parsed_sents(item):
        # print(" ".join(tree.leaves()))
        # perform optional tree transformations, e.g.:
        # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
        # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
        prods = tree.productions()
        # print(prods[0].prob())
        productions += prods

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    # print(grammar)    # This is a PCFG

    ### Parsing section below ###

    print("\nParse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(1)

    sent = treebank.parsed_sents('wsj_0001.mrg')[0]
    print(sent.prob())
def train():
    print("Collecting sub-corpus from Penn Treebank (nltk.corpus)")
    
    # prepare parsing trees, extrated from treebank
    tbank_trees = []
    for sent in treebank.parsed_sents():
        sent.chomsky_normal_form()
        tbank_trees.append(sent)
    
    # build vocabulary list, extracted from treebank
    vocab_size = 10000 # set vocabulary size to 10000
    words = [wrd.lower() for wrd in treebank.words()]
    vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)]
    
    # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency
    tbank_productions = set(production for tree in tbank_trees for production in tree.productions())
    tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions))
    production_rules = tbank_grammar.productions()
    rules_to_prob = defaultdict(int)
    nonterm_occurrence = defaultdict(int)
    
    #calculate probablity for rules
    for sent in tbank_trees:
        for production in sent.productions():
            if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal):
                production = Production(production.lhs(), [production.rhs()[0].lower()])
            nonterm_occurrence[production.lhs()] += 1
            rules_to_prob[production] += 1
    for rule in rules_to_prob:
        rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()]

    # use Katz smoothing
    rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab)
    rules = list(rules_to_prob.keys())
    rules_reverse_dict = dict((j,i) for i, j in enumerate(rules))
    left_rules = defaultdict(set)
    right_rules = defaultdict(set)
    unary_rules = defaultdict(set)
    
    # classify left, right rules
    for rule in rules:
        if len(rule.rhs()) > 1:
            left_rules[rule.rhs()[0]].add(rule)
            right_rules[rule.rhs()[1]].add(rule)
        else:
            unary_rules[rule.rhs()[0]].add(rule)
    terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str))
    terminal_nonterms = defaultdict(int)
    for rule in terminal_nonterms_rules:
        terminal_nonterms[rule.lhs()] += 1
        pcfg_parser = {
    'vocab': vocab,
        'left_rules': left_rules,
        'right_rules': right_rules,
        'unary_rules': unary_rules,
        'rules_to_prob': rules_to_prob,
        'terminal_nonterms': terminal_nonterms
    }
    return pcfg_parser
 def add_words(self, file_ids):
     for id in file_ids:
         sentences = ptb.parsed_sents(id)
         for sen_tree in sentences:
             words = Corpus._filter_words(sen_tree)
             words = ['<eos>'] + words + ['<eos>']
             for word in words:
                 self.dict.add(word)
Exemple #26
0
def convert_wsj(file_obj):
    from nltk.corpus import treebank

    sys.stderr.write("Converting Penn Treebank sampler...\n")
    tb = TreebankConverter()
    for sentence in treebank.parsed_sents():
        tb.add_sentence(sentence)
    tb.write(file_obj)
 def _induce_grammar(self):
     self.productions = []
     for tree in treebank.parsed_sents(treebank.fileids()):
         # perform optional tree transformations, e.g.:
         tree.collapse_unary(
             collapsePOS=False)  # Remove branches A-B-C into A-B+C
         tree.chomsky_normal_form(
             horzMarkov=2)  # Remove A->(B,C,D) into A->B,C+D->D
         self.productions += tree.productions()
Exemple #28
0
def nltk_parse(s):
    tokens = nltk.word_tokenize(s)
    print(tokens)
    tagged = nltk.pos_tag(tokens)
    print(tagged[0:6])
    entities = nltk.chunk.ne_chunk(tagged)
    print(entities)
    t = treebank.parsed_sents('wsj_0001.mrg')[0]
    t.draw()
Exemple #29
0
def read_data():
    treebank_tagged_sents = list(
        chain(*[[tree.pos() for tree in treebank.parsed_sents(pf)] for pf in treebank.fileids()]))

    words_list = [[tag[0] for tag in sent] for sent in treebank_tagged_sents]
    labels = [[tag[1] for tag in sent] for sent in treebank_tagged_sents]

    words = []
    max_words = 0
    for sent in words_list:
        words.extend(sent)
        max_words = max(max_words, len(sent))

    print("Max. Words:", max_words)

    seq_length = 100

    print("Seq. Length:", seq_length)

    words = list(set(words))

    print("Number of Words:", len(words))

    unique_labels = []
    for sent in labels:
        unique_labels.extend(sent)

    unique_labels = list(set(unique_labels))

    print("Number of Unique Labels:", len(unique_labels))

    word2id = {word: i + 1 for i, word in enumerate(words)}
    id2word = {i + 1: word for i, word in enumerate(words)}

    X_data = []
    Y_data = []

    for i in range(len(treebank_tagged_sents)):
        for j in range(len(words_list[i])):
            _x = [0] * max_words

            for k in range(j + 1):
                _x[j - k] = word2id[words_list[i][k]]

            _x = _x[:seq_length]
            _x.reverse()

            X_data.append(_x)
            Y_data.append(one_hot(labels[i][j], unique_labels))

    X_data = np.array(X_data, dtype=np.int32)
    Y_data = np.array(Y_data, dtype=np.float32)

    print(X_data.shape)
    print(Y_data.shape)

    return X_data, Y_data, unique_labels, words, word2id, id2word
Exemple #30
0
 def read_wsj_from_treebank(self, index):
     from nltk.corpus import treebank
     self.__reset()
     self.__input_text = 'wsj_000' + str(index) + '.mrg'
     self.__sents = treebank.sents(self.__input_text)
     self.__tagged_sents = treebank.parsed_sents(self.__input_text)
     if self.__verbose:
         self.__print_all()
     return self.__tagged_sents
Exemple #31
0
def get_processed_data():

    bank = treebank.parsed_sents()
    train_bank, test_bank = train_test_split(bank, test_size=0.2)
    train_bank = list(train_bank)
    test_bank = list(test_bank)
    train_bank = convert_to_base_category(train_bank)
    test_bank = convert_to_base_category(test_bank)

    return train_bank, test_bank
Exemple #32
0
def extract_simple_productions(n):
    rules = []
    new_rules = []
    for t in treebank.parsed_sents()[:n]:
        rules = rules + t.productions()
    for r in rules:
        r = simple_rule(r)
        if not "EMPTY" in str(r):
            new_rules.append(r)
    return new_rules
Exemple #33
0
def TreebankNoTraces():
    tb = []
    for t in treebank.parsed_sents():
        if t.label() != "S": continue
        RemoveFunctionTags(t)
        RemoveTraces(t)
        t.collapse_unary(collapsePOS=True, collapseRoot=True)
        t.chomsky_normal_form()
        tb.append(t)
    return tb
Exemple #34
0
def getTrees(source, size):
    '''Load the trees from source, return first SIZE trees'''
    if source == 'treebank':
        from nltk.corpus import treebank
        trees = treebank.parsed_sents()
        #inds = random.permutation(range(0,len(trees)))[0:size]
        trees = trees[:size]
        return trees
    else:
        return list()
Exemple #35
0
def getTrees(source,size):
    '''Load the trees from source, return first SIZE trees'''
    if source=='treebank':
        from nltk.corpus import treebank
        trees = treebank.parsed_sents()
        #inds = random.permutation(range(0,len(trees)))[0:size]
        trees = trees[:size]
        return trees
    else:
        return list()
Exemple #36
0
def learn_treebank(trees=None):
    """
    Learn a PCFG from the Penn Treebank, and return it.
    
    By default, this learns from NLTK's 10% sample of the Penn Treebank.
    You can also pass a set of trees.
    """
    if trees is None: bank = treebank.parsed_sents()
    else: bank = trees
    return learn_trees(bank, collapse=True)
Exemple #37
0
def main(phrase_level, sanitize):
    for n in range(1, 200):
        tree_file = "wsj_{}.mrg".format(str(n).zfill(4))
        sentences = treebank.parsed_sents(tree_file)
        for s in sentences:
            for subtree in s.subtrees(lambda t: t.label() == phrase_level):
                if sanitize == True:
                    sanitize_tree(subtree)
                print(subtree.pformat(100000))
                break
Exemple #38
0
def learn_treebank(trees=None):
    """
    Learn a PCFG from the Penn Treebank, and return it.
    
    By default, this learns from NLTK's 10% sample of the Penn Treebank.
    You can also pass a set of trees.
    """
    if trees is None: bank = treebank.parsed_sents()
    else: bank = trees
    return learn_trees(bank, collapse=True)
Exemple #39
0
def parse_treebank(parser: ViterbiParser, sentences):
    start_time = time.time()
    parser.trace(trace=1)
    for sentence in treebank.parsed_sents(sentences[:3]):
        tokens = sentence.leaves()
        for tree in parser.parse(tokens):
            print(tree)
            print(
                f"Time elapsed for sentence of length {len(tokens)}: {time.time() - start_time}"
            )
Exemple #40
0
def TreebankNoTraces():
    tb = []
    for t in treebank.parsed_sents():
        if t.label() != "S": continue
        RemoveFunctionTags(t)
        RemoveTraces(t)
        t.collapse_unary(collapsePOS = True, collapseRoot = True)
        t.chomsky_normal_form()
        tb.append(t)
    return tb
Exemple #41
0
def main():
    sentence = """I saw a man with a telescope.
    ... Colorless green ideas sleep furiously.
    ... The horse raced past the barn fell."""
    tokens = nltk.word_tokenize(sentence)
    print(tokens)
    tagged = nltk.pos_tag(tokens)
    print(tagged[0:6])
    entities = nltk.chunk.ne_chunk(tagged)
    print(entities)
    t = treebank.parsed_sents('wsj_0001.mrg')[0]
def write_example_tree(features, f):
    filename = features['_filename']
    sen = features['_sentence_id']
    phr = features['_phrase_id']
    tree = treebank.parsed_sents(filename)[sen]
    phrase = tree[tree.treepositions('preorder')[phr]]
    l = treebank_helper.get_label(phrase)
    treebank_helper.set_label(phrase, '***' + l + '***')
    f.write(str(tree))
    f.write('\n')
    treebank_helper.set_label(phrase, l)
def treebank_accessor():
  '''
  Function that reads the Penn treebank and returns all the trees 
  for each sentence in the corpus.
  '''
  trees = []

  for i in range(1, TREEBANK_FILES + 1):
    file_number = "%03d" % (i,)
    t = treebank.parsed_sents('wsj_0' + file_number + '.mrg')

    for sentence in range(len(t)):
      # For each sentence in the file, convert to a tree and add it to trees[]
      trees.append(t[sentence])

  return trees
def get_treebank_rules(cutoff=0, include_counts=False):
    all_rules = cache_utils.cache_get('treebank_rules', 'rules')
    if not all_rules:
        log('Generating lexical rules from Penn Treebank', 4)
        from nltk.corpus import treebank
        all_rules = dict()
        for tree in treebank.parsed_sents():
            for rule, count in lexical_rules(tree).items():
                all_rules[rule] = all_rules.get(rule, 0) + count

        cache_utils.cache_set('treebank_rules', 'rules', all_rules)

    if include_counts:
        return {k: v for (k, v) in all_rules.items() if v > cutoff}
    else:
        rules_set = set([rule for rule, count in all_rules.items() if count > cutoff])
        return rules_set
def read_treebank_files(files, extractor,fe):
    """Read the listed treebank files and collect function tagging examples
    from each tree.

    The user-provided feature extractor is applied to each phrase in each 
    tree. The extracted feature dicts and the true function tags for each
    phrase are stored in two separate lists, which are returned.
    """
    X = []
    Y = []
    for filename in files:
        scount = 0
        for tree in treebank.parsed_sents(filename):
            tree = ParentedTree.convert(tree)
            treebank_helper.postprocess(tree)
            find_examples_in_tree(tree, X, Y, extractor,fe, filename, scount, 0)
            scount += 1
    return X, Y
Exemple #46
0
def get_trees(fileids=None, verbose=False):
	""" 
	Get the CNF trees for the treebank fileids given, or for the entire treebank
	"""
	if not fileids:
		# Get the Penn Treebank corpus
		fileids = treebank.fileids()

	# Get the sentence-trees in each file
	tree_lists = [treebank.parsed_sents(file_id) for file_id in fileids]
	trees = [sent for sent_list in tree_lists for sent in sent_list]
	if verbose:
		print("obtained", len(trees), "trees from the corpus.")

	cnf_trees = [ctc.convert_tree(t) for t in trees]
	if verbose:
		print("converted", len(trees), "trees to cnf.")

	return cnf_trees
Exemple #47
0
    def create_forests(self, filename=None, treelist=None, clear=False):
        """ This will read sentences to parse. One sentence per line, no periods etc.

        :param filename: not used
        :param clear: start with empty
        """
        filename = filename or Document.get_default_treeset_file()

        forests = []
        input_trees = []

        shared_lexicon = load_lexicon(Document.get_default_lexicon_file())
        print('loaded shared_lexicon: ', shared_lexicon)
        if treelist:
            input_trees = treelist
        elif has_nltk:
            print(f"reading trees {NLTK_TREE_RANGE[0]}-{NLTK_TREE_RANGE[1]} from NLTK's treebank")
            for i in range(*NLTK_TREE_RANGE):  # 199
                trees = treebank.parsed_sents(f'wsj_0{str(i).rjust(3, "0")}.mrg')
                for j, tree in enumerate(trees):
                    tree.chomsky_normal_form()
                    tree.collapse_unary()
                    input_trees.append(as_list(tree))
        else:
            readfile = open(filename, 'r')
            for line in readfile:
                line = line.strip()
                if line:
                    if line.startswith('[') and line.endswith(']'):
                        input_trees.append(ast.literal_eval(line))
                    else:
                        input_trees.append(line)

        for input_tree in input_trees:
            syn = classes.SyntaxAPI()
            syn.lexicon = shared_lexicon
            if isinstance(input_tree, list):
                syn.input_tree = input_tree
            else:
                syn.input_text = input_tree
            forest = Forest(heading_text=str(input_tree), syntax=syn)
            forests.append(forest)
        return forests
Exemple #48
0
def train_pcfg():
    print 'training grammar'
    productions = []
    # print len(treebank.fileids())
    trees = []
    # up to 199 less for shorter grammar for quicker training
    for fileid in treebank.fileids()[0:20]:
        for tree in treebank.parsed_sents(fileid):
            # perform optional tree transformations, e.g.:
            # Remove branches A->B->C into A->B+C so we can avoid infinite
            # productions
            tree.collapse_unary(collapsePOS=False)
            # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser)
            # horizontal and vertical Markovization: remember parents and siblings in tree
            #     This gives a performance boost, but makes the grammar HUGE
            #     If we use these we would need to implement a tag forgetting method
            #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0)
            tree.chomsky_normal_form()
            productions += tree.productions()
    S = nltk.Nonterminal('S')
    grammar = nltk.induce_pcfg(S, productions)
    print "grammar trained!"
    return grammar
Exemple #49
0
#! /usr/bin/python
# -*- coding: utf-8 -*-

__author__ = "Osman Baskaya"

from nltk.corpus import treebank

files = "cl23.mrg wsj_1695.mrg wsj_1778.mrg".split()

for f in files:
    for sentence in treebank.parsed_sents(f):
        s = []
        for word, p in sentence.pos():
            if p != '-NONE-':
                s.append(word)
        print ' '.join(s)


#f = '../data/senseval3/english-all-words.xml'

#soup = BeautifulSoup(open(f), 'xml')
#texts = soup.find_all('text')
#sentences = []
#quot_set = set(['"', ])
#quot = False
#sentence = []
#for t in texts:
    #tokens = t.text.split()
    #for token in tokens:
        #if token in quot_set:
            #quot = not quot
Exemple #50
0
    rules = []
    results = re.findall("(\({0}\ {1}\))".format(rule,word), sent)
    for res in results:
        x = res.split(" ")
        if len(x) == 2:
            p,c = x
            rules.append("{0} -> '{1}'".format(p[1:], c[:-1]))
    return rules

def check(productions, rules):
    i = 0
    for x in productions:
        if str(x) in rules: i += 1
        else: print x
    return (i,len(productions))

if __name__=="__main__":
   total, recall = 0,0
   for s in treebank.parsed_sents():
        sent = "".join(str(s).split("\n"))
        unaries = find_unary(sent)
        nonunaries = find_nonunary(sent)
        rules = unaries + nonunaries
        r, t = check(s.productions(), rules)
        recall+=r
        total+=t
   print "{0} out of {1}: {2}".format(recall,total, float(recall)/total)
   


def find_pronouns(tree):
    pronouns = []
    for child in tree:
        if type(child) in [unicode, str] and child.lower() in PRONOUNS:
            pronouns.append((child.lower(), None))

        if isinstance(child, ParentedTree):
            pronouns = pronouns + find_pronouns(child)

    return pronouns

total = 0
for file in treebank.fileids():
    stats['name'] = file
    for tree in treebank.parsed_sents(file):
        tree = ParentedTree.convert(tree)
        for pronoun, np_node in find_pronouns(tree):
            if pronoun in gendered:
                stats['gendered'] += 1
            if pronoun in itits:
                stats['itits'] += 1
            stats['total'] += 1
            total += 1
            stats['pct_gendered'] = stats['gendered']/float(stats['total'])
    print file, total


    files.append(stats.copy())
    stats = dict.fromkeys(stats, 0)
Exemple #52
0
import nltk
from nltk.corpus import treebank

# show samples of treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
# print(t)

# filter sentential complements
def filter(tree):
    child_nodes = [child.label() for child in tree if isinstance(child, nltk.Tree)]
    return (tree.label() == 'VP') and ('S' in child_nodes)


subtrees = [subtree for tree in treebank.parsed_sents()
            for subtree in tree.subtrees(filter)]
for st in subtrees:
    print(st)
# Extracts Penn Treebank from NLTK.
from nltk.corpus import treebank
from operator import itemgetter
import codecs
words = treebank.sents()
tagged_words = [map(itemgetter(1), sent) for sent in treebank.tagged_sents()]
parsed_sents = treebank.parsed_sents()

total_sents = len(parsed_sents)

f = codecs.open('../data/penn_treebank','w','utf-8')
assert (len(words) == len(tagged_words) and len(words) == len(parsed_sents)), ' '.join(map(str, [len(words), len(tagged_words), len(parsed_sents)]))
f.write(str(total_sents) + '\n')
for i in xrange(total_sents):
	sent_len = len(words[i])
	f.write(str(sent_len) + '\n')
	
	sent = ' '.join(words[i])
	pos = ' '.join(tagged_words[i])
	assert(sent.count('\n') == 0 and pos.count('\n') == 0 and len(sent.split(' ')) == sent_len and len(pos.split(' ')) == sent_len)
	f.write(sent + '\n')
	f.write(pos + '\n')
	
	tree = str(parsed_sents[i]).split('\n')
	f.write(str(len(tree)) + '\n')
	f.write('\n'.join(tree) + '\n')
Exemple #54
0
from nltk.corpus import treebank
from nltk import Tree, Nonterminal
from nltk.parse.viterbi import ViterbiParser
from nltk.grammar import induce_pcfg
from os import getcwd, walk
from pickle import dump

###############################
# 2) Remove numerical indices #
###############################

print "Loading treebank."
sentenceStrings = map(lambda x: x.pprint(), treebank.parsed_sents())

#these are various things we want to remove (indices) or replace
indexStrings = map(str, range(166,0,-1))

#things that come before values and what should be left behind after
#the index is removed
indexPrefixes = [("-",""), ("=","")]
indiceRemedy = lambda n: map(lambda x: (n[0]+x,n[1]), indexStrings)
fixingTuples = reduce(lambda x,y: x+indiceRemedy(y), [[]]+indexPrefixes)

#this is where the removing takes place
print "Cleaning POS tags."
removeTargets = lambda x: reduce(lambda y,z: y.replace(z[0],z[1]), \
		[x]+fixingTuples)
sentenceStrings = map(removeTargets, sentenceStrings)
sentenceTrees = map(lambda x: Tree(x), sentenceStrings)

###################################################
import nltk
from nltk.corpus import treebank
print(treebank.parsed_sents('wsj_0007.mrg')[2])
from nltk.corpus import treebank
from nltk.grammar import ContextFreeGrammar, Nonterminal
from nltk.treetransforms import chomsky_normal_form


'''
tbank_productions = set(production for sent in treebank.parsed_sents()
                        for production in sent.productions())

'''

treebank_prods = []
for i in range(199): # for all found sets of fileids
    tbstuff = treebank._fileids[i] # get a bunch of 'em
    for tree in treebank.parsed_sents(tbstuff):
        tree.chomsky_normal_form()

        treebank_prods += tree.productions()



tTCpcfg = nltk.induce_pcfg(Nonterminal('S'), list(treebank_prods))

# induce pcfg

# PTCpcfg = nltk.induce_pcfg(tbank_grammar)

# treetransforms: chomsky_normal_form

print("done! You have your WeightedGrammar")
master_path = "./Data/"
train_filepath = master_path + "train.csv"
train_data = pd.read_csv(train_filepath)

dup_prob = []

row_count = 0

for row in train_data.iterrows():
    row_count += 1
    q1 = row[1]['question1']
    q2 = row[1]['question2']

    while row_count < 19:
        print(treebank.parsed_sents(q1)[0])
        print(treebank.parsed_sents(q2)[0])

    if pd.isnull(q1):
        q1_words = []
    else:
        q1_words = q1.split(' ')

    if pd.isnull(q2):
        q2_words = []
    else:
        q2_words = q2.split(' ')

    wd_counter = 0
    sim_counter = 0
import nltk
from nltk.corpus import treebank
from nltk.probability import *
from nltk.grammar import *

### RETRIEVE ALL TREES AND THEN SELECT THE FIRST 100.
all_trees = treebank.parsed_sents()
trees_100 = all_trees[0:100]

### FUNCTION EXTRACTING LEAVES OF NODES WITH LABEL AS A PARAMETER OF getAvgNodeLength().
def getAvgNodeLength(label):

    l_leaves = list()
    for tree in trees_100:
        for node in tree:
            if node.label() == label:
                l_leaves.append(node.leaves())

### CREATED OWN LIST OF PUNCTUATION TO EXCLUDE SINCE USING string.punctuation WOULD
### HAVE DELETED WORDS SUCH AS "Dr.", "World-Wide", "U.S.", etc. WHICH ARE OF INTEREST.
    punct = [u"*", u",", u"&", u"'"]

    for wordlist in l_sbj:
        for word in wordlist:
            for i in punct:
                if i in word:
                    wordlist.remove(word)

### CREATE LIST OF LENGTHS (IN WORDS) OF NODES.
    l_len = list()
    for wordlist in l_leaves:
count = {}
for sentence in sentences:
    for word in sentence:
        if word in count:
            count[word] += 1
        else:
            count[word] = 1


## 3. Estadisticas de transicion de palabras (2-gram model)
from sklearn.feature_extraction.text import CountVectorizer


sentences = sentences = texto.strip().split('.')[:-1]
bigram_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=1)
X_2 = bigram_vectorizer.fit_transform(sentences).toarray()


## 4. Using NLTK do a Part of Speech tagging (POS tagging)
tokens = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(tokens)

## 5. Dibuja un arbol lexico-gramatical con la ayuda de NLTK
from nltk.corpus import treebank

t = treebank.parsed_sents('wsj_0001.mrg')[0]
t.draw()



def main():
    answers = open('coref_key.txt', 'r')
    this_correct = 0
    correct = 0
    total = 0
    prev_sentences = deque()
    for file in FILENAMES:
        this_correct = 0
        this_total = 0
        prev_sentences.clear()
        for tree in treebank.parsed_sents(file):


            tree = ParentedTree.convert(tree)

            for pronoun, np_node in find_pronouns(tree):

                # i = 0
                # for t in list(prev_sentences)[-3:]:
                #     t.pretty_print()
                #     print("-"*25)
                #     i = i + 1
                #     if i == 3: break
                proposed = hobbs_to_string(hobbs(np_node, pronoun.lower(), prev_sentences))
                tree.pretty_print()

                actual = answers.readline()

                if  proposed == actual[:-1]:
                    update_pronoun_results(pronoun, 1)
                    correct += 1
                    this_correct += 1

                update_pronoun_results(pronoun, 0)
                total += 1
                this_total += 1

                print "Pronoun: '" + pronoun + "'   Proposed: '" + proposed + "'   Actual: '" + actual + "'"

                if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n"


                print("*"*100)
                print("*"*100)
            prev_sentences.append(tree)
        print("-"*50)
        if this_correct: print file,":\tCorrect:", this_correct, "\tTotal:", this_total, "\tPercentage:", this_correct/float(this_total), "\n"
        if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n"
        print("-"*50)

    print "Male correct:", PRONOUN_RESULTS['male'], "\tMale total:", PRONOUN_RESULTS['male_total'], "\tPercent correct:", PRONOUN_RESULTS['male_pct']
    print "Female correct:", PRONOUN_RESULTS['female'], "\tFemale total:", PRONOUN_RESULTS['female_total'], "\tPercent correct:", PRONOUN_RESULTS['female_pct']
    print "Neutral correct:", PRONOUN_RESULTS['neutral'], "\tNeutral total:", PRONOUN_RESULTS['neutral_total'], "\tPercent correct:", PRONOUN_RESULTS['neutral_pct']
    print "Plural correct:", PRONOUN_RESULTS['they'], "\tPlural total:", PRONOUN_RESULTS['they_total'], "\tPercent correct:", PRONOUN_RESULTS['they_pct']
    print "Reflexive correct:", PRONOUN_RESULTS['reflexive'], "\tReflexive total:", PRONOUN_RESULTS['reflexive_total'], "\tPercent correct:", PRONOUN_RESULTS['reflexive_pct']
    print "Total correct:", correct, "\tTotal:", total, "\tPercent correct:", correct/float(total)