Esempio n. 1
0
def CKY_parser():
    '''
    Given the PCFG, we use the built in CKY praser function
    to get a sentence's most probable parse
    '''
    PCFG_grammar = make_PCFG_grammar()
    # Utilize the ViertabiParser given the PCFG grammar induction rules
    parser = ViterbiParser(PCFG_grammar)

    # Sample sentence parse
    sentences = treebank.parsed_sents('wsj_1964.mrg')

    skipped_sentences = 0

    # A for loop to print out the full parse
    for sentence in sentences:
        sentence = sentence.leaves()
        try:
            PCFG_grammar.check_coverage(sentence)
            for parse in parser.parse(sentence):
                print(parse)
        except:
            skipped_sentences += 1
            continue

    print("Total skipped sentences:", skipped_sentences)
Esempio n. 2
0
def test_sentences(grammar):

    for t in test:
        print("Processing: " + str(t))
        reference = list(treebank.tagged_words(t))

        tokens = list(treebank.words(t))

        print("fixing grammar.....")
        # Checks if grammar covers all words in the sentence and adds them to the grammar if necessary
        fixed_grammar = get_fixed_grammer(grammar, tokens)

        print("fixed grammar")
        print("Building Parser....")
        parser = ViterbiParser(fixed_grammar)

        print("Parsing...")
        #Gets list of all possible trees, the most likely tree is at index 0
        start = time.time()
        parses = parser.parse_all(tokens)
        print("Time")
        print(start - time.time())

        #Getting POS tags from parser tree
        leafs = parses[0].pos()

        #Calculating accuracy of Parser results
        correct_tags = 0.0
        for i in range(len(leafs)):
            if leafs[i] == reference[i]:
                correct_tags += 1.0


        print(str(correct_tags/len(leafs)))
Esempio n. 3
0
def evaluate_sentence(sentence: string, grammar: PCFG):
    sentence = sentence.split()
    print(sentence, flush=True)
    pos = [pos for word, pos in pos_tag(sentence)]
    print(pos, flush=True)
    parser = ViterbiParser(grammar, trace=0)
    for line in accumulate(pos, lambda total, token: total + ' ' + token):
        line = line.split()
        print(line)
        print([tree.prob() for tree in list(parser.parse(line))], flush=True)
Esempio n. 4
0
def test():
    """
	A test to check if the changes I made have the intended
	effect
	"""
    import nltk
    from nltk.parse import ViterbiParser
    sent = 'I saw the man with my telescope'
    tokens = sent.split()
    grammar = nltk.toy_pcfg1
    parser = ViterbiParser(grammar)
    parser.trace(3)
    parses = parser.nbest_parse(tokens)
    print(parses)
Esempio n. 5
0
def main():
    # print(nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0])
    # nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0].draw()

    # print("Induce PCFG grammar from treebank data:")
    #
    productions = []
    print(len(treebank.fileids()))
    for item in treebank.fileids(): # Goes through all trees
      for tree in treebank.parsed_sents(item):
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
        tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
        productions += tree.productions()
    # #
    # # print(type(productions[0]))
    # #
    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    # # # print(grammar)    # This is a PCFG
    # pickle.dump(grammar, open("tbank-grammar.p", "wb"))
    # t = time.time()
    # grammar = pickle.load(open("tbank-grammar.p", "rb"))
    # textf = open("lexicon.txt", "w")
    # n = textf.write(str(reduce(lambda a, b: a + "\n" + b, list(filter(lambda x: "'" in x, str(grammar).split("\n"))))))
    # textf.close()
    # print(time.time()-t)
    parser = ViterbiParser(grammar)
    # pickle.dump(parser, open("cky-parser.p", "wb"))
    # parser = pickle.load(open("cky-parser.p", "rb"))
    parser.trace(0)
    sent = "John will join the board"
    tokens = sent.split()

    try:
        grammar.check_coverage(tokens)
        print("All words covered")
        parses = parser.parse_all(tokens)
        if parses:
            lp = len(parses)
            print(lp)
            print(parses[0].label())
            # parses[0].draw()
            p = reduce(lambda a,b:a+b.prob(), list(filter(lambda x: x.label() == 'S', parses)), 0.0)
        else:
            p = 0

        print("Probability:", p)
    except:
        print("Some words not covered")
Esempio n. 6
0
def main():
    data = pd.read_csv(data_file_path)
    data = data.drop(columns=["Unnamed: 0"])

    (sentence, sentence_tokens) = readsentence()  # take input from user and save text, tokenized text

    if os.path.exists('mytagger.pkl'):
        # try to open a previously saved tagger
        input = open('mytagger.pkl', 'rb')
        mytagger = load(input)
        input.close()
    else:
        # no such tagger is found so train/save it
        mytagger = traintagger()
        output = open('mytagger.pkl', 'wb')
        dump(mytagger, output, -1)
        output.close()
    tagged_tokens = mytagger.tag(sentence_tokens)
    print(tagged_tokens)

    if os.path.exists('mypcfg.pickle'):
        # try to open a previously saved PCFG
        input = open('mypcfg.pickle', 'rb')
        mypcfg = load(input)
        input.close()
    else:
        # no such PCFG exists, so induce/save it
        mypcfg = buildpcfg()
        output = open('mypcfg.pickle', 'wb')
        dump(mypcfg, output)
        output.close()
    try:
        tree = sequence_matching(tagged_tokens)
        print("Sequence matching was used")

    except:
        parser = ViterbiParser(mypcfg)
        tree = parser.parse(tagged_tokens)
        print("Vitberi parser was used")
    finally:
        if not isinstance(tree, Tree):
            Tree.pretty_print(next(tree))  # do something to print it out, or print error message if input couldn't be parsed
        else: print(tree)

    df2 = {'sentence': sentence, 'sentence tokens': sentence_tokens, 'tagged tokens': tagged_tokens}
    data = data.append(df2, ignore_index=True)
    data.to_csv(data_file_path)

    print("Previous data:")
    print(data)
def parseCKY(sentence, grammar):
    # Tokenize the sentence.
    tokens = sentence.split()

    #print('Coverage of input words by a grammar:')
    change_words = []
    for i, ele in enumerate(tokens):
        try:
            grammar.check_coverage([ele])
        except:
            #clprint("%s is not covered by the grammar. Replacing it with 'UNK'" % ele)
            change_words.append(tokens[i])
            tokens[i] = 'UNK'
    parsers = [ViterbiParser(grammar)]
    # Run the parsers on the tokenized sentence.
    from functools import reduce
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print('\nsentence: %s\n ' % (sentence))
        t = time.time()
        parses = parser.parse_all(tokens)
        times.append(time.time() - t)
        if parses:
            lp = len(parses)
            p = reduce(lambda a, b: a + b.prob(), parses, 0.0)
        else:
            p = 0
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses:
            all_parses[p.freeze()] = 1

    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0


#    for parse in parses:
#        print(parse)
    return parses
Esempio n. 8
0
    def parse_sentence(self, sent):
        """
        Parse sent using induced grammar
        Visualize the most likely parse tree for sent
        :return: None. Save parsing results to pcfg.txt
        """
        if self.grammar is None:
            raise ValueError("PCFG hasn't been induced yet.")
        # other parser option(s): e.g., parser = pchart.InsideChartParser(self.grammar)
        parser = ViterbiParser(self.grammar)
        parser.trace(3)

        # http://www.nltk.org/api/nltk.parse.html
        sys.stdout = open('pcfg.txt', 'w')
        parses = parser.parse(sent)
        for parse in parses:
            print(parse)
            # visualize the tree:
            print(parse.draw())
Esempio n. 9
0
def Parser_Section():
    demos = [('I saw John through the telescope', toy_pcfg1)]
    sent, grammar = demos[0]
    # print(grammar)

    # Tokenize the sentence.
    tokens = sent.split()
    parser = ViterbiParser(grammar)

    parser.trace(0) # Use this to change verbosity
    t = time.time()
    parses = parser.parse_all(tokens)
    print("Time:", time.time()-t)

    if parses:
        lp = len(parses)
        p = reduce(lambda a,b:a+b.prob(), parses, 0.0)
    else:
        p = 0

    print("Probability:", p)
Esempio n. 10
0
def perplexity():
    '''
    Give the PCFG and the parser used, run the parser on
    the rest of the treebank and calculates the perplexity
    of the model given the testing sentences.
    '''

    PCFG_grammar = make_PCFG_grammar()
    parser = ViterbiParser(PCFG_grammar)
    all_p = []
    skipped_sentence = 0

    for item in treebank.fileids()[1964:]:
        trees = treebank.parsed_sents(item)
        for tree in trees:
            tree = tree.leaves()
            try:
                PCFG_grammar.check_coverage(tree)
                for parse in parser.parse(tree):
                    parse_string = str(parse)
                    p = re.search(r"p=([^/]+)", parse_string).group(1)
                    p = p[:-1]
                    all_p.append(float(p))
            except:
                skipped_sentence += 1
                continue

    perplexity = 1
    N = float(len(all_p))
    for p in all_p:
        perplexity = perplexity * (1/p)
    perplexity = pow(perplexity, 1/float(N))

    print("Perplexity:", perplexity)
    print("All parse probabilities:", all_p)
    print("Skipped sentences:", skipped_sentence)
    print("PCFG grammar:", PCFG_grammar)
Esempio n. 11
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys
    import time

    from nltk import tokenize
    from nltk.grammar import PCFG
    from nltk.parse import ViterbiParser

    toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
    """)

    toy_pcfg2 = PCFG.fromstring("""
    S    -> NP VP         [1.0]
    VP   -> V NP          [.59]
    VP   -> V             [.40]
    VP   -> VP PP         [.01]
    NP   -> Det N         [.41]
    NP   -> Name          [.28]
    NP   -> NP PP         [.31]
    PP   -> P NP          [1.0]
    V    -> 'saw'         [.21]
    V    -> 'ate'         [.51]
    V    -> 'ran'         [.28]
    N    -> 'boy'         [.11]
    N    -> 'cookie'      [.12]
    N    -> 'table'       [.13]
    N    -> 'telescope'   [.14]
    N    -> 'hill'        [.5]
    Name -> 'Jack'        [.52]
    Name -> 'Bob'         [.48]
    P    -> 'with'        [.61]
    P    -> 'under'       [.39]
    Det  -> 'the'         [.41]
    Det  -> 'a'           [.31]
    Det  -> 'my'          [.28]
    """)

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ("I saw the man with my telescope", toy_pcfg1),
        ("the boy saw Jack with Bob under the table with a telescope",
         toy_pcfg2),
    ]

    # Ask the user which demo they want to use.
    print()
    for i in range(len(demos)):
        print(f"{i + 1:>3}: {demos[i][0]}")
        print("     %r" % demos[i][1])
        print()
    print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print("Bad sentence number")
        return

    # Tokenize the sentence.
    tokens = sent.split()

    parser = ViterbiParser(grammar)
    all_parses = {}

    print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
    parser.trace(3)
    t = time.time()
    parses = parser.parse_all(tokens)
    time = time.time() - t
    average = (reduce(lambda a, b: a + b.prob(), parses, 0) /
               len(parses) if parses else 0)
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print("Time (secs)   # Parses   Average P(parse)")
    print("-----------------------------------------")
    print("%11.4f%11d%19.14f" % (time, num_parses, average))
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print("------------------------------------------")
    print("%11s%11d%19.14f" % ("n/a", len(parses), p))

    # Ask the user if we should draw the parses.
    print()
    print("Draw parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        from nltk.draw.tree import draw_trees

        print("  please wait...")
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print()
    print("Print parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        for parse in parses:
            print(parse)
Esempio n. 12
0
 def init_viterbi(self):
     return ViterbiParser(self.grammar)
Esempio n. 13
0
from nltk import induce_pcfg
from nltk.parse import pchart
from nltk.parse import ViterbiParser
from nltk.treetransforms import *
from nltk import *

productions = []

for item in treebank._fileids:
    length=int(len(item)*0.9)
    for tree in treebank.parsed_sents(item)[:length]:
        tree.collapse_unary(collapsePOS = False)
        tree.chomsky_normal_form(horzMarkov = 2)
        productions += tree.productions()

S = Nonterminal('S')
grammar = induce_pcfg(S, productions)
parser = pchart.InsideChartParser(grammar)
parserv= ViterbiParser(grammar2)


for item in treebank._fileids:
    start=int(len(item)*0.9)
    for tree in treebank.parsed_sents(item)[start:]:
        sent = tree.leaves()
        print(tree.pos())
        for parse in parser.parse(sent):
            print(parse) 
        for parse in parserv.parse(sent):
            print(parse) 
Esempio n. 14
0
def create_viterbi_parser(grammar, pickle_it=False, filename="viterbi"):
    parser = ViterbiParser(grammar)
    parser.trace(0)
    if pickle_it:
        pickle.dump(parser, open("%s%s-parser.p" % (var_dir, filename), "wb"))
    return parser
Esempio n. 15
0
#print productions

############ create PCFG from the productions #######
from nltk import Nonterminal
from nltk import induce_pcfg

S = Nonterminal('SENT')
grammar = induce_pcfg(S, productions)
print(grammar)

######### Parser with CYK dynamic algorithm ########
from nltk.parse import pchart
from nltk.parse import ViterbiParser
from nltk.treetransforms import un_chomsky_normal_form

parser = ViterbiParser(grammar)
parser.trace(2)

parses_bank = []

test_file = open(args.test_dir, 'wb')
test_output_file = open(args.output_dir, 'wb')

for i in range(valid_idx, test_idx + 1):
    # take the leaves of each tree of testset and store
    # them in the test file
    tokens = treebank[i][0].leaves()
    sentence = u" ".join(tokens)
    test_file.write((sentence + u"\n").encode('utf-8'))

    print 'parsing :', sentence
Esempio n. 16
0
    def __init__(self):
        self.wordToTags = defaultdict(set)
        convertedTaggedWords = [(w,nltk.tag.mapping.map_tag('en-ptb', 'universal', t)) for w,t in treebank.tagged_words()]
        for word, tag in convertedTaggedWords:
            self.wordToTags[word].add(tag)

        productions = list()
        S = nltk.Nonterminal('S')
        for tree in treebank.parsed_sents():
            productions += tree.productions()
        # create the grammar
        pcfg = nltk.induce_pcfg(S, productions)
        # print(pcfg)
        self.viterb = ViterbiParser(pcfg)
        self.mostRecentTree = None
        self.validPosTags = set()
        self.validChunkTags = set()
        self.validIOBTags = set()
        self.relationTags = set()
        self.anchorTags = set()

        # pos tags
        self.validPosTags.add("CC")
        self.validPosTags.add("CD")
        self.validPosTags.add("DT")
        self.validPosTags.add("EX")
        self.validPosTags.add("FW")
        self.validPosTags.add("IN")
        self.validPosTags.add("JJ")
        self.validPosTags.add("JJR")
        self.validPosTags.add("JJS")
        self.validPosTags.add("LS")
        self.validPosTags.add("MD")
        self.validPosTags.add("NN")
        self.validPosTags.add("NNS")
        self.validPosTags.add("NNP")
        self.validPosTags.add("NNPS")
        self.validPosTags.add("PDT")
        self.validPosTags.add("POS")
        self.validPosTags.add("PRP")
        self.validPosTags.add("PRP$")
        self.validPosTags.add("PR")
        self.validPosTags.add("PBR")
        self.validPosTags.add("PBS")
        self.validPosTags.add("RP")
        self.validPosTags.add("SYM")
        self.validPosTags.add("TO")
        self.validPosTags.add("UH")
        self.validPosTags.add("VB")
        self.validPosTags.add("VBZ")
        self.validPosTags.add("VBP")
        self.validPosTags.add("VBD")
        self.validPosTags.add("VBG")
        self.validPosTags.add("WDT")
        self.validPosTags.add("WP")
        self.validPosTags.add("WP$")
        self.validPosTags.add("WRB")
        self.validPosTags.add(".")
        self.validPosTags.add(",")
        self.validPosTags.add(":")
        self.validPosTags.add("(")
        self.validPosTags.add(")")

        # chunk tags
        self.validChunkTags.add("NP")
        self.validChunkTags.add("PP")
        self.validChunkTags.add("VP")
        self.validChunkTags.add("ADVP")
        self.validChunkTags.add("ADJP")
        self.validChunkTags.add("SBAR")
        self.validChunkTags.add("PRT")
        self.validChunkTags.add("INTJ")
        self.validChunkTags.add("PNP")

        # IOB tags
        self.validIOBTags.add("I-")
        self.validIOBTags.add("O-")
        self.validIOBTags.add("B-")

        # relation tags
        self.relationTags.add("SBJ")
        self.relationTags.add("OBJ")
        self.relationTags.add("PRD")
        self.relationTags.add("TMP")
        self.relationTags.add("CLR")
        self.relationTags.add("LOC")
        self.relationTags.add("DIR")
        self.relationTags.add("EXT")
        self.relationTags.add("PRP")

        # anchor tags
        self.anchorTags.add("A1")
        self.anchorTags.add("P1")
Esempio n. 17
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize
    from nltk.parse import ViterbiParser
    from nltk.grammar import toy_pcfg1, toy_pcfg2

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ('I saw the man with my telescope', toy_pcfg1),
        ('the boy saw Jack with Bob under the table with a telescope',
         toy_pcfg2),
    ]

    # Ask the user which demo they want to use.
    print()
    for i in range(len(demos)):
        print('%3s: %s' % (i + 1, demos[i][0]))
        print('     %r' % demos[i][1])
        print()
    print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print('Bad sentence number')
        return

    # Tokenize the sentence.
    tokens = sent.split()

    parser = ViterbiParser(grammar)
    all_parses = {}

    print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
    parser.trace(3)
    t = time.time()
    parses = parser.parse_all(tokens)
    time = time.time() - t
    average = (reduce(lambda a, b: a + b.prob(), parses, 0) /
               len(parses) if parses else 0)
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print('Time (secs)   # Parses   Average P(parse)')
    print('-----------------------------------------')
    print('%11.4f%11d%19.14f' % (time, num_parses, average))
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print('------------------------------------------')
    print('%11s%11d%19.14f' % ('n/a', len(parses), p))

    # Ask the user if we should draw the parses.
    print()
    print('Draw parses (y/n)? ', end=' ')
    if sys.stdin.readline().strip().lower().startswith('y'):
        from nltk.draw.tree import draw_trees

        print('  please wait...')
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print()
    print('Print parses (y/n)? ', end=' ')
    if sys.stdin.readline().strip().lower().startswith('y'):
        for parse in parses:
            print(parse)
Esempio n. 18
0
#yang di test_set
for word, tag in treebank.tagged_words():
    t = Tree.fromstring("(" + tag + " " + word + ")")
    for production in t.productions():
        tbank_productions.append(production)

print tbank_productions[2]

#Secara otomatis membangun grammar (terutama menghitung probability rule)
#dari list production rule tbank_productions
tbank_grammar = induce_pcfg(Nonterminal('S'), tbank_productions)

print tbank_grammar

#PARSING
parser = ViterbiParser(tbank_grammar)
s = time.time()
#parsing untuk raw data latih kedua
for t in parser.parse(raw_test_set[1]):
    print(t)

#hitung waktu parsing
s = time.time() - s

#gold standard dari dataset kedua
print test_set[1]
'''Tugas anda adalah membangun fungsi untuk mengukur akurasi dari parser 
yang telah dibangun. Akurasi terdiri dari 2 macam, yaitu exact match, 
dan partial match (rata-rata recall dan precision). Cari sendiri bagaimana
menghitung recall dan precision untuk parsing dari referensi yang valid.
Esempio n. 19
0
def do_cky(grammar):
    global test
    global posx
    
    viterbi = ViterbiParser(grammar) # inicializa o parser com a gramatica (PCFG)
    resultados = []
    for t in test[:1]: # para cada sentenca da base de teste
        try:
            sent = t.leaves() # pega a frase

            if len(sent) <= 18: # filtro para palavras com até 18 palavras (incluindo pontuação)
                ssent = []
                for s in sent: # checar a cobertura da gramatica para cada palavra
                    try:
                        grammar.check_coverage([s])
                        ssent.append(s)
                    except ValueError: # para os casos de palavras desconhecidas
                        ssent.append("UNK")

                saida = []
                for i in viterbi.parse(ssent): # utiliza o parser para a sentenca de teste
                    saida.append(i)

                # lista para o resultado das duas vertentes: descobrir os não-terminais que original os terminais; e identificar os não-terminais que derivam as palavras
                list_eval_val = []
                list_eval_test = []
                list_tag_val = []
                list_tag_test = []
                
                posx = 0
                make_tree_evaluation(saida[0][0],list_eval_test,list_tag_test,0) # realiza a avalicao para o resultado do parser
                posx = 0
                make_tree_evaluation(t,list_eval_val,list_tag_val, 0) # realiza a avalicao para a arvore da base de teste
                
                # ordena pela ordem de visitacao
                list_eval_test.sort(key=lambda tup: tup[3])
                list_eval_val.sort(key=lambda tup: tup[3])

                # quantidade de acertos
                acertos = len(set(list_eval_test).intersection(set(list_eval_val)))
                # labeled precision
                lp = acertos/len(list_eval_test)
                # labeled recall
                lr = acertos/len(list_eval_val)
                # f1
                f1 = 0
                if lp > 0 and lr > 0:
                    f1 = 2*lp*lr/(lp+lr)
                # tagging accuracy
                ta = 0
                ta = len([i for i, j in zip(list_tag_test, list_tag_val) if i == j])
                ta /= len(list_tag_val)
                
                # armazena o resultado
                r = {'lp':lp, 'lr': lr, 'f1':f1, 'ta':ta}
                resultados.append(r)
            else:
                print("Sentença com mais de 18 palavras.")
        except Exception:
            print("Árvore mal formada.")

    # realiza o calculo da media para cada metrica
    media_lp = sum(item['lp'] for item in resultados)/len(resultados)
    media_lr = sum(item['lr'] for item in resultados)/len(resultados)
    media_f1 = sum(item['f1'] for item in resultados)/len(resultados)
    media_ta = sum(item['ta'] for item in resultados)/len(resultados)
    print("media_lp",media_lp,"media_lr",media_lr,"media_f1",media_f1,"media_ta",media_ta)
Esempio n. 20
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys
    import time

    from nltk import tokenize
    from nltk.grammar import toy_pcfg1, toy_pcfg2
    from nltk.parse import ViterbiParser

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ("I saw the man with my telescope", toy_pcfg1),
        ("the boy saw Jack with Bob under the table with a telescope",
         toy_pcfg2),
    ]

    # Ask the user which demo they want to use.
    print()
    for i in range(len(demos)):
        print(f"{i + 1:>3}: {demos[i][0]}")
        print("     %r" % demos[i][1])
        print()
    print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print("Bad sentence number")
        return

    # Tokenize the sentence.
    tokens = sent.split()

    parser = ViterbiParser(grammar)
    all_parses = {}

    print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
    parser.trace(3)
    t = time.time()
    parses = parser.parse_all(tokens)
    time = time.time() - t
    average = (reduce(lambda a, b: a + b.prob(), parses, 0) /
               len(parses) if parses else 0)
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print("Time (secs)   # Parses   Average P(parse)")
    print("-----------------------------------------")
    print("%11.4f%11d%19.14f" % (time, num_parses, average))
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print("------------------------------------------")
    print("%11s%11d%19.14f" % ("n/a", len(parses), p))

    # Ask the user if we should draw the parses.
    print()
    print("Draw parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        from nltk.draw.tree import draw_trees

        print("  please wait...")
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print()
    print("Print parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        for parse in parses:
            print(parse)