Ejemplo n.º 1
0
def demo():
    gfile = GrammarFile.read_file('test.cfg')
    cp = gfile.earley_parser()
    sent = Token(TEXT='the police read the solutions that Poirot sent')
    WhitespaceTokenizer().tokenize(sent)
    cp.parse_n(sent)
    for tree in sent['TREES']:
        print tree
Ejemplo n.º 2
0
def calculaEntropia(documento):
    freq_dist = FreqDist()
    corpus = Token(TEXT=open(documento).read())
    WhitespaceTokenizer().tokenize(corpus)
    for token in corpus['SUBTOKENS']:
        freq_dist.inc(token['TEXT'])
    entropia = 0
    for i in freq_dist.samples():
        entropia = entropia + (freq_dist.freq(i) * log(freq_dist.freq(i), 2))
    return -entropia
Ejemplo n.º 3
0
def text_parse(grammar, sentence, trace=2, drawtrees=False, latex=False):
    parser = grammar.earley_parser(trace=trace)
    print parser._grammar
    sent = Token(TEXT=sentence)
    WhitespaceTokenizer().tokenize(sent)
    parser.parse_n(sent)
    if drawtrees:
        from treeview import TreeView
        TreeView(sent['TREES'])
    else:
        for tree in sent['TREES']:
            if latex: print tree.latex_qtree()
            else: print tree
def _demo_stemmer(stemmer):
    # Tokenize a sample text.
    from nltk.tokenizer import WhitespaceTokenizer
    text = Token(TEXT='John was eating icecream')
    WhitespaceTokenizer().tokenize(text)

    # Use the stemmer to stem it.
    for word in text['SUBTOKENS']:
        stemmer.stem(word)

    # Print the results.
    print stemmer
    for word in text['SUBTOKENS']:
        print '%20s => %s' % (word['TEXT'], word['STEM'])
    print
Ejemplo n.º 5
0
def demo():
    """
    Create a shift reduce parser demo, using a simple grammar and
    text. 
    """

    from nltk.cfg import Nonterminal, CFGProduction, CFG
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]

    productions = (
        # Syntactic Productions
        CFGProduction(S, [NP, VP]),
        CFGProduction(NP, [Det, N]),
        CFGProduction(NP, [NP, PP]),
        CFGProduction(VP, [VP, PP]),
        CFGProduction(VP, [V, NP, PP]),
        CFGProduction(VP, [V, NP]),
        CFGProduction(PP, [P, NP]),

        # Lexical Productions
        CFGProduction(NP, ['I']),
        CFGProduction(Det, ['the']),
        CFGProduction(Det, ['a']),
        CFGProduction(N, ['man']),
        CFGProduction(V, ['saw']),
        CFGProduction(P, ['in']),
        CFGProduction(P, ['with']),
        CFGProduction(N, ['park']),
        CFGProduction(N, ['dog']),
        CFGProduction(N, ['statue']),
        CFGProduction(Det, ['my']),
    )

    grammar = CFG(S, productions)

    # tokenize the sentence
    sent = Token(TEXT='my dog saw a man in the park with a statue')
    WhitespaceTokenizer().tokenize(sent)

    ShiftReduceParserDemo(grammar, sent).mainloop()
 def __init__(self, **property_names):
     from nltk.tokenizer import WhitespaceTokenizer
     tokenizer = WhitespaceTokenizer(**property_names)
     TokenizerBasedTokenReader.__init__(self, tokenizer)
from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.draw.plot import Plot

freq_dist = FreqDist()
corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
print corpus
WhitespaceTokenizer().tokenize(corpus)
print corpus

for token in corpus['SUBTOKENS']:
	freq_dist.inc(token['TEXT'])

# How many times did "the" occur?
freq_dist.count('the')

# What was the frequency of the word "the"?
freq_dist.freq('the')

# How many word tokens were counted?
freq_dist.N()

# What word types were encountered?
freq_dist.samples()

# What was the most common word?
freq_dist.max()

# What is the distribution of word lengths in a corpus?
freq_dist = FreqDist()
def demo():
    import sys, time

    S = GrammarCategory.parse('S')
    VP = GrammarCategory.parse('VP')
    NP = GrammarCategory.parse('NP')
    PP = GrammarCategory.parse('PP')
    V = GrammarCategory.parse('V')
    N = GrammarCategory.parse('N')
    P = GrammarCategory.parse('P')
    Name = GrammarCategory.parse('Name')
    Det = GrammarCategory.parse('Det')
    DetSg = GrammarCategory.parse('Det[-pl]')
    DetPl = GrammarCategory.parse('Det[+pl]')
    NSg = GrammarCategory.parse('N[-pl]')
    NPl = GrammarCategory.parse('N[+pl]')

    # Define some grammatical productions.
    grammatical_productions = [
        CFGProduction(S, (NP, VP)),
        CFGProduction(PP, (P, NP)),
        CFGProduction(NP, (NP, PP)),
        CFGProduction(VP, (VP, PP)),
        CFGProduction(VP, (V, NP)),
        CFGProduction(VP, (V, )),
        CFGProduction(NP, (DetPl, NPl)),
        CFGProduction(NP, (DetSg, NSg))
    ]

    # Define some lexical productions.
    lexical_productions = [
        CFGProduction(NP, ('John', )),
        CFGProduction(NP, ('I', )),
        CFGProduction(Det, ('the', )),
        CFGProduction(Det, ('my', )),
        CFGProduction(Det, ('a', )),
        CFGProduction(NSg, ('dog', )),
        CFGProduction(NSg, ('cookie', )),
        CFGProduction(V, ('ate', )),
        CFGProduction(V, ('saw', )),
        CFGProduction(P, ('with', )),
        CFGProduction(P, ('under', )),
    ]

    earley_grammar = CFG(S, grammatical_productions)
    earley_lexicon = {}
    for prod in lexical_productions:
        earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs())

    sent = Token(TEXT='I saw John with a dog with my cookie')
    print "Sentence:\n", sent
    from nltk.tokenizer import WhitespaceTokenizer
    WhitespaceTokenizer().tokenize(sent)
    t = time.time()
    cp = FeatureEarleyChartParser(earley_grammar,
                                  earley_lexicon,
                                  LEAF='TEXT',
                                  trace=1)
    cp.parse_n(sent)
    print "Time: %s" % (time.time() - t)
    for tree in sent['TREES']:
        print tree
Ejemplo n.º 9
0
 def set_sentence(self, sentence):
     self._token = Token(TEXT=sentence)
     WhitespaceTokenizer().tokenize(self._token)  #[XX] use tagged?
     self.reset()
def demo(labeled_tokens, n_words=5, n_lens=20, debug=1):
    assert _chktype(1, labeled_tokens, [Token], (Token,))
    assert _chktype(2, n_words, types.IntType)
    assert _chktype(3, n_lens, types.IntType)
    assert _chktype(4, debug, types.IntType)
    _resettime()
    
    if debug: print _timestamp(), 'getting a list of labels...'
    labelmap = {}
    for ltok in labeled_tokens:
        labelmap[ltok.type().label()] = 1
    labels = labelmap.keys()
    if debug: print _timestamp(), '  got %d labels.' % len(labels)
    
    if debug: print _timestamp(), 'constructing feature list...'
    f_range = [chr(i) for i in (range(ord('a'), ord('z'))+[ord("'")])]
    feature_detectors = [
        FunctionFeatureDetector(lambda w:ord(w[0:1])),
        FunctionFeatureDetector(lambda w:ord(w[-2:-1] or ' ')),
        FunctionFeatureDetector(lambda w:ord(w[-1:])),
        FunctionFeatureDetector(lambda w:len(w)),
        ]
    fd_list = SimpleFDList(feature_detectors)
    #fd_list += TextFunctionFDList(lambda w:w[0:1], f_range, labels)
    #fd_list = TextFunctionFDList(lambda w:w[0:1], f_range, labels)
    #fd_list += TextFunctionFDList(lambda w:w[-2:-1], f_range, labels)
    #fd_list += TextFunctionFDList(lambda w:w[-1:], f_range, labels)
    #fd_list += TextFunctionFDList(lambda w:w, ["atlanta's"], labels)
    #fd_list += TextFunctionFDList(lambda w:len(w), range(n_lens), labels)

    if debug: print _timestamp(), '  got %d features' % len(fd_list)

    if debug: print _timestamp(), 'training on %d samples...' % len(labeled_tokens)

    trainer = NBClassifierTrainer(fd_list)
    classifier = trainer.train(labeled_tokens, estimator='ELE')

    #trainer = MultinomialNBClassifierTrainer(fd_list)
    #classifier = trainer.train(labeled_tokens)
    
    if debug: print _timestamp(), '  done training'
    
    if debug: print _timestamp(), ('%d tokens, %d labels' % (len(labeled_tokens), 
                                     len(classifier.labels())))
    toks = WhitespaceTokenizer().tokenize("jury the reports aweerdr "+
                                  "atlanta's atlanta_s moowerp's")
    
    #import time
    #for i in range(20):
    #    for word in toks:
    #        classifier.classify(word)
    #if debug: print _timestamp(), '100 classifications: %0.4f secs' % (time.time()-t)

    toks = toks * (1+((n_words-1)/len(toks)))
    if debug:print _timestamp(), 'Testing on %d tokens' % len(toks)
    t = time.time()
    for word in toks:
        if debug: print _timestamp(), word
        if 1:
            items = classifier.distribution_dictionary(word).items()
            items.sort(lambda x,y:cmp(y[1], x[1]))
            for (label,p) in items:
                if p > 0.01:
                    print _timestamp(), '    %3.5f %s' % (p, label)
        label = classifier.classify(word)
        if debug: print _timestamp(), '  =>', label

    return time.time()-t