Esempio n. 1
0
def load_pos():
    from nltk_lite.corpora import brown
    from itertools import islice

    sentences = list(islice(brown.tagged(), 100))

    tag_set = ["'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl',
        'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem',
        'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz',
        'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg',
        'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$',
        'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn',
        'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb',
        'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn',
        'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb']
        
    sequences = []
    sequence = []
    symbols = Set()
    start_re = re.compile(r'[^-*+]*')
    for sentence in sentences:
        for i in range(len(sentence)):
            word, tag = sentence[i]
            word = word.lower()  # normalize
            symbols.add(word)    # log this word
            m = start_re.match(tag)
            # cleanup the tag
            tag = m.group(0)
            if tag not in tag_set:
                tag = '*'
            sentence[i] = (word, tag)  # store cleaned-up tagged token

    return sentences, tag_set, list(symbols)
def demo():
    from nltk_lite.corpora import brown
    from itertools import islice
    from pprint import pprint

    pprint(list(islice(brown.raw('a'), 0, 5)))

    pprint(list(islice(brown.tagged('a'), 0, 5)))
def demo():
    """
    A simple demonstration function for the C{Tagger} classes.  It
    constructs a backoff tagger using a trigram tagger, bigram tagger
    unigram tagger and a default tagger.  It trains and tests the
    tagger using the Brown corpus.
    """
    from nltk_lite.corpora import brown
    import sys

    print 'Training taggers.'

    # Create a default tagger
    t0 = Default('nn')

#    t1a = Affix(length=-3, minlength=5, backoff=t0)
#    t1b = Unigram(cutoff=2, backoff=t1a)
    t1 = Unigram(cutoff=1, backoff=t0)
    t2 = Bigram(cutoff=1, backoff=t1)
    t3 = Trigram(backoff=t2)

    t1.train(brown.tagged('a'), verbose=True)
    t2.train(brown.tagged('a'), verbose=True)
    t3.train(brown.tagged('a'), verbose=True)

    # Tokenize the testing files
    test_tokens = []
    num_words = 0

    # Run the taggers.  For t0, t1, and t2, back-off to DefaultTagger.
    # This is especially important for t1 and t2, which count on
    # having known tags as contexts; if they get a context containing
    # None, then they will generate an output of None, and so all
    # words will get tagged a None.

    print '='*75
    print 'Running the taggers on test data...'
    print '  Default (nn) tagger: ',
    sys.stdout.flush()
    _demo_tagger(t0, brown.tagged('b'))

    print '  Unigram tagger:      ',
    sys.stdout.flush()
    _demo_tagger(t1, list(brown.tagged('b'))[:1000])

    print '  Bigram tagger:       ',
    sys.stdout.flush()
    _demo_tagger(t2, list(brown.tagged('b'))[:1000])

    print '  Trigram tagger:      ',
    sys.stdout.flush()
    _demo_tagger(t3, list(brown.tagged('b'))[:1000])
def demo():
    """
    Demonstrates how to use IndexConcordance and Aggregator.
    """
    print "Reading Brown Corpus into memory..."
    corpus = list(brown.tagged(('a','j')))
    print "Generating index..."
    ic = IndexConcordance(corpus)
    print "Showing all occurences of 'plasma' in the Brown Corpus..."
    ic.formatted(middleRegexp="^plasma/.*", verbose=True)

    print "Investigating the collocates of 'deal' and derivatives..."
    agg = Aggregator()
    agg.add(ic.raw(middleRegexp="^deal", leftContextLength=1, rightContextLength=0,
    leftRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' left collocates")
    agg.add(ic.raw(middleRegexp="^deal", leftContextLength=0, rightContextLength=1,
    rightRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' right collocates")
    agg.formatted(showFirstX=5, usePOS=False)
Esempio n. 5
0
def demo():
    import tnt
    from nltk_lite.corpora import brown
    sents = list(brown.tagged())
    test = list(brown.raw())

    # create and train the tagger
    tagger = tnt.Tnt()
    tagger.train(sents[200:1000])

    # tag some data
    tagged_data = tagger.tagdata(test[100:120])

    # print results
    for j in range(len(tagged_data)):
        s = tagged_data[j]
        t = sents[j + 100]
        for i in range(len(s)):
            print s[i], '--', t[i]
        print
Esempio n. 6
0
def demo():
   import tnt
   from nltk_lite.corpora import brown
   sents = list(brown.tagged())
   test = list(brown.raw())

   # create and train the tagger
   tagger = tnt.Tnt()
   tagger.train(sents[200:1000])

   # tag some data
   tagged_data = tagger.tagdata(test[100:120])

   # print results
   for j in range(len(tagged_data)):
      s = tagged_data[j]
      t = sents[j+100]
      for i in range(len(s)):
         print s[i],'--', t[i]
      print
Esempio n. 7
0
def demo3():
    from nltk_lite import tag
    from nltk_lite.corpora import treebank
    from nltk_lite.corpora import brown
    import tnt

    d = list(treebank.tagged())
    e = list(brown.tagged())

    d = d[:1000]
    e = e[:1000]

    d10 = int(len(d) * 0.1)
    e10 = int(len(e) * 0.1)

    tknacc = 0
    sknacc = 0
    tallacc = 0
    sallacc = 0
    tknown = 0
    sknown = 0

    for i in range(10):

        t = tnt.Tnt(N=1000, C=False)
        s = tnt.Tnt(N=1000, C=False)

        dtest = d[(i * d10):((i + 1) * d10)]
        etest = e[(i * e10):((i + 1) * e10)]

        dtrain = d[:(i * d10)] + d[((i + 1) * d10):]
        etrain = e[:(i * e10)] + e[((i + 1) * e10):]

        t.train(dtrain)
        s.train(etrain)

        tacc = tag.accuracy(t, dtest)
        tp_un = float(t.unknown) / float(t.known + t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        tknown += tp_kn
        t.unknown = 0
        t.known = 0

        sacc = tag.accuracy(s, etest)
        sp_un = float(s.unknown) / float(s.known + s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        sknown += sp_kn
        s.unknown = 0
        s.known = 0

        tknacc += (tacc / tp_kn)
        sknacc += (sacc / tp_kn)
        tallacc += tacc
        sallacc += sacc

        #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc

    print "brown: acc over words known:", 10 * tknacc
    print "     : overall accuracy:", 10 * tallacc
    print "     : words known:", 10 * tknown
    print "treebank: acc over words known:", 10 * sknacc
    print "        : overall accuracy:", 10 * sallacc
    print "        : words known:", 10 * sknown
Esempio n. 8
0
		      fictionGeneral = 'k',
		      fictionMystery = 'l',
		      fictionScience = 'm',
		      fictionAdventure = 'n',
		      fictionRomance = 'p',
		      humour = 'r')

# set corpus basedir
set_basedir('./topicalizer/corpora')

# create tokenizer
tokenizer = analyser.Tokenizer()

# train tagging model
model = tag.Bigram()
model.train(brown.tagged([textCategories['pressReportage'], textCategories['pressEditorial'], textCategories['pressReviews'], textCategories['skillsAndHobbies'], textCategories['popularLore']]))

# tag text
text = 'I want to buy a camera'
tokens = list(tokenizer.processWhitespacesWithoutStopWords(text, 1))
taggedTokens = list(model.tag(tokens))
print tokens
print taggedTokens

# get WordNet information for each noun
for taggedToken in taggedTokens:
    if taggedToken[1] == 'nn' or taggedToken[1] == None:
	# get synsets
	synsets = impl.lookupSynsetsByForm(taggedToken[0])

	# print gloss
Esempio n. 9
0
                      fictionAdventure='n',
                      fictionRomance='p',
                      humour='r')

# set corpus basedir
set_basedir('./topicalizer/corpora')

# create tokenizer
tokenizer = analyser.Tokenizer()

# train tagging model
model = tag.Bigram()
model.train(
    brown.tagged([
        textCategories['pressReportage'], textCategories['pressEditorial'],
        textCategories['pressReviews'], textCategories['skillsAndHobbies'],
        textCategories['popularLore']
    ]))

# tag text
text = 'I want to buy a camera'
tokens = list(tokenizer.processWhitespacesWithoutStopWords(text, 1))
taggedTokens = list(model.tag(tokens))
print tokens
print taggedTokens

# get WordNet information for each noun
for taggedToken in taggedTokens:
    if taggedToken[1] == 'nn' or taggedToken[1] == None:
        # get synsets
        synsets = impl.lookupSynsetsByForm(taggedToken[0])
Esempio n. 10
0
def demo3():
   from nltk_lite import tag
   from nltk_lite.corpora import treebank
   from nltk_lite.corpora import brown
   import tnt

   d = list(treebank.tagged())
   e = list(brown.tagged())

   d = d[:1000]
   e = e[:1000]

   d10 = int(len(d)*0.1)
   e10 = int(len(e)*0.1)

   tknacc = 0
   sknacc = 0
   tallacc = 0
   sallacc = 0
   tknown = 0
   sknown = 0

   for i in range(10):

      t = tnt.Tnt(N=1000, C=False)
      s = tnt.Tnt(N=1000, C=False)

      dtest = d[(i*d10):((i+1)*d10)]
      etest = e[(i*e10):((i+1)*e10)]
      
      dtrain = d[:(i*d10)] + d[((i+1)*d10):]
      etrain = e[:(i*e10)] + e[((i+1)*e10):]     

      t.train(dtrain)
      s.train(etrain)
   
      tacc = tag.accuracy(t, dtest)
      tp_un = float(t.unknown) / float(t.known +t.unknown)
      tp_kn = float(t.known) / float(t.known + t.unknown)
      tknown += tp_kn
      t.unknown = 0
      t.known = 0
      
      sacc = tag.accuracy(s, etest)
      sp_un = float(s.unknown) / float(s.known + s.unknown)
      sp_kn = float(s.known) / float(s.known + s.unknown)
      sknown += sp_kn
      s.unknown = 0
      s.known = 0 

      tknacc += (tacc / tp_kn)
      sknacc += (sacc / tp_kn)
      tallacc += tacc
      sallacc += sacc

      #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc
      

   print "brown: acc over words known:", 10*tknacc
   print "     : overall accuracy:", 10*tallacc
   print "     : words known:", 10*tknown
   print "treebank: acc over words known:", 10*sknacc
   print "        : overall accuracy:", 10*sallacc
   print "        : words known:", 10*sknown
        # 'context_pattern' is built based on the context's size (self._n),
        # for example:
        #   self._n = 2 -> r'^(.+?)$', like 'tag1'
        #   self._n = 3 -> r'^(.+?):(.+?)$', like 'tag1:tag2'
        #   self._n = 4 -> r'^(.+?):(.+?):(.+?)$', like 'tag1:tag2:tag3'
        context_pattern_str = r'^(.+?)%s$' % ( r':(.+?)' * (self._n-2) )
        context_pattern = re.compile(context_pattern_str, re.UNICODE)
        
        for line in lines[1:]:
            m = re.match(pattern, line)
            context, text, tag = m.groups()
            
            c_m = re.match(context_pattern, context)
            key = (c_m.groups(), text)
            self._model[key] = tag
        
        handler.close()

# load train corpus
train_sents = list(islice(brown.tagged(), 500))

# create taggers
tagger = MarshalNgram(3)

#tagger.train(train_sents)
#tagger.marshal("ngram.test")

tagger.unmarshal("ngram.test")
print tagger._model