Ejemplo n.º 1
0
def demo_pos_supervised():
    from nltk.corpus import brown
    from sys import stdout
    print 'Loading data from Brown corpus...'
    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.tokenize(item))

    tag_set = ["'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl',
        'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem',
        'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz',
        'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg',
        'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$',
        'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn',
        'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb',
        'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn',
        'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb']
        
    annul_nonmatching_tags(tagged_tokens, tag_set, '*')
    words, word_set, tags, tag_set = _split_tagged_tokens(tagged_tokens)

    word_set.sort()
    tag_set.sort()

    print 'output alphabet', `word_set`[:50], '...'
    print 'state labels   ', `tag_set`[:50], '...'
    print tag_set

    print 'Training HMM...'

    #print 'training data:'
    #print zip(words[1:], tags[1:])

    trainer = HMMTrainer(tag_set, word_set)
    hmm = trainer.train_supervised(words[100:], tags[100:],
                    lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))

    print hmm
    print 'Testing...'
    
    for ws, ts in zip(words[:3], tags[:3]):
        print ws
        print 'HMM >>>'
        print hmm.best_path(ws)
        print 'CORRECT >>>'
        print ts
        print '-' * 60

    count = correct = 0
    for ws, ts in zip(words[:100], tags[:100]):
        print '.',
        stdout.flush()
        pts = hmm.best_path(ws)
        for t, pt in zip(ts, pts):
            count += 1
            if t == pt:
                correct += 1

    print 'accuracy over first', count, 'tokens %.1f' % (100.0 * correct / count)
Ejemplo n.º 2
0
def testLengthOfText(brownSection):
    items = brown.items(brownSection)
    textToken = brown.read(items[0])
    
    for i in range(len(items)):
        print "\n\nWeightedTaggedTScoreModel\n"
        model = WeightedTaggedTScoreModel(textToken, SUBTOKENS='WORDS')
        TextGenerator(model).generateWords(('the','at'),100)
        textToken['WORDS'] = textToken['WORDS'] + brown.read(items[i+1])['WORDS']
Ejemplo n.º 3
0
    def buildTrainTokens(self, texts=10):
        '''
        Tokenize texts from the Brown Corpus.
        '''
        from nltk.corpus import brown

        train_tokens = []
        for item in brown.items()[:texts]:
            train_tokens.append(brown.read(item))
        self.train_tokens = train_tokens
Ejemplo n.º 4
0
def main():

  print "\n\nWeightedTaggedTScoreModel\n"
  items = brown.items('fiction: general')
  textToken = brown.read(items[0])
  for item in items[1:]:
      textToken['WORDS'] = textToken['WORDS'] + brown.read(item)['WORDS']

  model = WeightedTaggedTScoreModel(textToken, SUBTOKENS='WORDS')
  TextGenerator(model).generateWords(('the','at'), 200)
  print "\n\nWeightedTScoreModel...\n"
  model = WeightedTScoreModel(textToken, SUBTOKENS='WORDS')
  TextGenerator(model).generateWords('the',200)
def load_pos():
    from nltk.corpus import brown
    from nltk.tagger import TaggedTokenizer

    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.tokenize(item))

    tag_set = [
        "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn',
        'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben',
        'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$',
        'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz',
        'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$',
        'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$',
        'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr',
        'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt',
        'wp$', 'wpo', 'wps', 'wql', 'wrb'
    ]

    sequences = []
    sequence = []
    start_re = re.compile(r'[^-*+]*')
    for token in tagged_tokens:
        # the multi-output allows us to treat each word as a
        # tuple of features
        for sub_token in token['SUBTOKENS']:
            sequence.append(sub_token)
            # a feature for words as lower case
            features = [sub_token['TEXT'].lower()]
            #a feature for word suffixes of length 3
            features.append(sub_token['TEXT'][-3:])
            # a feature for the length of words
            features.append(len(sub_token['TEXT']))
            # store the observation as a tuple of features
            sub_token['TEXT'] = tuple(features)
            m = start_re.match(sub_token['TAG'])
            # cleanup the tag
            tag = m.group(0)
            if tag in tag_set:
                sub_token['TAG'] = tag
            else:
                sub_token['TAG'] = '*'
            # split on the period tag
            if sub_token['TAG'] == '.':
                sequences.append(Token(SUBTOKENS=sequence))
                sequence = []

    return sequences, tag_set, 3
Ejemplo n.º 6
0
def load_pos():
    from nltk.corpus import brown
    from nltk.tagger import TaggedTokenizer

    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.tokenize(item))

    tag_set = [
        "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn',
        'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben',
        'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$',
        'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz',
        'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$',
        'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$',
        'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr',
        'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt',
        'wp$', 'wpo', 'wps', 'wql', 'wrb'
    ]

    sequences = []
    sequence = []
    start_re = re.compile(r'[^-*+]*')
    for token in tagged_tokens:
        # the multi-output allows us to treat each word as a
        # tuple of features
        for sub_token in token['SUBTOKENS']:
            sequence.append(sub_token)
            # a feature for words as lower case
            features = [sub_token['TEXT'].lower()]
            #a feature for word suffixes of length 3
            features.append(sub_token['TEXT'][-3:])
            # a feature for the length of words
            features.append(len(sub_token['TEXT']))
            # store the observation as a tuple of features
            sub_token['TEXT'] = tuple(features)
            m = start_re.match(sub_token['TAG'])
            # cleanup the tag
            tag = m.group(0)
            if tag in tag_set:
                sub_token['TAG'] = tag
            else:
                sub_token['TAG'] = '*'
            # split on the period tag
            if sub_token['TAG'] == '.':
                sequences.append(Token(SUBTOKENS=sequence))
                sequence = []

    return sequences, tag_set, 3
Ejemplo n.º 7
0
def load_pos():
    from nltk.corpus import brown

    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.read(item))

    tag_set = [
        "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn',
        'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben',
        'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$',
        'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz',
        'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$',
        'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$',
        'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr',
        'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt',
        'wp$', 'wpo', 'wps', 'wql', 'wrb'
    ]

    sequences = []
    sequence = []
    symbols = {}
    start_re = re.compile(r'[^-*+]*')
    for token in tagged_tokens:
        for sub_token in token['WORDS']:
            sequence.append(sub_token)
            # make words lower case
            sub_token['TEXT'] = sub_token['TEXT'].lower()
            symbols[sub_token['TEXT']] = 1
            m = start_re.match(sub_token['TAG'])
            # cleanup the tag
            tag = m.group(0)
            if tag in tag_set:
                sub_token['TAG'] = tag
            else:
                sub_token['TAG'] = '*'
            # split on the period tag
            if sub_token['TAG'] == '.':
                sequences.append(Token(SUBTOKENS=sequence))
                sequence = []

    return sequences, tag_set, symbols.keys()
Ejemplo n.º 8
0
def load_pos():
    from nltk.corpus import brown

    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.read(item))

    tag_set = [
        "'",
        "''",
        "(",
        ")",
        "*",
        ",",
        ".",
        ":",
        "--",
        "``",
        "abl",
        "abn",
        "abx",
        "ap",
        "ap$",
        "at",
        "be",
        "bed",
        "bedz",
        "beg",
        "bem",
        "ben",
        "ber",
        "bez",
        "cc",
        "cd",
        "cd$",
        "cs",
        "do",
        "dod",
        "doz",
        "dt",
        "dt$",
        "dti",
        "dts",
        "dtx",
        "ex",
        "fw",
        "hv",
        "hvd",
        "hvg",
        "hvn",
        "hvz",
        "in",
        "jj",
        "jjr",
        "jjs",
        "jjt",
        "md",
        "nn",
        "nn$",
        "nns",
        "nns$",
        "np",
        "np$",
        "nps",
        "nps$",
        "nr",
        "nr$",
        "od",
        "pn",
        "pn$",
        "pp$",
        "ppl",
        "ppls",
        "ppo",
        "pps",
        "ppss",
        "ql",
        "qlp",
        "rb",
        "rb$",
        "rbr",
        "rbt",
        "rp",
        "to",
        "uh",
        "vb",
        "vbd",
        "vbg",
        "vbn",
        "vbz",
        "wdt",
        "wp$",
        "wpo",
        "wps",
        "wql",
        "wrb",
    ]

    sequences = []
    sequence = []
    symbols = {}
    start_re = re.compile(r"[^-*+]*")
    for token in tagged_tokens:
        for sub_token in token["WORDS"]:
            sequence.append(sub_token)
            # make words lower case
            sub_token["TEXT"] = sub_token["TEXT"].lower()
            symbols[sub_token["TEXT"]] = 1
            m = start_re.match(sub_token["TAG"])
            # cleanup the tag
            tag = m.group(0)
            if tag in tag_set:
                sub_token["TAG"] = tag
            else:
                sub_token["TAG"] = "*"
            # split on the period tag
            if sub_token["TAG"] == ".":
                sequences.append(Token(SUBTOKENS=sequence))
                sequence = []

    return sequences, tag_set, symbols.keys()
def demo_pos_supervised():
    from nltk.corpus import brown
    from sys import stdout
    print 'Loading data from Brown corpus...'
    tagged_tokens = []
    for item in brown.items()[:5]:
        tagged_tokens.append(brown.tokenize(item))

    tag_set = [
        "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn',
        'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben',
        'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$',
        'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz',
        'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$',
        'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$',
        'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr',
        'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt',
        'wp$', 'wpo', 'wps', 'wql', 'wrb'
    ]

    annul_nonmatching_tags(tagged_tokens, tag_set, '*')
    words, word_set, tags, tag_set = _split_tagged_tokens(tagged_tokens)

    word_set.sort()
    tag_set.sort()

    print 'output alphabet', ` word_set ` [:50], '...'
    print 'state labels   ', ` tag_set ` [:50], '...'
    print tag_set

    print 'Training HMM...'

    #print 'training data:'
    #print zip(words[1:], tags[1:])

    trainer = HMMTrainer(tag_set, word_set)
    hmm = trainer.train_supervised(
        words[100:], tags[100:],
        lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))

    print hmm
    print 'Testing...'

    for ws, ts in zip(words[:3], tags[:3]):
        print ws
        print 'HMM >>>'
        print hmm.best_path(ws)
        print 'CORRECT >>>'
        print ts
        print '-' * 60

    count = correct = 0
    for ws, ts in zip(words[:100], tags[:100]):
        print '.',
        stdout.flush()
        pts = hmm.best_path(ws)
        for t, pt in zip(ts, pts):
            count += 1
            if t == pt:
                correct += 1

    print 'accuracy over first', count, 'tokens %.1f' % (100.0 * correct /
                                                         count)
Ejemplo n.º 10
0
# Felizemente, os atomos dos corpus utilizados para a tarefa de
# aprendizado jah estao demarcados com as suas tags. Sendo assim,
# estes corpus podem ser utilizados tambem para a avaliacao dos
# algoritmos de tags.
#

# um exemplo simples

from nltk.tokenizer import *
from nltk.tagger import *
from nltk.corpus import brown

# fase de treinamento

train_tokens = []
for item in brown.items()[:10]:
    train_tokens.append(brown.read(item))

mytagger = UnigramTagger(SUBTOKENS='WORDS')
for tok in train_tokens:
    mytagger.train(tok)

# utilizacao

text_token = Token(TEXT="John saw the book on the table")
WhitespaceTokenizer(SUBTOKENS='WORDS').tokenize(text_token)
mytagger.tag(text_token)
print text_token

#
# depois de criado o mytagger podemos testa-lo
#
# Anteriormente, nos tentamos inferir a tag de um atomo
# baseado apenas no texto do atomo.
# Agora vamos usar informacoes sobre o contexto de um atomo
# (palavras que estao ao seu entorno) para inferir a sua tag.
#

#
# Bigram Taggers: utiliza sempre a palavra mais proxima do atomo
# Geralmente, a anterior
#
# N-gram Taggers: utiliza sempre a n palavra mais proxima do atomo
# Geralmente, a n-palavra anterior.
#

from nltk.tokenizer import *
from nltk.tagger import *
from nltk.corpus import brown

tagger = NthOrderTagger(3, SUBTOKENS='WORDS')  # 3rd order tagger
for item in brown.items()[:10]:
    tok = brown.read(item)
    tagger.train(tok)

# utilizacao

text_token = Token(TEXT="John saw the book on the table")
WhitespaceTokenizer(SUBTOKENS='WORDS').tokenize(text_token)
tagger.tag(text_token)
print text_token
def demo(num_files=20):
    """
    A simple demonstration function for the C{Tagger} classes.  It
    constructs a C{BackoffTagger} using a 2nd order C{NthOrderTagger},
    a 1st order C{NthOrderTagger}, a 0th order C{NthOrderTagger}, and
    an C{DefaultTagger}.  It trains and tests the tagger using the
    brown corpus.

    @type num_files: C{int}
    @param num_files: The number of files that should be used for
        training and for testing.  Two thirds of these files will be
        used for training.  All files are randomly selected
        (I{without} replacement) from the brown corpus.  If
        C{num_files>=500}, then all 500 files will be used.
    @rtype: None
    """
    from nltk.corpus import brown
    import sys, random
    num_files = max(min(num_files, 500), 3)

    # Get a randomly sorted list of files in the brown corpus.
    items = list(brown.items())
    random.shuffle(items)

    # Tokenize the training files.
    print '='*75
    sys.stdout.write('Reading training data'); sys.stdout.flush()
    train_tokens = []
    num_words = 0
    for item in items[:num_files*2/3]:
        sys.stdout.write('.'); sys.stdout.flush()
        train_tokens.append(brown.read(item))
        num_words += len(train_tokens[-1]['WORDS'])
    print '\n  Read in %d words for training' % num_words

    print 'Training taggers.'

    # Create a default tagger
    default_tagger = DefaultTagger('nn', SUBTOKENS='WORDS')

    print '  Training unigram tagger...'
    t0 = UnigramTagger(SUBTOKENS='WORDS')
    for tok in train_tokens: t0.train(tok)
        
    print '  Training bigram tagger...'
    t1 = NthOrderTagger(1, SUBTOKENS='WORDS')
    for tok in train_tokens: t1.train(tok)

    print '  Training trigram tagger...'
    t2 = NthOrderTagger(2, SUBTOKENS='WORDS')
    for tok in train_tokens: t2.train(tok)

    # Delete train_tokens, because it takes up lots of memory.
    del train_tokens
    
    # Tokenize the testing files
    test_tokens = []
    num_words = 0
    sys.stdout.write('Reading testing data'); sys.stdout.flush()
    for item in items[num_files*2/3:num_files]:
        sys.stdout.write('.'); sys.stdout.flush()
        test_tok = brown.read(item)
        num_words += len(test_tok['WORDS'])
        test_tokens.append(test_tok)
    print '\n  Read in %d words for testing' % num_words

    # Run the taggers.  For t0, t1, and t2, back-off to DefaultTagger.
    # This is especially important for t1 and t2, which count on
    # having known tags as contexts; if they get a context containing
    # None, then they will generate an output of None, and so all
    # words will get tagged a None.
    print '='*75
    print 'Running the taggers on test data...'
    print '  Default (nn) tagger: ',
    sys.stdout.flush()
    _demo_tagger(test_tokens, default_tagger)
    print '  Unigram tagger:      ',
    sys.stdout.flush()
    _demo_tagger(test_tokens, BackoffTagger([t0, default_tagger], SUBTOKENS='WORDS'))
    print '  Bigram tagger:       ',
    sys.stdout.flush()
    _demo_tagger(test_tokens, BackoffTagger([t1, t0, default_tagger], SUBTOKENS='WORDS'))
    print '  Trigram tagger:      ',
    sys.stdout.flush()
    trigram = BackoffTagger([t2, t1, t0, default_tagger], SUBTOKENS='WORDS')
    _demo_tagger(test_tokens, trigram)

    print '\nUsage statistics for the trigram tagger:\n'
    trigram.print_usage_stats()
    print '='*75