Example #1
0
def load_pos():
    from en.parser.nltk_lite.corpora import brown
    from itertools import islice

    sentences = list(islice(brown.tagged(), 100))

    tag_set = ["'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl',
        'abn', 'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem',
        'ben', 'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz',
        'dt', 'dt$', 'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg',
        'hvn', 'hvz', 'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$',
        'nns', 'nns$', 'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn',
        'pn$', 'pp$', 'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb',
        'rb$', 'rbr', 'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn',
        'vbz', 'wdt', 'wp$', 'wpo', 'wps', 'wql', 'wrb']
        
    sequences = []
    sequence = []
    symbols = set()
    start_re = re.compile(r'[^-*+]*')
    for sentence in sentences:
        for i in range(len(sentence)):
            word, tag = sentence[i]
            word = word.lower()  # normalize
            symbols.add(word)    # log this word
            m = start_re.match(tag)
            # cleanup the tag
            tag = m.group(0)
            if tag not in tag_set:
                tag = '*'
            sentence[i] = (word, tag)  # store cleaned-up tagged token

    return sentences, tag_set, list(symbols)
Example #2
0
def load_pos():
    from en.parser.nltk_lite.corpora import brown
    from itertools import islice

    sentences = list(islice(brown.tagged(), 100))

    tag_set = [
        "'", "''", '(', ')', '*', ',', '.', ':', '--', '``', 'abl', 'abn',
        'abx', 'ap', 'ap$', 'at', 'be', 'bed', 'bedz', 'beg', 'bem', 'ben',
        'ber', 'bez', 'cc', 'cd', 'cd$', 'cs', 'do', 'dod', 'doz', 'dt', 'dt$',
        'dti', 'dts', 'dtx', 'ex', 'fw', 'hv', 'hvd', 'hvg', 'hvn', 'hvz',
        'in', 'jj', 'jjr', 'jjs', 'jjt', 'md', 'nn', 'nn$', 'nns', 'nns$',
        'np', 'np$', 'nps', 'nps$', 'nr', 'nr$', 'od', 'pn', 'pn$', 'pp$',
        'ppl', 'ppls', 'ppo', 'pps', 'ppss', 'ql', 'qlp', 'rb', 'rb$', 'rbr',
        'rbt', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbz', 'wdt',
        'wp$', 'wpo', 'wps', 'wql', 'wrb'
    ]

    sequences = []
    sequence = []
    symbols = set()
    start_re = re.compile(r'[^-*+]*')
    for sentence in sentences:
        for i in range(len(sentence)):
            word, tag = sentence[i]
            word = word.lower()  # normalize
            symbols.add(word)  # log this word
            m = start_re.match(tag)
            # cleanup the tag
            tag = m.group(0)
            if tag not in tag_set:
                tag = '*'
            sentence[i] = (word, tag)  # store cleaned-up tagged token

    return sentences, tag_set, list(symbols)
Example #3
0
def demo():
    """
    Demonstrates how to use IndexConcordance and Aggregator.
    """
    print "Reading Brown Corpus into memory..."
    corpus = list(brown.tagged(('a', 'j')))
    print "Generating index..."
    ic = IndexConcordance(corpus)
    print "Showing all occurences of 'plasma' in the Brown Corpus..."
    ic.formatted(middleRegexp="^plasma/.*", verbose=True)

    print "Investigating the collocates of 'deal' and derivatives..."
    agg = Aggregator()
    agg.add(
        ic.raw(middleRegexp="^deal",
               leftContextLength=1,
               rightContextLength=0,
               leftRegexp="^(\w|\s|/)*$"),
        "Brown Corpus 'deal' left collocates")
    agg.add(
        ic.raw(middleRegexp="^deal",
               leftContextLength=0,
               rightContextLength=1,
               rightRegexp="^(\w|\s|/)*$"),
        "Brown Corpus 'deal' right collocates")
    agg.formatted(showFirstX=5, usePOS=False)
Example #4
0
def demo():
    from en.parser.nltk_lite.corpora import brown
    from itertools import islice
    from pprint import pprint

    pprint(list(islice(brown.raw('a'), 0, 5)))

    pprint(list(islice(brown.tagged('a'), 0, 5)))
Example #5
0
def demo():
    from en.parser.nltk_lite.corpora import brown
    from itertools import islice
    from pprint import pprint

    pprint(list(islice(brown.raw('a'), 0, 5)))

    pprint(list(islice(brown.tagged('a'), 0, 5)))
Example #6
0
def demo ():
    # load train corpus
    train_sents = list(islice(brown.tagged(), 500))

    # create taggers
    tagger = MarshalNgram(3)

    #tagger.train(train_sents)
    #tagger.marshal("ngram.test")

    tagger.unmarshal("ngram.test")
    print tagger._model
Example #7
0
def demo():
    """
    A simple demonstration function for the C{Tagger} classes.  It
    constructs a backoff tagger using a trigram tagger, bigram tagger
    unigram tagger and a default tagger.  It trains and tests the
    tagger using the Brown corpus.
    """
    from en.parser.nltk_lite.corpora import brown
    import sys

    print 'Training taggers.'

    # Create a default tagger
    t0 = Default('nn')

#    t1a = Affix(length=-3, minlength=5, backoff=t0)
#    t1b = Unigram(cutoff=2, backoff=t1a)
    t1 = Unigram(cutoff=1, backoff=t0)
    t2 = Bigram(cutoff=1, backoff=t1)
    t3 = Trigram(backoff=t2)

    t1.train(brown.tagged('a'), verbose=True)
    t2.train(brown.tagged('a'), verbose=True)
    t3.train(brown.tagged('a'), verbose=True)

    # Tokenize the testing files
    test_tokens = []
    num_words = 0

    # Run the taggers.  For t0, t1, and t2, back-off to the default tagger.
    # This is especially important for t1 and t2, which count on
    # having known tags as contexts; if they get a context containing
    # None, then they will generate an output of None, and so all
    # words will get tagged a None.

    print '='*75
    print 'Running the taggers on test data...'
    print '  Default (nn) tagger: ',
    sys.stdout.flush()
    _demo_tagger(t0, brown.tagged('b'))

    print '  Unigram tagger:      ',
    sys.stdout.flush()
    _demo_tagger(t1, list(brown.tagged('b'))[:1000])

    print '  Bigram tagger:       ',
    sys.stdout.flush()
    _demo_tagger(t2, list(brown.tagged('b'))[:1000])

    print '  Trigram tagger:      ',
    sys.stdout.flush()
    _demo_tagger(t3, list(brown.tagged('b'))[:1000])
Example #8
0
File: ngram.py Project: mgolden/en
def demo():
    """
    A simple demonstration function for the C{Tagger} classes.  It
    constructs a backoff tagger using a trigram tagger, bigram tagger
    unigram tagger and a default tagger.  It trains and tests the
    tagger using the Brown corpus.
    """
    from en.parser.nltk_lite.corpora import brown
    import sys

    print('Training taggers.')

    # Create a default tagger
    t0 = Default('nn')

    #    t1a = Affix(length=-3, minlength=5, backoff=t0)
    #    t1b = Unigram(cutoff=2, backoff=t1a)
    t1 = Unigram(cutoff=1, backoff=t0)
    t2 = Bigram(cutoff=1, backoff=t1)
    t3 = Trigram(backoff=t2)

    t1.train(brown.tagged('a'), verbose=True)
    t2.train(brown.tagged('a'), verbose=True)
    t3.train(brown.tagged('a'), verbose=True)

    # Tokenize the testing files
    test_tokens = []
    num_words = 0

    # Run the taggers.  For t0, t1, and t2, back-off to the default tagger.
    # This is especially important for t1 and t2, which count on
    # having known tags as contexts; if they get a context containing
    # None, then they will generate an output of None, and so all
    # words will get tagged a None.

    print('=' * 75)
    print('Running the taggers on test data...')
    print('  Default (nn) tagger: ', end=' ')
    sys.stdout.flush()
    _demo_tagger(t0, brown.tagged('b'))

    print('  Unigram tagger:      ', end=' ')
    sys.stdout.flush()
    _demo_tagger(t1, list(brown.tagged('b'))[:1000])

    print('  Bigram tagger:       ', end=' ')
    sys.stdout.flush()
    _demo_tagger(t2, list(brown.tagged('b'))[:1000])

    print('  Trigram tagger:      ', end=' ')
    sys.stdout.flush()
    _demo_tagger(t3, list(brown.tagged('b'))[:1000])
Example #9
0
def demo():
    """
    A simple demonstration function for the C{Tagger} classes.  It
    constructs a backoff tagger using a trigram tagger, bigram tagger
    unigram tagger and a default tagger.  It trains and tests the
    tagger using the Brown corpus.
    """
    from en.parser.nltk_lite.corpora import brown
    from en.parser.nltk_lite import tag
    import sys

    print('Training taggers.')

    # Create a default tagger
    t0 = tag.Default('nn')

    t1 = tag.Unigram(cutoff=1, backoff=t0)
    t1.train(brown.tagged('a'), verbose=True)

    t2 = tag.Affix(-3, 5, cutoff=2, backoff=t0)
    t2.train(brown.tagged('a'), verbose=True)

    t3 = tag.Regexp([(r'.*ed', 'vbd')], backoff=t0)  # no training

    t4 = tag.Lookup({'the': 'dt'}, backoff=t0)

    test_tokens = []
    num_words = 0

    print('='*75)
    print('Running the taggers on test data...')
    print('  Default (nn) tagger: ', end=' ')
    sys.stdout.flush()
    _demo_tagger(t0, brown.tagged('b'))

    print('  Unigram tagger:      ', end=' ')
    sys.stdout.flush()
    _demo_tagger(t1, list(brown.tagged('b'))[:1000])

    print('  Affix tagger:        ', end=' ')
    sys.stdout.flush()
    _demo_tagger(t2, list(brown.tagged('b'))[:1000])

    print('  Regexp tagger:       ', end=' ')
    sys.stdout.flush()
    _demo_tagger(t3, list(brown.tagged('b'))[:1000])

    print('  Lookup tagger:       ', end=' ')
    sys.stdout.flush()
    _demo_tagger(t4, list(brown.tagged('b'))[:1000])
Example #10
0
def demo():
    """
    A simple demonstration function for the C{Tagger} classes.  It
    constructs a backoff tagger using a trigram tagger, bigram tagger
    unigram tagger and a default tagger.  It trains and tests the
    tagger using the Brown corpus.
    """
    from en.parser.nltk_lite.corpora import brown
    from en.parser.nltk_lite import tag
    import sys

    print 'Training taggers.'

    # Create a default tagger
    t0 = tag.Default('nn')

    t1 = tag.Unigram(cutoff=1, backoff=t0)
    t1.train(brown.tagged('a'), verbose=True)

    t2 = tag.Affix(-3, 5, cutoff=2, backoff=t0)
    t2.train(brown.tagged('a'), verbose=True)

    t3 = tag.Regexp([(r'.*ed', 'vbd')], backoff=t0)  # no training

    t4 = tag.Lookup({'the': 'dt'}, backoff=t0)

    test_tokens = []
    num_words = 0

    print '='*75
    print 'Running the taggers on test data...'
    print '  Default (nn) tagger: ',
    sys.stdout.flush()
    _demo_tagger(t0, brown.tagged('b'))

    print '  Unigram tagger:      ',
    sys.stdout.flush()
    _demo_tagger(t1, list(brown.tagged('b'))[:1000])

    print '  Affix tagger:        ',
    sys.stdout.flush()
    _demo_tagger(t2, list(brown.tagged('b'))[:1000])

    print '  Regexp tagger:       ',
    sys.stdout.flush()
    _demo_tagger(t3, list(brown.tagged('b'))[:1000])

    print '  Lookup tagger:       ',
    sys.stdout.flush()
    _demo_tagger(t4, list(brown.tagged('b'))[:1000])
Example #11
0
def demo():
    """
    Demonstrates how to use IndexConcordance and Aggregator.
    """
    print "Reading Brown Corpus into memory..."
    corpus = list(brown.tagged(('a','j')))
    print "Generating index..."
    ic = IndexConcordance(corpus)
    print "Showing all occurences of 'plasma' in the Brown Corpus..."
    ic.formatted(middleRegexp="^plasma/.*", verbose=True)

    print "Investigating the collocates of 'deal' and derivatives..."
    agg = Aggregator()
    agg.add(ic.raw(middleRegexp="^deal", leftContextLength=1, rightContextLength=0,
    leftRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' left collocates")
    agg.add(ic.raw(middleRegexp="^deal", leftContextLength=0, rightContextLength=1,
    rightRegexp="^(\w|\s|/)*$"), "Brown Corpus 'deal' right collocates")
    agg.formatted(showFirstX=5, usePOS=False)