Example #1
0
def crossValidate(corpus, test_precent):
    summarize = []
    corpus_len = len(corpus)
    cut = int((test_precent / 100.0) * corpus_len)
    mean = 0
    for i in range(0, corpus_len / cut):
        test = corpus[i * cut:cut * (i + 1)]
        train = corpus[:i * cut] + corpus[cut * (i + 1):]

        nn_tagger = nltk.DefaultTagger('NN')
        regexp_tagger = nltk.RegexpTagger(
            [
                (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
                (r'(The|the|A|a|An|an)$', 'AT'),  # articles
                (r'.*able$', 'JJ'),  # adjectives
                (r'.*ness$', 'NN'),  # nouns formed from adjectives
                (r'.*ly$', 'RB'),  # adverbs
                (r'.*s$', 'NNS'),  # plural nouns
                (r'.*ing$', 'VBG'),  # gerunds
                (r'.*ed$', 'VBD'),  # past tense verbs
                (r'.*', 'NN')  # nouns (default)
            ],
            backoff=nn_tagger)
        at2 = nltk.AffixTagger(train, backoff=regexp_tagger)
        ut3 = nltk.UnigramTagger(train, backoff=at2)
        ct2 = nltk.NgramTagger(2, train, backoff=ut3)

        accu = float(ct2.evaluate(test))
        summarize.append((i, accu))
        mean += accu
    return (summarize, mean / (corpus_len / cut))
Example #2
0
def _build_tagger():
    global tagger
    file = Path(tagger_path)

    if tagger != None: return

    if file.is_file():
        tagger = object_io.read_object(tagger_path)
    else:
        print('{} - Building train data...'.format(datetime.now()))

        dataset = nltk.corpus.floresta.tagged_sents() + \
                  nltk.corpus.mac_morpho.tagged_sents()
        traindata = [[(w, _simplify_tag(t)) for (w, t) in sent]
                     for sent in dataset]

        print('{} - Training POS tagging model...'.format(datetime.now()))

        tagger = nltk.NgramTagger(
            4,
            traindata,
            backoff=nltk.TrigramTagger(
                traindata,
                backoff=nltk.BigramTagger(
                    traindata,
                    backoff=nltk.UnigramTagger(
                        traindata, backoff=nltk.DefaultTagger('NOUN')))))

        print('{} - Saving tagger object...'.format(datetime.now()))

        object_io.save_object(tagger, tagger_path)
Example #3
0
def main():
    nltk.TaggerI.ConfusionMatrix = ConfusionMatrix

    brown_news_tagged = brown.tagged_sents(categories='news')
    brown_train = brown_news_tagged[100:]
    brown_test = brown_news_tagged[:100]

    nn_tagger = nltk.DefaultTagger('NN')
    regexp_tagger = nltk.RegexpTagger(
        [
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'(The|the|A|a|An|an)$', 'AT'),  # articles
            (r'.*able$', 'JJ'),  # adjectives
            (r'.*ness$', 'NN'),  # nouns formed from adjectives
            (r'.*ly$', 'RB'),  # adverbs
            (r'.*s$', 'NNS'),  # plural nouns
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # past tense verbs
            (r'.*', 'NN')  # nouns (default)
        ],
        backoff=nn_tagger)
    at2 = nltk.AffixTagger(brown_train, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(brown_train, backoff=at2)
    ct2 = nltk.NgramTagger(2, brown_train, backoff=ut3)

    print ct2.ConfusionMatrix(brown_test)
Example #4
0
def getTrainedTagger():
    train = brown.tagged_sents(simplify_tags=True)
    newTrain = []
    for sen in train:
        newSen = []
        for word, tag in sen:
            newSen.append((word.lower(), tag))
        newTrain.append(newSen)
    nn_tagger = nltk.DefaultTagger('NN')
    regexp_tagger = nltk.RegexpTagger(
        [
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'(The|the|A|a|An|an)$', 'AT'),  # articles
            (r'.*able$', 'JJ'),  # adjectives
            (r'.*ness$', 'NN'),  # nouns formed from adjectives
            (r'.*ly$', 'RB'),  # adverbs
            (r'.*s$', 'NNS'),  # plural nouns
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # past tense verbs
            (r'.*', 'NN')  # nouns (default)
        ],
        backoff=nn_tagger)
    at2 = nltk.AffixTagger(newTrain, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(newTrain, backoff=at2)
    ct2 = nltk.NgramTagger(2, newTrain, backoff=ut3)
    return ct2
Example #5
0
def question5():
    a = brown.tagged_sents(categories='news')
    train = a[:(int(len(a) * 0.9))]
    test = a[(int(len(a) * 0.9)):]
    n = 6
    for i in range(n):
        x = nltk.NgramTagger(i, train)
        print(str(i + 1) + ': ' + str(x.evaluate(test)))
Example #6
0
def trainTagger(pos_tagged):
    size = int(len(pos_tagged) * 0.9)
    train_sents = pos_tagged[:size]
    test_sents = pos_tagged[size:]

    tagger = nltk.UnigramTagger(train=pos_tagged,
                                verbose=True,
                                backoff=nltk.DefaultTagger('None'))
    tagger = nltk.BigramTagger(train=pos_tagged, verbose=True, backoff=tagger)
    tagger = nltk.NgramTagger(3,
                              train=pos_tagged,
                              verbose=True,
                              backoff=tagger)
    tagger = nltk.NgramTagger(4,
                              train=pos_tagged,
                              verbose=True,
                              backoff=tagger)
    print('(train={} , test={} , evaluate={})'.format(
        size,
        len(pos_tagged) - size, tagger.evaluate(test_sents)))

    return tagger
Example #7
0
 def getTagger(self):
     brown_news_tagged = brown.tagged_sents(categories='news')
     nn_tagger = nltk.DefaultTagger('NN')
     regexp_tagger = nltk.RegexpTagger(
         [
             (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),  # articles
             (r'.*able$', 'JJ'),  # adjectives
             (r'.*ness$', 'NN'),  # nouns formed from adjectives
             (r'.*ly$', 'RB'),  # adverbs
             (r'.*s$', 'NNS'),  # plural nouns
             (r'.*ing$', 'VBG'),  # gerunds
             (r'.*ed$', 'VBD'),  # past tense verbs
             (r'.*', 'NN')  # nouns (default)
         ],
         backoff=nn_tagger)
     affix_tagger = nltk.AffixTagger(brown_news_tagged,
                                     backoff=regexp_tagger)
     ut3 = nltk.UnigramTagger(brown_news_tagged, backoff=affix_tagger)
     ct2 = nltk.NgramTagger(2, brown_news_tagged, backoff=ut3)
     return ct2
Example #8
0
def getTaggerAndTestSetInSimplifiedMode(taggerName):
    brown_news_taggedS = brown.tagged_sents(categories='news',
                                            simplify_tags=True)
    brown_trainS = brown_news_taggedS[100:]
    brown_testS = brown_news_taggedS[:100]

    nn_taggerS = nltk.DefaultTagger('NN')
    regexp_taggerS = nltk.RegexpTagger(
        [
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'(The|the|A|a|An|an)$', 'AT'),  # articles
            (r'.*able$', 'JJ'),  # adjectives
            (r'.*ness$', 'NN'),  # nouns formed from adjectives
            (r'.*ly$', 'RB'),  # adverbs
            (r'.*s$', 'NNS'),  # plural nouns
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # past tense verbs
            (r'.*', 'NN')  # nouns (default)
        ],
        backoff=nn_taggerS)
    at2S = nltk.AffixTagger(brown_trainS, backoff=regexp_taggerS)
    ut3S = nltk.UnigramTagger(brown_trainS, backoff=at2S)
    ct2S = nltk.NgramTagger(2, brown_trainS, backoff=ut3S)
    if taggerName == "DefaultTagger":
        return nn_taggerS, brown_testS
    else:
        if taggerName == "RegExpTagger":
            return regexp_taggerS, brown_testS
        else:
            if taggerName == "AffixTagger":
                return at2S, brown_testS
            else:
                if taggerName == "UnigramTagger":
                    return ut3S, brown_testS
                else:
                    if taggerName == "BigramTagger":
                        return ct2S, brown_testS
Example #9
0
def main():
    nltk.TaggerI.evaluate2 = evaluate2

    brown_news_tagged = brown.tagged_sents(categories='news')
    brown_train = brown_news_tagged[100:]
    brown_test = brown_news_tagged[:100]

    regexp_tagger = nltk.RegexpTagger(
        [
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'(The|the|A|a|An|an)$', 'AT'),  # articles
            (r'.*able$', 'JJ'),  # adjectives
            (r'.*ness$', 'NN'),  # nouns formed from adjectives
            (r'.*ly$', 'RB'),  # adverbs
            (r'.*s$', 'NNS'),  # plural nouns
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # past tense verbs
            (r'.*', 'UNKNOWN')  # unkonwn (default)
        ],
        backoff=None)
    at2 = nltk.AffixTagger(brown_train, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(brown_train, backoff=at2)
    ct2 = nltk.NgramTagger(2, brown_train, backoff=ut3)

    e = regexp_tagger.evaluate2(brown_test)
    print "evaluate2 regExp(default unknown) = accoracy unkown words: %f ,accuracy known words: " % e[
        0], e[1]
    e = at2.evaluate2(brown_test)
    print "evaluate2 affix(regExp(default unknown)) = accoracy unkown words: %f ,accuracy known words: " % e[
        0], e[1]
    e = ut3.evaluate2(brown_test)
    print "evaluate2 unigram(affix(regExp(default unknown))) = accoracy unkown words: %f ,accuracy known words: " % e[
        0], e[1]
    e = ct2.evaluate2(brown_test)
    print "evaluate2 bigram(unigram(affix(regExp(default unknown)))) = accoracy unkown words: %f ,accuracy known words: " % e[
        0], e[1]
Example #10
0
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import brown

brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]

print('Ngram Tagger Ecaluate Score')
print('    train_sents test_sents')

for i in range(1, 7):
    ngram_tagger = nltk.NgramTagger(i, train_sents)
    print('i=%d      %.4f     %.4f' % (i, ngram_tagger.evaluate(train_sents),
                                       ngram_tagger.evaluate(test_sents)))
Example #11
0
def process_text(text):
    text = text.lower()
    # Tokenizing
    tokens = [
        token for token in tokenizer.tokenize(text) if token not in stopwords
    ]
    # Stemming
    tokens = map(stemmer.stem, tokens)
    # # Lemmatizing
    # tokens = map(lemmatizer.lemmatize, tokens)

    return tokens


if __name__ == '__main__':

    df = pd.read_csv('dataset.csv', nrows=80000, error_bad_lines=False)

    tagged_tokens = []
    for tweet, sentiment in df[['SentimentText', 'Sentiment']].values:
        tokens = process_text(tweet)
        if tokens:
            tagged_tokens.append([(token, sentiment) for token in tokens])

    training_data = tagged_tokens[:len(tagged_tokens) * 3 / 4]
    test_data = tagged_tokens[len(tagged_tokens) * 3 / 4:]

    tagger = nltk.NgramTagger(2, train=training_data)
    print tagger.evaluate(test_data)
Example #12
0
def main():
    #ploting the distribution graph
#    getDistSentByLength()
    #############################################################
    #cycle of training-testing First case - Random split 90%-10%#
    #############################################################
    train, test = stratifiedSamples([getAllTaggedCorpus()], 10)

    nn_tagger = nltk.DefaultTagger('NN')
    regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                                       (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                                       (r'.*able$', 'JJ'),                # adjectives
                                       (r'.*ness$', 'NN'),                # nouns formed from adjectives
                                       (r'.*ly$', 'RB'),                  # adverbs
                                       (r'.*s$', 'NNS'),                  # plural nouns
                                       (r'.*ing$', 'VBG'),                # gerunds
                                       (r'.*ed$', 'VBD'),                 # past tense verbs
                                       (r'.*', 'NN')                      # nouns (default)
                                       ],backoff=nn_tagger)
    at2 = nltk.AffixTagger(train, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(train, backoff=at2)
    ct2 = nltk.NgramTagger(2, train, backoff=ut3)
    print "evaluate bigram(unigram(affix(regExp(default nn)))) Random Split= " ,ct2.evaluate(test)
    
    ###############################################################################################
    #cycle of training-testing second case - Stratified split 90%-10% according to sentence length#
    ###############################################################################################
    classes = divideToLengthClasses()
    train, test = stratifiedSamples(classes, 10)

    nn_tagger = nltk.DefaultTagger('NN')
    regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                                       (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                                       (r'.*able$', 'JJ'),                # adjectives
                                       (r'.*ness$', 'NN'),                # nouns formed from adjectives
                                       (r'.*ly$', 'RB'),                  # adverbs
                                       (r'.*s$', 'NNS'),                  # plural nouns
                                       (r'.*ing$', 'VBG'),                # gerunds
                                       (r'.*ed$', 'VBD'),                 # past tense verbs
                                       (r'.*', 'NN')                      # nouns (default)
                                       ],backoff=nn_tagger)
    at2 = nltk.AffixTagger(train, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(train, backoff=at2)
    ct2 = nltk.NgramTagger(2, train, backoff=ut3)
    print "evaluate bigram(unigram(affix(regExp(default nn)))) Length split = " ,ct2.evaluate(test)
    
    #################################################################################################
    #cycle of training-testing Third case - Stratified split 90%-10% according to the sentence genre#
    #################################################################################################
    classes = divideToGenereClasses()
    train, test = stratifiedSamples(classes, 10)

    nn_tagger = nltk.DefaultTagger('NN')
    regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                                       (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                                       (r'.*able$', 'JJ'),                # adjectives
                                       (r'.*ness$', 'NN'),                # nouns formed from adjectives
                                       (r'.*ly$', 'RB'),                  # adverbs
                                       (r'.*s$', 'NNS'),                  # plural nouns
                                       (r'.*ing$', 'VBG'),                # gerunds
                                       (r'.*ed$', 'VBD'),                 # past tense verbs
                                       (r'.*', 'NN')                      # nouns (default)
                                       ],backoff=nn_tagger)
    at2 = nltk.AffixTagger(train, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(train, backoff=at2)
    ct2 = nltk.NgramTagger(2, train, backoff=ut3)
    print "evaluate bigram(unigram(affix(regExp(default nn)))) Genere split = " ,ct2.evaluate(test)
Example #13
0
    (r"^[nN]est[ae]s?$", "ADP"),
    (r"^[nN]um$", "ADP"),
    (r"^[nN]ess[ae]s?$", "ADP"),
    (r"^[nN]aquel[ae]s?$", "ADP"),
    (r"^\xe0$", "ADP"),
]


tagger = nltk.RegexpTagger(regex_patterns,
        backoff = nltk.NgramTagger(10, traindata,
        backoff = nltk.NgramTagger(9, traindata,
        backoff = nltk.NgramTagger(8, traindata,
        backoff = nltk.NgramTagger(7, traindata,
        backoff = nltk.NgramTagger(6, traindata,
        backoff = nltk.NgramTagger(5, traindata,
        backoff = nltk.NgramTagger(4, traindata,
        backoff = nltk.NgramTagger(3, traindata,
        backoff = nltk.NgramTagger(2, traindata,
        backoff=nltk.UnigramTagger(traindata,
        backoff=nltk.AffixTagger(traindata, affix_length=-4,
        backoff=nltk.DefaultTagger("NOUN")
        ))))))))))))

templates = nltk.brill.fntbl37()
tagger = nltk.BrillTaggerTrainer(tagger, templates)
tagger = tagger.train(traindata, max_rules=100)

with open("tagger_2.pkl", "wb") as f:
    pickle.dump(tagger, f)

Example #14
0
# Concept of N-Gram tagging

from nltk.corpus import brown
import nltk

brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

tagger = nltk.NgramTagger(len(brown_tagged_sents), train=brown_tagged_sents)
print(
    tagger.tag(
        nltk.word_tokenize('We are using the programming language Python')))

print(tagger.evaluate(brown_tagged_sents))
default_tagger = DefaultTagger(most_common_tag)
def_tagged_barack = default_tagger.tag(tokenised_barack)
print(def_tagged_barack)

print(
    "____________________Lookup Taggers_____________________________________")
#lookup taggers
#Ngarm Taggers Context dependent taggers

sent1 = "the quick brown fox jumps over the lazy dog"
training_tags = nltk.pos_tag(nltk.word_tokenize(sent1))
print(training_tags)
print(list(nltk.ngrams(nltk.word_tokenize(sent1), 2)))
#now use these tags to train Ngarms tagger
ngarm_tagger = nltk.NgramTagger(n=2, train=[training_tags])
print(ngarm_tagger)

sent2 = "the lazy dog was jumped over by the quick brown fox"
training_tags_sent2 = nltk.pos_tag(nltk.word_tokenize(sent2))
print(list(nltk.ngrams(nltk.word_tokenize(sent2), 2)))
#print(training_tags_sent2)
sent2_taggers = ngarm_tagger.tag(nltk.word_tokenize(sent2))
print(sent2_taggers)

print("________________unigrams Taggers_____________________________________")
#unigrams tagger

bush ="George Walker Bush (born July 6, 1946) is an American politician who served as the 43rd President of the United States" \
      " from 2001 to 2009. He had previously served as the 46th Governor of Texas from 1995 to 2000.Bush was born in New Haven, " \
      "Connecticut, and grew up in Texas. After graduating from Yale University in 1968 and Harvard Business School in 1975, he" \
Example #16
0
print(unigram_tagger.tag(nltk.word_tokenize('I am studying NLP')))
#None for unseen word

print(unigram_tagger.evaluate(brown_tagged_sents))

#Separating Training and Testing

size = int(len(brown_tagged_sents) * 0.9)

train = brown_tagged_sents[:size]
test = brown_tagged_sents[size:]

unigram_tagger = nltk.UnigramTagger(train)

print(unigram_tagger.evaluate(test))

########################################################
###############    NGram Tagger        ###############
########################################################

#Judges the tag based on the other N-1 tags, analyzes word and context
brown_tagged_sents = brown.tagged_sents(categories='news')

brown_sents = brown.sents(categories='news')

#Ngram tagger - expects a value of N - num of tokes to judge the tag
ngram_tagger = nltk.NgramTagger(4, train=brown_tagged_sents)

print(ngram_tagger.tag(nltk.word_tokenize('We are studying NLP')))
Example #17
0
    data += [[(w.lower(), simplificarTag(t)) for (w, t) in sentenca]
             for sentenca in sentencas_floresta if sentenca]

    sentencas_mac_morpho = nltk.corpus.mac_morpho.tagged_sents()
    data += [[(w.lower(), simplificarTag(t)) for (w, t) in sentenca]
             for sentenca in sentencas_mac_morpho if sentenca]

    base = data
    teste = data

    print('Treinando tagger. Isso pode demorar...')
    _tagger = nltk.NgramTagger(4,
                               base,
                               backoff=nltk.TrigramTagger(
                                   base,
                                   backoff=nltk.BigramTagger(
                                       base,
                                       backoff=nltk.UnigramTagger(
                                           base,
                                           backoff=nltk.DefaultTagger('n')))))

    print('Tagger treinado com sucesso! Precisão de %.1f!' %
          (_tagger.evaluate(teste) * 100))

    try:
        print('Salvando tagger...')

        output = open(CAMINHO_DUMP, 'wb')
        dump(_tagger, output, -1)
        output.close()
 def __init__(self, n, train_sents):
     train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
                   for sent in train_sents]
     self.tagger = nltk.NgramTagger(n, train_data)
Example #19
0
        conv_sent = []
        for word, tag in sentence:
            conv_sent.append((word, tag))
        conv_data.append(conv_sent)
    return conv_data


os.chdir('C:\CourseWork\Term5')

train = json.load(open('train.txt'))
test = json.load(open('test.txt'))

train_data = convert_data(train)
test_data = convert_data(test)

default_tagger = nltk.DefaultTagger('NN')

unigram_tagger = nltk.NgramTagger(1, train_data, backoff=default_tagger)
unigram_tagger.evaluate(test_data)

bigram_tagger = nltk.NgramTagger(2, train_data, backoff=unigram_tagger)
bigram_tagger.evaluate(test_data)

trigram_tagger = nltk.NgramTagger(3, train_data, backoff=bigram_tagger)
trigram_tagger.evaluate(test_data)

fourgram_tagger = nltk.NgramTagger(4, train_data, backoff=trigram_tagger)
fourgram_tagger.evaluate(test_data)

fivegram_tagger = nltk.NgramTagger(5, train_data, backoff=fourgram_tagger)
fivegram_tagger.evaluate(test_data)
Example #20
0
def main():
    nltk.TaggerI.MicroEvaluate = MicroEvaluate

    brown_news_tagged = brown.tagged_sents(categories='news')
    brown_train = brown_news_tagged[100:]
    brown_test = brown_news_tagged[:100]

    nn_tagger = nltk.DefaultTagger('NN')
    regexp_tagger = nltk.RegexpTagger(
        [
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'(The|the|A|a|An|an)$', 'AT'),  # articles
            (r'.*able$', 'JJ'),  # adjectives
            (r'.*ness$', 'NN'),  # nouns formed from adjectives
            (r'.*ly$', 'RB'),  # adverbs
            (r'.*s$', 'NNS'),  # plural nouns
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # past tense verbs
            (r'.*', 'NN')  # nouns (default)
        ],
        backoff=nn_tagger)
    at2 = nltk.AffixTagger(brown_train, backoff=regexp_tagger)
    ut3 = nltk.UnigramTagger(brown_train, backoff=at2)
    ct2 = nltk.NgramTagger(2, brown_train, backoff=ut3)

    print "evaluate default nn = ", nn_tagger.evaluate(brown_test)
    print "evaluate regExp(default nn) = ", regexp_tagger.evaluate(brown_test)
    print "evaluate affix(regExp(default nn)) = ", at2.evaluate(brown_test)
    print "evaluate unigram(affix(regExp(default nn))) = ", ut3.evaluate(
        brown_test)
    print "evaluate bigram(unigram(affix(regExp(default nn)))) = ", ct2.evaluate(
        brown_test)
    print ""

    print "micro-evaluate default nn = ", nn_tagger.MicroEvaluate(brown_test)
    print "micro-evaluate regExp(default nn) = ", regexp_tagger.MicroEvaluate(
        brown_test)
    print "micro-evaluate affix(regExp(default nn)) = ", at2.MicroEvaluate(
        brown_test)
    print "micro-evaluate unigram(affix(regExp(default nn))) = ", ut3.MicroEvaluate(
        brown_test)
    print "micro-evaluate bigram(unigram(affix(regExp(default nn)))) = ", ct2.MicroEvaluate(
        brown_test)
    print ""

    print "default nn prec tag = AT => ", checkTaggerPrecForTag(
        nn_tagger, 'AT', brown_test)
    print "default nn recall tag = AT => ", checkTaggerRecallForTag(
        nn_tagger, 'AT', brown_test)
    print ""

    print "default nn prec tag = NN => ", checkTaggerPrecForTag(
        nn_tagger, 'NN', brown_test)
    print "default nn recall tag = NN => ", checkTaggerRecallForTag(
        nn_tagger, 'NN', brown_test)
    print ""

    print "4 most difficult tags in simplified tagsSet - bigramTagger with all the backoffs:", checkSimplifiedDifficultTags(
        "BigramTagger", 4)
    print "4 most difficult tags in full tagsSet - bigramTagger with all the backoffs: ", checkFullDifficultTags(
        ct2, brown_test, 4)
    print ""
Example #21
0
who served as the 44th President of the United States from January 20, 2009, to January 20, 2017.
A member of the Democratic Party, he was the first African American to assume the presidency 
and previously served as a United States Senator from Illinois (2005–2008)."""

tokenized_barack = word_tokenize(barack)
default_tagger = DefaultTagger(most_common_tag)
def_tagged_barack = default_tagger.tag(tokenized_barack)
print(def_tagged_barack)

#Lookup Tagger
#Ngram tagger
message = "the quick brown fox jumped over the lazy dog"
training_tag = pos_tag(word_tokenize(message))
print(training_tag)
#training the ngram tagger
ngram_tagger = nltk.NgramTagger(n=2, train=[training_tag])

message2 = "the lazy dog jumped over the quick brown fox"
message2_tags = ngram_tagger.tag(word_tokenize(message2))
print(message2_tags)

print(list(nltk.ngrams(pos_tag(word_tokenize(message)), n=2)))

#Unigram tagger
barack = """Barack Hussein Obama II born August 4, 1961) is an American politician
who served as the 44th President of 
the United States from January 20, 2009, to January 20, 2017.
A member of the Democratic Party, he was the 
first African American to assume the presidency and previously
served as a United States Senator from Illinois (2005–2008)."""
bush = """George Walker Bush (born July 6, 1946) is an American politician who served as the 43rd President