Example #1
0
def train_tagger():
    """
	This function trains the tagger
	"""
    print("Training POS tagger...")
    # https://github.com/japerk/nltk3-cookbook/blob/master/chapter4.py

    tagged_sentences = treebank.tagged_sents()
    size = int(len(tagged_sentences) * 0.9)
    train_sents = tagged_sentences[:size]
    test_sents = tagged_sentences[3000:]

    default = DefaultTagger("NN")
    tagger = ClassifierBasedPOSTagger(
        train=train_sents, backoff=default, cutoff_prob=0.3
    )
    print(tagger.evaluate(test_sents))  # 0.9613641269156055

    # save model to pickle file as binary
    file_name = MODEL_PATH + "tag_model.pkl"
    with open(file_name, "wb") as fout:
        pickle.dump(tagger, fout)

    print("model written to: " + file_name)
    print("")

    return tagger
Example #2
0
def NER_HINDINBC():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    nbc_tagger = ClassifierBasedPOSTagger(train=train_sents)
    test = nbc_tagger.evaluate(test_sents)
    print "The Test Result is:", test
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = nbc_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs
Example #3
0
def nbc_tagger():
    news_text = brown.tagged_sents(categories='news')
    train_sents = news_text[:3230]
    test_sents = news_text[3230:4600]
    nbc_tagger = ClassifierBasedPOSTagger(train=train_sents)
    test = nbc_tagger.evaluate(test_sents)
    print "The Test Results Is:", test
    sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years"
    sent_w = sent3.lower().split()
    print sent_w
    tag = nbc_tagger.tag(sent_w)
    print "The Tag Is:", tag
Example #4
0
 def load_tagger(self, name='backup/tagger.pickle'):
     try:
         with open(name, "rb") as f:
             tagger = pickle.load(f)
         f.close()
         return tagger
     except IOError as e:
         print ("I/O error: {0}".format(e))
         pass
     tagger = ClassifierBasedPOSTagger(train=self.__train_sents, backoff=self.__default, cutoff_prob=0.3)
     print 'Tagger accuracy : {}'.format(tagger.evaluate(self.__test_sents))
     with open(name, 'wb') as f:
         pickle.dump(tagger, f)
     f.close()
     return tagger
Example #5
0
print tt.evaluate(test_data)
print tt.tag(tokens)

def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

ct = combined_tagger(train_data=train_data, 
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)

print ct.evaluate(test_data)        
print ct.tag(tokens)

from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger

nbt = ClassifierBasedPOSTagger(train=train_data,
                               classifier_builder=NaiveBayesClassifier.train)

print nbt.evaluate(test_data)
print nbt.tag(tokens)    


# try this out for fun!
met = ClassifierBasedPOSTagger(train=train_data,
                               classifier_builder=MaxentClassifier.train)
print met.evaluate(test_data)                           
print met.tag(tokens)
Example #6
0
import nltk
from nltk.corpus import treebank
from nltk.tag import DefaultTagger

from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger

data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
#print(train_data[0])

dt = DefaultTagger('NN')
print(dt.evaluate(test_data))

nt = ClassifierBasedPOSTagger(train=train_data,
                              classifier_builder=NaiveBayesClassifier.train)
print(nt.evaluate(test_data))
Example #7
0
from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger

default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,
                                  backoff=default,
                                  cutoff_prob=0.3)

tagger.evaluate(test_sents)

#token = nltk.word_tokenize(title)  #title string tokenized

#removing all the punctuation  marks

#punctuation = re.compile(r'[-.?,":;()`~!@#$%^*()_=+{}]')

#tword = [punctuation.sub("", word) for word in token]

#print(tword) #without punctuation

#removing all the MS smart quotes

#smart_quotes = re.compile(r'[\x80-\x9f]')
Example #8
0
tnt_eval['train_time'] = toc()
# test
tic()
tnt_eval['test_accuracy'] = tnt_tagger.evaluate(val_sents)
tnt_eval['test_time'] = toc()
# display results
display_training_metrics(tnt_eval)
""" 2. Naive Bayes classifier tagger """
nb_eval = dict()
# train
tic()
nb_tagger = ClassifierBasedPOSTagger(train=train_sents)
nb_eval['train_time'] = toc()
# test
tic()
nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents)
nb_eval['test_time'] = toc()
# display results
display_training_metrics(nb_eval)
""" 3. Naive Bayes classifier tagger with features """
nb_eval = dict()
# train
tic()
nb_tagger = ClassifierBasedTagger(train=train_sents,
                                  feature_detector=add_features)
nb_eval['train_time'] = toc()
# test
tic()
nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents)
nb_eval['test_time'] = toc()
# display results
Example #9
0
# print( 'Training TnT...' )
# tnt_tagger = tnt.TnT()
# tnt_tagger.train(train_corpus)
# print( 'Testing...' )
# acc = tnt_tagger.evaluate(test_corpus)
# print( 'TnT accuracy={0}\n'.format(acc) )
#
# # ----------------------------------------------------------------------
#
# print( 'Training UnigramTagger...' )
# unigram_tagger = UnigramTagger(train_corpus)
# with open( 'unigram.pos_tagger.pickle', 'wb' ) as f:
#     pickle.dump( unigram_tagger, f )
#
# print( 'Testing...' )
# acc = unigram_tagger.evaluate(test_corpus)
# print( 'UnigramTagger accuracy={0}\n'.format(acc) )

# ----------------------------------------------------------------------

print('Training ClassifierBasedPOSTagger...')
cbt = ClassifierBasedPOSTagger(train=train_corpus)
print('Testing...')
acc = cbt.evaluate(test_corpus)
print('accuracy={0}\n'.format(acc))

print('Storing...')
with open(os.path.join(model_folder, 'ClassifierBasedPOSTagger.pickle'),
          'wb') as f:
    pickle.dump(cbt, f)
Example #10
0
from nltk.tag.sequential import ClassifierBasedPOSTagger
from tag_util import train_sents, test_sents

tagger = ClassifierBasedPOSTagger(train=train_sents)
print(tagger.evaluate(test_sents))
Example #11
0
            'a': 'JJ',
            'r': 'RB',
            'v': 'VB'
        }
        self.fd = FreqDist(treebank.words())

    def choose_tag(self, tokens, index, history):
        """
            Choses a POS tag based on the wordnet tag
        """

        word = tokens[index]
        for synset in wordnet.synsets(word):
            self.fd[synset.pos()] += 1
        return self.wordnet_tag_map.get(self.fd.max())


# Using the wordnet tagger
wn_tagger = WordNetTagger()
accuracy = wn_tagger.evaluate(test_sents)
print(f"Accuracy of the wordnet tagger: {accuracy}\n")

# Classifier tagging
cl_tagger = ClassifierBasedPOSTagger(train=train_sents)
accuracy = cl_tagger.evaluate(test_sents)
print(f"Accuracy of the classifier tagger: {accuracy}\n")

# Saving pickle - Heavy one
with open('pickles/pos-taggers/classifier_tagger.pickle', 'wb') as file:
    pickle.dump(cl_tagger, file)
Example #12
0
from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger

default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 )


tagger.evaluate(test_sents)

#token = nltk.word_tokenize(title)  #title string tokenized

#removing all the punctuation  marks

#punctuation = re.compile(r'[-.?,":;()`~!@#$%^*()_=+{}]')

#tword = [punctuation.sub("", word) for word in token]

#print(tword) #without punctuation

#removing all the MS smart quotes

#smart_quotes = re.compile(r'[\x80-\x9f]')