def lexical(tokens): print "\n" print "Step 2: Lexical Analysis\n" print "Essentially refers to dictionary and obtains the properties of the word" print "Part-Of-Speech tagging" print "The tagset is:\n" tag = DefaultTagger('NN') tagg = UnigramTagger(train_sent, backoff=tag) tagger = BigramTagger(train_sent, backoff=tagg) tagtokens = tagger.tag(tokens) for token, tag in tagtokens: print token + "->" + tag print "\n" print "The acurracy of the trained pos tagger is:" print tagger.evaluate(test_sents) return tagtokens
class Tagger: def __init__(self, idiom): self.tagger0 = DefaultTagger('N') self.tagger1 = UnigramTagger(None, self.tagger0) self.tagger2 = BigramTagger(None, self.tagger1) self.lang = os.path.abspath('FriggAnswer')+'/pickle/' #self.lang = os.path.abspath('pickle')+'\\' self.loadIdiom(idiom) def loadIdiom(self, idiom): input = open(self.lang +idiom +'1.pkl', 'rb') self.tagger = load(input) input.close() input = open(self.lang +idiom+'2.pkl', 'rb') self.tagger2 = load(input) input.close() def classify(self, question): tags = self.tagger2.tag(question) return tags
#end init #pre Operate for word in my_corp[sent_count]: if my_corp[sent_count][wordcount] == ':': flag1 = 1 break wordcount = wordcount + 1 if flag1 == 1: #print(my_corp[sent_count][wordcount+1:-1]) curr_sent = my_corp[sent_count][wordcount + 1:-1] else: #print(my_corp[sent_count]) curr_sent = my_corp[sent_count] #curr_sent=rep.replace(curr_sent) #end pre Operate tag_curr_sent = t2.tag(curr_sent) #print(tag_curr_sent) for words_tup in tag_curr_sent: if words_tup[1] != (',' or '?' or '!' or '.'): All_count = All_count + 1 if words_tup[1] == 'NN': NN_count = NN_count + 1 #NN_Num=NN_Num+1 if words_tup[1] == 'NNP': NNP_count = NNP_count + 1 NNP_Num = NNP_Num + 1 if words_tup[1] == 'JJ': JJ_count = JJ_count + 1 JJ_Num = JJ_Num + 1 #if words_tup[1]=='VB': # VB_count=VB_count+1
## N gram taggers from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) print ut.evaluate(test_data) print ut.tag(tokens) print bt.evaluate(test_data) print bt.tag(tokens) print tt.evaluate(test_data) print tt.tag(tokens) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger], backoff=rt) print ct.evaluate(test_data) print ct.tag(tokens)
editdis = 1000 line_num = 0 rate = 0 ## 3.The part of Standardize and NER if tag == 0: stopwords = nltk.corpus.stopwords.words('english') cont = nltk.word_tokenize(rep_22) # Name Standardize cont = [lc_male_name if x == 'he' else x for x in cont] cont = [lc_male_name if x == 'his' else x for x in cont] cont = [lc_female_name if x == 'she' else x for x in cont] cont = [lc_female_name if x == 'her' else x for x in cont] # Place Standardize cont_tag = t2.tag(cont) cont_unstop = [word for word in cont if word not in stopwords] cont_tagged = t2.tag(cont_unstop) # Final result of NER ## 4.Answer Filter # First Filter: Find names in the input for tupl_1 in cont_tag: if tupl_1[1] == 'NN': if tupl_1[0] in male_names: lc_male_name = tupl_1[0] elif tupl_1[0] in female_names: lc_female_name = tupl_1[0] # Second Filter: Find places in the input for tupl_2 in cont_tag: if tupl_2[1] == 'NN':
import nltk from nltk.tag import BigramTagger from nltk.corpus import treebank training_1= treebank.tagged_sents()[:7000] bigramtagger=BigramTagger(training_1) print(treebank.sents()[0]) print(bigramtagger.tag(treebank.sents()[0])) testing_1 = treebank.tagged_sents()[2000:] print(bigramtagger.evaluate(testing_1))
# Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations) ut = UnigramTagger(train=train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) # Test the performance of each N-Gram tagger print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data))) print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data))) print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data))) print("\n1-Gram tags:") print(ut.tag(tokens)) print("\n2-Gram tags:") print(bt.tag(tokens)) print("\n3-Gram tags:") print(tt.tag(tokens)) # Note that the best accuracy is provided by the 1-Gram tagger, as it isn't always the case that the same bigrams # and trigrams observed in the training data will be present in the same way in the testing data (e.g. pairs of words # do not always appear paired in the same way) # 4. TAGGER CHAINING WITH BACKOFF TAGGERS: # Function to chain a set of taggers, with a backoff tagger as last resource def combined_tagger(training_data, taggers, backoff=None): for tagger in taggers:
editdis = 1000 line_num = 0 rate = 0 ## 3.The part of Standardize and NER if tag == 0: stopwords = nltk.corpus.stopwords.words('english') cont = nltk.word_tokenize(rep_22) # Name Standardize cont = [lc_male_name if x == 'he' else x for x in cont] cont = [lc_male_name if x == 'his' else x for x in cont] cont = [lc_female_name if x == 'she' else x for x in cont] cont = [lc_female_name if x == 'her' else x for x in cont] # Place Standardize cont_tag = t2.tag(cont) cont_unstop = [word for word in cont if word not in stopwords] cont_tagged = t2.tag(cont_unstop) # Final result of NER print(cont_tagged) ## 4.Answer Filter # First Filter: Find names in the input for tupl_1 in cont_tag: if tupl_1[1] == 'NN' or tupl_1[1] == 'NNP': if tupl_1[0] in male_names: lc_male_name = tupl_1[0] elif tupl_1[0] in female_names: lc_female_name = tupl_1[0] # Second Filter: Find places in the input for tupl_2 in cont_tag:
import nltk from nltk.tag import BigramTagger from nltk.corpus import treebank training_1 = treebank.tagged_sents()[:7000] bigramtagger = BigramTagger(training_1) print(treebank.sents()[0]) print(bigramtagger.tag(treebank.sents()[0])) testing_1 = treebank.tagged_sents()[2000:] print(bigramtagger.evaluate(testing_1))
from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) # testing performance of unigram tagger print('unigram tagger: ') print(ut.evaluate(test_data)) print(ut.tag(nltk.word_tokenize(sentence))) # testing performance of bigram tagger print('\nbigram tagger:') print(bt.evaluate(test_data)) print(bt.tag(nltk.word_tokenize(sentence))) # testing performance of trigram tagger print('\ntrigram tagger:') print(tt.evaluate(test_data)) print(tt.tag(nltk.word_tokenize(sentence))) #%% # combined tagger with a list of taggers and use a backoff tagger def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff
print("------------Unigram Tagger Overrode------------") unigramTagger = UnigramTagger(model={'Pierre': 'NN'}) print(unigramTagger.tag(sent)) print("------------Unigram Tagger Trained------------") unigramTagger = UnigramTagger(brown_train_sents) print(unigramTagger.tag(sent)) #cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger print("------------Unigram Tagger Trained with cutoff=3------------") unigramTagger = UnigramTagger(brown_train_sents, cutoff=3) print(unigramTagger.tag(sent)) print("------------Bigram Tagger------------") print(bigramTagger.tag(sent)) print("------------Trigram Tagger------------") print(trigramTagger.tag(sent)) print("------------Brill Tagger------------") print(brillTagger.tag(sent)) print("------------Accuracy: Unigram Tagger Trained------------") unigramTagger = UnigramTagger(brown_train_sents) print(unigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Unigram Tagger Trained with cutoff = 3------------") unigramTagger = UnigramTagger(brown_train_sents, cutoff = 3) print(unigramTagger.evaluate(brown_test_sents))
brown_tagged_sents = brown.tagged_sents(categories='news') print(brown_tagged_sents) # we are dividing the data into a test and train to evaluate our taggers. train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] # [start : end] test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] unigram_tagger = UnigramTagger(train_data) print (unigram_tagger.evaluate(test_data)) default_tagger = DefaultTagger('NN') unigram_tagger = UnigramTagger(train_data, backoff = default_tagger) print (unigram_tagger.evaluate(test_data)) bigram_tagger = BigramTagger(train_data, backoff = unigram_tagger) print (bigram_tagger.evaluate(test_data)) trigram_tagger = TrigramTagger(train_data, backoff = bigram_tagger) print (trigram_tagger.evaluate(test_data)) my_string = "Many text corpora contain linguistic annotations, representing POS tags, named entities, syntactic structures, semantic roles, and so forth. NLTK provides convenient ways to access several of these corpora, and has data packages containing corpora and corpus samples, freely downloadable for use in teaching and research." my_sent = nltk.sent_tokenize(my_string) # tokenize sentences mytoken = [] # create empty list for sent in my_sent: # for each sentence, tokenize words mytoken.extend(word_tokenize(sent)) print(mytoken) print('\n') print(bigram_tagger.tag(mytoken)) # tag each word using unigram_tagger model