def wordTagger(self, wordlist,number): train_sents = treebank.tagged_sents()[:3000] if number==1: taglist = nltk.pos_tag(wordlist) elif number ==2: tagger = DefaultTagger('NN') taglist = tagger.tag(wordlist) elif number ==3: tagger = UnigramTagger(train_sents) taglist = tagger.tag(wordlist) elif number ==4: tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) taglist = tnt_tagger.tag(wordlist) elif number ==5: tagger = ClassifierBasedPOSTagger(train=train_sents) taglist = tagger.tag(wordlist) return taglist
def tag_words(words, tag): """ Associates a tag with words. Parameters ---------- words: A list of strings. tag: A str. Returns ------- A list of tuples of (str, str) """ default_tagger = DefaultTagger(tag) tags = default_tagger.tag(words) return tags
# every tagger has a tag() method. # DefaultTagger is a subclass of SequentialBackoffTagger which has a choose_tag() method. from nltk.tag import DefaultTagger from nltk.corpus import treebank tagger = DefaultTagger('NN') print(tagger.tag(['Hello', 'World'])) # thought it's too simple, we can try to evaluate it test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents)) # for sentences print(tagger.tag_sents([['Hello', 'World', '.'], ['How', 'are', 'you', '?']])) # untagging from nltk.tag import untag print(untag([('Hello', 'NN'), ('World', 'NN')]))
from nltk.tokenize import word_tokenize from nltk.corpus import treebank, wordnet from nltk.probability import FreqDist from nltk.tag.sequential import ClassifierBasedPOSTagger from nltk.tag import brill, brill_trainer, tnt, SequentialBackoffTagger from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger, AffixTagger from samples import sample # Test and training variables test_sents = treebank.tagged_sents()[3000:] train_sents = treebank.tagged_sents()[:3000] tk_sample = word_tokenize(sample) # Default tagger - Nouns df_tagger = DefaultTagger('NN') tagged = df_tagger.tag(tk_sample) accuracy = df_tagger.evaluate(test_sents) print(f"Tagged text: {tagged}; acc = {accuracy}\n") # Unigram tagger ug_tagger = UnigramTagger(train_sents) tagged = ug_tagger.tag(tk_sample) accuracy = ug_tagger.evaluate(test_sents) print(f"Tagged text: {tagged}; acc = {accuracy}\n") # Backoff tagger: rely on other tagger(backoff) when the current one does not know how to evaluate ugb_tagger = UnigramTagger(train_sents, backoff=df_tagger) accuracy = ugb_tagger.evaluate(test_sents) print(f"Accuracy of backoff: {accuracy}\n") # Saving pickle and testing it.
# Build a custom tagger by extending class TaggerI, from the nltk.tag package and implementing the tag function. # Use the evaluate function to assess the performance of the tagger. # --- Tagger --- # INPUT: Sentence tokens # OUTPUT: List of pairs where each item corresponds to a token of the input with its POS tag # 1. BACKOFF TAGGER (a tagger that is consulted by another when not able to tag a token): # Assigns the same tag to all tokens (tag specified as argument, NN in this case) dt = DefaultTagger("NN") # Measure accuracy on test data (i.e. Gold Standard). Test data should be tagged to compare these # tags against new ones computed by the evaluated tagger (dt in this case) print(dt.evaluate(gold=test_data)) print(dt.tag(tokens=tokens)) # 2. REGEX TAGGER: # Assigns tags to tokens by comparing their word strings to a series of regular expressions # Define regex patterns used that determine the tags of tokens. Note that when tagging a token, expressions # are evaluated bottom up and thus, the last one defines the default tag patterns = [ (r".*ing$", "VBG"), # Gerunds (r".*ed$", "VBD"), # Simple past (r".*es$", "VBZ"), # 3rd singular present (r".*ould$", "MD"), # Modals (r".*'s$", "NN$"), # Possesive pronouns (r".*s$", "NNS"), # Plural nouns (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # Cardinal numbers (r".*", "NN") # Nouns (default)
defaultTagger = DefaultTagger('NN') initialTagger = backoff_tagger(brown_train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=defaultTagger) brillTagger = train_brill_tagger(initialTagger, brown_train_sents) tnt_tagger = tnt.TnT(N=100) tnt_tagger.train(brown_train_sents) bigramTagger = BigramTagger(brown_train_sents) trigramTagger = TrigramTagger(brown_train_sents) print("------------Recommended Tagger------------") print(nltk.pos_tag(sent)) print("------------Default Tagger------------") print(defaultTagger.tag(sent)) print("------------Unigram Tagger Overrode------------") unigramTagger = UnigramTagger(model={'Pierre': 'NN'}) print(unigramTagger.tag(sent)) print("------------Unigram Tagger Trained------------") unigramTagger = UnigramTagger(brown_train_sents) print(unigramTagger.tag(sent)) #cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger print("------------Unigram Tagger Trained with cutoff=3------------") unigramTagger = UnigramTagger(brown_train_sents, cutoff=3) print(unigramTagger.tag(sent)) print("------------Bigram Tagger------------")
# In[28]: from nltk.tag import DefaultTagger default_tagger = DefaultTagger("NOUN") # In[29]: true_pred = 0 num_pred = 0 for sent in test_sents: tags = np.array([tag for (word, tag) in sent]) words = np.array([word for (word, tag) in sent]) tagged_sent = default_tagger.tag(words) outputs = [tag for token, tag in tagged_sent] true_pred += np.sum(outputs == tags) num_pred += len(words) print(f"{true_pred / num_pred * 100:.1f}") # как видим такой способ в целом дает результат довольно плохой, хотя 20% (или пятая часть) это скорее про дистрибуцию популярного тега # ## NLTK, Rnnmorph # Вспомним первый [семинар](https://colab.research.google.com/drive/1FHZVU6yJT61J8w1hALno0stD4VU36rit?usp=sharing) нашего курса. В том семинаре мы с вами работали c некоторыми библиотеками. # # Не забудьте преобразовать систему тэгов из `'en-ptb' в 'universal'` с помощью функции `map_tag` или используйте `tagset='universal'`
#Without the part-of-speech tags, a chunker cannot know how to extract #phrases from a sentence. But with part-of-speech tags, you can tell a chunker how to identify #phrases based on tag patterns. #part-of-speech tags for grammar analysis and word sense disambiguation #All taggers in NLTK are in the nltk.tag package and inherit from the TaggerI base class. #TaggerI requires all subclasses to implement a tag() method,which takes a list of word as input #and returns a tagged word as output #TaggerI also provides an evaluate() method for evaluating the accuracy of the tagger from nltk.tag import DefaultTagger tagger = DefaultTagger('NN') print(tagger.tag(['Hello', 'World'])) print(tagger.tag(['we', 'are','going']))# WRONG #SequentialBackoffTagger implements the tag() method, which calls the #choose_tag() method of the subclass for each index in the tokens list while accumulating #a history of the previously tagged tokens """DefaultTagger is a subclass of SequentialBackoffTagger. Every subclass of SequentialBackoffTagger must implement the choose_tag() method, which takes three arguments: * The list of tokens * The index of the current token whose tag we want to choose * The history, which is a list of the previous tags SequentialBackoffTagger implements the tag() method, which calls the
rc4 = [] def PhraseExtractor(tree): rc4 = [] for subtree in tree.subtrees(): if subtree.label() == 'Phrase': rc4.append(str(untag(subtree))) return rc4 '''-----------------------------------------------------------------------------------------''' '''----------------------Customized POS Tagging -----------------------------------------------------''' '''-----------------------------------------------------------------------------------------''' ptagger = DefaultTagger('PWD') ntagger = DefaultTagger('NWD') etagger = DefaultTagger('EMP') tag_pos = ptagger.tag(PosWords) # changed to list version of Dictionary tag_neg = ntagger.tag(NegWords) # changed to list version of Dictionary tag_emp = etagger.tag(EmpWords) # changed to list version of Dictionary tag_wrd = tag_pos + tag_neg + tag_emp tag_wrd_dict = dict(tag_wrd) tagger5 = UnigramTagger(model = tag_wrd_dict, backoff= tagger2) '''-----------------------------------------------------------------------------------------''' '''------------------- Chunking with POS Tagging ---------------------------------------------------''' '''-----------------------------------------------------------------------------------------''' chunker1 = RegexpParser(r''' PWD:
# building your own tagger # preparing the data from nltk.corpus import treebank data = treebank.tagged_sents() train_data = data[:3500] test_data = data[3500:] print train_data[0] # default tagger from nltk.tag import DefaultTagger dt = DefaultTagger('NN') print dt.evaluate(test_data) print dt.tag(tokens) # regex tagger from nltk.tag import RegexpTagger # define regex tag patterns patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ... ]
from nltk.tag import DefaultTagger tagger = DefaultTagger('NN') result = tagger.tag(['Hello', 'World', 'want']) print(result)
#a splits = [[90, 10], [50, 50]] correct_brown = brown.tagged_sents() correct_chat = chat.tagged_posts() default_tagger = DefaultTagger("NN") for split in splits: #lag til funksjon for bruk i b test_brown, train_brown = train_test_split(correct_brown, test_size=split[1] / 100, shuffle=False) test_chat, train_chat = train_test_split(correct_chat, test_size=split[1] / 100, shuffle=False) default_tagger.tag(train_brown) print( f"The DefaultTagger accuracy for the Brown Corpus is {default_tagger.evaluate(test_brown)} using a {split[0]}/{split[1]} split." ) default_tagger.tag(train_chat) print( f"The DefaultTagger accuracy for the NPS Chat Corpus is {default_tagger.evaluate(test_chat)} using a {split[0]}/{split[1]} split.\n" ) #50/50 is better because the tagger doesn't "learn", so when the test data is increased (from 10%) #there's a bigger chance that some words are going to be NN? #b patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past
from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() word_list = tokenizer.tokenize( sentence ) #-------------------------------------------------------------------------------- # Parts of Speech #-------------------------------------------------------------------------------- # Default tagging from nltk.tag import DefaultTagger # if all else fails, make an unknown word a noun ( "NN" ) default_tagger = DefaultTagger( 'NN' ) # try it. tagged_sentence = default_tagger.tag( word_list ) # Can also batch tag, but need a list of sentences, each already tokenized. #tagger.batch_tag([['Hello', 'world', '.'], ['How', 'are', 'you', '?']]) #-------------------------------------------------------------------------------- # Training taggers #-------------------------------------------------------------------------------- # so far so good. Next have to train taggers. # Unigram, training on Treebank corpus from nltk.tag import UnigramTagger from nltk.corpus import treebank train_sents = treebank.tagged_sents()[:3000] unigram_tagger = UnigramTagger(train_sents)
import nltk from nltk.tag import DefaultTagger tag = DefaultTagger('NN') print(tag.tag(['Beautiful', 'morning']))
import nltk from nltk.corpus import treebank #import treebank corpus from nltk.tag import DefaultTagger #import DefaultTagger tagger = DefaultTagger('NN') #Default Tagger with assigning NN tag treebank_tagged_sents = treebank.tagged_sents( ) #initialising treebank_tagged_sents tagger.tag(treebank_tagged_sents) #tag treebank_tagged_sents print('Accuracy %4.1f%%' % (100.0 * tagger.evaluate(treebank_tagged_sents))) #calculate and print Accuracy
from nltk.corpus import treebank data = treebank.tagged_sents() train_data = data[:3500] test_data = data[3500:] print(train_data[0]) #%% # default tagger from nltk.tag import DefaultTagger dt = DefaultTagger('NN') # accuracy on test data dt.evaluate(test_data) # tagging our sample headline dt.tag(nltk.word_tokenize(sentence)) #%% # regex tagger from nltk.tag import RegexpTagger # define refex tag patterns patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NNS'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ]
######### DEFAULT TAGGER ############### #Assigning the default Tag from nltk.tag import DefaultTagger, untag tagger=DefaultTagger('NN') tokens=[['Hello','World'],['How','are','you','?']] print tagger.tag(tokens) print tagger.tag_sents(tokens) #Untagging tagged=tagger.tag(tokens) print untag(tagged) #Evaluating the tagger accuracy from nltk.corpus import treebank test_sents=treebank.tagged_sents()[3000:] print tagger.evaluate(test_sents)
import nltk from nltk.corpus import brown from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger brown.tagged_words(tagset='universal') train_sents = treebank.tagged_sents()[:3500] test_sents = treebank.tagged_sents()[3500:] tagger = DefaultTagger('NN') def back_off_tagger(train_sents, tagger_classes, backoff=None): for cls in tagger_classes: backoff = cls(train_sents, backoff=backoff) return backoff tagger = back_off_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=DefaultTagger('NN')) tagged = tagger.tag(["The", "lecture", "will", "be", "in", "Wean", "4623"]) print(tagged) print(tagger.evaluate(test_sents))