def tag_sents(self, sents): ''' Tag a list of sentences. NB before using this function, user should specify the mode_file either by - Train a new model using ``train'' function - Use the pre-trained model which is set via ``set_model_file'' function :params sentences : list of sentences needed to tag. :type sentences : list(list(str)) :return : list of tagged sentences. :rtype : list (list (tuple(str,str))) ''' if self._model_file == '': raise Exception(' No model file is found !! Please use train or set_model_file function') # We need the list of sentences instead of the list generator for matching the input and output ################ added by Kathrin ######################################### default = DefaultTagger('None') sents = default.tag_sents(sents) ########################################################################### result = [] for tokens in sents: features = [self._feature_func(tokens,i) for i in range(len(tokens))] labels = self._tagger.tag(features) if len(labels) != len(tokens): raise Exception(' Predicted Length Not Matched, Expect Errors !') ############### added by Kathrin ############################################ tokens = [i[0] for i in tokens] ############################################################################# tagged_sent = list(zip(tokens,labels)) result.append(tagged_sent) return result
class Chunker: _tagger = DefaultTagger def __init__(self, words, sents): self._tagger = DefaultTagger('NN') self.tag_words(words, sents) def tag_words(self, words, sents): train_sents = treebank.tagged_sents() tagger = UnigramTagger(train_sents) test_sents = tagger.tag(sents[0]) # test_sents = treebank.tagged_sents()[3000:] # print treebank.tagged_sents()[1:] # print "accuracy: " + str(self._tagger.evaluate(test_sents)) # print self._tagger.tag(words) # print test_sents print tagger.evaluate(test_sents) def get_accuracy(self, sentences=[]): if sentences == []: test_sents = treebank.tagged_sents()[6000:] else: test_sents = sentences print self._tagger.evaluate(test_sents)
def getDefaultTaggerAccuracy(testingSet): # gets the accuracy of the DefaultTagger # get untagged sentences and gold POS tags untaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet] goldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet] # declare tagger; honestly this is unncessary, as every tag is going to be 'NN' so we could really just skip this # altogether # I went with NN as it was the default value shown in the ntlk DefaultTagger documentation, completely arbitrary defaultTagger = DefaultTagger('NN') defaultTaggedSentences = defaultTagger.tag_sents(untaggedSentences) # calculate accuracy totalTags = 0 matches = 0 # iterate through sentences for sentencePOSTags in goldPOSTags: # iterate through tags for individualPOSTag in sentencePOSTags: totalTags += 1 # if the gold tag is NN, then match if individualPOSTag == 'NN': matches += 1 accuracy = (matches / totalTags) * 100 return accuracy
def getDefaultTaggerAccuracy(testingSet): # gets the accuracy of the DefaultTagger # get untagged sentences and gold POS tags untaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet] goldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet] # declare tagger; honestly this is unncessary, as every tag is going to be 'NN' so we could really just skip this # altogether # I went with NN as it was the default value shown in the ntlk DefaultTagger documentation, completely arbitrary defaultTagger = DefaultTagger("NN") defaultTaggedSentences = defaultTagger.tag_sents(untaggedSentences) # calculate accuracy totalTags = 0 matches = 0 # iterate through sentences for sentencePOSTags in goldPOSTags: # iterate through tags for individualPOSTag in sentencePOSTags: totalTags += 1 # if the gold tag is NN, then match if individualPOSTag == "NN": matches += 1 accuracy = (matches / totalTags) * 100 return accuracy
def test_default_tagger(self): test_list = make_sentence_list(path.join(self.test_dir, 'test.tsv')) tagger = DefaultTagger('N') split = int(len(test_list) * .90) train_data = test_list[:split] test_data = test_list[split:] print(tagger.evaluate(train_data)) print(tagger.evaluate(test_data))
def find_accuracy(train_set, test_set): #skal alt her være test-set? train_words = [word for sent in train_set for word in sent] train_set_tags = [tag for (word, tag) in train_words] train_set_most_frequent_tag = FreqDist(train_set_tags).max() default_tagger = DefaultTagger(train_set_most_frequent_tag) accuracy_result = default_tagger.evaluate(test_set) return accuracy_result
def evaluate_nltk_pos_taggers(gold_standard_filename, num_folds=10, loo=False): """ Evaluates the NLTK backoff taggers on the corpus data. Uses cross-validation. :param gold_standard_filename: tsv file of format: word \t POS \n :param num_folds: int: number of folds for cross-validation :param loo: bool: whether to use Leave One Out cross-validation :return: """ tagged_sents = make_sentence_list(gold_standard_filename) backoff = DefaultTagger('N') tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger] scores = { 'DefaultTagger': [], 'UnigramTagger': [], 'BigramTagger': [], 'TrigramTagger': [], 'BrillTagger': [], } # k-fold cross-validation if loo: # Leave One Out cross-validation num_folds = len(tagged_sents)-1 subset_size = int(len(tagged_sents) / num_folds) for i in range(num_folds): # training and testing data for this round X_test = tagged_sents[i * subset_size:][:subset_size] X_train = tagged_sents[:i * subset_size] + tagged_sents[(i + 1) * subset_size:] # compute score for taggers default_score = backoff.evaluate(X_train) trigram, tagger_scores = backoff_tagger(X_train, X_test, tagger_classes, backoff=backoff) uni_score, bi_score, tri_score = tagger_scores brill_tagger = train_brill_tagger(trigram, X_train) brill_score = brill_tagger.evaluate(X_test) brill_tagger.print_template_statistics(printunused=False) # save scores scores['DefaultTagger'].append(default_score) scores['UnigramTagger'].append(uni_score) scores['BigramTagger'].append(bi_score) scores['TrigramTagger'].append(tri_score) scores['BrillTagger'].append(brill_score) for k, v in scores.items(): # average scores across folds if v: scores[k] = sum(v)/len(v) print(k, ": {:2.2%}".format(scores[k])) return scores
def find_combined_taggers_accuracy(train_set, test_set): # finding most used tag train_words = [word for sent in train_set for word in sent] train_set_tags = [tag for (word, tag) in train_words] most_frequent_tag = FreqDist(train_set_tags).max() default_tagger = DefaultTagger(most_frequent_tag) # default tagger default_tagger_result = default_tagger.evaluate(test_set) print("Default Tagger accuracy: ", default_tagger_result) # regex tagger patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] regex_tagger = RegexpTagger(patterns) regex_tagger_result = regex_tagger.evaluate(test_set) print("Regex Tagger Accuracy: ", regex_tagger_result) # unigram tagger with default tagger as backoff unigram_tagger = UnigramTagger(train_set, backoff=default_tagger) unigram_tagger_result = unigram_tagger.evaluate(test_set) print("Unigram Tagger accuracy (Backoff = Default Tagger): ", unigram_tagger_result) # bigram tagger with different backoffs bigram_tagger = BigramTagger(train_set) bigram_tagger_backoff_unigram = BigramTagger(train_set, backoff=unigram_tagger) bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger) bigram_tagger_result = bigram_tagger.evaluate(test_set) bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate( test_set) bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate( test_set) print("Bigram Tagger Accuracy: ", bigram_tagger_result) print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ", bigram_tagger_backoff_regex_result) print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ", bigram_tagger_backoff_unigram_result)
def train_tagger(): """ This function trains the tagger """ print("Training POS tagger...") # https://github.com/japerk/nltk3-cookbook/blob/master/chapter4.py tagged_sentences = treebank.tagged_sents() size = int(len(tagged_sentences) * 0.9) train_sents = tagged_sentences[:size] test_sents = tagged_sentences[3000:] default = DefaultTagger("NN") tagger = ClassifierBasedPOSTagger( train=train_sents, backoff=default, cutoff_prob=0.3 ) print(tagger.evaluate(test_sents)) # 0.9613641269156055 # save model to pickle file as binary file_name = MODEL_PATH + "tag_model.pkl" with open(file_name, "wb") as fout: pickle.dump(tagger, fout) print("model written to: " + file_name) print("") return tagger
def train(self, sentence_list): """Trains the tagger from the tagged sentences provided """ noun_fallback = DefaultTagger('NN') affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback) unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback) bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback) trigram_fallback = TrigramTagger(sentence_list, backoff=bigram_fallback) templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1)) ] trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates) self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
def train(train_sentences): print "- Default Tagger" default_tagger = DefaultTagger('NC') print "- Unigram Tagger" unigram_tagger = UnigramTagger(train_sentences, backoff=default_tagger) print "- Templates" #These templates define the features to be used for the brill tagger # relatively to the word position. Template._cleartemplates() templates = [ Template(Pos([-1])), Template(Pos([-1]), Word([0])), Template(Pos([-2])), Template(Pos([-2]), Word([0])), Template(Pos([1])), ] print "- Brill Tagger" tt = BrillTaggerTrainer(unigram_tagger, templates, trace=1) tagger = tt.train(train_sentences, max_rules=1000) print "- Done." return tagger
def tag(self, sent, tagregex=True, deftag='XX', verbose=False): kalimat = sent.encode('utf-8') text = self.regexTokenizer(kalimat.lower().strip()) ## :> --___<<IMPORTANT>>___-- ## Untuk beberapa hal default tagger harus dibiarkan 'XX' ## dengan tujuan identifikasi Entitas backoff_tagger = DefaultTagger(deftag) if tagregex: regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger) unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger) else: unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger) bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger) """ # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt" # kombinasi proses tagging diatas menghasilkan tingkat akurasi: # dengan regextagger: 77% # tanpa regextagger : > 90% """ if verbose: # Semakin besar dokumen, semakin lama proses perhitungan akurasi # disarankan hanya untuk testing print ("Calculating Tagger Accuracy...") self.tagAccuracy = trigram_tagger.evaluate(self.test_sent) print ("Accuracy is: %s" % (self.tagAccuracy)) return trigram_tagger.tag(text)
def train(): try: input = open('tagger.pkl', 'rb') print("Found tagger") tagger = load(input) input.close() except IOError: print('Training:') train_sents = brown.tagged_sents()[:50000] test_sents = brown.tagged_sents()[50000:] tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger] tagger = backoff_tagger(train_sents, tagger_classes, backoff=DefaultTagger('unseen')) print('Finished training, tagger accuracy:') print(tagger.evaluate(test_sents)) output = open('tagger.pkl', 'wb') dump(tagger, output, -1) output return tagger
def __init__(self): self.backoff = self.backoff_tagger(backoff=DefaultTagger('NN')) self.st = StanfordNERTagger( 'stanfordNERJars/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanfordNERJars/stanford-ner.jar', encoding='utf-8') if os.path.exists("out/"): shutil.rmtree('out/')
def ngramtagging(train): #PROSES POS TAGGING train_data = [] train_data.append(train) backoff_tagger = DefaultTagger('nn') unigram_tagger = UnigramTagger(train_data, backoff = backoff_tagger) bigram_tagger = BigramTagger(train_data, backoff = unigram_tagger) trigram_tagger = TrigramTagger(train_data, backoff = bigram_tagger) return trigram_tagger
def tag_words(words, tag): """ Associates a tag with words. Parameters ---------- words: A list of strings. tag: A str. Returns ------- A list of tuples of (str, str) """ default_tagger = DefaultTagger(tag) tags = default_tagger.tag(words) return tags
def train_tagger(tagger_name): train_sents = treebank.tagged_sents()[:5000] if tagger_name == "TnT" or tagger_name == 'tagger': trained_tagger = tnt.TnT() trained_tagger.train(train_sents) else: tagger1 = DefaultTagger('NN') tagger2 = TrigramTagger(train_sents, backoff=tagger1) tagger3 = BigramTagger(train_sents, backoff=tagger2) trained_tagger = UnigramTagger(train_sents, backoff=tagger3) return trained_tagger
def baseline(tagged_sentences): from nltk.tag import UnigramTagger from nltk.tag import DefaultTagger from collections import Counter # lowercase everything # remove all instances of non-universal tags for propper comparison with # the other methods new_tagged_sentences = [] for sent in tagged_sentences: sent = [(x[0].lower(), x[1]) for x in sent] sent = [x for x in sent if x[1] in _UNI] new_tagged_sentences.append(sent) tagged_sentences = new_tagged_sentences # size of corpus corpus_size = sum([len(sent) for sent in tagged_sentences]) print('Corpus size: {} docs'.format(len(tagged_sentences))) print('Corpus size: {} tokens'.format(corpus_size)) # train/test split test_pct = 0.3 test_len = int(len(tagged_sentences) * test_pct) test_idx = len(tagged_sentences) - test_len train_set = tagged_sentences[:test_idx] test_set = tagged_sentences[test_idx:] print('Train set: {} docs'.format(len(train_set))) print('Test set: {} docs'.format(len(test_set))) # calculate test set size in tokens test_size = sum([len(sent) for sent in test_set]) print('Test set: {} tokens'.format(test_size)) # calculate most comman tag in the train set # this should be 'NOUN' tag_dist = [] for sent in train_set: tag_dist += [x[1] for x in sent] counts = Counter() counts.update(tag_dist) most_common = counts.most_common(1)[0][0] print('Most common tag: {}'.format(most_common)) # Create model backoff = DefaultTagger(most_common) tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5) # Evaluate acc = tagger.evaluate(test_set) print('Baseline: {}'.format(acc))
def lexical(tokens): print "\n" print "Step 2: Lexical Analysis\n" print "Essentially refers to dictionary and obtains the properties of the word" print "Part-Of-Speech tagging" print "The tagset is:\n" tag = DefaultTagger('NN') tagg = UnigramTagger(train_sent, backoff=tag) tagger = BigramTagger(train_sent, backoff=tagg) tagtokens = tagger.tag(tokens) for token, tag in tagtokens: print token + "->" + tag print "\n" print "The acurracy of the trained pos tagger is:" print tagger.evaluate(test_sents) return tagtokens
def wordTagger(self, wordlist,number): train_sents = treebank.tagged_sents()[:3000] if number==1: taglist = nltk.pos_tag(wordlist) elif number ==2: tagger = DefaultTagger('NN') taglist = tagger.tag(wordlist) elif number ==3: tagger = UnigramTagger(train_sents) taglist = tagger.tag(wordlist) elif number ==4: tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) taglist = tnt_tagger.tag(wordlist) elif number ==5: tagger = ClassifierBasedPOSTagger(train=train_sents) taglist = tagger.tag(wordlist) return taglist
def tag_linked(words, default_tag='INFO'): """ Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers. Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger. Parameters ---------- words: A list of strings. Returns ------- A list of tuples of (str, str) :param default_tag: """ default_tagger = DefaultTagger(default_tag) pt_tagger = UnigramTagger(treebank.tagged_sents()) pt_tagger._taggers = [pt_tagger, default_tagger] tags = pt_tagger.tag(words) return tags
def train_tagger(): ''' Um exemplo de treinamento de um etiquetador sintático usando um modelo de tri-gramas baseado em probabilidades. Um etiquetador sintático identifica quais a classe de uma palavra Ex.: Isso é um teste = Isso-PROSUB é-V um-ART teste-N Preposição Verbo Artigo Substantivo ''' # Carregando um conjunto de dados em português que possui # sentenças manualmente identificadas data = [ [(w, re.split('[|-]', tag)[0]) for w, tag in sent] for sent in mac_morpho.tagged_sents()] # Classe sintática padrão. N siginifica Nome/substantivo tagger0 = DefaultTagger('N') print('train unigram') tagger1 = UnigramTagger(data, backoff=tagger0) print('training bigram') tagger2 = BigramTagger(data, backoff=tagger1) print('training trigram') return TrigramTagger(data, backoff=tagger2)
brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), #you can look at the combination of the previous two words to learn a transformation rule brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) return trainer.train(train_sents, **kwargs) defaultTagger = DefaultTagger('NN') initialTagger = backoff_tagger(brown_train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=defaultTagger) brillTagger = train_brill_tagger(initialTagger, brown_train_sents) tnt_tagger = tnt.TnT(N=100) tnt_tagger.train(brown_train_sents) bigramTagger = BigramTagger(brown_train_sents) trigramTagger = TrigramTagger(brown_train_sents) print("------------Recommended Tagger------------") print(nltk.pos_tag(sent)) print("------------Default Tagger------------") print(defaultTagger.tag(sent))
def backoff_tagger(train_sents, tagger_classes): backoff = DefaultTagger('NN') for cls in tagger_classes: backoff = cls(train_sents, backoff=backoff) return backoff
def indivDefault(bambara): default = DefaultTagger('n') print(default.evaluate(bambara.test_sents)) return default
import nltk from nltk.tag import DefaultTagger tag = DefaultTagger('NN') print(tag.tag(['Beautiful', 'morning']))
from nltk.tag import tnt, DefaultTagger import pickle datas = open('Indonesian_Manually_Tagged_Corpus.tsv', 'r').read() datas = datas.split('\n\n') train_sents = [] for data in datas: train_sents.append(list(tuple(i.split('\t')) for i in data.split('\n'))) unk = DefaultTagger('NN') tnt_tagger = tnt.TnT(unk=unk, Trained=True) tnt_tagger.train(train_sents) tagger_file = open("indonesian_tnt_pos_tag.pickle", "wb") pickle.dump(tnt_tagger, tagger_file) tagger_file.close()
('BASIS', 'BASIS'), ('CORPORATE', 'CORPORATE'), ('OTHER|other', 'OTHER'), ('LAST|last', 'LAST'), (r'POS', 'POINT_OF_SALE'), (r'AFTERCOMPANY|CORPORATE|COMPANY', 'CORPORATE'), (r'DISCOUNT|COMMISSIO|COMMISSIONS|COMISSION|discount|discounts|Commissionable|commissionable', 'DISCOUNT'), (r'ASIA|Asia|asia|AISA|亚洲', 'ASIA'), (r'NORTH|North|north', 'NORTH'), (r'SOUTH|South|south', 'SOUTH') # ('TICKET','VALIDITY') ] # add learning loop here for tags def_tagger = DefaultTagger('NN') prelim_def_tagger = DefaultTagger(None) backoff = RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'is|was|are|were', 'VBZ'), # verb to be (r'"', 'QT'), # quote (r'.*', 'NN') # nouns (default)
def setUp(self): self.corpus = brown.tagged_sents()[:35] self.decoder = JSONTaggedDecoder() self.encoder = JSONTaggedEncoder() self.default_tagger = DefaultTagger("NN")
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import treebank from nltk.corpus import wordnet as wn from os.path import isfile, join from os import listdir from pprint import pprint import gensim.downloader as api import re import nltk import os TEST_PATH = '../test/untagged' COMMON_WORDS_PATH = '../resources/1-1000.txt' TRAINING_SENTS = treebank.tagged_sents() UNIGRAM = UnigramTagger(TRAINING_SENTS, backoff=DefaultTagger('NN')) BIGRAM = BigramTagger(TRAINING_SENTS, backoff=UNIGRAM) TRIGRAM = TrigramTagger(TRAINING_SENTS, backoff=BIGRAM) STOPWORDS = set(nltk.corpus.stopwords.words('english')) WORD_VECTORS = api.load("glove-wiki-gigaword-100") TEST_FILES = [f for f in listdir(TEST_PATH) if isfile(join(TEST_PATH, f))] # Manual list of words to be considered "irrelevant" IRRELEVANT_WORDS = ["talk", "seminar", "lecture"] # manually created ontology tree, which is later extended TREE = {"science": {}, "maths": {}, "engineering": {}, "medicine": {}} # code to convert POS tags into the right form for lemmatization # https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word
#word_tokenize('Hello World.') # load pickled tokenizer from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() word_list = tokenizer.tokenize( sentence ) #-------------------------------------------------------------------------------- # Parts of Speech #-------------------------------------------------------------------------------- # Default tagging from nltk.tag import DefaultTagger # if all else fails, make an unknown word a noun ( "NN" ) default_tagger = DefaultTagger( 'NN' ) # try it. tagged_sentence = default_tagger.tag( word_list ) # Can also batch tag, but need a list of sentences, each already tokenized. #tagger.batch_tag([['Hello', 'world', '.'], ['How', 'are', 'you', '?']]) #-------------------------------------------------------------------------------- # Training taggers #-------------------------------------------------------------------------------- # so far so good. Next have to train taggers. # Unigram, training on Treebank corpus from nltk.tag import UnigramTagger
# every tagger has a tag() method. # DefaultTagger is a subclass of SequentialBackoffTagger which has a choose_tag() method. from nltk.tag import DefaultTagger from nltk.corpus import treebank tagger = DefaultTagger('NN') print(tagger.tag(['Hello', 'World'])) # thought it's too simple, we can try to evaluate it test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents)) # for sentences print(tagger.tag_sents([['Hello', 'World', '.'], ['How', 'are', 'you', '?']])) # untagging from nltk.tag import untag print(untag([('Hello', 'NN'), ('World', 'NN')]))
import nltk from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.classify import NaiveBayesClassifier, MaxentClassifier from nltk.tag.sequential import ClassifierBasedPOSTagger data = treebank.tagged_sents() train_data = data[:3500] test_data = data[3500:] #print(train_data[0]) dt = DefaultTagger('NN') print(dt.evaluate(test_data)) nt = ClassifierBasedPOSTagger(train=train_data, classifier_builder=NaiveBayesClassifier.train) print(nt.evaluate(test_data))
for page in list(root): l = [] text = page.find('text').text.decode('utf8') language = page.find('language').text.decode('utf8') pos = page.find('pos_tags').text.decode('utf8') splitText = text.split(" ")[1:-1] posText = pos.split(" ")[1:-1] for i in range(len(splitText)): l.append((splitText[i], posText[i])) data.append(l) count = count + 1 shuffle(data) # Divide data into train and test sets eightyPercent = count*0.9 training_set = data[0:int(eightyPercent)] test_set = data[int(eightyPercent):] # Train train_data = training_set tag1 = DefaultTagger('NN') tag2 = UnigramTagger(train_data, backoff = tag1) tag3 = BigramTagger(train_data, backoff = tag2) tag4 = TrigramTagger(train_data, backoff = tag3) # Accuracy # print tag4.tag('open a start up'.encode('utf-8').decode('utf-8').split()) # print tag4.tag('OUT nahi KARDO ISSE BAHUT HOGAYA aaj Salman'.encode('utf-8').decode('utf-8').split()) gold_sentences = test_set print tag4.evaluate(gold_sentences)
def Tagger(): #Tagger etiq1 = DefaultTagger('N') sentencas_treinadoras = mac_morpho.tagged_sents()[::] etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1) return etiq2
f.write("\n".join(caps)) f.close() #adding the tagger import nltk from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger('NN') train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents, backoff=default, cutoff_prob=0.3) tagger.evaluate(test_sents) #token = nltk.word_tokenize(title) #title string tokenized #removing all the punctuation marks
tagged_sent = tag(sentence) print tagged_sent # building your own tagger # preparing the data from nltk.corpus import treebank data = treebank.tagged_sents() train_data = data[:3500] test_data = data[3500:] print train_data[0] # default tagger from nltk.tag import DefaultTagger dt = DefaultTagger('NN') print dt.evaluate(test_data) print dt.tag(tokens) # regex tagger from nltk.tag import RegexpTagger # define regex tag patterns patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns
######### DEFAULT TAGGER ############### #Assigning the default Tag from nltk.tag import DefaultTagger, untag tagger=DefaultTagger('NN') tokens=[['Hello','World'],['How','are','you','?']] print tagger.tag(tokens) print tagger.tag_sents(tokens) #Untagging tagged=tagger.tag(tokens) print untag(tagged) #Evaluating the tagger accuracy from nltk.corpus import treebank test_sents=treebank.tagged_sents()[3000:] print tagger.evaluate(test_sents)
from nltk.metrics import * import string '''import replacer from replacer import RegexpReplacer from replacer import RepeatReplacer''' import linecache import matplotlib.pyplot as plt ''' Train Tagger ''' from nltk.tag import DefaultTagger from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.corpus import treebank train = treebank.tagged_sents()[:10000] t0 = DefaultTagger('NN') t1 = UnigramTagger(train, backoff=t0) t2 = BigramTagger(train, backoff=t1) ''' Initialize ''' my_corp = web.sents(fileids='firefox.txt') sent_count = 0 ques_count = 0 All_count = 1 NN_count = 0 NNS_count = 0 NNP_count = 0 VB_count = 0 VBN_count = 0 VBG_count = 0
def __init__(self, words, sents): self._tagger = DefaultTagger('NN') self.tag_words(words, sents)