def train(train_sentences): print "- Default Tagger" default_tagger = DefaultTagger('NC') print "- Unigram Tagger" unigram_tagger = UnigramTagger(train_sentences, backoff=default_tagger) print "- Templates" #These templates define the features to be used for the brill tagger # relatively to the word position. Template._cleartemplates() templates = [ Template(Pos([-1])), Template(Pos([-1]), Word([0])), Template(Pos([-2])), Template(Pos([-2]), Word([0])), Template(Pos([1])), ] print "- Brill Tagger" tt = BrillTaggerTrainer(unigram_tagger, templates, trace=1) tagger = tt.train(train_sentences, max_rules=1000) print "- Done." return tagger
def train(self, sentence_list): """Trains the tagger from the tagged sentences provided """ noun_fallback = DefaultTagger('NN') affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback) unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback) bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback) trigram_fallback = TrigramTagger(sentence_list, backoff=bigram_fallback) templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1)) ] trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates) self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
def train_tagger(): """ This function trains the tagger """ print("Training POS tagger...") # https://github.com/japerk/nltk3-cookbook/blob/master/chapter4.py tagged_sentences = treebank.tagged_sents() size = int(len(tagged_sentences) * 0.9) train_sents = tagged_sentences[:size] test_sents = tagged_sentences[3000:] default = DefaultTagger("NN") tagger = ClassifierBasedPOSTagger( train=train_sents, backoff=default, cutoff_prob=0.3 ) print(tagger.evaluate(test_sents)) # 0.9613641269156055 # save model to pickle file as binary file_name = MODEL_PATH + "tag_model.pkl" with open(file_name, "wb") as fout: pickle.dump(tagger, fout) print("model written to: " + file_name) print("") return tagger
def tag(self, sent, tagregex=True, deftag='XX', verbose=False): kalimat = sent.encode('utf-8') text = self.regexTokenizer(kalimat.lower().strip()) ## :> --___<<IMPORTANT>>___-- ## Untuk beberapa hal default tagger harus dibiarkan 'XX' ## dengan tujuan identifikasi Entitas backoff_tagger = DefaultTagger(deftag) if tagregex: regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger) unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger) else: unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger) bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger) """ # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt" # kombinasi proses tagging diatas menghasilkan tingkat akurasi: # dengan regextagger: 77% # tanpa regextagger : > 90% """ if verbose: # Semakin besar dokumen, semakin lama proses perhitungan akurasi # disarankan hanya untuk testing print ("Calculating Tagger Accuracy...") self.tagAccuracy = trigram_tagger.evaluate(self.test_sent) print ("Accuracy is: %s" % (self.tagAccuracy)) return trigram_tagger.tag(text)
def train(): try: input = open('tagger.pkl', 'rb') print("Found tagger") tagger = load(input) input.close() except IOError: print('Training:') train_sents = brown.tagged_sents()[:50000] test_sents = brown.tagged_sents()[50000:] tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger] tagger = backoff_tagger(train_sents, tagger_classes, backoff=DefaultTagger('unseen')) print('Finished training, tagger accuracy:') print(tagger.evaluate(test_sents)) output = open('tagger.pkl', 'wb') dump(tagger, output, -1) output return tagger
def getDefaultTaggerAccuracy(testingSet): # gets the accuracy of the DefaultTagger # get untagged sentences and gold POS tags untaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet] goldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet] # declare tagger; honestly this is unncessary, as every tag is going to be 'NN' so we could really just skip this # altogether # I went with NN as it was the default value shown in the ntlk DefaultTagger documentation, completely arbitrary defaultTagger = DefaultTagger('NN') defaultTaggedSentences = defaultTagger.tag_sents(untaggedSentences) # calculate accuracy totalTags = 0 matches = 0 # iterate through sentences for sentencePOSTags in goldPOSTags: # iterate through tags for individualPOSTag in sentencePOSTags: totalTags += 1 # if the gold tag is NN, then match if individualPOSTag == 'NN': matches += 1 accuracy = (matches / totalTags) * 100 return accuracy
def find_accuracy(train_set, test_set): #skal alt her være test-set? train_words = [word for sent in train_set for word in sent] train_set_tags = [tag for (word, tag) in train_words] train_set_most_frequent_tag = FreqDist(train_set_tags).max() default_tagger = DefaultTagger(train_set_most_frequent_tag) accuracy_result = default_tagger.evaluate(test_set) return accuracy_result
def ngramtagging(train): #PROSES POS TAGGING train_data = [] train_data.append(train) backoff_tagger = DefaultTagger('nn') unigram_tagger = UnigramTagger(train_data, backoff = backoff_tagger) bigram_tagger = BigramTagger(train_data, backoff = unigram_tagger) trigram_tagger = TrigramTagger(train_data, backoff = bigram_tagger) return trigram_tagger
def test_default_tagger(self): test_list = make_sentence_list(path.join(self.test_dir, 'test.tsv')) tagger = DefaultTagger('N') split = int(len(test_list) * .90) train_data = test_list[:split] test_data = test_list[split:] print(tagger.evaluate(train_data)) print(tagger.evaluate(test_data))
def __init__(self): self.backoff = self.backoff_tagger(backoff=DefaultTagger('NN')) self.st = StanfordNERTagger( 'stanfordNERJars/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanfordNERJars/stanford-ner.jar', encoding='utf-8') if os.path.exists("out/"): shutil.rmtree('out/')
def train_tagger(tagger_name): train_sents = treebank.tagged_sents()[:5000] if tagger_name == "TnT" or tagger_name == 'tagger': trained_tagger = tnt.TnT() trained_tagger.train(train_sents) else: tagger1 = DefaultTagger('NN') tagger2 = TrigramTagger(train_sents, backoff=tagger1) tagger3 = BigramTagger(train_sents, backoff=tagger2) trained_tagger = UnigramTagger(train_sents, backoff=tagger3) return trained_tagger
def baseline(tagged_sentences): from nltk.tag import UnigramTagger from nltk.tag import DefaultTagger from collections import Counter # lowercase everything # remove all instances of non-universal tags for propper comparison with # the other methods new_tagged_sentences = [] for sent in tagged_sentences: sent = [(x[0].lower(), x[1]) for x in sent] sent = [x for x in sent if x[1] in _UNI] new_tagged_sentences.append(sent) tagged_sentences = new_tagged_sentences # size of corpus corpus_size = sum([len(sent) for sent in tagged_sentences]) print('Corpus size: {} docs'.format(len(tagged_sentences))) print('Corpus size: {} tokens'.format(corpus_size)) # train/test split test_pct = 0.3 test_len = int(len(tagged_sentences) * test_pct) test_idx = len(tagged_sentences) - test_len train_set = tagged_sentences[:test_idx] test_set = tagged_sentences[test_idx:] print('Train set: {} docs'.format(len(train_set))) print('Test set: {} docs'.format(len(test_set))) # calculate test set size in tokens test_size = sum([len(sent) for sent in test_set]) print('Test set: {} tokens'.format(test_size)) # calculate most comman tag in the train set # this should be 'NOUN' tag_dist = [] for sent in train_set: tag_dist += [x[1] for x in sent] counts = Counter() counts.update(tag_dist) most_common = counts.most_common(1)[0][0] print('Most common tag: {}'.format(most_common)) # Create model backoff = DefaultTagger(most_common) tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5) # Evaluate acc = tagger.evaluate(test_set) print('Baseline: {}'.format(acc))
def evaluate_nltk_pos_taggers(gold_standard_filename, num_folds=10, loo=False): """ Evaluates the NLTK backoff taggers on the corpus data. Uses cross-validation. :param gold_standard_filename: tsv file of format: word \t POS \n :param num_folds: int: number of folds for cross-validation :param loo: bool: whether to use Leave One Out cross-validation :return: """ tagged_sents = make_sentence_list(gold_standard_filename) backoff = DefaultTagger('N') tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger] scores = { 'DefaultTagger': [], 'UnigramTagger': [], 'BigramTagger': [], 'TrigramTagger': [], 'BrillTagger': [], } # k-fold cross-validation if loo: # Leave One Out cross-validation num_folds = len(tagged_sents)-1 subset_size = int(len(tagged_sents) / num_folds) for i in range(num_folds): # training and testing data for this round X_test = tagged_sents[i * subset_size:][:subset_size] X_train = tagged_sents[:i * subset_size] + tagged_sents[(i + 1) * subset_size:] # compute score for taggers default_score = backoff.evaluate(X_train) trigram, tagger_scores = backoff_tagger(X_train, X_test, tagger_classes, backoff=backoff) uni_score, bi_score, tri_score = tagger_scores brill_tagger = train_brill_tagger(trigram, X_train) brill_score = brill_tagger.evaluate(X_test) brill_tagger.print_template_statistics(printunused=False) # save scores scores['DefaultTagger'].append(default_score) scores['UnigramTagger'].append(uni_score) scores['BigramTagger'].append(bi_score) scores['TrigramTagger'].append(tri_score) scores['BrillTagger'].append(brill_score) for k, v in scores.items(): # average scores across folds if v: scores[k] = sum(v)/len(v) print(k, ": {:2.2%}".format(scores[k])) return scores
def find_combined_taggers_accuracy(train_set, test_set): # finding most used tag train_words = [word for sent in train_set for word in sent] train_set_tags = [tag for (word, tag) in train_words] most_frequent_tag = FreqDist(train_set_tags).max() default_tagger = DefaultTagger(most_frequent_tag) # default tagger default_tagger_result = default_tagger.evaluate(test_set) print("Default Tagger accuracy: ", default_tagger_result) # regex tagger patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] regex_tagger = RegexpTagger(patterns) regex_tagger_result = regex_tagger.evaluate(test_set) print("Regex Tagger Accuracy: ", regex_tagger_result) # unigram tagger with default tagger as backoff unigram_tagger = UnigramTagger(train_set, backoff=default_tagger) unigram_tagger_result = unigram_tagger.evaluate(test_set) print("Unigram Tagger accuracy (Backoff = Default Tagger): ", unigram_tagger_result) # bigram tagger with different backoffs bigram_tagger = BigramTagger(train_set) bigram_tagger_backoff_unigram = BigramTagger(train_set, backoff=unigram_tagger) bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger) bigram_tagger_result = bigram_tagger.evaluate(test_set) bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate( test_set) bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate( test_set) print("Bigram Tagger Accuracy: ", bigram_tagger_result) print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ", bigram_tagger_backoff_regex_result) print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ", bigram_tagger_backoff_unigram_result)
def tag_words(words, tag): """ Associates a tag with words. Parameters ---------- words: A list of strings. tag: A str. Returns ------- A list of tuples of (str, str) """ default_tagger = DefaultTagger(tag) tags = default_tagger.tag(words) return tags
def wordTagger(self, wordlist,number): train_sents = treebank.tagged_sents()[:3000] if number==1: taglist = nltk.pos_tag(wordlist) elif number ==2: tagger = DefaultTagger('NN') taglist = tagger.tag(wordlist) elif number ==3: tagger = UnigramTagger(train_sents) taglist = tagger.tag(wordlist) elif number ==4: tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) taglist = tnt_tagger.tag(wordlist) elif number ==5: tagger = ClassifierBasedPOSTagger(train=train_sents) taglist = tagger.tag(wordlist) return taglist
def lexical(tokens): print "\n" print "Step 2: Lexical Analysis\n" print "Essentially refers to dictionary and obtains the properties of the word" print "Part-Of-Speech tagging" print "The tagset is:\n" tag = DefaultTagger('NN') tagg = UnigramTagger(train_sent, backoff=tag) tagger = BigramTagger(train_sent, backoff=tagg) tagtokens = tagger.tag(tokens) for token, tag in tagtokens: print token + "->" + tag print "\n" print "The acurracy of the trained pos tagger is:" print tagger.evaluate(test_sents) return tagtokens
def tag_linked(words, default_tag='INFO'): """ Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers. Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger. Parameters ---------- words: A list of strings. Returns ------- A list of tuples of (str, str) :param default_tag: """ default_tagger = DefaultTagger(default_tag) pt_tagger = UnigramTagger(treebank.tagged_sents()) pt_tagger._taggers = [pt_tagger, default_tagger] tags = pt_tagger.tag(words) return tags
def train_tagger(): ''' Um exemplo de treinamento de um etiquetador sintático usando um modelo de tri-gramas baseado em probabilidades. Um etiquetador sintático identifica quais a classe de uma palavra Ex.: Isso é um teste = Isso-PROSUB é-V um-ART teste-N Preposição Verbo Artigo Substantivo ''' # Carregando um conjunto de dados em português que possui # sentenças manualmente identificadas data = [ [(w, re.split('[|-]', tag)[0]) for w, tag in sent] for sent in mac_morpho.tagged_sents()] # Classe sintática padrão. N siginifica Nome/substantivo tagger0 = DefaultTagger('N') print('train unigram') tagger1 = UnigramTagger(data, backoff=tagger0) print('training bigram') tagger2 = BigramTagger(data, backoff=tagger1) print('training trigram') return TrigramTagger(data, backoff=tagger2)
from nltk.metrics import * import string '''import replacer from replacer import RegexpReplacer from replacer import RepeatReplacer''' import linecache import matplotlib.pyplot as plt ''' Train Tagger ''' from nltk.tag import DefaultTagger from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.corpus import treebank train = treebank.tagged_sents()[:10000] t0 = DefaultTagger('NN') t1 = UnigramTagger(train, backoff=t0) t2 = BigramTagger(train, backoff=t1) ''' Initialize ''' my_corp = web.sents(fileids='firefox.txt') sent_count = 0 ques_count = 0 All_count = 1 NN_count = 0 NNS_count = 0 NNP_count = 0 VB_count = 0 VBN_count = 0 VBG_count = 0
('BASIS', 'BASIS'), ('CORPORATE', 'CORPORATE'), ('OTHER|other', 'OTHER'), ('LAST|last', 'LAST'), (r'POS', 'POINT_OF_SALE'), (r'AFTERCOMPANY|CORPORATE|COMPANY', 'CORPORATE'), (r'DISCOUNT|COMMISSIO|COMMISSIONS|COMISSION|discount|discounts|Commissionable|commissionable', 'DISCOUNT'), (r'ASIA|Asia|asia|AISA|亚洲', 'ASIA'), (r'NORTH|North|north', 'NORTH'), (r'SOUTH|South|south', 'SOUTH') # ('TICKET','VALIDITY') ] # add learning loop here for tags def_tagger = DefaultTagger('NN') prelim_def_tagger = DefaultTagger(None) backoff = RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'is|was|are|were', 'VBZ'), # verb to be (r'"', 'QT'), # quote (r'.*', 'NN') # nouns (default)
def Tagger(): #Tagger etiq1 = DefaultTagger('N') sentencas_treinadoras = mac_morpho.tagged_sents()[::] etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1) return etiq2
train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] train_brown = nltk.corpus.brown.tagged_sents()[0:5000] test_brown = nltk.corpus.brown.tagged_sents()[5000:] from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger def backoff_tagger(train_sents, tagger_classes, backoff=None): for cls in tagger_classes: backoff = cls(train_sents, backoff=backoff) return backoff backoff = DefaultTagger('NN') btagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=backoff) #print tagger.evaluate(test_sents) tnt_tagger = nltk.tag.tnt.TnT() tnt_tagger.train(train_sents) t_tagger_brown = nltk.tag.tnt.TnT() t_tagger_brown.train(train_brown) def readEssays(filename): infile = open(filename, 'r')
def backoff_tagger(train_sents, tagger_classes): backoff = DefaultTagger('NN') for cls in tagger_classes: backoff = cls(train_sents, backoff=backoff) return backoff
train_sents = update_tags(train_sents) val_sents = update_tags(val_sents) test_sents = update_tags(test_sents) """ # ============================================================================= # finalise a sequential tagger # ============================================================================= """ """ 1. run tagger with different corpus size (50% and 100%) """ # backoff tagger tag1_eval = dict() # train with backoff and Brill tic() tag1_tagger = DefaultTagger('NO') tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-4, backoff=tag1_tagger) tag1_tagger = AffixTagger(train_sents, affix_length=-5, backoff=tag1_tagger) tag1_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag1_tagger) tag1_tagger = BigramTagger(train_sents, backoff=tag1_tagger) tag1_tagger = TrigramTagger(train_sents, backoff=tag1_tagger) tag1b_tagger = train_brill_tagger(tag1_tagger, train_sents, True, max_rules=100) tag1_eval['train_time'] = toc() # test tic()
from nltk.tag import tnt, DefaultTagger import pickle datas = open('Indonesian_Manually_Tagged_Corpus.tsv', 'r').read() datas = datas.split('\n\n') train_sents = [] for data in datas: train_sents.append(list(tuple(i.split('\t')) for i in data.split('\n'))) unk = DefaultTagger('NN') tnt_tagger = tnt.TnT(unk=unk, Trained=True) tnt_tagger.train(train_sents) tagger_file = open("indonesian_tnt_pos_tag.pickle", "wb") pickle.dump(tnt_tagger, tagger_file) tagger_file.close()
f.write("\n".join(caps)) f.close() #adding the tagger import nltk from nltk.corpus import treebank from nltk.tag import DefaultTagger from nltk.tag.sequential import ClassifierBasedPOSTagger default = DefaultTagger('NN') train_sents = treebank.tagged_sents()[:3000] test_sents = treebank.tagged_sents()[3000:] tagger = ClassifierBasedPOSTagger(train=train_sents, backoff=default, cutoff_prob=0.3) tagger.evaluate(test_sents) #token = nltk.word_tokenize(title) #title string tokenized #removing all the punctuation marks
for page in list(root): l = [] text = page.find('text').text.decode('utf8') language = page.find('language').text.decode('utf8') pos = page.find('pos_tags').text.decode('utf8') splitText = text.split(" ")[1:-1] posText = pos.split(" ")[1:-1] for i in range(len(splitText)): l.append((splitText[i], posText[i])) data.append(l) count = count + 1 shuffle(data) # Divide data into train and test sets eightyPercent = count*0.9 training_set = data[0:int(eightyPercent)] test_set = data[int(eightyPercent):] # Train train_data = training_set tag1 = DefaultTagger('NN') tag2 = UnigramTagger(train_data, backoff = tag1) tag3 = BigramTagger(train_data, backoff = tag2) tag4 = TrigramTagger(train_data, backoff = tag3) # Accuracy # print tag4.tag('open a start up'.encode('utf-8').decode('utf-8').split()) # print tag4.tag('OUT nahi KARDO ISSE BAHUT HOGAYA aaj Salman'.encode('utf-8').decode('utf-8').split()) gold_sentences = test_set print tag4.evaluate(gold_sentences)
def setUp(self): self.corpus = brown.tagged_sents()[:35] self.decoder = JSONTaggedDecoder() self.encoder = JSONTaggedEncoder() self.default_tagger = DefaultTagger("NN")
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import treebank from nltk.corpus import wordnet as wn from os.path import isfile, join from os import listdir from pprint import pprint import gensim.downloader as api import re import nltk import os TEST_PATH = '../test/untagged' COMMON_WORDS_PATH = '../resources/1-1000.txt' TRAINING_SENTS = treebank.tagged_sents() UNIGRAM = UnigramTagger(TRAINING_SENTS, backoff=DefaultTagger('NN')) BIGRAM = BigramTagger(TRAINING_SENTS, backoff=UNIGRAM) TRIGRAM = TrigramTagger(TRAINING_SENTS, backoff=BIGRAM) STOPWORDS = set(nltk.corpus.stopwords.words('english')) WORD_VECTORS = api.load("glove-wiki-gigaword-100") TEST_FILES = [f for f in listdir(TEST_PATH) if isfile(join(TEST_PATH, f))] # Manual list of words to be considered "irrelevant" IRRELEVANT_WORDS = ["talk", "seminar", "lecture"] # manually created ontology tree, which is later extended TREE = {"science": {}, "maths": {}, "engineering": {}, "medicine": {}} # code to convert POS tags into the right form for lemmatization # https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word