def tag_words(self, words, sents): train_sents = treebank.tagged_sents() tagger = UnigramTagger(train_sents) test_sents = tagger.tag(sents[0]) # test_sents = treebank.tagged_sents()[3000:] # print treebank.tagged_sents()[1:] # print "accuracy: " + str(self._tagger.evaluate(test_sents)) # print self._tagger.tag(words) # print test_sents print tagger.evaluate(test_sents)
def baseline(tagged_sentences): from nltk.tag import UnigramTagger from nltk.tag import DefaultTagger from collections import Counter # lowercase everything # remove all instances of non-universal tags for propper comparison with # the other methods new_tagged_sentences = [] for sent in tagged_sentences: sent = [(x[0].lower(), x[1]) for x in sent] sent = [x for x in sent if x[1] in _UNI] new_tagged_sentences.append(sent) tagged_sentences = new_tagged_sentences # size of corpus corpus_size = sum([len(sent) for sent in tagged_sentences]) print('Corpus size: {} docs'.format(len(tagged_sentences))) print('Corpus size: {} tokens'.format(corpus_size)) # train/test split test_pct = 0.3 test_len = int(len(tagged_sentences) * test_pct) test_idx = len(tagged_sentences) - test_len train_set = tagged_sentences[:test_idx] test_set = tagged_sentences[test_idx:] print('Train set: {} docs'.format(len(train_set))) print('Test set: {} docs'.format(len(test_set))) # calculate test set size in tokens test_size = sum([len(sent) for sent in test_set]) print('Test set: {} tokens'.format(test_size)) # calculate most comman tag in the train set # this should be 'NOUN' tag_dist = [] for sent in train_set: tag_dist += [x[1] for x in sent] counts = Counter() counts.update(tag_dist) most_common = counts.most_common(1)[0][0] print('Most common tag: {}'.format(most_common)) # Create model backoff = DefaultTagger(most_common) tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5) # Evaluate acc = tagger.evaluate(test_set) print('Baseline: {}'.format(acc))
######## UNIGRAM TAGGER ########## from nltk.tag import UnigramTagger from nltk.corpus import treebank #We use the first 3000 sentences of the treebank corpus as the training set to initialize #the UnigramTagger class #Unigram tagger can be trained by giving it a list of tagged sentences at initialization. train_sents=treebank.tagged_sents()[:3000] tagger=UnigramTagger(train_sents) print treebank.sents()[0] print tagger.tag(treebank.sents()[0]) test_sents=treebank.tagged_sents()[3000:] print tagger.evaluate(test_sents) tagger=UnigramTagger(model={'Pierre':'NN'}) tagger.tag(treebank.sents())[0]
print(rt.evaluate(test_data)) print(rt.tag(tokens)) # 3. N-GRAM TAGGERS: # Contiguous sequences of n items from a sequence of text or speech. Items can be words, phonemes, # letters, characters or syllabes. Shingles: n-grams where items are just words. # UnigramTagger -> NGramTagger -> ContextTagger -> SequentialBackoffTagger # Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations) ut = UnigramTagger(train=train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) # Test the performance of each N-Gram tagger print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data))) print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data))) print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data))) print("\n1-Gram tags:") print(ut.tag(tokens)) print("\n2-Gram tags:") print(bt.tag(tokens)) print("\n3-Gram tags:") print(tt.tag(tokens)) # Note that the best accuracy is provided by the 1-Gram tagger, as it isn't always the case that the same bigrams # and trigrams observed in the training data will be present in the same way in the testing data (e.g. pairs of words # do not always appear paired in the same way)
# Training set training_data = tagged_data_list[:cutoff] # Evaluation set evaulation_data = tagged_data_list[cutoff:development_size] # print "Data is splitted!" # Regular expression tagger nn_cd_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNC'), (r'.*', 'NOUN_NOM')]) # Unigram tagger unigram_tagger = UnigramTagger(training_data, backoff=nn_cd_tagger) print "Unigram accuracy: " print unigram_tagger.evaluate(evaulation_data) # Bigram tagger bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger) print "Bigram accuracy: " print bigram_tagger.evaluate(evaulation_data) # Trigram tagger trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger) print "Trigram accuracy: " print trigram_tagger.evaluate(evaulation_data) # Brill tagger templates templates = [ Template(brill.Pos([1, 1])), Template(brill.Pos([2, 2])),
def postag( templates=None, tagged_data=None, num_sents=1000, max_rules=300, min_score=3, min_acc=None, train=0.8, trace=3, randomize=False, ruleformat="str", incremental_stats=False, template_stats=False, error_output=None, serialize_output=None, learning_curve_output=None, learning_curve_take=300, baseline_backoff_tagger=None, separate_baseline_data=False, cache_baseline_tagger=None): """ Brill Tagger Demonstration :param templates: how many sentences of training and testing data to use :type templates: list of Template :param tagged_data: maximum number of rule instances to create :type tagged_data: C{int} :param num_sents: how many sentences of training and testing data to use :type num_sents: C{int} :param max_rules: maximum number of rule instances to create :type max_rules: C{int} :param min_score: the minimum score for a rule in order for it to be considered :type min_score: C{int} :param min_acc: the minimum score for a rule in order for it to be considered :type min_acc: C{float} :param train: the fraction of the the corpus to be used for training (1=all) :type train: C{float} :param trace: the level of diagnostic tracing output to produce (0-4) :type trace: C{int} :param randomize: whether the training data should be a random subset of the corpus :type randomize: C{bool} :param ruleformat: rule output format, one of "str", "repr", "verbose" :type ruleformat: C{str} :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow) :type incremental_stats: C{bool} :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing :type template_stats: C{bool} :param error_output: the file where errors will be saved :type error_output: C{string} :param serialize_output: the file where the learned tbl tagger will be saved :type serialize_output: C{string} :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available) :type learning_curve_output: C{string} :param learning_curve_take: how many rules plotted :type learning_curve_take: C{int} :param baseline_backoff_tagger: the file where rules will be saved :type baseline_backoff_tagger: tagger :param separate_baseline_data: use a fraction of the training data exclusively for training baseline :type separate_baseline_data: C{bool} :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get deterministic output from the baseline unigram tagger between python versions) :type cache_baseline_tagger: C{string} Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This is fast and fine for a demo, but is likely to generalize worse on unseen data. Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high). """ # defaults baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER if templates is None: from nltk.tag.brill import describe_template_sets, brill24 # some pre-built template sets taken from typical systems or publications are # available. Print a list with describe_template_sets() # for instance: templates = brill24() (training_data, baseline_data, gold_data, testing_data) = \ _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data) # creating (or reloading from cache) a baseline tagger (unigram tagger) # this is just a mechanism for getting deterministic output from the baseline between # python versions if cache_baseline_tagger: if not os.path.exists(cache_baseline_tagger): baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) with open(cache_baseline_tagger, 'w') as print_rules: pickle.dump(baseline_tagger, print_rules) print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger)) with open(cache_baseline_tagger, "r") as print_rules: baseline_tagger= pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger)) else: baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) print("Trained baseline tagger") if gold_data: print(" Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data))) # creating a Brill tagger tbrill = time.time() trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat) print("Training tbl tagger...") brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill)) if gold_data: print(" Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data)) # printing the learned rules, if learned silently if trace == 1: print("\nLearned rules: ") for (ruleno, rule) in enumerate(brill_tagger.rules(),1): print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat))) # printing template statistics (optionally including comparison with the training data) # note: if not separate_baseline_data, then baseline accuracy will be artificially high if incremental_stats: print("Incrementally tagging the test data, collecting individual rule statistics") (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data) print(" Rule statistics collected") if not separate_baseline_data: print("WARNING: train_stats asked for separate_baseline_data=True; the baseline " "will be artificially high") trainstats = brill_tagger.train_stats() if template_stats: brill_tagger.print_template_statistics(teststats) if learning_curve_output: _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take) print("Wrote plot of learning curve to {0}".format(learning_curve_output)) else: print("Tagging the test data") taggedtest = brill_tagger.tag_sents(testing_data) if template_stats: brill_tagger.print_template_statistics() # writing error analysis to file if error_output is not None: with open(error_output, 'w') as f: f.write('Errors for Brill Tagger %r\n\n' % serialize_output) f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n') print("Wrote tagger errors including context to {0}".format(error_output)) # serializing the tagger to a pickle file and reloading (just to see it works) if serialize_output is not None: taggedtest = brill_tagger.tag_sents(testing_data) with open(serialize_output, 'w') as print_rules: pickle.dump(brill_tagger, print_rules) print("Wrote pickled tagger to {0}".format(serialize_output)) with open(serialize_output, "r") as print_rules: brill_tagger_reloaded = pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(serialize_output)) taggedtest_reloaded = brill_tagger.tag_sents(testing_data) if taggedtest == taggedtest_reloaded: print("Reloaded tagger tried on test set, results identical") else: print("PROBLEM: Reloaded tagger gave different results on test set")
# default tagger >>>brown_tagged_sents = brown.tagged_sents(categories='news') >>>default_tagger = nltk.DefaultTagger('NN') >>>print default_tagger.evaluate(brown_tagged_sents) # N-gram taggers >>>from nltk.tag import UnigramTagger >>>from nltk.tag import DefaultTagger >>>from nltk.tag import BigramTagger >>>from nltk.tag import TrigramTagger # we are dividing the data into a test and train to evaluate our taggers. >>>train_data= brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] >>>test_data= brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] >>>unigram_tagger = UnigramTagger(train_data,backoff=default_tagger) >>>print unigram_tagger.evaluate(test_data) >>>bigram_tagger= BigramTagger(train_data, backoff=unigram_tagger) >>>print bigram_tagger.evaluate(test_data) >>>trigram_tagger=TrigramTagger(train_data,backoff=bigram_tagger) >>>print trigram_tagger.evaluate(test_data) # Regex tagger >>>from nltk.tag.sequential import RegexpTagger >>>regexp_tagger = RegexpTagger( [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ( r'(The|the|A|a|An|an)$', 'AT'), # articles ( r'.*able$', 'JJ'), # adjectives ( r'.*ness$', 'NN'), # nouns formed from adj ( r'.*ly$', 'RB'), # adverbs ( r'.*s$', 'NNS'), # plural nouns
import nltk from nltk.tag import UnigramTagger from nltk.tag import DefaultTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] tag1=DefaultTagger('NN') tag2=UnigramTagger(training,backoff=tag1) print(tag2.evaluate(testing))
def train_speech_recognition(): global unigram_tagger print('Loading "{0}...'.format(corpus_path)) corpus = [] ntoken = 0 n_bad = 0 with codecs.open(corpus_path, 'r', 'utf-8') as rdr: sent = [] good = True for line0 in rdr: line = line0.strip() if len(line) == 0: if good: corpus.append(sent) ntoken += len(sent) if len(corpus) >= max_sent: break else: n_bad += 1 good = True sent = [] else: tx = line.split('\t') if len(tx) < 2: good = False else: word = tx[0].lower() pos = tx[1].lower() sent.append((word, pos)) print('done, {0} good sentences, {1} ntoken'.format(len(corpus), ntoken)) # ---------------------------------------------------------------------- n_patterns = len(corpus) n_test = int(n_patterns * 0.1) n_train = n_patterns - n_test print('n_test={0} n_train={1}'.format(n_test, n_train)) data_indeces = [x for x in range(n_patterns)] np.random.shuffle(data_indeces) test_indeces = data_indeces[:n_test] train_indeces = data_indeces[n_test:] train_corpus = [corpus[i] for i in train_indeces] test_corpus = [corpus[i] for i in test_indeces] # ---------------------------------------------------------------------- default_tagger = DefaultTagger(u'СУЩЕСТВРТЕЛЬНОЕ') # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 1-suffixes...' ) suffix1_tagger = AffixTagger(train_corpus, affix_length=-1, backoff=default_tagger) # print( 'Testing...' ) acc = suffix1_tagger.evaluate(test_corpus) # print( 'AffixTagger(1) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 2-suffixes...' ) suffix2_tagger = AffixTagger(train_corpus, affix_length=-2, backoff=suffix1_tagger) # print( 'Testing...' ) acc = suffix2_tagger.evaluate(test_corpus) # print( 'AffixTagger(2,1) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 3-suffixes...' ) suffix3_tagger = AffixTagger(train_corpus, affix_length=-3, backoff=suffix2_tagger) # print( 'Testing...' ) acc = suffix3_tagger.evaluate(test_corpus) # print( 'AffixTagger(3,2,1) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Training AffixTagger on 4,3,2-suffixes...' ) suffix4_tagger = AffixTagger(train_corpus, affix_length=-4, backoff=suffix3_tagger) # print( 'Testing...' ) acc = suffix4_tagger.evaluate(test_corpus) # print( 'AffixTagger(4,3,2) accuracy={0}\n'.format(acc) ) # ---------------------------------------------------------------------- # print( 'Testing UnigramTagger + AffixTagger(4,3,2,1)...' ) unigram_tagger = UnigramTagger(train_corpus, backoff=suffix4_tagger) # print(unigram_tagger.tag(word_tokenize("погода на завтра в Одессе"))) acc = unigram_tagger.evaluate(test_corpus) # print( 'UnigramTagger+AffixTagger(4,3,2,1) accuracy={0}\n'.format(acc) ) cache_model()
from nltk.tag import UnigramTagger, DefaultTagger from nltk.corpus import treebank from tag_util import train_sents, test_sents # train default_tagger = DefaultTagger('NN') tagger = UnigramTagger(train_sents, backoff=default_tagger) # test print(tagger.evaluate(test_sents)) # save to pickle import pickle with open('unitagger.pkl', 'wb') as output: pickle.dump(tagger, output) # load from pickle with open('unitagger.pkl', 'rb') as data_file: tagger2 = pickle.load(data_file) print(tagger2.evaluate(test_sents)) # or nltk.data.load('unitagger.pkl') to load
Aquí el "taggeador" coge las N palabras previas para clasificar correctamente la nueva palabra ''' from nltk.tag import UnigramTagger from nltk.tag import DefaultTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger # we are dividing the data into a test and train to evaluate our taggers. train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] #Unigram selecciona la clasificación + probable #https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.UnigramTagger unigram_tagger = UnigramTagger(train_data,backoff=default_tagger) print("Unigram Tagger: {}".format(unigram_tagger.evaluate(test_data))) #Bigram se basa en la palabra actual y la anterior para clasificar #https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.BigramTagger bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger) print("Bigram Tagger: {}".format(bigram_tagger.evaluate(test_data))) #Trigram se basa en la actual, anterior y anterior a la anterior #https://www.nltk.org/api/nltk.tag.html?highlight=postagger#nltk.tag.sequential.TrigramTagger trigram_tagger = TrigramTagger(train_data,backoff=bigram_tagger) print("Trigram Tagger: {}".format(trigram_tagger.evaluate(test_data))) ''' Aquí lo que se ha hecho ha sido crear 3 "taggeadores" N-Gram con un conjunto de datos de entrenamiento del corpus brown, que ya estaba clasificado. Además, se han podido combinar para que cuando un "taggeador" no sepa que hacer pruebe con su "taggeador" N-1 hasta llegar al por defecto de clasificarlo como NN.
# 0.13089484257215028 # N-gram 标注器 from nltk.tag import UnigramTagger from nltk.tag import TrigramTagger from nltk.tag import BigramTagger from nltk.tag import DefaultTagger # we are dividing the data into a test and train to evaluate our taggers. train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] unigram_tagger = UnigramTagger(train_data, backoff=default_tagger) # unigram_tagger = UnigramTagger(train_data, backoff=regexp_tagger) print(unigram_tagger.evaluate(test_data)) # 0.8361407355726104 bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger) print(bigram_tagger.evaluate(test_data)) # 0.8452108043456593 trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger) print(trigram_tagger.evaluate(test_data)) # 0.843317053722715 # 命名实体识别 # NER tagger from nltk import ne_chunk from nltk import word_tokenize sent = "Mark is studying at Stanford University in California"
import nltk from nltk.tag import UnigramTagger from nltk.tag import DefaultTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training = treebank.tagged_sents()[:7000] tag1 = DefaultTagger('NN') tag2 = UnigramTagger(training, backoff=tag1) print(tag2.evaluate(testing))
import nltk import pickle from os import listdir from os.path import isfile, join from nltk.tag import UnigramTagger from nltk import word_tokenize corpusRoot = 'nltk_data/corpora/brown/' onlyfiles = [f for f in listdir(corpusRoot) if isfile(join(corpusRoot, f))] corpus = nltk.corpus.reader.tagged.TaggedCorpusReader(corpusRoot, onlyfiles) trainSents = corpus.tagged_sents()[:3000] testSents = corpus.tagged_sents()[3000:] unigramTagger = UnigramTagger(trainSents) print("Test accuracy = " + str(unigramTagger.evaluate(testSents))) fileName = 'tagfile' fileObj = open(fileName, 'wb') pickle.dump(unigramTagger, fileObj) fileObj.close()
## Run function process_tagging() ### BIGRAM AND TRIGRAM TAGGERS ####################################### from nltk.corpus import treebank from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger ## Data reader = treebank train_sents = reader.tagged_sents()[:3000] test_sents = reader.tagged_sents()[3000:6000] ## Unigram tagger tagger = UnigramTagger(train_sents) print("tagger accuracy:", tagger.evaluate(test_sents)) ## Chaining taggers/ backoff tagging tagger2 = BigramTagger(train_sents) tagger3 = UnigramTagger(train_sents, backoff=tagger2) print("tagger accuracy:", tagger3.evaluate( test_sents)) ## Chaining decreses accuracy - too much context is not good ## Chaining taggers- advanced mode def backoff_tagger(train_sents, tagger_classes, backoff=None): for cls in tagger_classes: backoff = cls(train_sents, backoff=backoff) return backoff
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) unigram_accuracies = [] bigram_accuracies = [] trigram_accuracies = [] backoff_accuracies = [] tnt_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make unigram tagger unigram_tagger = UnigramTagger(train_sents) # evaluate unigram tagger unigram_accuracy = None unigram_accuracy = unigram_tagger.evaluate(test_sents) unigram_accuracies.append(unigram_accuracy) print('Unigram:', unigram_accuracy) # make bigram tagger bigram_tagger = BigramTagger(train_sents) # evaluate bigram tagger bigram_accuracy = None bigram_accuracy = bigram_tagger.evaluate(test_sents) bigram_accuracies.append(bigram_accuracy) print('Bigram:', bigram_accuracy) # make trigram tagger trigram_tagger = TrigramTagger(train_sents) # evaluate trigram tagger trigram_accuracy = None trigram_accuracy = trigram_tagger.evaluate(test_sents) trigram_accuracies.append(trigram_accuracy) print('Trigram:', trigram_accuracy) # make 1, 2, 3-gram backoff tagger tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger3 = TrigramTagger(train_sents, backoff=tagger2) # evaluate trigram tagger backoff_accuracy = None backoff_accuracy = tagger3.evaluate(test_sents) backoff_accuracies.append(backoff_accuracy) print('1, 2, 3-gram backoff:', backoff_accuracy) # make tnt tagger tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) # evaulate tnt tagger tnt_accuracy = None tnt_accuracy = tnt_tagger.evaluate(test_sents) tnt_accuracies.append(tnt_accuracy) print('TnT:', tnt_accuracy) final_accuracies_list = [] mean_accuracy_unigram = mean(unigram_accuracies) standard_deviation_unigram = stdev(unigram_accuracies) uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}} final_accuracies_list.append(uni) mean_accuracy_bigram = mean(bigram_accuracies) standard_deviation_bigram = stdev(bigram_accuracies) bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}} final_accuracies_list.append(bi) mean_accuracy_trigram = mean(trigram_accuracies) standard_deviation_trigram = stdev(trigram_accuracies) tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}} final_accuracies_list.append(tri) mean_accuracy_backoff = mean(backoff_accuracies) standard_deviation_backoff = stdev(backoff_accuracies) back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}} final_accuracies_list.append(back) mean_accuracy_tnt = mean(tnt_accuracies) standard_deviation_tnt = stdev(tnt_accuracies) tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}} final_accuracies_list.append(tnt_score) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
(r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')] rt = RegexpTagger(patterns) # accuracy on test data print(rt.evaluate(test_data)) from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) #testing perfomence of unigram tagger print(ut.evaluate(test_data)) print(ut.tag(tokens)) #testing perfomence of bigram tagger print(bt.evaluate(test_data)) print(bt.tag(tokens)) #testing perfomence of trigram tagger print(tt.evaluate(test_data)) print(tt.tag(tokens)) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff
training = treebank.tagged_sents()[0:3500] testing = treebank.tagged_sents()[3500:] print(len(treebank.tagged_sents())) #tagger = DefaultTagger('NN') #----------------------------------------------------- from nltk.tag import UnigramTagger unigramTagger = UnigramTagger(training, cutoff=2) # same as tagger.train(training) print('Uniigram tagger accuracy:') print(unigramTagger.evaluate(testing)) #----------------------------------------------------- print('Bigram tagger accuracy:') from nltk.tag import BigramTagger bigramTagger = BigramTagger(training) print(bigramTagger.evaluate(testing)) #----------------------------------------------------- print('Trigram tagger accuracy:') from nltk.tag import TrigramTagger
from nltk.tag import UnigramTagger from nltk.corpus import treebank # train train_sents = treebank.tagged_sents()[:3000] tagger = UnigramTagger(train_sents) print(treebank.sents()[0]) print(tagger.tag(treebank.sents()[0])) # test test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents))
''' all_sentences = brown.sents() len_sent = len(all_sentences) X_train = tagged_sentences[:int(len(tagged_sentences) * 0.8)] X_test = tagged_sentences[int(len(tagged_sentences) * 0.8):] ''' Question 2 - Performance of 0.13, 0.9 and 0.91 ''' # using only the default - NN - 0.1308 default_tagger = nltk.DefaultTagger('NN') print(default_tagger.evaluate(tagged_sentences)) # Unigrams - 0.902 unigram_tagger = UnigramTagger(X_train) print(unigram_tagger.evaluate(X_test)) # Bigrams with backoff of unigrams - 0.911 bigram_tagger = BigramTagger(X_train, backoff=unigram_tagger) print(bigram_tagger.evaluate(X_test)) ''' Question 3 Performace of 0.77 and 0.79 ''' treebank_tagged_sents = nltk.corpus.treebank.tagged_sents(tagset='universal') print(default_tagger.evaluate(treebank_tagged_sents)) print(unigram_tagger.evaluate(treebank_tagged_sents)) # 0.77 print(bigram_tagger.evaluate(treebank_tagged_sents)) # 0.79 ''' Question 4-5 - F1 of 0.972 for brown dataset. Better performance '''
def postag( templates=None, tagged_data=None, num_sents=1000, max_rules=300, min_score=3, min_acc=None, train=0.8, trace=3, randomize=False, ruleformat="str", incremental_stats=False, template_stats=False, error_output=None, serialize_output=None, learning_curve_output=None, learning_curve_take=300, baseline_backoff_tagger=None, separate_baseline_data=False, cache_baseline_tagger=None): """ Brill Tagger Demonstration :param templates: how many sentences of training and testing data to use :type templates: list of Template :param tagged_data: maximum number of rule instances to create :type tagged_data: C{int} :param num_sents: how many sentences of training and testing data to use :type num_sents: C{int} :param max_rules: maximum number of rule instances to create :type max_rules: C{int} :param min_score: the minimum score for a rule in order for it to be considered :type min_score: C{int} :param min_acc: the minimum score for a rule in order for it to be considered :type min_acc: C{float} :param train: the fraction of the the corpus to be used for training (1=all) :type train: C{float} :param trace: the level of diagnostic tracing output to produce (0-4) :type trace: C{int} :param randomize: whether the training data should be a random subset of the corpus :type randomize: C{bool} :param ruleformat: rule output format, one of "str", "repr", "verbose" :type ruleformat: C{str} :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow) :type incremental_stats: C{bool} :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing :type template_stats: C{bool} :param error_output: the file where errors will be saved :type error_output: C{string} :param serialize_output: the file where the learned tbl tagger will be saved :type serialize_output: C{string} :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available) :type learning_curve_output: C{string} :param learning_curve_take: how many rules plotted :type learning_curve_take: C{int} :param baseline_backoff_tagger: the file where rules will be saved :type baseline_backoff_tagger: tagger :param separate_baseline_data: use a fraction of the training data exclusively for training baseline :type separate_baseline_data: C{bool} :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get deterministic output from the baseline unigram tagger between python versions) :type cache_baseline_tagger: C{string} Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This is fast and fine for a demo, but is likely to generalize worse on unseen data. Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high). """ # defaults baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER if templates is None: from nltk.tag.brill import describe_template_sets, brill24 # some pre-built template sets taken from typical systems or publications are # available. Print a list with describe_template_sets() # for instance: templates = brill24() (training_data, baseline_data, gold_data, testing_data) = \ _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data) # creating (or reloading from cache) a baseline tagger (unigram tagger) # this is just a mechanism for getting deterministic output from the baseline between # python versions if cache_baseline_tagger: if not os.path.exists(cache_baseline_tagger): baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) with open(cache_baseline_tagger, 'w') as print_rules: pickle.dump(baseline_tagger, print_rules) print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger)) with open(cache_baseline_tagger, "r") as print_rules: baseline_tagger= pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger)) else: baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) print("Trained baseline tagger") if gold_data: print(" Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data))) # creating a Brill tagger tbrill = time.time() trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat) print("Training tbl tagger...") brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill)) if gold_data: print(" Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data)) # printing the learned rules, if learned silently if trace == 1: print("\nLearned rules: ") for (ruleno, rule) in enumerate(brill_tagger.rules(),1): print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat))) # printing template statistics (optionally including comparison with the training data) # note: if not separate_baseline_data, then baseline accuracy will be artificially high if incremental_stats: print("Incrementally tagging the test data, collecting individual rule statistics") (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data) print(" Rule statistics collected") if not separate_baseline_data: print("WARNING: train_stats asked for separate_baseline_data=True; the baseline " "will be artificially high") trainstats = brill_tagger.train_stats() if template_stats: brill_tagger.print_template_statistics(teststats) if learning_curve_output: _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take) print("Wrote plot of learning curve to {0}".format(learning_curve_output)) else: print("Tagging the test data") taggedtest = brill_tagger.batch_tag(testing_data) if template_stats: brill_tagger.print_template_statistics() # writing error analysis to file if error_output is not None: with open(error_output, 'w') as f: f.write('Errors for Brill Tagger %r\n\n' % serialize_output) for e in error_list(gold_data, taggedtest): f.write(e+'\n') print("Wrote tagger errors including context to {0}".format(error_output)) # serializing the tagger to a pickle file and reloading (just to see it works) if serialize_output is not None: taggedtest = brill_tagger.batch_tag(testing_data) with open(serialize_output, 'w') as print_rules: pickle.dump(brill_tagger, print_rules) print("Wrote pickled tagger to {0}".format(serialize_output)) with open(serialize_output, "r") as print_rules: brill_tagger_reloaded = pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(serialize_output)) taggedtest_reloaded = brill_tagger.batch_tag(testing_data) if taggedtest == taggedtest_reloaded: print("Reloaded tagger tried on test set, results identical") else: print("PROBLEM: Reloaded tagger gave different results on test set")
import nltk from nltk.corpus import treebank from nltk.tag import UnigramTagger training= treebank.tagged_sents()[:7000] unitagger=UnigramTagger(training) testing = treebank.tagged_sents()[2000:] print(unitagger.evaluate(testing))
print("------------Unigram Tagger Trained with cutoff=3------------") unigramTagger = UnigramTagger(brown_train_sents, cutoff=3) print(unigramTagger.tag(sent)) print("------------Bigram Tagger------------") print(bigramTagger.tag(sent)) print("------------Trigram Tagger------------") print(trigramTagger.tag(sent)) print("------------Brill Tagger------------") print(brillTagger.tag(sent)) print("------------Accuracy: Unigram Tagger Trained------------") unigramTagger = UnigramTagger(brown_train_sents) print(unigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Unigram Tagger Trained with cutoff = 3------------") unigramTagger = UnigramTagger(brown_train_sents, cutoff = 3) print(unigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Bigram Tagger Trained------------") print(bigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Trigram Tagger Trained------------") print(trigramTagger.evaluate(brown_test_sents)) print("------------Accuracy: Unigram Tagger with backoff enabled. Backoff Chain: UnigramTagger -> DefaultTagger------------") unigramTagger = UnigramTagger(brown_train_sents, backoff=defaultTagger) print(unigramTagger.evaluate(brown_test_sents))
brown_tagged_sents = brown.tagged_sents(categories='news') #print(brown_tagged_sents) # [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL')], ...] default_tagger = nltk.DefaultTagger('NN') print(default_tagger.evaluate(brown_tagged_sents)) # 0.13089484257215028 brown_tagged_sents2 = [[('The', 'AT'), ('Fulton', 'NP-TL'), ('manner', 'NN')]] print(default_tagger.evaluate(brown_tagged_sents2)) # 0.3333333333333333 train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] unigram_tagger = UnigramTagger(train_data, backoff=default_tagger) print(unigram_tagger.evaluate(test_data)) # 0.835841722316356 bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger) print(bigram_tagger.evaluate(test_data)) # 0.8454101465164956 trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger) print(trigram_tagger.evaluate(test_data)) # 0.8427190272102063 regexp_tagger = RegexpTagger( [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ( r'(The|the|A|a|An|an)$', 'AT'), # articles ( r'.*able$', 'JJ'), # adjectives ( r'.*ness$', 'NN'), # nouns formed from adj
def indivUnigram(bambara,backoff): unigram= UnigramTagger(bambara.train_sents, backoff=backoff) print("Unigram accuracy: ",unigram.evaluate(bambara.test_sents)) return unigram
rt = RegexpTagger(patterns) print rt.evaluate(test_data) print rt.tag(tokens) ## N gram taggers from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) print ut.evaluate(test_data) print ut.tag(tokens) print bt.evaluate(test_data) print bt.tag(tokens) print tt.evaluate(test_data) print tt.tag(tokens) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger],
# Test and training variables test_sents = treebank.tagged_sents()[3000:] train_sents = treebank.tagged_sents()[:3000] tk_sample = word_tokenize(sample) # Default tagger - Nouns df_tagger = DefaultTagger('NN') tagged = df_tagger.tag(tk_sample) accuracy = df_tagger.evaluate(test_sents) print(f"Tagged text: {tagged}; acc = {accuracy}\n") # Unigram tagger ug_tagger = UnigramTagger(train_sents) tagged = ug_tagger.tag(tk_sample) accuracy = ug_tagger.evaluate(test_sents) print(f"Tagged text: {tagged}; acc = {accuracy}\n") # Backoff tagger: rely on other tagger(backoff) when the current one does not know how to evaluate ugb_tagger = UnigramTagger(train_sents, backoff=df_tagger) accuracy = ugb_tagger.evaluate(test_sents) print(f"Accuracy of backoff: {accuracy}\n") # Saving pickle and testing it. with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'wb') as file: pickle.dump(ugb_tagger, file) with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'rb') as file: pk_tagger = pickle.load(file) accuracy = pk_tagger.evaluate(test_sents)