def load_data(self, percentage): print("Started Loading the Data") # Get the complete data data_set = treebank.fileids() # Partition the data into train and test data sets training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)] testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)] # How much percentage of files consider for training? index = int(percentage*len(training_data_fileIds)) training_data_fileIds = training_data_fileIds[:index] tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds) tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds) tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds) tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds) # print(len(tagged_training_data1), len(tagged_testing_data1)) # UnTag the data for other uses untagged_training_data = [untag(item) for item in tagged_training_data] untagged_testing_data = [untag(item) for item in tagged_testing_data] print("Data Loaded Successfully. Stats are") print("Training Data Sentences: ", len(tagged_training_data)) print("Testing Data Sentences: ", len(tagged_testing_data)) return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
def _untag_sequence(tagged): try: if isinstance(tagged[0][0], str): return tuple(untag(tagged)) else: return [tuple(untag(t)) for t in tagged] except IndexError: return []
def trainUniTnT(self): """train unigram and tnt seperatly without DefaultTagger""" self.split_into_folds() for k in range(1, (self.folds + 1)): train_sents = sum(self.foldlist[: (self.folds - 1)], []) tnt_tagger = tnt.TnT(N=100) tnt_tagger.train(train_sents) print(str(k) + " fold: tnt evaluated") unigram = UnigramTagger(train_sents) print(str(k) + " fold: unigram evaluated") to_tag = [untag(i) for i in self.foldlist[self.folds - 1]] self.tnt_tagged += tnt_tagger.tag_sents(to_tag) self.uni_tagged += unigram.tag_sents(to_tag) self.org_tagged += self.foldlist[self.folds - 1] self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)] self.tnt = tnt_tagger self.unigram = unigram self.tnt_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.tnt_tagged, [])) self.uni_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.uni_tagged, [])) print("Accuracy of concatenated tnt-tagged sentences: ", self.tnt_avg_acc) print("Accuracy of concatenated unigram-tagged sentences: ", self.uni_avg_acc) (self.tnt_tagprecision, self.tnt_tagrecall) = self.tagprecision_recall( tnt_tagger, self.tnt_tagged, self.org_tagged ) (self.unigram_tagprecision, self.unigram_tagrecall) = self.tagprecision_recall( unigram, self.uni_tagged, self.org_tagged ) # delete following values so that trainRegexp has the inicial values self.org_tagged = [] self.foldlist = [] for i in range(1, self.folds + 1): self.foldlist.append(self.create_fold(i))
def update_category_by_pos(): from nltk.corpus import brown from nltk import NaiveBayesClassifier from nltk import classify from nltk.tag import untag from nltk import DecisionTreeClassifier def pos_features(sentence, i): features = {'suffix(1)':sentence[i][-1:], 'suffix(2)':sentence[i][-2:], 'suffix(3)':sentence[i][-3:] } features['prev-word'] = '<start>' if i==0 else sentence[i-1] return features print pos_features(brown.sents()[0], 8) tagged_sents = brown.tagged_sents(categories='news') featuresets = [] for tagged_sent in tagged_sents: untagged_sent = untag(tagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append((pos_features(untagged_sent, i), tag)) size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = NaiveBayesClassifier.train(train_set) classifier = DecisionTreeClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
def entropy_of_words(self, tagged_data): """ Takes in tagged words as input and return tag entropies of the words """ tagged_words_fdist = Counter(tagged_data) total_no_of_tagged_words = sum(tagged_words_fdist.values()) untagged_data = untag(tagged_data) untagged_words_fdist = Counter(untagged_data) total_no_of_words = sum(untagged_words_fdist.values()) # Create Word Tags dictionary as shown below in format word_tags = dict() # {word:{(word,tag1),....(word,tagN)},.....} for tagged_word in tagged_words_fdist.keys(): if tagged_word[0] in word_tags.keys(): word_tags[tagged_word[0]].add(tagged_word) else: word_tags[tagged_word[0]] = set() word_tags[tagged_word[0]].add(tagged_word) # Compute the entropies of the words entropies = dict() for word in untagged_words_fdist.keys(): entropies[word] = 0 tagged_words = word_tags[word] for tagged_word in tagged_words: yi = tagged_words_fdist[tagged_word]/untagged_words_fdist[word] entropies[word] += -(yi*log2(yi)) entropies = sorted(entropies.items(), key=operator.itemgetter(1), reverse=True) return entropies, word_tags, tagged_words_fdist
def trainRegexp(self, backoff): self.split_into_folds() for k in range(1, (self.folds + 1)): train_sents = sum(self.foldlist[: (self.folds - 1)], []) if self.option_tone == "tonal" and self.option_tag == "Affixes": regex = RegexpTonalSA(backoff) if self.option_tone == "tonal" and self.option_tag == "POS": regex = RegexpTonal(backoff) if self.option_tone == "nontonal" and self.option_tag == "Affixes": regex = RegexpSA(backoff) if self.option_tone == "nontonal" and self.option_tag == "POS": regex = Regexp(backoff) to_tag = [untag(i) for i in self.foldlist[self.folds - 1]] self.regex_tagged += regex.tag_sents(to_tag) self.org_tagged += self.foldlist[self.folds - 1] self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)] self.regex = regex self.regex_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.regex_tagged, [])) print("Accuracy of concatenated regexp-tagged sentences: ", self.regex_avg_acc) (self.regex_tagprecision, self.regex_tagrecall) = self.tagprecision_recall( regex, self.regex_tagged, self.org_tagged ) self.org_tagged = [] self.foldlist = [] for i in range(1, self.folds + 1): self.foldlist.append(self.create_fold(i))
def demo(self, test_sents): tagger = CRFTagger(feature_func=self.feature_detector) tagger.set_model_file(self.modelpath) for sent in test_sents: tagged = tagger.tag(untag(sent)) for s in self._to_sentence(tagged): print(s) print(tagger.evaluate(test_sents))
def calc_switched_words(bambara, tagger, tagpairs): '''iterates over the switched tag-pairs to find all the words which are responsible for these switches''' untag_testsents = [untag(i) for i in bambara.test_sents] tagger_tagged_sents = tagger.tag_sents(untag_testsents) compareTags = list(zip(sum(tagger_tagged_sents,[]), sum(bambara.test_sents,[]))) word_tag_list = sum(bambara.reader.tagged_sents, []) switch_list = [(i[0],i[1]) for i in tagpairs] for i in switch_list: calc_one_switched_word(i[0], i[1], compareTags, word_tag_list)
def test_MLT(self, model, tagged_testing_data): """ Takes the model as well as the tagged testing data SENTENCES to tag the words using most likely tag tagger :param model dictionary to use for most likely tag tagger :param tagged_testing_data testing data :return accuracy """ untagged_testing_data = [untag(item) for item in tagged_testing_data] for sent in untagged_testing_data: for word in untagged_testing_data: pass return
def train(cls,train_sents, feature_extractor, classifier_cls,**kwargs): train_set = [] for tagged_sent in train_sents: untagged_sent = untag(tagged_sent) history = [] for i, (word,tag) in enumerate(tagged_sent): featureset = feature_extractor(untagged_sent,i, history) train_set.append((featureset,tag)) history.append(tag) classifier = classifier_cls.train(train_set, **kwargs) return cls(feature_extractor, classifier)
def train(cls, train_sents, feature_extractor, classifier_cls, **kwargs): train_set = [] for tagged_sent in train_sents: untagged_sent = untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): featureset = feature_extractor(untagged_sent, i, history) train_set.append((featureset, tag)) history.append(tag) classifier = classifier_cls.train(train_set, **kwargs) return cls(feature_extractor, classifier)
def __init__(self): boundary = int(len(brown.tagged_sents())*0.8) train_naive = brown.tagged_sents(simplify_tags=True)[:boundary] temp_train_data = [] for sentence in train_naive: untagged_sent = untag(sentence) history = [] for i, (word, tag) in enumerate(sentence): temp_train_data.append((self.featextract(untagged_sent, i, history), tag)) history.append(tag) self.bayes=naivebayes.NaiveBayesClassifier.train(temp_train_data)
def evaluate(self, gold): """ Score the accuracy of the tagger against the gold standard. Strip the tags from the gold standard text, retag it using the tagger, then compute the accuracy score. :type gold: list(list(tuple(str, str))) :param gold: The list of tagged sentences to score the tagger on. :rtype: float """ from nltk.tag import untag tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = sum(gold, []) test_tokens = sum(tagged_sents, []) return self.accuracy(gold_tokens, test_tokens)
def demo(): myTagger = Tagger() from nltk.tag import untag # Load the brown corpus. from nltk.corpus import brown #brown_train = brown.tagged_sents()[100:] brown_test = brown.tagged_sents()[:100] test_sent = untag(brown_test[1]) score = myTagger.evaluate(brown_test) print "Score: ", score result = myTagger.tag(test_sent) print result print myTagger.tag(['drink', 'some', 'water'])
def read_and_write(): flist = [] path = "C:\\Users\\gsree\\OneDrive\\Desktop\\Book2.xlsx" wb_obj = openpyxl.load_workbook(path) sheet_obj = wb_obj.active m_row = sheet_obj.max_row for i in range(2, m_row + 1): cell_obj = sheet_obj.cell(row=i, column=1) result = summarization(cell_obj.value) ci = sheet_obj.cell(row=i, column=8) ci.value = result tok = word_tokenize(result) tagged = pos_tag(tok) nn_vb_tagged = [(word, tag) for word, tag in tagged if tag in ('NN', 'JJ', 'NNS', 'VBN', 'RB', 'VBG', 'VB') ] mylist = untag(nn_vb_tagged) cj = sheet_obj.cell(row=i, column=9) str = "" for l in mylist: str = str + l + "," cj.value = str mylist = (list(dict.fromkeys(mylist))) #print(mylist) ck = sheet_obj.cell(row=i, column=10) string_story = [] string_story = (create_user_story(mylist)) #print(string_story) st = "" for i in string_story: st = st + i ck.value = st flist.append(string_story) #print(flist) wb_obj.save("C:\\Users\\gsree\\OneDrive\\Desktop\\Book2.xlsx") generate_html(final_user_story(flist))
def load_data(self, data_set): """ Loads the given data set. Makes the data set case insensitive. Remove words that appear less than 5 times. :return updated data set """ print("Started Loading the Data") tagged_tokens = data_set.tagged_words() tokens = untag(tagged_tokens) # Get the list of words that appear less than 5 times in Corpus print("Get LT5's") tokens = [token.lower() for token in tokens] # Convert to lower case freq_dist = FreqDist(tokens) # Compute the freq dist tokens_lt_5 = [word for word, count in freq_dist.items() if count < 5] # Delete words less than 5 and make the corpus insensitive print("Making data case insensitive") token_range = range(len(tagged_tokens)) indexed_tokens = OrderedDict(zip(token_range,tagged_tokens)) updated_tagged_tokens = OrderedDict() for tagged_token_id, tagged_token in indexed_tokens.items(): if tagged_token[0].lower() in tokens_lt_5: del indexed_tokens[tagged_token_id] else: temp = list() temp.append(tagged_token[0].lower()) temp.append(tagged_token[1]) temp = tuple(temp) updated_tagged_tokens[tagged_token_id] = temp tagged_tokens = list(updated_tagged_tokens.values()) # Pickle the data for future purpose print("Pickling the Updated Corpus") if data_set == brown: file_name = "q5_brown_updated.pkl" else: file_name = "q5_treebank_updated.pkl" pkl.dump((tagged_tokens, tokens_lt_5), open(file_name,'wb')) return tagged_tokens, tokens_lt_5
def get_statistics_per_tag(corpus_test, tagger): from nltk.tag import untag possible_tags = get_all_tags(corpus_test) untaged_test = [untag(x) for x in corpus_test] tagged_sents = tagger.tag_sents(untaged_test) ref_words = sum(corpus_test, []) test_words = sum(tagged_sents, []) best_fmeasure = 0 best_tag = "Chtulhu" worst_tag_fmeasure = 100 worst_tag = "cthulhu" for tag in possible_tags: f_measure = evaluate_tag(ref_words, test_words, tag) if f_measure > best_fmeasure: best_tag = tag best_fmeasure = f_measure if f_measure < worst_tag_fmeasure: worst_tag = tag worst_tag_fmeasure = f_measure print "best tag: ", best_tag print "worst tag: ", worst_tag
def calculate_contingenz_with_sets(self, tagger): """ Compares the original tags with the tags created by the tagger. """ tagger_tagged = tagger.tag_sents([untag(i) for i in self.test_sents]) tagger_words = sum(tagger_tagged,[]) original_tagged = self.test_sents original_words = sum(original_tagged,[]) tagged_org_zip = zip([i[1] for i in original_words],[i[1] for i in tagger_words]) contingenzliste = [] orig_tags = [] tag_tags = [] for i in tagged_org_zip: if i[0] != i[1]: if i[1] == None: i = (i[0], "None") contingenzliste.append(i[1]+" : "+i[0]+"\n") orig_tags.append(i[0]) tag_tags.append(i[1]) self.contingenzliste = self.contingenzliste + contingenzliste self.reference_tags = self.reference_tags + orig_tags self.test_tags = self.test_tags + tag_tags
print "unique words that are None: ", len(self.distinct_nones) print "precentage of none tokesns to overall tokes: ", float(self.overall_nones) / float( self.overall_tokens_tagged) print "precentage of unique nones to overall unique ", float(len(self.distinct_nones)) / float( self.get_overall_distinct()) if __name__ == "__main__": # split the brown corpus to test, dev, and test set all_words = corpus.brown.tagged_sents(tagset='universal') ds_length = len(all_words) train = all_words[int(0.2 * ds_length):] dev = all_words[:int(0.1 * ds_length)] test = all_words[int(0.1 * ds_length):int(0.2 * ds_length)] untagged_dev = [untag(item) for item in dev] words_in_dev = 0 for item in untagged_dev: words_in_dev += len(item) print "overall words in dev : ", words_in_dev u1 = SimpleUnigramTagger(train) tagged_dev = u1.tag_sents(untagged_dev) print len(tagged_dev) none_count = 0 for sent in tagged_dev: for tagged_word in sent: if tagged_word[1] is None: none_count += 1 print "Number of Nones in dev is: ", none_count print "number of options per (token, word) is:"
# every tagger has a tag() method. # DefaultTagger is a subclass of SequentialBackoffTagger which has a choose_tag() method. from nltk.tag import DefaultTagger from nltk.corpus import treebank tagger = DefaultTagger('NN') print(tagger.tag(['Hello', 'World'])) # thought it's too simple, we can try to evaluate it test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents)) # for sentences print(tagger.tag_sents([['Hello', 'World', '.'], ['How', 'are', 'you', '?']])) # untagging from nltk.tag import untag print(untag([('Hello', 'NN'), ('World', 'NN')]))
def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): """ Trains the Brill tagger on the corpus *train_sents*, producing at most *max_rules* transformations, each of which reduces the net number of errors in the corpus by at least *min_score*, and each of which has accuracy not lower than *min_acc*. #imports >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Pos, Word >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer #some data >>> from nltk.corpus import treebank >>> training_data = treebank.tagged_sents()[:100] >>> baseline_data = treebank.tagged_sents()[100:200] >>> gold_data = treebank.tagged_sents()[200:300] >>> testing_data = [untag(s) for s in gold_data] >>> backoff = RegexpTagger([ ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives ... (r'.*ly$', 'RB'), # adverbs ... (r'.*s$', 'NNS'), # plural nouns ... (r'.*ing$', 'VBG'), # gerunds ... (r'.*ed$', 'VBD'), # past tense verbs ... (r'.*', 'NN') # nouns (default) ... ]) >>> baseline = backoff #see NOTE1 >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS 0.2450142... #templates >>> Template._cleartemplates() #clear any templates created in earlier tests >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] #construct a BrillTaggerTrainer >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) >>> tagger1 = tt.train(training_data, max_rules=10) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) Finding initial useful rules... Found 845 useful rules. <BLANKLINE> B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 47 63 16 161 | NN->IN if Pos:NNS@[-1] 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0] 22 27 5 24 | NN->-NONE- if Pos:VBD@[-1] 17 17 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger1.rules()[1:3] (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')])) >>> train_stats = tagger1.train_stats() >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] >>> tagger1.print_template_statistics(printunused=False) TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) TRAIN ( 2417 tokens) initial 1775 0.2656 final: 1269 0.4750 #ID | Score (train) | #Rules | Template -------------------------------------------- 001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0])) 000 | 201 0.397 | 3 0.300 | Template(Pos([-1])) <BLANKLINE> <BLANKLINE> >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS 0.43996... >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'), ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'), ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] True >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] # a high-accuracy tagger >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) Finding initial useful rules... Found 845 useful rules. <BLANKLINE> B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 36 36 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | NN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | NN->, if Pos:NNS@[-1] & Word:,@[0] 19 19 0 6 | NN->VB if Pos:TO@[-1] 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0] 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS 0.44159544... >>> tagger2.rules()[2:4] (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger, # with a RegexpTagger only as backoff. For instance, # >>> baseline = UnigramTagger(baseline_data, backoff=backoff) # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results # between python versions. The simplistic backoff above is a workaround to make doctests # get consistent input. :param train_sents: training data :type train_sents: list(list(tuple)) :param max_rules: output at most max_rules rules :type max_rules: int :param min_score: stop training when no rules better than min_score can be found :type min_score: int :param min_acc: discard any rule with lower accuracy than min_acc :type min_acc: float or None :return: the learned tagger :rtype: BrillTagger """ # FIXME: several tests are a bit too dependent on tracing format # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates # Basic idea: Keep track of the rules that apply at each position. # And keep track of the positions to which each rule applies. # Create a new copy of the training corpus, and run the # initial tagger on it. We will progressively update this # test corpus to look more like the training corpus. test_sents = [list(self._initial_tagger.tag(untag(sent))) for sent in train_sents] # Collect some statistics on the training process trainstats = {} trainstats['min_acc'] = min_acc trainstats['min_score'] = min_score trainstats['tokencount'] = sum(len(t) for t in test_sents) trainstats['sequencecount'] = len(test_sents) trainstats['templatecount'] = len(self._templates) trainstats['rulescores'] = [] trainstats['initialerrors'] = sum( tag[1] != truth[1] for paired in zip(test_sents, train_sents) for (tag, truth) in zip(*paired) ) trainstats['initialacc'] = 1 - trainstats['initialerrors']/trainstats['tokencount'] if self._trace > 0: print("TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; " "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format(**trainstats)) # Initialize our mappings. This will find any errors made # by the initial tagger, and use those to generate repair # rules, which are added to the rule mappings. if self._trace: print("Finding initial useful rules...") self._init_mappings(test_sents, train_sents) if self._trace: print((" Found %d useful rules." % len(self._rule_scores))) # Let the user know what we're up to. if self._trace > 2: self._trace_header() elif self._trace == 1: print("Selecting rules...") # Repeatedly select the best rule, and add it to `rules`. rules = [] try: while (len(rules) < max_rules): # Find the best rule, and add it to our rule list. rule = self._best_rule(train_sents, test_sents, min_score, min_acc) if rule: rules.append(rule) score = self._rule_scores[rule] trainstats['rulescores'].append(score) else: break # No more good rules left! # Report the rule that we found. if self._trace > 1: self._trace_rule(rule) # Apply the new rule at the relevant sites self._apply_rule(rule, test_sents) # Update _tag_positions[rule.original_tag] and # _tag_positions[rule.replacement_tag] for the affected # positions (i.e., self._positions_by_rule[rule]). self._update_tag_positions(rule) # Update rules that were affected by the change. self._update_rules(rule, train_sents, test_sents) # The user can cancel training manually: except KeyboardInterrupt: print("Training stopped manually -- %d rules found" % len(rules)) # Discard our tag position mapping & rule mappings. self._clean() trainstats['finalerrors'] = trainstats['initialerrors'] - sum(trainstats['rulescores']) trainstats['finalacc'] = 1 - trainstats['finalerrors']/trainstats['tokencount'] # Create and return a tagger from the rules we found. return BrillTagger(self._initial_tagger, rules, trainstats)
from nltk.tag import pos_tag, untag from nltk.tokenize import word_tokenize sentence = 'Emma refused to permit us to obtain the refuse permit"' # 문장을 단어로 분리하고 그것을 품사와 같이 표시 tagged_list = pos_tag(word_tokenize(sentence)) print(tagged_list) # t[0] t[1] t[0] t[1] t[0] t[1] #[('Emma', 'NNP'), ('refused', 'VBD'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN'), ("''", "''")] noun_list = [t[0] for t in tagged_list if t[1] == 'NN'] # 리스트중 =='NN' 인 명사만 가져온다. print(noun_list) # 품사 tag 를 제거 한다. print(untag(tagged_list)) # 단어와 품사를 묶어서 한 문자열로 변경 def tokenizer(doc): return ['/'.join(p) for p in tagged_list] # 배열을 하나의 문자열로 '/'로 구분해서 합친다. print(tokenizer(tagged_list)) # ['Emma/NNP', 'refused/VBD', 'to/TO', 'permit/VB', 'us/PRP', 'to/TO', 'obtain/VB', 'the/DT', 'refuse/NN', 'permit/NN', "''/''"]
def FeatureExtractor(tree): rc = [] for subtree in tree.subtrees(): if subtree.label() == 'NP': rc.append(str(untag(subtree))) return rc
print("wrong input") last = input("Unigram or Bigram+Affix+Dictionary+Regexp+Default? UNIGRAM/BIGRAM-> ") print("\nCalculating Brill & Wu Complementarity\n\nAfter the programme finished, please look for \ 'Results\CompareBrillWu_"+last+tone+tag+".txt' and 'Results\Disagreement_BrillWuHtml_"+last+tone+tag+".txt' in your current working directory.\n") bambara = create_reader(tone, tag) if last == "BIGRAM": lasttagger = backoff_tagger([5,6,8,1],bambara,option_tones=tone, option_tag=tag) lasttaggertagged = lasttagger.tag_sents([untag(i) for i in bambara.test_sents]) lasttagger_acc = lasttagger.evaluate(bambara.test_sents) lasttagger_err = round((1-(lasttagger_acc)),4) if last == "UNIGRAM": lasttagger = indivUnigram(bambara, backoff) lasttaggertagged = lasttagger.tag_sents([untag(i) for i in bambara.test_sents]) lasttagger_acc = lasttagger.evaluate(bambara.test_sents) lasttagger_err = round((1-(lasttagger_acc)),4) hmm = indivHMM(bambara) hmmtagged = hmm.tag_sents([untag(i) for i in bambara.test_sents]) hmm_acc = hmm.evaluate(bambara.test_sents) hmm_err = round((1 - (hmm_acc)),4) crf = indivCRF(bambara, tone, tag) crftagged = crf.tag_sents([untag(i) for i in bambara.test_sents])
def retag_trees(trees, sents): tagged_sents = tagger.tag_sents([untag(sent) for sent in sents]) for tree, sentence in zip(trees, tagged_sents): for (n, word) in zip(tree.treepositions('leaves'), sentence): tree[n] = word
def NounExtractor(tree): rc1 = [] for subtree in tree.subtrees(): if subtree.label() == 'Noun': rc1.append(str(untag(subtree))) return rc1
import nltk nltk.download('brown') from nltk.corpus import brown brown_news_tagged = brown.tagged_sents(categories='news', tagset='universal') brown_news_words = brown.tagged_words(categories='news', tagset='universal') brown_train = brown_news_tagged[100:] brown_test = brown_news_tagged[:100] from nltk.tag import untag test_sent = untag(brown_test[0]) print("Tagged: ", brown_test[0]) print() print("Untagged: ", test_sent) # A default tagger assigns the same tag to all words from nltk import DefaultTagger default_tagger = DefaultTagger('NOUN') default_tagger.tag('This is a test'.split())
def apply_tagger(tagger, corpus): return [tagger.tag(untag(sent)) for sent in corpus]
[lm.lemmatize(w) for w in words] lm.lemmatize("dying", pos="v") nltk.help.upenn_tagset('VB') from nltk.tag import pos_tag sentence = "Emma refused to permit us to obtain the refuse permit" tagged_list = pos_tag(word_tokenize(sentence)) tagged_list nouns_list = [t[0] for t in tagged_list if t[1] == "NN"] nouns_list from nltk.tag import untag untag(tagged_list) def tokenizer(doc): return ["/".join(p) for p in tagged_list] tokenizer(sentence) from nltk import Text text = Text(retokenize.tokenize(emma_raw), name="Emma") text.plot(20) plt.show() text.dispersion_plot(["Emma", "Knightley", "Frank", "Jane", "Harriet", "Robert"])
tag = input("POS/Affixes? -> ") backoff = input("DefaultTagger as backoff? J/N-> ") if backoff =="J": backoff = DefaultTagger('n') else: backoff = None print("\nCalculating Brill & Wu Complementarity\n\nAfter the programme finished, please look for \ 'Results\CompareBrillWu_Regex"+tone+tag+".txt' and 'Results\Disagreement_BrillWuHtml_Regex"+tone+tag+".txt' in your current working directory.\n") bambara = create_reader(tone, tag) hmm = indivHMM(bambara) hmmtagged = hmm.tag_sents([untag(i) for i in bambara.test_sents]) hmm_acc = hmm.evaluate(bambara.test_sents) hmm_err = round((1 - (hmm_acc)),4) crf = indivCRF(bambara, tone, tag) crftagged = crf.tag_sents([untag(i) for i in bambara.test_sents]) crf_acc = crf.evaluate(bambara.test_sents) crf_err = round((1-(crf_acc)),4) tnt = indivTnT(bambara, backoff) tnttagged = tnt.tag_sents([untag(i) for i in bambara.test_sents]) tnt_acc = tnt.evaluate(bambara.test_sents) tnt_err = round((1-(tnt_acc)),4) unigram = indivUnigram(bambara, backoff) unitagged = unigram.tag_sents([untag(i) for i in bambara.test_sents])
# print affix_ugram_backoff.evaluate(test) # print unigram_affix_backoff.evaluate(test) # cutoffs = [x*0.1 for x in range(20)] # for c in cutoffs: # tagger = EntropyVotingTagger(taggers, c) # print "Accuracy of entropy voting = ", tagger.evaluate(test) affix_tagger = EntropyAffixTagger(train) unigram_tagger = EntropyUnigramTagger(train) taggers = [unigram_tagger, affix_tagger] tagger = EntropyVotingTagger(taggers, max_entropy=80) from nltk.tag import untag untagged_test = [untag(x) for x in dev] tagged_sents_uni_affix = unigram_affix_backoff.tag_sents(untagged_test) tagged_sents_entr = tagger.tag_sents(untagged_test) affix_mistake = 0 unigram_mistake = 0 overall_mistakes = 0 print "len of dev: ", len(dev) for tagged_reference_sent, tagged_uni_affix_sent, tagged_entropy_sent in izip(dev, tagged_sents_uni_affix, tagged_sents_entr): # import pdb;pdb.set_trace() for tagged_reference, tagged_uni_affix, tagged_entropy in izip(tagged_reference_sent, tagged_uni_affix_sent, tagged_entropy_sent): if tagged_uni_affix[1] != tagged_entropy[1]: overall_mistakes += 1 print "WE GOT MATCH!"
def VerbExtractor(tree): rc2 = [] for subtree in tree.subtrees(): if subtree.label() == 'Verb': rc2.append(str(untag(subtree))) return rc2
def AdjExtractor(tree): rc3 = [] for subtree in tree.subtrees(): if subtree.label() == 'Adj': rc3.append(str(untag(subtree))) return rc3
import nltk from nltk.tag import untag print(untag([('beautiful', 'NN'), ('morning', 'NN')]))
def PhraseExtractor(tree): rc4 = [] for subtree in tree.subtrees(): if subtree.label() == 'Phrase': rc4.append(str(untag(subtree))) return rc4
def PosExtractor(tree): ps = [] for subtree in tree.subtrees(): if subtree.label() == 'PWD': ps.append(str(untag(subtree))) return ps
def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): """ Trains the Brill tagger on the corpus *train_sents*, producing at most *max_rules* transformations, each of which reduces the net number of errors in the corpus by at least *min_score*, and each of which has accuracy not lower than *min_acc*. #imports >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Pos, Word >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer #some data >>> from nltk.corpus import treebank >>> training_data = treebank.tagged_sents()[:100] >>> baseline_data = treebank.tagged_sents()[100:200] >>> gold_data = treebank.tagged_sents()[200:300] >>> testing_data = [untag(s) for s in gold_data] >>> backoff = RegexpTagger([ ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives ... (r'.*ly$', 'RB'), # adverbs ... (r'.*s$', 'NNS'), # plural nouns ... (r'.*ing$', 'VBG'), # gerunds ... (r'.*ed$', 'VBD'), # past tense verbs ... (r'.*', 'NN') # nouns (default) ... ]) >>> baseline = backoff #see NOTE1 >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS 0.2450142... #templates >>> Template._cleartemplates() #clear any templates created in earlier tests >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] #construct a BrillTaggerTrainer >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) >>> tagger1 = tt.train(training_data, max_rules=10) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) Finding initial useful rules... Found 845 useful rules. <BLANKLINE> B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 47 63 16 161 | NN->IN if Pos:NNS@[-1] 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0] 22 27 5 24 | NN->-NONE- if Pos:VBD@[-1] 17 17 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger1.rules()[1:3] (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')])) >>> train_stats = tagger1.train_stats() >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] >>> tagger1.print_template_statistics(printunused=False) TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) TRAIN ( 2417 tokens) initial 1775 0.2656 final: 1269 0.4750 #ID | Score (train) | #Rules | Template -------------------------------------------- 001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0])) 000 | 201 0.397 | 3 0.300 | Template(Pos([-1])) <BLANKLINE> <BLANKLINE> >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS 0.43996... >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'), ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'), ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] True >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] # a high-accuracy tagger >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) Finding initial useful rules... Found 845 useful rules. <BLANKLINE> B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 36 36 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | NN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | NN->, if Pos:NNS@[-1] & Word:,@[0] 19 19 0 6 | NN->VB if Pos:TO@[-1] 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0] 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS 0.44159544... >>> tagger2.rules()[2:4] (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger, # with a RegexpTagger only as backoff. For instance, # >>> baseline = UnigramTagger(baseline_data, backoff=backoff) # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results # between python versions. The simplistic backoff above is a workaround to make doctests # get consistent input. :param train_sents: training data :type train_sents: list(list(tuple)) :param max_rules: output at most max_rules rules :type max_rules: int :param min_score: stop training when no rules better than min_score can be found :type min_score: int :param min_acc: discard any rule with lower accuracy than min_acc :type min_acc: float or None :return: the learned tagger :rtype: BrillTagger """ # FIXME: several tests are a bit too dependent on tracing format # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates # Basic idea: Keep track of the rules that apply at each position. # And keep track of the positions to which each rule applies. # Create a new copy of the training corpus, and run the # initial tagger on it. We will progressively update this # test corpus to look more like the training corpus. test_sents = [ list(self._initial_tagger.tag(untag(sent))) for sent in train_sents ] # Collect some statistics on the training process trainstats = {} trainstats['min_acc'] = min_acc trainstats['min_score'] = min_score trainstats['tokencount'] = sum(len(t) for t in test_sents) trainstats['sequencecount'] = len(test_sents) trainstats['templatecount'] = len(self._templates) trainstats['rulescores'] = [] trainstats['initialerrors'] = sum( tag[1] != truth[1] for paired in zip(test_sents, train_sents) for (tag, truth) in zip(*paired)) trainstats['initialacc'] = 1 - trainstats[ 'initialerrors'] / trainstats['tokencount'] if self._trace > 0: print( "TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; " "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})" .format(**trainstats)) # Initialize our mappings. This will find any errors made # by the initial tagger, and use those to generate repair # rules, which are added to the rule mappings. if self._trace: print("Finding initial useful rules...") self._init_mappings(test_sents, train_sents) if self._trace: print((" Found %d useful rules." % len(self._rule_scores))) # Let the user know what we're up to. if self._trace > 2: self._trace_header() elif self._trace == 1: print("Selecting rules...") # Repeatedly select the best rule, and add it to `rules`. rules = [] try: while (len(rules) < max_rules): # Find the best rule, and add it to our rule list. rule = self._best_rule(train_sents, test_sents, min_score, min_acc) if rule: rules.append(rule) score = self._rule_scores[rule] trainstats['rulescores'].append(score) else: break # No more good rules left! # Report the rule that we found. if self._trace > 1: self._trace_rule(rule) # Apply the new rule at the relevant sites self._apply_rule(rule, test_sents) # Update _tag_positions[rule.original_tag] and # _tag_positions[rule.replacement_tag] for the affected # positions (i.e., self._positions_by_rule[rule]). self._update_tag_positions(rule) # Update rules that were affected by the change. self._update_rules(rule, train_sents, test_sents) # The user can cancel training manually: except KeyboardInterrupt: print("Training stopped manually -- %d rules found" % len(rules)) # Discard our tag position mapping & rule mappings. self._clean() trainstats['finalerrors'] = trainstats['initialerrors'] - sum( trainstats['rulescores']) trainstats['finalacc'] = 1 - trainstats['finalerrors'] / trainstats[ 'tokencount'] # Create and return a tagger from the rules we found. return BrillTagger(self._initial_tagger, rules, trainstats)
def NegExtractor(tree): ns = [] for subtree in tree.subtrees(): if subtree.label() == 'NWD': ns.append(str(untag(subtree))) return ns
######### DEFAULT TAGGER ############### #Assigning the default Tag from nltk.tag import DefaultTagger, untag tagger=DefaultTagger('NN') tokens=[['Hello','World'],['How','are','you','?']] print tagger.tag(tokens) print tagger.tag_sents(tokens) #Untagging tagged=tagger.tag(tokens) print untag(tagged) #Evaluating the tagger accuracy from nltk.corpus import treebank test_sents=treebank.tagged_sents()[3000:] print tagger.evaluate(test_sents)
def trainALL(self, last): self.split_into_folds() for k in range(1, (self.folds + 1)): train_sents = sum(self.foldlist[: (self.folds - 1)], []) crf = CRFTagger(training_opt={"max_iterations": 100, "max_linesearch": 10, "c1": 0.0001, "c2": 1.0}) crf_trained = crf.train( train_sents, "Models/model.crfCrossValidation1" + str(k) + self.option_tone + self.option_tag + ".tagger", ) print(str(k) + " fold: crf") tnt_tagger = tnt.TnT(unk=DefaultTagger("n"), Trained=True, N=100) tnt_tagger.train(train_sents) print(str(k) + " fold: tnt") tag_set = set() symbols = set() for i in train_sents: for j in i: tag_set.add(j[1]) symbols.add(j[0]) trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols)) hmm = trainer.train_supervised(train_sents, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print(str(k) + " fold: hmm") if last == "U": lasttagger = UnigramTagger(train_sents, backoff=DefaultTagger("n")) print(str(k) + " fold: unigram") if last == "B": if self.option_tone == "tonal" and self.option_tag == "Affixes": regex = RegexpTonalSA(DefaultTagger("n")) if self.option_tone == "tonal" and self.option_tag == "POS": regex = RegexpTonal(DefaultTagger("n")) if self.option_tone == "nontonal" and self.option_tag == "Affixes": regex = RegexpSA(DefaultTagger("n")) if self.option_tone == "nontonal" and self.option_tag == "POS": regex = Regexp(DefaultTagger("n")) dic = dictionary_backoff(self.option_tone, regex) affix = AffixTagger(train_sents, min_stem_length=0, affix_length=-4, backoff=dic) lasttagger = BigramTagger(train_sents, backoff=affix) print(str(k) + " fold: bigram") to_tag = [untag(i) for i in self.foldlist[self.folds - 1]] self.crf_tagged += crf.tag_sents(to_tag) self.tnt_tagged += tnt_tagger.tag_sents(to_tag) self.hmm_tagged += hmm.tag_sents(to_tag) self.lasttagger_tagged += lasttagger.tag_sents(to_tag) self.org_tagged += self.foldlist[self.folds - 1] self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)] self.crf = crf self.tnt = tnt_tagger self.hmm = hmm self.lasttagger = lasttagger org_words = sum(self.org_tagged, []) self.crf_avg_acc = accuracy(org_words, sum(self.crf_tagged, [])) self.tnt_avg_acc = accuracy(org_words, sum(self.tnt_tagged, [])) self.hmm_avg_acc = accuracy(org_words, sum(self.hmm_tagged, [])) self.lasttagger_avg_acc = accuracy(org_words, sum(self.lasttagger_tagged, [])) print("Accuracy of concatenated crf-tagged sentences: ", self.crf_avg_acc) print("Accuracy of concatenated tnt-tagged sentences: ", self.tnt_avg_acc) print("Accuracy of concatenated hmm-tagged sentences: ", self.hmm_avg_acc) print("Accuracy of concatenated " + last + "-tagged sentences: ", self.lasttagger_avg_acc) (self.crf_tagprecision, self.crf_tagrecall) = self.tagprecision_recall(crf, self.crf_tagged, self.org_tagged) (self.tnt_tagprecision, self.tnt_tagrecall) = self.tagprecision_recall( tnt_tagger, self.tnt_tagged, self.org_tagged ) (self.hmm_tagprecision, self.hmm_tagrecall) = self.tagprecision_recall(hmm, self.hmm_tagged, self.org_tagged) (self.lasttagger_tagprecision, self.lasttagger_tagrecall) = self.tagprecision_recall( lasttagger, self.lasttagger_tagged, self.org_tagged ) self.org_tagged = [] self.foldlist = [] for i in range(1, self.folds + 1): self.foldlist.append(self.create_fold(i))