def no_backoff_taggers(test, train, corpus='floresta'): default_tagger = default_tagger_corpus(corpus) info('training {} taggers without backoff'.format(corpus)) info('this may take a while...\n') info(default_tagger) default_score = default_tagger.evaluate(test) print('accuracy score: {}\n'.format(default_score)) # unigram tagger uni_tagger = UnigramTagger(train) # bigram tagger bi_tagger = BigramTagger(train) # trigram tagger tri_tagger = TrigramTagger(train) info(uni_tagger) uni_score = uni_tagger.evaluate(test) print('accuracy score: {}\n'.format(uni_score)) info(bi_tagger) bi_score = bi_tagger.evaluate(test) print('accuracy score: {}\n'.format(bi_score)) info(tri_tagger) tri_score = tri_tagger.evaluate(test) print('accuracy score: {}\n'.format(tri_score))
def get_lookup_tagger_accuracy(test_set, lookup_tagger_basis, corpus): words = [word for sent in lookup_tagger_basis for word in sent] fd = FreqDist(words) cfd = ConditionalFreqDist(corpus.tagged_words()) most_freq_words = fd.most_common(200) likely_tags = dict( (word[0], cfd[word[0]].max()) for (word, _) in most_freq_words) baseline_tagger = UnigramTagger(model=likely_tags) result = baseline_tagger.evaluate(test_set) return result
def pos_tag(self): tokenize_obj = NLTKTokenize(self.options) res = tokenize_obj.tokenize() tokens = res['result'] tags = [] # Performs Bigram / Unigram / Regex Tagging if self.options.get('tagger') in ['unigram', 'bigram', 'regex']: trainer = self.options['train'] if self.options.get( 'train') in TRAINERS else DEFAULT_TRAIN train = brown.tagged_sents(categories=trainer) # Create your custom regex tagging pattern here regex_tag = RegexpTagger([(r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN')]) current = os.path.dirname(os.path.abspath(__file__)) # Unigram tag training data load / dump pickle pkl_name = current + '/trained/unigram_' + trainer + '.pkl' if os.path.isfile(pkl_name): with open(pkl_name, 'rb') as pkl: unigram_tag = load(pkl) else: unigram_tag = UnigramTagger(train, backoff=regex_tag) with open(pkl_name, 'wb') as pkl: dump(unigram_tag, pkl, -1) # Bigram tag training data load / dump pickle if self.options['tagger'] == 'bigram': pkl_name = current + '/trained/bigram_' + trainer + '.pkl' if os.path.isfile(pkl_name): with open(pkl_name, 'rb') as pkl: bigram_tag = load(pkl) else: bigram_tag = BigramTagger(train, backoff=unigram_tag) with open(pkl_name, 'wb') as pkl: dump(bigram_tag, pkl, -1) tags = bigram_tag.tag(tokens) # Bigram tagging performed here elif self.options['tagger'] == 'unigram': tags = unigram_tag.tag( tokens) # Unigram tagging performed here else: tags = regex_tag.tag(tokens) # Regex tagging performed here # Performs default pos_tag elif self.options.get('tagger', DEFAULT_TAGGER) == 'pos': tags = pos_tag(tokens) return self._dump(tags)
def backoff_taggers(test, train, save, corpus='floresta'): default_tagger = default_tagger_corpus(corpus) info('training {} taggers with backoff'.format(corpus)) info('this may take a while...\n') info(default_tagger) default_score = default_tagger.evaluate(test) print('accuracy score: {}\n'.format(default_score)) # UNIGRAM TAGGER WITH BACKOFF uni_tagger_backoff = UnigramTagger(train, backoff=default_tagger) # BIGRAM TAGGER WITH BACKOFF bi_tagger_backoff = BigramTagger(train, backoff=uni_tagger_backoff) # TRIGRAM TAGGER WITH BACKOFF tri_tagger_backoff = TrigramTagger(train, backoff=bi_tagger_backoff) info(uni_tagger_backoff) uni_backoff_score = uni_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(uni_backoff_score)) info(bi_tagger_backoff) bi_backoff_score = bi_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(bi_backoff_score)) info(tri_tagger_backoff) tri_backoff_score = tri_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(tri_backoff_score)) if not save: return accuracy_dict = {} accuracy_dict['uni'] = uni_backoff_score accuracy_dict['bi'] = bi_backoff_score accuracy_dict['tri'] = tri_backoff_score # Saving our Trigram-tagger with backoff if uni_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_unigram_tagger_backoff.pkl'.format(corpus) output = open(tagger_file, 'wb') dump(uni_tagger_backoff, output, -1) elif bi_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_bigram_tagger_backoff.pkl'.format(corpus) output = open(tagger_file, 'wb') dump(bi_tagger_backoff, output, -1) elif tri_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_trigram_tagger_backoff.pkl'.format(corpus) dump(tri_tagger_backoff, output, -1) output.close() info('saving %s...\n', tagger_file)
def find_combined_taggers_accuracy(train_set, test_set): # finding most used tag train_words = [word for sent in train_set for word in sent] train_set_tags = [tag for (word, tag) in train_words] most_frequent_tag = FreqDist(train_set_tags).max() default_tagger = DefaultTagger(most_frequent_tag) # default tagger default_tagger_result = default_tagger.evaluate(test_set) print("Default Tagger accuracy: ", default_tagger_result) # regex tagger patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] regex_tagger = RegexpTagger(patterns) regex_tagger_result = regex_tagger.evaluate(test_set) print("Regex Tagger Accuracy: ", regex_tagger_result) # unigram tagger with default tagger as backoff unigram_tagger = UnigramTagger(train_set, backoff=default_tagger) unigram_tagger_result = unigram_tagger.evaluate(test_set) print("Unigram Tagger accuracy (Backoff = Default Tagger): ", unigram_tagger_result) # bigram tagger with different backoffs bigram_tagger = BigramTagger(train_set) bigram_tagger_backoff_unigram = BigramTagger(train_set, backoff=unigram_tagger) bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger) bigram_tagger_result = bigram_tagger.evaluate(test_set) bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate( test_set) bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate( test_set) print("Bigram Tagger Accuracy: ", bigram_tagger_result) print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ", bigram_tagger_backoff_regex_result) print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ", bigram_tagger_backoff_unigram_result)
def __init__(self, mode, train_sents): if mode == TRIGRAM: self.tagger = UnigramTagger(train_sents) self.tagger = BigramTagger(train_sents, backoff=self.tagger) self.tagger = TrigramTagger(train_sents, backoff=self.tagger) elif HDM: self.tagger = HiddenMarkovModelTagger.train(train_sents)
def train_tagger(corpus_name, corpus): """ Train the taggers and saves them Args: corpus_name: name of the corpus used to create the tagger corpus: corpus for creating the tagger """ #List of n-gram taggers names complete_names = [corpus_name + '_' + x for x in N_GRAM_NAMES] # Training UnigramTagger tagger1 = UnigramTagger(corpus) utilities.save_pickle(tagger1, complete_names[0], TAGGER_EXTENSION, TAGGER_PATH) print "UnigramTagger trained with", corpus_name # Training BigramTagger tagger2 = BigramTagger(corpus) utilities.save_pickle(tagger2, complete_names[1], TAGGER_EXTENSION, TAGGER_PATH) print "BigramTagger trained with", corpus_name # Training TrigramTagger tagger3 = TrigramTagger(corpus) utilities.save_pickle(tagger3, complete_names[2], TAGGER_EXTENSION, TAGGER_PATH) print "TrigramTagger trained with", corpus_name
class Tagger(object): def __init__(self, cess_name="cess_esp"): """ Tagger object. Allows to specify a cess. """ cess = getattr(nltk.corpus, cess_name) self.wnl = WordNetLemmatizer() self.ut = UnigramTagger(cess.tagged_sents()) def pos_tag(self, tokens, lemmatize=False): def clean_tag(tag): def get_type(tag): if tag[1]: return tag[1][0].upper() return "X" if lemmatize: return (self.wnl.lemmatize(tag[0]), get_type(tag)) return (tag[0], get_type(tag)) if type(tokens) == str: tokens = tokens.split() return [clean_tag(a) for a in self.ut.tag(tokens)] def get_main_words(self, tokens, lemmatize=True, type_w=False): def cond(t): if type_w: for type_w_ in type_w: if t[1].lower().startswith(type_w_.lower()): return True return False return True return filter(cond, self.pos_tag(tokens, lemmatize=lemmatize))
def get_pos_tagger(): from nltk.corpus import brown regexp_tagger = nltk.RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) brown_train = brown.tagged_sents() unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) # Override particular words main_tagger = nltk.RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')], backoff=trigram_tagger) return main_tagger
def __init__(self, cess_name="cess_esp"): """ Tagger object. Allows to specify a cess. """ cess = getattr(nltk.corpus, cess_name) self.wnl = WordNetLemmatizer() self.ut = UnigramTagger(cess.tagged_sents())
def lookupTagger(r, c): # r = range, c = corpus if (c == "brown"): fDist = ConditionalFreqDist(brownTW) freqDist = FreqDist(brown.words()) wordsR = freqDist.most_common(r) likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR) baseline_tagger = UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger("NN")) return baseline_tagger if (c == "chat"): fDist = ConditionalFreqDist(chatTW) freqDist = FreqDist(chat.words()) wordsR = freqDist.most_common(r) likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR) baseline_tagger = UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger("NN")) return baseline_tagger
def _model_definition(self) -> UnigramTagger: """Function to define and compile the model. Returns: Model object. """ t0 = DefaultTagger('NOUN') return UnigramTagger([[(".", "PUNCT")]], backoff=t0)
def TrainTaggers(training, testing): global results Unigram = UnigramTagger(training, backoff = default) print('unigram trained') Bigram = BigramTagger(training, backoff = Unigram) print('bigram trained') Trigram = TrigramTagger(training, backoff = Bigram) print('trigram trained') results += [Trigram.evaluate(testing)]
def __init__(self, train_sents, to_detect_list, n_gram=1): train_data = [[(t, c) for w, t, c in sent] for sent in train_sents] self.tagger = UnigramTagger(train_data) if n_gram > 1: self.tagger = BigramTagger(train_data, backoff=self.tagger) if n_gram > 2: self.tagger = TrigramTagger(train_data, backoff=self.tagger) self.to_detect_list = to_detect_list
def train(self, model_path): corpus = [[(token.lower(), tag) for token, tag in sent] for sent in CORPUS] unigram_tagger = UnigramTagger(corpus, backoff=DefaultTagger('UNK')) bigram_tagger = BigramTagger(corpus, backoff=unigram_tagger) with open(model_path, "wb") as model_file: pickle.dump(bigram_tagger, model_file)
def lookup_tag(num_sampling): raw = 'I am applying for AIT because I can be with my parents here and I am already granted a scholarship' #Get the frequency distribution of the words fd = FreqDist(brown.words(categories='news')) #Get the most frequent tag of each word in the corpus cfd = ConditionalFreqDist(brown.tagged_words( categories='news')) #, backoff=nltk.DefaultTagger('NN')) #Get the first 100 most common words most_freq_words = fd.most_common(num_sampling) #Create a dictionary in form of a tuple (word, most_likely_tag) likely_tags = dict( (word, cfd[word].max()) for (word, _) in most_freq_words) #Unigram means tag by using its most frequency tag (no context needed) just like unigram in the Ngram topic lookup_tagger = UnigramTagger(model=likely_tags) tagged = lookup_tagger.tag(word_tokenize(raw)) print(tagged) score = lookup_tagger.evaluate(brown_tagged_sents) print(score)
def __init__(self, train_sents): """Show parameters. train_sents: trained sentences which have already been tagged. using Brown, conll2000, and TreeBank corpus. """ t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) self.tagger = TrigramTagger(train_sents, backoff=t2)
def train_tagger(corpus_name, corpus): """ Function to train tagger. """ # Training UnigramTagger. uni_tag = UnigramTagger(corpus) save_tagger('{}_unigram.tagger'.format(corpus_name), uni_tag) # Training BigramTagger. bi_tag = BigramTagger(corpus, backoff=uni_tag) save_tagger('{}_bigram.tagger'.format(corpus_name), bi_tag) _msg = str("Tagger trained with {} using " "UnigramTagger and BigramTagger.").format(corpus_name) print(_msg, file=sys.stderr)
def create_tagger(sents,patterns=PATTERNS,maxngram=4): '''Обучение Backoff tagger на каком-либо корпусе предложений''' train = sents def_tagger = DefaultTagger('NN') re_tagger = RegexpTagger(patterns, backoff=def_tagger) uni_tagger = UnigramTagger(train, backoff=re_tagger) bi_tagger = BigramTagger(train, backoff=uni_tagger) tri_tagger = TrigramTagger(train, backoff=bi_tagger) ngram_tagger = NgramTagger(maxngram, train, backoff=tri_tagger) return ngram_tagger
def get_tagger(type="StandfordPOSTagger"): if type == "Custom": brown_tagged_sents = brown.tagged_sents(categories='news', tagset='universal') t0 = DefaultTagger('NOUN') t1 = UnigramTagger(brown_tagged_sents, backoff=t0) t2 = BigramTagger(brown_tagged_sents, backoff=t1) else: t2 = StanfordPOSTagger( 'data/./models/wsj-0-18-bidirectional-distsim.tagger', '3rdparty_libs/stanford-postagger.jar') return t2
def __init__(self, modelpath, candidates): self.modelpath = modelpath self.bus_counter = 0 with open(modelpath + 'all_highest_probs_' + str(candidates) + '.json', 'r') as f: self.candidates = json.load(f) with open( modelpath + 'inject_refcoco_refrnn_compositional_3_512_1/4eval_greedy.json', 'r' ) as f: # 'restoredmodel_refs_greedy.json') as f: restoredmodel_refs_greedy/4eval_greedy self.refs = json.load(f) self.words_that_are_names = list() with open("./noun_list_long.txt", 'r') as f: for row in f.readlines(): self.words_that_are_names.append(row.strip()) self.unigram_tagger = UnigramTagger(brown.tagged_sents()) self.zero_shot_refs = defaultdict() self.non_noun_counter = 0 self.baseline_top_1 = defaultdict() self.baseline_top_5 = defaultdict() self.baseline_top_10 = defaultdict()
def task3(data, corpus): fd = FreqDist(corpus.words()) cfd = ConditionalFreqDist(corpus.tagged_words()) most_freq_words = sorted(list(fd.items()), key=lambda x: x[1], reverse=True)[:200] most_freq_words = list(map(lambda x: x[0], most_freq_words)) likely_tags = dict((word, cfd[word].max()) for word in most_freq_words) lookup_tagger = UnigramTagger(model=likely_tags) for str in ["brown50", "brown90", "nps50", "nps90"]: tagger = CombinedTagger(train=data["train_" + str], default=lookup_tagger, name=str) test_tagger(tagger, data)
def generateTagger(): default_tagger = DefaultTagger('V') patterns = [ (r'.*o$', 'NMS'), # noun masculine singular (r'.*os$', 'NMP'), # noun masculine plural (r'.*a$', 'NFS'), # noun feminine singular (r'.*as$', 'NFP') # noun feminine singular ] regexp_tagger = RegexpTagger(patterns, backoff=default_tagger) #train nltk.UnigramTagger using tagged sentences from cess_esp cess_tagged_sents = cess_esp.tagged_sents() combined_tagger = UnigramTagger(cess_tagged_sents, backoff=regexp_tagger) return combined_tagger
def train(self, corpus: Corpus, evaluate: bool = True, config: dict = None) -> Union[None, Dict[str, Dict[str, float]]]: """Train method. Args: corpus: Corpus to train model. evaluate: Flag to return evaluation of the model. config: Training config dict (not used for this model). Returns: Model evaluation metrics. """ if self.model is None: self._model_definition() self.model = UnigramTagger(corpus.train.sentences, backoff=DefaultTagger('NOUN')) if evaluate: return self.evaluate(corpus) return None
def __init__(self): if os.path.exists('tagger_spanish.pickle'): with open('tagger_spanish.pickle', 'r') as file_obj: self.tagger = pickle.load(file_obj) else: print 'tagger_spanish.pickle not found. Training tagger... may take a few minutes...' from nltk import UnigramTagger, BigramTagger, TrigramTagger from nltk.corpus import cess_esp sents = cess_esp.tagged_sents() unigram_tagger = UnigramTagger(sents) bigram_tagger = BigramTagger(sents, backoff=unigram_tagger) # uses unigram tagger in case it can't tag a word self.tagger = unigram_tagger with open('tagger_spanish.pickle', 'w') as file_obj: pickle.dump(self.tagger, file_obj) # Dump trained tagger
def __init__(self, train=None, default=None, name=None): self.name = name # As found on page 199 of the nltk book regexps = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ] self.default = default self.regex = RegexpTagger(regexps, backoff=self.default) self.unigram = UnigramTagger(train=train, backoff=self.regex) self.bigram = BigramTagger(train=train, backoff=self.unigram)
def trained_tagger(): """Returns a trained trigram tagger existing : set to True if already trained tagger has been pickled """ # Aggregate trained sentences for N-Gram Taggers train_sents = nltk.corpus.brown.tagged_sents() train_sents += nltk.corpus.conll2000.tagged_sents() train_sents += nltk.corpus.treebank.tagged_sents() t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) trigram_tagger = TrigramTagger(train_sents, backoff=t2) pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb')) return trigram_tagger
def ngram_tag_with_backoff(): fd = FreqDist(brown.words(categories='news')) #Get the most frequent tag of each word in the corpus cfd = ConditionalFreqDist(brown.tagged_words( categories='news')) #, backoff=nltk.DefaultTagger('NN')) #Get the first 100 most common words most_freq_words = fd.most_common(1000000) #Create a dictionary in form of a tuple (word, most_likely_tag) likely_tags = dict( (word, cfd[word].max()) for (word, _) in most_freq_words) #Unigram means tag by using its most frequency tag (no context needed) just like unigram in the Ngram topic lookup_tagger = UnigramTagger(model=likely_tags) #With Backoff train_len = int(len(brown_tagged_sents) * 0.9) print(brown_tagged_sents[train_len:]) bigram_tagger = BigramTagger(brown_tagged_sents[:train_len], backoff=lookup_tagger) score = bigram_tagger.evaluate(brown_tagged_sents[train_len:]) print(score)
def train_and_save_unigram_tagger(): train_text = brown.tagged_sents() regexp_tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) unigram_tagger = UnigramTagger(train_text, backoff=regexp_tagger) output = open('../taggers/unigram_tagger.pkl', 'wb') dump(unigram_tagger, output, -1) output.close()
def trained_tagger(): """Returns a trained trigram tagger existing : set to True if already trained tagger has been pickled """ if os.path.exists(os.path.join(os.getcwd(), r"DataBase/trained_tagger.pkl")): print("Trained Tagger File already Exists..") return # Aggregate trained sentences for N-Gram Taggers train_sents = nltk.corpus.brown.tagged_sents() train_sents += nltk.corpus.conll2000.tagged_sents() train_sents += nltk.corpus.treebank.tagged_sents() t0 = DefaultTagger('NN') t1 = UnigramTagger(train_sents, backoff=t0) t2 = BigramTagger(train_sents, backoff=t1) trigram_tagger = TrigramTagger(train_sents, backoff=t2) pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb'))
def __init__(self, train_sents, load=False): if load: print 'Loading saved tagger...', self.load() print 'done.' else: time_start = time.time() print 'Training the tagger...' tag_counts = Counter([t for s in train_sents for w, t in s]) default_tag = argmax(tag_counts) def_tgr = DefaultTagger(default_tag) af_tgr = AffixTagger(train_sents, affix_length=-3, backoff=def_tgr) uni_tgr = UnigramTagger(train_sents, backoff=af_tgr) bi_tgr = BigramTagger(train_sents, backoff=uni_tgr) tri_tgr = TrigramTagger(train_sents, backoff=bi_tgr) self.tgr = tri_tgr print 'Done.' time_stop = time.time() print 'Training time: {0:.2f}s'.format(time_stop - time_start)
def prepare_toolset(): toolset = {} patterns = [(r'^[\.1-9]+$', 'NUM'), (r'^[^a-zA-Z]+$', '.'), (r'^[^a-zA-Z]*[a-zA-Z]+[-\'][a-zA-Z]+[^a-zA-Z]*$', 'NOUN'), (r'^.*[a-zA-Z]+[^-a-zA-Z]+[a-zA-Z]+.*$', '.')] train_set = brown.tagged_sents( categories='learned', tagset='universal') + brown.tagged_sents( categories='news', tagset='universal') + brown.tagged_sents( categories='reviews', tagset='universal') utgr = UnigramTagger(train=train_set, backoff=DefaultTagger('NN')) btgr = BigramTagger(train=train_set, backoff=utgr) ttgr = TrigramTagger(train=train_set, backoff=btgr) toolset['tgr'] = RegexpTagger(regexps=patterns, backoff=ttgr) toolset['sw'] = stopwords.words('english') toolset['lr'] = WordNetLemmatizer() toolset['wntg'] = { 'NOUN': wordnet.NOUN, 'VERB': wordnet.VERB, 'ADJ': wordnet.ADJ, 'ADV': wordnet.ADV, 'X': wordnet.NOUN } print('Tools Ready') return toolset
""" return [u"%s/%s" % (t, p) for t, p in sent.pos() if not t in ["-LRB-", "-RRB-"]] if __name__ == "__main__": if len(sys.argv) < 3: print "Usage:\n\t%s <corpus>" % sys.argv[0] sys.exit(-1) # Prepare corpus tagged_sents = build_tagged_sents(sys.argv[1:]) random.shuffle(tagged_sents) tagged_train = tagged_sents[: len(tagged_sents) / 2] tagged_test = tagged_sents[len(tagged_sents) / 2 :] # Train unigram tagger print "Training unigram tagger..." unigram_tagger = UnigramTagger(tagged_train) print "\taccuracy: %f" % unigram_tagger.evaluate(tagged_test) # Train brill tagger print "Training Brill tagger..." templates = [ # Context tag in a 1, 2 and 3 word window SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)), # Context word in a 1, 2 and 3 word window SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateWordsRule, (2, 2)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 2)), SymmetricProximateTokensTemplate(ProximateWordsRule, (1, 3)), # Closest tag
# import sys # sys.exit(1) all_words = corpus.brown.tagged_sents(tagset='universal') # random.shuffle(all_words) # we shuffle it so we don't get a specific category as the test set! ds_length = len(all_words) train = all_words[int(0.2 * ds_length):] dev = all_words[:int(0.1 * ds_length)] test = all_words[int(0.1 * ds_length):int(0.2 * ds_length)] from nltk import UnigramTagger, AffixTagger unigram = UnigramTagger(train) affix_ugram_backoff = AffixTagger(train, backoff=unigram) affix = AffixTagger(train) unigram_affix_backoff = UnigramTagger(train, backoff=affix) # print "testing" # print affix_ugram_backoff.evaluate(test) # print unigram_affix_backoff.evaluate(test) # cutoffs = [x*0.1 for x in range(20)] # for c in cutoffs: # tagger = EntropyVotingTagger(taggers, c) # print "Accuracy of entropy voting = ", tagger.evaluate(test) affix_tagger = EntropyAffixTagger(train) unigram_tagger = EntropyUnigramTagger(train) taggers = [unigram_tagger, affix_tagger] tagger = EntropyVotingTagger(taggers, max_entropy=80) from nltk.tag import untag
def performance(wordList): tagger = dict((word[0], cfd[word[0]].max()) for (word, freq) in wordList if len(cfd[word[0]])) if not len(tagger): return 0 baselineTagger = UnigramTagger(model=tagger, backoff=DefaultTagger("NN")) return baselineTagger.evaluate(taggedSents)
from nltk import UnigramTagger from nltk.corpus import treebank from tag_util import word_tag_model model = word_tag_model(treebank.words(), treebank.tagged_words()) tagger = UnigramTagger(model=model) test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents))
def createModel(self): model_name=None try: unigrams=self.buildUnigrams() N=len(self.corpusSents) toTraining=round(self.training_portion*N) #logging.info("Sentencias totales:" + str(N)) training=self.corpusSents[:toTraining] test=self.corpusSents[toTraining:] post_patterns=[] for regex,post in self.regex_list: try: regex=regex.decode('utf-8') except: pass post_patterns.append((regex,post)) for regex,post in self.config.items('postaggers.regex'): post_patterns.append((regex.decode('utf-8'),post)) regexpTagger = RegexpTagger(post_patterns) unigramTagger = UnigramTagger(unigrams+training,backoff=regexpTagger) bigramTagger= BigramTagger(training, backoff=unigramTagger) trigramTagger = TrigramTagger(training, backoff=bigramTagger) NTagger=NgramTagger(self.max_ngrams,training,backoff=trigramTagger) print("Sentencias de entrenamiento para n-taggers:" + str(len(training))) print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams))) print("Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams))) print("Sentencias para testing:" + str(len(test))) print("Expresiones regulares para el Tagger:") for post_regex in post_patterns: print post_regex if self.training_portion!=1: score_ut=unigramTagger.evaluate(test) score_bt=bigramTagger.evaluate(test)-0.002 score_tt=trigramTagger.evaluate(test) score_nt=NTagger.evaluate(test) scores=[score_ut,score_bt,score_tt,score_nt] tagger_names=["uTagger","biTagger","triTagger","NTagger"] taggers=[unigramTagger,bigramTagger,trigramTagger,NTagger] bestTagger_index= scores.index(max(scores)) best_msg=max(scores),tagger_names[bestTagger_index] fname=self.taggers_path + tagger_names[bestTagger_index] if os.path.isfile(fname+self.tagger_extension_file): fname=fname+str(len(listdir(self.taggers_path)))+self.tagger_extension_file else: fname=self.taggers_path + tagger_names[bestTagger_index]+self.tagger_extension_file model=taggers[bestTagger_index] f = open(fname,'wb') pickle.dump(model, f) f.close() print ("Guardando el tagger :" + fname) #logging.info("Guardando el mejor tagger :" + fname) model_name=fname except Exception,e: print "ERRPR EN POS TAGGER GENERATOR:",str(e) pdb.set_trace()
def treeSentenceToTuples(sent): """ :param sent: a Tree representing a sentence :type sent: nltk.tree.Tree """ return [u"%s/%s"%(t,p) for t,p in sent.pos() if not t in ["-LRB-", "-RRB-"]] if __name__ == "__main__": if len(sys.argv) < 3: print "Usage:\n\t%s <corpus>" % sys.argv[0] sys.exit(-1) training = [] testing = [] lineIdx = 0 for fname in sys.argv[1:]: fin = codecs.open(fname, "r", "utf-8") for line in fin: lineIdx += 1 t = Tree.parse(line) if lineIdx % 2 == 0: training.append( t.pos() ) else: testing.append( t.pos() ) fin.close() # Train tagger unigram_tagger = UnigramTagger(training) # Evaluate print "Accuracy: %f" % unigram_tagger.evaluate(testing)