def pos_tag(self): tokenize_obj = NLTKTokenize(self.options) res = tokenize_obj.tokenize() tokens = res['result'] tags = [] # Performs Bigram / Unigram / Regex Tagging if self.options.get('tagger') in ['unigram', 'bigram', 'regex']: trainer = self.options['train'] if self.options.get( 'train') in TRAINERS else DEFAULT_TRAIN train = brown.tagged_sents(categories=trainer) # Create your custom regex tagging pattern here regex_tag = RegexpTagger([(r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN')]) current = os.path.dirname(os.path.abspath(__file__)) # Unigram tag training data load / dump pickle pkl_name = current + '/trained/unigram_' + trainer + '.pkl' if os.path.isfile(pkl_name): with open(pkl_name, 'rb') as pkl: unigram_tag = load(pkl) else: unigram_tag = UnigramTagger(train, backoff=regex_tag) with open(pkl_name, 'wb') as pkl: dump(unigram_tag, pkl, -1) # Bigram tag training data load / dump pickle if self.options['tagger'] == 'bigram': pkl_name = current + '/trained/bigram_' + trainer + '.pkl' if os.path.isfile(pkl_name): with open(pkl_name, 'rb') as pkl: bigram_tag = load(pkl) else: bigram_tag = BigramTagger(train, backoff=unigram_tag) with open(pkl_name, 'wb') as pkl: dump(bigram_tag, pkl, -1) tags = bigram_tag.tag(tokens) # Bigram tagging performed here elif self.options['tagger'] == 'unigram': tags = unigram_tag.tag( tokens) # Unigram tagging performed here else: tags = regex_tag.tag(tokens) # Regex tagging performed here # Performs default pos_tag elif self.options.get('tagger', DEFAULT_TAGGER) == 'pos': tags = pos_tag(tokens) return self._dump(tags)
def verifygrammar(label, codestring, varname): regexp_tagger = RegexpTagger([ (r"^[0-9]+$", "decimal"), (r"^0x[0-9A-Fa-f]+$", "hexadecimal"), ]) # VARIABLE LINE GENERATION - Assumption - Complex numbers data types are ignored for data mining algorithms if label.tag == 'var': varGrammar = CFG.fromstring(""" S -> VN "=" VV VN -> """ + varname + """ VV -> I | D | ST | B B -> True | False I -> I N | N D -> I"."F F -> F N | N ST -> "'"STI"'" STI -> S N | S C | N | C N -> 0|1|2|3|4|5|6|7|8|9 C -> a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z """) elif label.tag == 'array': arrayGrammar = CFG.fromstring(""" S -> AN "= [" AE "]" AN -> """ + varname + """ AE -> VV AE | VV VV -> I | D | ST | B B -> True | False I -> I N | N D -> I"."F F -> F N | N ST -> "'"STI"'" STI -> S N | S C | N | C N -> 0|1|2|3|4|5|6|7|8|9 C -> a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z """)
def _model_definition(self) -> RegexpTagger: """Function to define and compile the model. Returns: Model object. """ t0 = DefaultTagger('NOUN') return RegexpTagger(Model.RULES, backoff=t0)
def create_tagger(sents,patterns=PATTERNS,maxngram=4): '''Обучение Backoff tagger на каком-либо корпусе предложений''' train = sents def_tagger = DefaultTagger('NN') re_tagger = RegexpTagger(patterns, backoff=def_tagger) uni_tagger = UnigramTagger(train, backoff=re_tagger) bi_tagger = BigramTagger(train, backoff=uni_tagger) tri_tagger = TrigramTagger(train, backoff=bi_tagger) ngram_tagger = NgramTagger(maxngram, train, backoff=tri_tagger) return ngram_tagger
def regex_tag(): raw = 'I am applying for AIT because I can be with my parents here and I am already granted a scholarship' raw_incorrect = 'I love AIT because AIT is interesting and professors here give a lot of challenging assignment' patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') ] # nouns (default) regexp_tagger = RegexpTagger(patterns) tagged = regexp_tagger.tag(word_tokenize(raw)) tagged_incorrect = regexp_tagger.tag(word_tokenize(raw_incorrect)) print(tagged) print(tagged_incorrect) score = regexp_tagger.evaluate(brown_tagged_sents) print(score)
def generateTagger(): default_tagger = DefaultTagger('V') patterns = [ (r'.*o$', 'NMS'), # noun masculine singular (r'.*os$', 'NMP'), # noun masculine plural (r'.*a$', 'NFS'), # noun feminine singular (r'.*as$', 'NFP') # noun feminine singular ] regexp_tagger = RegexpTagger(patterns, backoff=default_tagger) #train nltk.UnigramTagger using tagged sentences from cess_esp cess_tagged_sents = cess_esp.tagged_sents() combined_tagger = UnigramTagger(cess_tagged_sents, backoff=regexp_tagger) return combined_tagger
def __init__(self, train=None, default=None, name=None): self.name = name # As found on page 199 of the nltk book regexps = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ] self.default = default self.regex = RegexpTagger(regexps, backoff=self.default) self.unigram = UnigramTagger(train=train, backoff=self.regex) self.bigram = BigramTagger(train=train, backoff=self.unigram)
def train_and_save_unigram_tagger(): train_text = brown.tagged_sents() regexp_tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) unigram_tagger = UnigramTagger(train_text, backoff=regexp_tagger) output = open('../taggers/unigram_tagger.pkl', 'wb') dump(unigram_tagger, output, -1) output.close()
def prepare_toolset(): toolset = {} patterns = [(r'^[\.1-9]+$', 'NUM'), (r'^[^a-zA-Z]+$', '.'), (r'^[^a-zA-Z]*[a-zA-Z]+[-\'][a-zA-Z]+[^a-zA-Z]*$', 'NOUN'), (r'^.*[a-zA-Z]+[^-a-zA-Z]+[a-zA-Z]+.*$', '.')] train_set = brown.tagged_sents( categories='learned', tagset='universal') + brown.tagged_sents( categories='news', tagset='universal') + brown.tagged_sents( categories='reviews', tagset='universal') utgr = UnigramTagger(train=train_set, backoff=DefaultTagger('NN')) btgr = BigramTagger(train=train_set, backoff=utgr) ttgr = TrigramTagger(train=train_set, backoff=btgr) toolset['tgr'] = RegexpTagger(regexps=patterns, backoff=ttgr) toolset['sw'] = stopwords.words('english') toolset['lr'] = WordNetLemmatizer() toolset['wntg'] = { 'NOUN': wordnet.NOUN, 'VERB': wordnet.VERB, 'ADJ': wordnet.ADJ, 'ADV': wordnet.ADV, 'X': wordnet.NOUN } print('Tools Ready') return toolset
(r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'the', 'DT'), # Determiner (r'in','IN'), # preposition (r'.*', 'NN') # nouns (default) ] # Generate a *RegexpTagger*-object with the defined rules. # In[3]: regexp_tagger = RegexpTagger(patterns) # Apply the RegexpTagger for tagging a single sentence: # In[4]: regexp_tagger.tag("5 friends have been singing in the rain".split()) # Apply the *RegexpTagger* for tagging the first 3 sentences of the brown corpus. # In[5]:
@author: ibews """ import codecs from nltk import RegexpTagger f = codecs.open("./staedte.txt", "r", encoding="utf8") names = [line.strip() for line in f.readlines()] # https://de.wikipedia.org/wiki/Liste_der_St%C3%A4dte_in_Deutschland # grab url # parse html # find all links of tho form: # <dd><a href="/wiki/.*" title=".*">$1</a>.*</dd> # combing all $1 into a list. # look through tho complete list of cities looking for similarities # compile similaritiesinto patterns # apply patterns = [ (ur'.+(Hall|bach|b[eu]rg|dorf|feld|hausen|heim|leben|r[ao]de|st[ae]dt|tah?l)$', 'city'), (ur'^Bad .+', 'city'), (ur'^Burg', 'city'), (ur'^.*$', 'DEFAULT') ] ret = RegexpTagger(patterns) tagged = ret.tag(names) print len(filter(lambda (_, tag): tag == 'city', tagged))*100.0/len(names)
def __init__(self): from nltk import RegexpTagger from nltk import RegexpParser self.tagger = RegexpTagger(patterns) self.chunker = RegexpParser(grammar, trace=COPYRIGHT_TRACE)
class CopyrightDetector(object): """ Class to detect copyrights and authorship. """ def __init__(self): from nltk import RegexpTagger from nltk import RegexpParser self.tagger = RegexpTagger(patterns) self.chunker = RegexpParser(grammar, trace=COPYRIGHT_TRACE) @staticmethod def as_str(node): """ Return a parse tree node as a space-normalized string. """ node_string = ' '.join(k for k, _ in node.leaves()) return u' '.join(node_string.split()) def detect(self, numbered_lines): """ Return a sequence of tuples (copyrights, authors, years, holders) detected in a sequence of numbered line tuples. """ from nltk.tree import Tree numbered_lines = list(numbered_lines) numbers = [n for n, _l in numbered_lines] start_line = min(numbers) end_line = max(numbers) # logger.debug('CopyrightDetector:detect:lines numbers: %(start_line)d->%(end_line)d' % locals()) tokens = self.get_tokens(numbered_lines) # we accumulate detected items in these synchronized lists # this could be a single list of namedtuples # or a list of dicts instead copyrights, authors, years, holders = [], [], [], [] if not tokens: return copyrights, authors, years, holders, None, None # OPTIMIZED copyrights_append = copyrights.append authors_append = authors.append years_append = years.append holders_append = holders.append # first, POS tag each token using token regexes tagged_text = self.tagger.tag(tokens) logger.debug('CopyrightDetector:tagged_text: ' + str(tagged_text)) # then build a parse tree based on tagged tokens tree = self.chunker.parse(tagged_text) logger.debug('CopyrightDetector:parse tree: ' + str(tree)) CopyrightDetector_as_str = CopyrightDetector.as_str def collect_year_and_holder(detected_copyright): """ Walk the a parse sub-tree starting with the `detected_copyright` node collecting all years and holders. """ for copyr in detected_copyright: if isinstance(copyr, Tree): # logger.debug('n: ' + str(copyr)) node_text = CopyrightDetector_as_str(copyr) copyr_label = copyr.label() if 'YR-RANGE' in copyr_label: years_append(refine_date(node_text)) elif 'NAME' == copyr_label or 'COMPANY' in copyr_label: # FIXME : this would wreck things like 23andme # where a company name contains numbers holders_append(refine_author(node_text)) # logger.debug('CopyrightDetector: node_text: ' + node_text) else: collect_year_and_holder(copyr) # then walk the parse tree, collecting copyrights, years and authors for tree_node in tree: if isinstance(tree_node, Tree): node_text = CopyrightDetector_as_str(tree_node) tree_node_label = tree_node.label() if 'COPYRIGHT' in tree_node_label: if node_text and node_text.strip(): refined = refine_copyright(node_text) if not is_junk(refined): copyrights_append(refined) collect_year_and_holder(tree_node) elif tree_node_label == 'AUTHOR': authors_append(refine_author(node_text)) return copyrights, authors, years, holders, start_line, end_line def get_tokens(self, numbered_lines): """ Return an iterable of tokens from lines of text. """ tokens = [] tokens_append = tokens.append # simple tokenization: spaces and some punctuation splitter = re.compile('[\\t =;]+').split for _line_number, line in numbered_lines: line = line.strip() if line: line = prepare_text_line(line) if line: line = strip_markup(line) if line and line.strip(): for tok in splitter(line): # strip trailing quotes and ignore empties tok = tok.strip("' ") if not tok: continue # strip trailing colons: why? tok = tok.rstrip(':').strip() # strip leading @: : why? tok = tok.lstrip('@').strip() if tok and tok not in (':', ): tokens_append(tok) logger.debug('CopyrightDetector:tokens: ' + repr(tokens)) return tokens
(ur'.*schaft$', 'NN'), (ur'.*mus$', 'NN'), (ur'.*er$', 'NN'), (ur'.*chen$', 'NN'), (ur'.*lein$', 'NN'), (ur'.*lich$', 'ADJ'), (ur'.*ig$', 'ADJ'), (ur'.*isch$', 'ADJ'), (ur'.*haft$', 'ADJ'), (ur'.*bar$', 'ADJ'), (ur'.*los$', 'ADJ'), (ur'.*sam$', 'ADJ'), (ur'.*', None) ] ret = RegexpTagger(patterns) tokens = nltk.word_tokenize(text) tagged = ret.tag(tokens) diff = [(w, lt, rt) for ((w, lt),(_, rt)) in zip(tagged, words) if lt != rt ] print diff # [(u'der', 'NN', None), (u'Brahmane', None, 'NN'), (u'stumm', None, 'ADJ'), # (u'gekreuzten', None, 'ADJ'), (u'der', 'NN', None), (u'Vater.', None, 'NN'), # (u'der', 'NN', None), (u'der', 'NN', None), (u'er', 'NN', None), (u'einer', 'NN', None), # (u'Stunde', None, 'NN'), (u'Schlaf', None, 'NN'), (u'Augen', None, 'NN'), # (u'der', 'NN', None), (u'Brahmane', None, 'NN'), (u'Schritte', None, 'NN'), # (u'her', 'NN', None), (u'Haus', None, 'NN'), (u'Mond', None, 'NN'), (u'der', 'NN', None),
if taggerTrained: input = open('data/bestTagger.pkl', 'rb') tagger = load(input) input.close() else: output = open('data/bestTagger.pkl', 'wb') train_set = make_train_set([brown, treebank]) backoffTagger = backoff_tagger( train_set, [UnigramTagger, BigramTagger, TrigramTagger], backoff=DefaultTagger('NN')) brillTagger = brill_tagger_wrapper.train_brill_tagger( backoffTagger, train_set) dump(brillTagger, output, -1) output.close() tagger = brillTagger print(tagger.evaluate(test_set)) print(tagger.evaluate(brown.tagged_sents())) print(tagger.evaluate(treebank.tagged_sents())) # cpos = ClassifierBasedPOSTagger(train=train_set) # print(cpos.evaluate(test_set)) ################## # Regex tagger # ################## patterns = [(r'[0-9]+:[0-9]+|[0-9]+-[0-9]+', 'T'), (r'.*road', 'L')] regexTagger = RegexpTagger(patterns) # try out the regex tagger x = nltk.word_tokenize("its at 9:00 till 5-00 on bigroad") print(regexTagger.tag(x))
(r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] additions = [ (r'[.?;!:]', '.'), ('\\($', '('), (r'.*ly$', 'ADV'), ('n[o\']t$', '*'), (r'^,$', ',') ] ret = RegexpTagger(patterns) print ret.evaluate(brown.tagged_sents(categories='news')) for pattern in additions: patterns.insert(-1, pattern) print "added pattern {}".format(pattern) ret = RegexpTagger(patterns) print ret.evaluate(brown.tagged_sents(categories='news')) # 0.203263917895 # added pattern ('[.?;!:]', '.') # 0.247538635957 # added pattern ('\\($', '(') # 0.24901048193 # added pattern ('.*ly$', 'ADV') # 0.248314338564
from nltk import NgramTagger from nltk import RegexpTagger # In[5]: # Prepare training brown_news_tagged = brown.tagged_sents(categories='news', tagset='universal') train = brown_news_tagged[100:] test = brown_news_tagged[:100] regexp_tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'DET'), # articles (r'.*able$', 'ADJ'), # adjectives (r'.*ness$', 'NOUN'), # nouns formed from adjectives (r'.*ly$', 'ADV'), # adverbs (r'.*s$', 'NOUN'), # plural nouns (r'.*ing$', 'VERB'), # gerunds (r'.*ed$', 'VERB'), # past tense verbs (r'.*', 'NOUN') # nouns (default) ]) #Affix tagger at2 = AffixTagger(train, backoff=regexp_tagger) #Unigram tagger ut3 = UnigramTagger(train, backoff=at2) ut3.evaluate(test) # Ngram tagger ct3 = NgramTagger(3, train, backoff=ut3) google3.EnsureDir("tagged/") for i in range(0, 12):
defaultTChat90 = nltk.DefaultTagger(freqChat90) #print(defaultTChat90.evaluate(chatT90)) #b) #using regex from nltk.org/book/chp05.html, 4.2 patterns = [ (r'.*ing$', 'VBG'), #gerunds (r'.*ed$', 'VBD'), #simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), #modal (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), #plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') #nouns (default) ] regexp_tagger = RegexpTagger(patterns) uniB = UnigramTagger(brownT90, backoff=defaultTB90) biB = BigramTagger(brownT90, backoff=uniB) triB = TrigramTagger(brownT90, backoff=biB) uniC = UnigramTagger(chatT50, backoff=defaultTChat50) biC = BigramTagger(chatT50, backoff=uniC) triC = TrigramTagger(chatT50, backoff=uniC) print("Regextag50/50: ", regexp_tagger.evaluate(brownT50)) print("Default: ", defaultTB90.evaluate(brownT50)) print("Bigram Brown 50/50: ", BigramTagger(brownT50, backoff=defaultTB50).evaluate(brownT50)) print("Default: ", defaultTB50.evaluate(brownT50))
from pythainlp.tokenize import word_tokenize def open_dict(name): with open(name + ".dict", "r") as f: return [i.strip() for i in f.readlines()] dict_word = { "NUM": open_dict("NUM"), "PART": open_dict("part"), "DET": open_dict("det"), "PROPN": open_dict("PROPN"), "ADJ": open_dict("ADJ"), "NOUN": open_dict("NOUN"), "NOTKNOW": [".*"] } regexp_tagger = RegexpTagger([('(' + '|'.join(dict_word[a]) + ')$', a) for a in dict_word]) while True: text = input("input : ") if text == "exit": break print(regexp_tagger.tag(word_tokenize(text))) print("\n") """ https://stackoverflow.com/questions/14802442/how-to-use-a-regex-backoff-tagger-in-python-nltk-to-override-nns """ #print('Regexp accuracy %4.1f%%' % (100.0 * regexp_tagger.evaluate(brown_test)))
class CopyrightDetector(object): """ Class to detect copyrights and authorship. """ def __init__(self): from nltk import RegexpTagger from nltk import RegexpParser self.tagger = RegexpTagger(patterns) self.chunker = RegexpParser(grammar, trace=COPYRIGHT_TRACE) @staticmethod def as_str(node): """ Return a parse tree node as a space-normalized string. """ node_string = ' '.join(k for k, _ in node.leaves()) return u' '.join(node_string.split()) def detect(self, numbered_lines): """ Return a sequence of tuples (copyrights, authors, years, holders) detected in a sequence of numbered line tuples. """ from nltk.tree import Tree numbered_lines = list(numbered_lines) numbers = [n for n, _l in numbered_lines] start_line = min(numbers) end_line = max(numbers) # logger.debug('CopyrightDetector:detect:lines numbers: %(start_line)d->%(end_line)d' % locals()) tokens = self.get_tokens(numbered_lines) # we accumulate detected items in these synchronized lists # this could be a single list of namedtuples # or a list of dicts instead copyrights, authors, years, holders = [], [], [], [] if not tokens: return copyrights, authors, years, holders, None, None # OPTIMIZED copyrights_append = copyrights.append authors_append = authors.append years_append = years.append holders_append = holders.append # first, POS tag each token using token regexes tagged_text = self.tagger.tag(tokens) logger.debug('CopyrightDetector:tagged_text: ' + str(tagged_text)) # then build a parse tree based on tagged tokens tree = self.chunker.parse(tagged_text) logger.debug('CopyrightDetector:parse tree: ' + str(tree)) CopyrightDetector_as_str = CopyrightDetector.as_str def collect_year_and_holder(detected_copyright): """ Walk the a parse sub-tree starting with the `detected_copyright` node collecting all years and holders. """ for copyr in detected_copyright: if isinstance(copyr, Tree): # logger.debug('n: ' + str(copyr)) node_text = CopyrightDetector_as_str(copyr) copyr_label = copyr.label() if 'YR-RANGE' in copyr_label: years_append(refine_date(node_text)) elif 'NAME' == copyr_label or 'COMPANY' in copyr_label: # FIXME : this would wreck things like 23andme # where a company name contains numbers holders_append(refine_author(node_text)) # logger.debug('CopyrightDetector: node_text: ' + node_text) else: collect_year_and_holder(copyr) # then walk the parse tree, collecting copyrights, years and authors for tree_node in tree: if isinstance(tree_node, Tree): node_text = CopyrightDetector_as_str(tree_node) tree_node_label = tree_node.label() if 'COPYRIGHT' in tree_node_label: if node_text and node_text.strip(): refined = refine_copyright(node_text) if not is_junk(refined): copyrights_append(refined) collect_year_and_holder(tree_node) elif tree_node_label == 'AUTHOR': authors_append(refine_author(node_text)) return copyrights, authors, years, holders, start_line, end_line def get_tokens(self, numbered_lines): """ Return an iterable of tokens from lines of text. """ tokens = [] tokens_append = tokens.append # simple tokenization: spaces and some punctuation splitter = re.compile('[\\t =;]+').split for _line_number, line in numbered_lines: line = line.strip() if line: line = prepare_text_line(line) if line : line = strip_markup(line) if line and line.strip(): for tok in splitter(line): # strip trailing quotes and ignore empties tok = tok.strip("' ") if not tok: continue # strip trailing colons: why? tok = tok.rstrip(':').strip() # strip leading @: : why? tok = tok.lstrip('@').strip() if tok and tok not in (':',): tokens_append(tok) logger.debug('CopyrightDetector:tokens: ' + repr(tokens)) return tokens
def createModel(self): model_name = None try: unigrams = self.buildUnigrams() N = len(self.corpusSents) toTraining = round(self.training_portion * N) #logging.info("Sentencias totales:" + str(N)) training = self.corpusSents[:toTraining] test = self.corpusSents[toTraining:] post_patterns = [] for regex, post in self.regex_list: try: regex = regex.decode('utf-8') except: pass post_patterns.append((regex, post)) for regex, post in self.config.items('postaggers.regex'): post_patterns.append((regex.decode('utf-8'), post)) regexpTagger = RegexpTagger(post_patterns) unigramTagger = UnigramTagger(unigrams + training, backoff=regexpTagger) bigramTagger = BigramTagger(training, backoff=unigramTagger) trigramTagger = TrigramTagger(training, backoff=bigramTagger) NTagger = NgramTagger(self.max_ngrams, training, backoff=trigramTagger) print("Sentencias de entrenamiento para n-taggers:" + str(len(training))) print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams))) print( "Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams))) print("Sentencias para testing:" + str(len(test))) print("Expresiones regulares para el Tagger:") for post_regex in post_patterns: print post_regex if self.training_portion != 1: score_ut = unigramTagger.evaluate(test) score_bt = bigramTagger.evaluate(test) - 0.002 score_tt = trigramTagger.evaluate(test) score_nt = NTagger.evaluate(test) scores = [score_ut, score_bt, score_tt, score_nt] tagger_names = ["uTagger", "biTagger", "triTagger", "NTagger"] taggers = [unigramTagger, bigramTagger, trigramTagger, NTagger] bestTagger_index = scores.index(max(scores)) best_msg = max(scores), tagger_names[bestTagger_index] fname = self.taggers_path + tagger_names[bestTagger_index] if os.path.isfile(fname + self.tagger_extension_file): fname = fname + str(len(listdir( self.taggers_path))) + self.tagger_extension_file else: fname = self.taggers_path + tagger_names[ bestTagger_index] + self.tagger_extension_file model = taggers[bestTagger_index] f = open(fname, 'wb') pickle.dump(model, f) f.close() print("Guardando el tagger :" + fname) #logging.info("Guardando el mejor tagger :" + fname) model_name = fname except Exception, e: print "ERRPR EN POS TAGGER GENERATOR:", str(e) pdb.set_trace()
dt = DefaultTagger('NN') dt.evaluate(test_data) patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ... ] rt = RegexpTagger(patterns) rt.evaluate(test_data) ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) ut.evaluate(test_data) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff