def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose)
def __init__(self, train=None, model=None, backoff=None, cutoff=0, n = 2, verbose=False): self._method = 'Lidstone' self._gamma = 1 self._n = n self._bins = None self._backoff = backoff NgramTagger.__init__(self, self._n, train, model, backoff, cutoff, verbose)
def _train(self, ngrams, backoff, save=True): log.debug(" - train tagger (ngram={!r})".format(ngrams)) tagger = NgramTagger(ngrams, self.tagged_sentences, backoff=backoff) if save: filename = self.get_ngram_tagger_filename(self.id, self.use_mwe, ngrams) log.debug(" - save tagger (ngram={!r}) to filename {!r}".format( ngrams, filename)) self._save(filename, tagger) return tagger
def create_tagger(sents,patterns=PATTERNS,maxngram=4): '''Обучение Backoff tagger на каком-либо корпусе предложений''' train = sents def_tagger = DefaultTagger('NN') re_tagger = RegexpTagger(patterns, backoff=def_tagger) uni_tagger = UnigramTagger(train, backoff=re_tagger) bi_tagger = BigramTagger(train, backoff=uni_tagger) tri_tagger = TrigramTagger(train, backoff=bi_tagger) ngram_tagger = NgramTagger(maxngram, train, backoff=tri_tagger) return ngram_tagger
def createModel(self): model_name = None try: unigrams = self.buildUnigrams() N = len(self.corpusSents) toTraining = round(self.training_portion * N) #logging.info("Sentencias totales:" + str(N)) training = self.corpusSents[:toTraining] test = self.corpusSents[toTraining:] post_patterns = [] for regex, post in self.regex_list: try: regex = regex.decode('utf-8') except: pass post_patterns.append((regex, post)) for regex, post in self.config.items('postaggers.regex'): post_patterns.append((regex.decode('utf-8'), post)) regexpTagger = RegexpTagger(post_patterns) unigramTagger = UnigramTagger(unigrams + training, backoff=regexpTagger) bigramTagger = BigramTagger(training, backoff=unigramTagger) trigramTagger = TrigramTagger(training, backoff=bigramTagger) NTagger = NgramTagger(self.max_ngrams, training, backoff=trigramTagger) print("Sentencias de entrenamiento para n-taggers:" + str(len(training))) print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams))) print( "Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams))) print("Sentencias para testing:" + str(len(test))) print("Expresiones regulares para el Tagger:") for post_regex in post_patterns: print post_regex if self.training_portion != 1: score_ut = unigramTagger.evaluate(test) score_bt = bigramTagger.evaluate(test) - 0.002 score_tt = trigramTagger.evaluate(test) score_nt = NTagger.evaluate(test) scores = [score_ut, score_bt, score_tt, score_nt] tagger_names = ["uTagger", "biTagger", "triTagger", "NTagger"] taggers = [unigramTagger, bigramTagger, trigramTagger, NTagger] bestTagger_index = scores.index(max(scores)) best_msg = max(scores), tagger_names[bestTagger_index] fname = self.taggers_path + tagger_names[bestTagger_index] if os.path.isfile(fname + self.tagger_extension_file): fname = fname + str(len(listdir( self.taggers_path))) + self.tagger_extension_file else: fname = self.taggers_path + tagger_names[ bestTagger_index] + self.tagger_extension_file model = taggers[bestTagger_index] f = open(fname, 'wb') pickle.dump(model, f) f.close() print("Guardando el tagger :" + fname) #logging.info("Guardando el mejor tagger :" + fname) model_name = fname except Exception, e: print "ERRPR EN POS TAGGER GENERATOR:", str(e) pdb.set_trace()
import nltk from nltk.corpus import treebank from nltk import NgramTagger testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] quadgramtag = NgramTagger(4, training) print(quadgramtag.evaluate(testing))
import nltk from nltk.corpus import treebank from nltk import NgramTagger testing = treebank.tagged_sents()[2000:] training = treebank.tagged_sents()[:7000] quadgramtag = NgramTagger(8, training) print(quadgramtag.evaluate(testing))
Long-distance dependencies affect the effectiveness of n-grams """ tagged_sents = list(brown.tagged_sents(categories='news')) # this step is not necessary since the algorithm, NgraTagger is expecting a sentence (so a list of (str, str)) #tagged_sents = [ e for sublist in tagged_sents for e in sublist ] random.shuffle( tagged_sents ) test_size = int(len(tagged_sents) / 10) evaluation = 0 for I in range(10): test_sents = tagged_sents[I * test_size : (I+1) * test_size ] train_sents = tagged_sents[: I * test_size] + tagged_sents[ (I+1) * test_size :] # Tagger that shooses the tag based on the word string and the preceding "n" word's tags tagger = NgramTagger(2, train=train_sents) evaluation += tagger.evaluate(test_sents) print('evaluation with 2-gram model') print(evaluation/10) tagger = [0,0,0,0,0,0,0] evaluation = [0,0,0,0,0,0] tagger[0] = DefaultTagger('NN') for N in range(1, 7): for I in range(10): test_sents = tagged_sents[I * test_size : (I+1) * test_size ] train_sents = tagged_sents[: I * test_size] + tagged_sents[ (I+1) * test_size :] tagger[N] = NgramTagger(1, train=train_sents, backoff=tagger[N-1]) # <- to be used in "retrospective" if it encounters an unknown context evaluation[N-1] += tagger[N].evaluate(test_sents)
def createModel(self): model_name=None try: unigrams=self.buildUnigrams() N=len(self.corpusSents) toTraining=round(self.training_portion*N) #logging.info("Sentencias totales:" + str(N)) training=self.corpusSents[:toTraining] test=self.corpusSents[toTraining:] post_patterns=[] for regex,post in self.regex_list: try: regex=regex.decode('utf-8') except: pass post_patterns.append((regex,post)) for regex,post in self.config.items('postaggers.regex'): post_patterns.append((regex.decode('utf-8'),post)) regexpTagger = RegexpTagger(post_patterns) unigramTagger = UnigramTagger(unigrams+training,backoff=regexpTagger) bigramTagger= BigramTagger(training, backoff=unigramTagger) trigramTagger = TrigramTagger(training, backoff=bigramTagger) NTagger=NgramTagger(self.max_ngrams,training,backoff=trigramTagger) print("Sentencias de entrenamiento para n-taggers:" + str(len(training))) print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams))) print("Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams))) print("Sentencias para testing:" + str(len(test))) print("Expresiones regulares para el Tagger:") for post_regex in post_patterns: print post_regex if self.training_portion!=1: score_ut=unigramTagger.evaluate(test) score_bt=bigramTagger.evaluate(test)-0.002 score_tt=trigramTagger.evaluate(test) score_nt=NTagger.evaluate(test) scores=[score_ut,score_bt,score_tt,score_nt] tagger_names=["uTagger","biTagger","triTagger","NTagger"] taggers=[unigramTagger,bigramTagger,trigramTagger,NTagger] bestTagger_index= scores.index(max(scores)) best_msg=max(scores),tagger_names[bestTagger_index] fname=self.taggers_path + tagger_names[bestTagger_index] if os.path.isfile(fname+self.tagger_extension_file): fname=fname+str(len(listdir(self.taggers_path)))+self.tagger_extension_file else: fname=self.taggers_path + tagger_names[bestTagger_index]+self.tagger_extension_file model=taggers[bestTagger_index] f = open(fname,'wb') pickle.dump(model, f) f.close() print ("Guardando el tagger :" + fname) #logging.info("Guardando el mejor tagger :" + fname) model_name=fname except Exception,e: print "ERRPR EN POS TAGGER GENERATOR:",str(e) pdb.set_trace()
(r'(The|the|A|a|An|an)$', 'DET'), # articles (r'.*able$', 'ADJ'), # adjectives (r'.*ness$', 'NOUN'), # nouns formed from adjectives (r'.*ly$', 'ADV'), # adverbs (r'.*s$', 'NOUN'), # plural nouns (r'.*ing$', 'VERB'), # gerunds (r'.*ed$', 'VERB'), # past tense verbs (r'.*', 'NOUN') # nouns (default) ]) #Affix tagger at2 = AffixTagger(train, backoff=regexp_tagger) #Unigram tagger ut3 = UnigramTagger(train, backoff=at2) ut3.evaluate(test) # Ngram tagger ct3 = NgramTagger(3, train, backoff=ut3) google3.EnsureDir("tagged/") for i in range(0, 12): try: tokenFile = codecs.open(os.path.join("clean", "Tokens-%s.txt" % (i)), "r", encoding="utf-8") taggedFile = codecs.open(os.path.join("tagged", "Tagged-%s.txt" % (i)), "a", encoding="utf-8") tokenList = tokenFile.read().splitlines() # taggedTokens = nltk.pos_tag(tokenList, tagset='universal') taggedTokens = ct3.tag(tokenList) # print(taggedTokens) for pair in taggedTokens: