Ejemplo n.º 1
0
 def __init__(self,
              train=None,
              model=None,
              backoff=None,
              cutoff=0,
              verbose=False):
     NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose)
Ejemplo n.º 2
0
 def __init__(self, train=None, model=None,
              backoff=None, cutoff=0, n = 2, verbose=False):
     self._method = 'Lidstone'
     self._gamma = 1
     self._n = n
     self._bins = None
     self._backoff = backoff
     NgramTagger.__init__(self, self._n, train, model,
                          backoff, cutoff, verbose)
Ejemplo n.º 3
0
    def _train(self, ngrams, backoff, save=True):
        log.debug(" - train tagger (ngram={!r})".format(ngrams))
        tagger = NgramTagger(ngrams, self.tagged_sentences, backoff=backoff)

        if save:
            filename = self.get_ngram_tagger_filename(self.id, self.use_mwe,
                                                      ngrams)
            log.debug(" - save tagger (ngram={!r}) to filename {!r}".format(
                ngrams, filename))
            self._save(filename, tagger)
        return tagger
Ejemplo n.º 4
0
def create_tagger(sents,patterns=PATTERNS,maxngram=4):
    '''Обучение Backoff tagger на каком-либо корпусе предложений'''
    
    train = sents
    def_tagger = DefaultTagger('NN')
    re_tagger = RegexpTagger(patterns, backoff=def_tagger)
    uni_tagger = UnigramTagger(train, backoff=re_tagger) 
    bi_tagger = BigramTagger(train, backoff=uni_tagger) 
    tri_tagger = TrigramTagger(train, backoff=bi_tagger) 
    ngram_tagger = NgramTagger(maxngram, train, backoff=tri_tagger)
    return ngram_tagger
Ejemplo n.º 5
0
    def createModel(self):

        model_name = None
        try:
            unigrams = self.buildUnigrams()

            N = len(self.corpusSents)
            toTraining = round(self.training_portion * N)

            #logging.info("Sentencias totales:" + str(N))

            training = self.corpusSents[:toTraining]
            test = self.corpusSents[toTraining:]

            post_patterns = []

            for regex, post in self.regex_list:
                try:
                    regex = regex.decode('utf-8')
                except:
                    pass

                post_patterns.append((regex, post))

            for regex, post in self.config.items('postaggers.regex'):
                post_patterns.append((regex.decode('utf-8'), post))

            regexpTagger = RegexpTagger(post_patterns)
            unigramTagger = UnigramTagger(unigrams + training,
                                          backoff=regexpTagger)
            bigramTagger = BigramTagger(training, backoff=unigramTagger)
            trigramTagger = TrigramTagger(training, backoff=bigramTagger)
            NTagger = NgramTagger(self.max_ngrams,
                                  training,
                                  backoff=trigramTagger)

            print("Sentencias de entrenamiento para n-taggers:" +
                  str(len(training)))
            print("Sentencias de entrenamiento para unitaggers:" +
                  str(len(unigrams)))
            print(
                "Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:"
                + str(len(unigrams)))
            print("Sentencias para testing:" + str(len(test)))
            print("Expresiones regulares para el Tagger:")

            for post_regex in post_patterns:
                print post_regex

            if self.training_portion != 1:

                score_ut = unigramTagger.evaluate(test)
                score_bt = bigramTagger.evaluate(test) - 0.002
                score_tt = trigramTagger.evaluate(test)
                score_nt = NTagger.evaluate(test)

                scores = [score_ut, score_bt, score_tt, score_nt]
                tagger_names = ["uTagger", "biTagger", "triTagger", "NTagger"]
                taggers = [unigramTagger, bigramTagger, trigramTagger, NTagger]

                bestTagger_index = scores.index(max(scores))
                best_msg = max(scores), tagger_names[bestTagger_index]

            fname = self.taggers_path + tagger_names[bestTagger_index]
            if os.path.isfile(fname + self.tagger_extension_file):
                fname = fname + str(len(listdir(
                    self.taggers_path))) + self.tagger_extension_file
            else:
                fname = self.taggers_path + tagger_names[
                    bestTagger_index] + self.tagger_extension_file

            model = taggers[bestTagger_index]

            f = open(fname, 'wb')
            pickle.dump(model, f)
            f.close()

            print("Guardando el tagger :" + fname)
            #logging.info("Guardando el mejor tagger :" + fname)

            model_name = fname

        except Exception, e:
            print "ERRPR EN POS TAGGER GENERATOR:", str(e)
            pdb.set_trace()
import nltk
from nltk.corpus import treebank
from nltk import NgramTagger
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
quadgramtag = NgramTagger(4, training)
print(quadgramtag.evaluate(testing))

Ejemplo n.º 7
0
 def __init__(self, train=None, model=None,
              backoff=None, cutoff=0, verbose=False):
     NgramTagger.__init__(self, 1, train, model,
                          backoff, cutoff, verbose)
import nltk
from nltk.corpus import treebank
from nltk import NgramTagger
testing = treebank.tagged_sents()[2000:]
training = treebank.tagged_sents()[:7000]
quadgramtag = NgramTagger(8, training)
print(quadgramtag.evaluate(testing))
Ejemplo n.º 9
0
Long-distance dependencies affect the effectiveness of n-grams
"""

tagged_sents = list(brown.tagged_sents(categories='news'))
# this step is not necessary since the algorithm, NgraTagger is expecting a sentence (so a list of (str, str))
#tagged_sents = [ e for sublist in tagged_sents for e in sublist ]
random.shuffle( tagged_sents )

test_size = int(len(tagged_sents) / 10)
evaluation = 0

for I in range(10):
  test_sents = tagged_sents[I * test_size : (I+1) * test_size ]
  train_sents = tagged_sents[: I * test_size] + tagged_sents[ (I+1) * test_size :]
  # Tagger that shooses the tag based on the word string and the preceding "n" word's tags
  tagger = NgramTagger(2, train=train_sents)
  evaluation += tagger.evaluate(test_sents)

print('evaluation with 2-gram model')
print(evaluation/10)

tagger = [0,0,0,0,0,0,0]
evaluation = [0,0,0,0,0,0]
tagger[0] = DefaultTagger('NN')

for N in range(1, 7):
  for I in range(10):
    test_sents = tagged_sents[I * test_size : (I+1) * test_size ]
    train_sents = tagged_sents[: I * test_size] + tagged_sents[ (I+1) * test_size :]
    tagger[N] = NgramTagger(1, train=train_sents, backoff=tagger[N-1]) # <- to be used in "retrospective" if it encounters an unknown context
    evaluation[N-1] += tagger[N].evaluate(test_sents)
Ejemplo n.º 10
0
	def createModel(self):

		
		model_name=None
		try:
			unigrams=self.buildUnigrams()
			
			N=len(self.corpusSents)
			toTraining=round(self.training_portion*N)
			
			#logging.info("Sentencias totales:" + str(N))

			training=self.corpusSents[:toTraining]
			test=self.corpusSents[toTraining:]
			
			post_patterns=[]

			for regex,post in self.regex_list:
				try:
					regex=regex.decode('utf-8')
				except:
					pass
				
				post_patterns.append((regex,post))


			
			for regex,post in self.config.items('postaggers.regex'):
				post_patterns.append((regex.decode('utf-8'),post))

		
			regexpTagger  = RegexpTagger(post_patterns)
			unigramTagger = UnigramTagger(unigrams+training,backoff=regexpTagger)	
			bigramTagger= BigramTagger(training, backoff=unigramTagger) 
			trigramTagger = TrigramTagger(training, backoff=bigramTagger)
			NTagger=NgramTagger(self.max_ngrams,training,backoff=trigramTagger)

			print("Sentencias de entrenamiento para n-taggers:" + str(len(training)))
			print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams)))
			print("Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams)))
			print("Sentencias para testing:" + str(len(test)))
			print("Expresiones regulares para el Tagger:")
			
			for post_regex in post_patterns:
				print post_regex
				
		
			if self.training_portion!=1:
		
				score_ut=unigramTagger.evaluate(test)
				score_bt=bigramTagger.evaluate(test)-0.002
				score_tt=trigramTagger.evaluate(test)
				score_nt=NTagger.evaluate(test)

			

				scores=[score_ut,score_bt,score_tt,score_nt]
				tagger_names=["uTagger","biTagger","triTagger","NTagger"]
				taggers=[unigramTagger,bigramTagger,trigramTagger,NTagger]

				bestTagger_index= scores.index(max(scores))
				best_msg=max(scores),tagger_names[bestTagger_index]
			
		
			fname=self.taggers_path + tagger_names[bestTagger_index]
			if os.path.isfile(fname+self.tagger_extension_file):
				fname=fname+str(len(listdir(self.taggers_path)))+self.tagger_extension_file
			else:
				fname=self.taggers_path + tagger_names[bestTagger_index]+self.tagger_extension_file
			
			model=taggers[bestTagger_index]

			f = open(fname,'wb')
			pickle.dump(model, f)
			f.close()
			
			print ("Guardando el tagger :" + fname)
			#logging.info("Guardando el mejor tagger :" + fname)
			
			model_name=fname
			
		except Exception,e:
			print "ERRPR EN POS TAGGER GENERATOR:",str(e)
			pdb.set_trace()
Ejemplo n.º 11
0
    (r'(The|the|A|a|An|an)$', 'DET'),  # articles
    (r'.*able$', 'ADJ'),  # adjectives
    (r'.*ness$', 'NOUN'),  # nouns formed from adjectives
    (r'.*ly$', 'ADV'),  # adverbs
    (r'.*s$', 'NOUN'),  # plural nouns
    (r'.*ing$', 'VERB'),  # gerunds
    (r'.*ed$', 'VERB'),  # past tense verbs
    (r'.*', 'NOUN')  # nouns (default)
])
#Affix tagger
at2 = AffixTagger(train, backoff=regexp_tagger)
#Unigram tagger
ut3 = UnigramTagger(train, backoff=at2)
ut3.evaluate(test)
# Ngram tagger
ct3 = NgramTagger(3, train, backoff=ut3)

google3.EnsureDir("tagged/")
for i in range(0, 12):
    try:
        tokenFile = codecs.open(os.path.join("clean", "Tokens-%s.txt" % (i)),
                                "r",
                                encoding="utf-8")
        taggedFile = codecs.open(os.path.join("tagged", "Tagged-%s.txt" % (i)),
                                 "a",
                                 encoding="utf-8")
        tokenList = tokenFile.read().splitlines()
        # taggedTokens = nltk.pos_tag(tokenList, tagset='universal')
        taggedTokens = ct3.tag(tokenList)
        # print(taggedTokens)
        for pair in taggedTokens: