Exemple #1
0
 def createInstance(cls, backoff):
     from nltk.corpus import brown
     brown_tagged_sents = brown.tagged_sents()
     return nltk.TrigramTagger(brown_tagged_sents, backoff=backoff)
for item in all_files:
    item = periods.sub(".", item)
    all_text.append(item)

# Make raw text and then tokenize
corpus_raw = "".join(all_text)
corpus_sents = nltk.sent_tokenize(corpus_raw, language="french")

for sentence in corpus_sents:
    corpus_list = sentence.split()
    corpus_tuples = [nltk.tag.str2tuple(item) for item in corpus_list]
    corpus_tagged_sents.append(corpus_tuples)

# Split into training and held out data
size = int(len(corpus_tagged_sents) * 0.9)
train_sents = corpus_tagged_sents[:size]
test_sents = corpus_tagged_sents[size:]

# Train taggers
tagger_default = nltk.DefaultTagger("NN")
tagger_unigram = nltk.UnigramTagger(train_sents, backoff=tagger_default)
tagger_bigram = nltk.BigramTagger(train_sents, backoff=tagger_unigram)
tagger_trigram = nltk.TrigramTagger(train_sents, backoff=tagger_bigram)

# Evaluate with disfluency chunks and print some stats
stats_dir = "./stats/"
result = tagger_trigram.evaluate(test_sents)

with open(f"{stats_dir}test_dis_ext_result.txt", "w") as file:
    file.write(str(result))
# 一元标注器
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_rate = unigram_tagger.evaluate(test_sents)
print('unigram_rate', unigram_rate)  # 0.8121200039868434

# 二元标注器
bigram_tagger = nltk.BigramTagger(train_sents)
print(bigram_tagger.tag(brown_sents[2007]))
unseen_sent = brown_sents[4203]
print(bigram_tagger.tag(unseen_sent))
bigram_rate = bigram_tagger.evaluate(test_sents)
print('bigram_rate', bigram_rate)

# 组合 bigram 标注器、unigram 标注器和一个默认标注器
# 1. 尝试使用 bigram 标注器标注标识符。
# 2. 如果 bigram 标注器无法找到一个标记,尝试 unigram 标注器。
# 3. 如果 unigram 标注器也无法找到一个标记,使用默认标注器。
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t3 = nltk.TrigramTagger(train_sents, cutoff=2, backoff=t2)
conbine_rate = t3.evaluate(test_sents)
print('conbine_rate', conbine_rate)

t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
conbine_rate_simple = t2.evaluate(test_sents)
print('conbine_rate_simple', conbine_rate_simple)
Exemple #4
0
from nltk.tag.sequential import UnigramTagger
from nltk import jsontags
import pprint


tagged_sents = brown.tagged_sents(categories='news')

size = int(len(tagged_sents) * 0.9)
train_sents = tagged_sents[:size]
test_sents = tagged_sents[size:]


@jsontags.register_tag
class PreviousTagger(UnigramTagger):
    json_tag = 'nltk.tag.sequential.PreviousTagger'

    def context(self, tokens, index, history):
        if index == 0:
            return None
        else:
            return history[index-1]


t0 = nltk.DefaultTagger('NN')
t1 = PreviousTagger(train_sents, backoff=t0)
t2 = nltk.UnigramTagger(train_sents, backoff=t1)
t3 = nltk.BigramTagger(train_sents, backoff=t2)
t4 = nltk.TrigramTagger(train_sents, backoff=t3)

pprint.pprint(t4.tag(['I', 'like', 'to', 'blog', 'on', 'Kim\'s', 'blog']))
Exemple #5
0
 def __init__(self, train_sents):
     train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                   for sent in train_sents]
     self.tagger = nltk.TrigramTagger(train_data)
Exemple #6
0
import codecs
import nltk

entrada = codecs.open("catalanTagged_0_5000-utf-8.txt", "r", encoding="utf-8")

tagged_words = []
tagged_sents = []
for linia in entrada:
    linia = linia.rstrip()
    if linia.startswith("<") or len(linia) == 0:
        if len(tagged_words) > 0:
            tagged_sents.append(tagged_words)
            tagged_words = []
    else:
        camps = linia.split(" ")
        forma = camps[0]
        lema = camps[1]
        etiqueta = camps[2]
        tupla = (forma, etiqueta)
        tagged_words.append(tupla)

unigram_tagger = nltk.UnigramTagger(tagged_sents)
bigram_tagger = nltk.BigramTagger(tagged_sents, backoff=unigram_tagger)
trigram_tagger = nltk.TrigramTagger(tagged_sents, backoff=bigram_tagger)
oracio = "l'àcid desoxiribonucleic (ADN o DNA) és un àcid nucleic que conté les instruccions genètiques utilitzades en el desenvolupament i funcionament de tots els éssers vius coneguts, així com en alguns virus, des d'un punt de vista químic, l'ADN es compon de dos llargs polímers d'unitats simples anomenades nucleòtids, amb un tronc compost de sucres i grups fosfats units per enllaços èster"
tokenitzador = nltk.tokenize.RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+')
tokens = tokenitzador.tokenize(oracio)
analisi = trigram_tagger.tag(tokens)
print(analisi)
Exemple #7
0
             for sentenca in sentencas_floresta if sentenca]

    sentencas_mac_morpho = nltk.corpus.mac_morpho.tagged_sents()
    data += [[(w.lower(), simplificarTag(t)) for (w, t) in sentenca]
             for sentenca in sentencas_mac_morpho if sentenca]

    base = data
    teste = data

    print('Treinando tagger. Isso pode demorar...')
    _tagger = nltk.NgramTagger(4,
                               base,
                               backoff=nltk.TrigramTagger(
                                   base,
                                   backoff=nltk.BigramTagger(
                                       base,
                                       backoff=nltk.UnigramTagger(
                                           base,
                                           backoff=nltk.DefaultTagger('n')))))

    print('Tagger treinado com sucesso! Precisão de %.1f!' %
          (_tagger.evaluate(teste) * 100))

    try:
        print('Salvando tagger...')

        output = open(CAMINHO_DUMP, 'wb')
        dump(_tagger, output, -1)
        output.close()

        print('Tagger salvo em "dump_tagger.pkl"!')
    try:
        return cfd[word].max()
    except ValueError:
        return "UNK"


cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news", tagset="universal"))
print(get_most_likely_tag("lol", cfd))
for word in brown.sents(categories="science_fiction")[0]:
    print(word, get_most_likely_tag(word, cfd))

# Task 8.3

# a), b)

t0 = nltk.DefaultTagger('NOUN')
train_sents = brown.tagged_sents(tagset='universal', categories='news')
t1 = nltk.UnigramTagger(train_sents)
t2 = nltk.BigramTagger(train_sents)
t3 = nltk.TrigramTagger(train_sents)

sentences = [
    [("The", "DET"), ("only", "ADJ"), ("Conservative", "NOUN"), ("councillor", "NOUN"), ("representing", "VERB"),
     ("Cambridge", "NOUN"), ("resigned", "VERB"), ("from", "ADP"), ("the", "DET"), ("city", "NOUN"),
     ("council", "NOUN"), (".", ".")]]

print(t0.evaluate(sentences))
print(t1.evaluate(sentences))
print(t2.evaluate(sentences))
print(t3.evaluate(sentences))
# In[13]:

# train Unigramtagger
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)

# In[14]:

#train a BigramTagger
Bigram_tagger = nltk.BigramTagger(train_sents)
Bigram_tagger.evaluate(test_sents)

# In[15]:

#train a Trigramtagger
Trigram_tagger = nltk.TrigramTagger(train_sents)
Trigram_tagger.evaluate(test_sents)

# In[16]:

#Ensemble the Uni and Bi - gram taggers
t1 = nltk.UnigramTagger(train_sents)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)

# In[17]:

#Ensemble the Uni and Bi and Tri - gram taggers, the performance is slightly down, we stick with Uni and Bi -gram taggers
t3 = nltk.TrigramTagger(train_sents, backoff=t2)
t3.evaluate(test_sents)
#find the distinct n-grams that contain the word 'on'
ngrams = [item for item in set(bigram_tuples) if "on" in item]

#Create naive Default Tagger
default_tagger = nltk.DefaultTagger('NN')
tagged_sentence = default_tagger.tag(tokens)

# download tagsets from .download()
nltk.help.upenn_tagset('NN')

#Regular Expression Tagset
patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'),
            (r'.*ed$', 'VB')]
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(tokens)

from nltk.corpus import brown
training = brown.tagged_sents(categories='news')

# Create Unigram, Bigram, Trigram taggers based on the training set.
unigram_tagger = nltk.UnigramTagger(training)
bigram_tagger = nltk.BigramTagger(training)
trigram_tagger = nltk.TrigramTagger(training)

#Combination of taggers
default_tagger = nltk.DefaultTagger('NN')
bigram_tagger = nltk.BigramTagger(training, backoff=default_tagger)
trigram_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)

print("done!")
Exemple #11
0
 def add_trigram_tagger(self):
     tagger = nltk.TrigramTagger(self.tagged_sents_train,
                                 backoff=self.currentTagger)
     self.currentTagger = tagger
     return self
Exemple #12
0
import nltk

tagged_sents = nltk.corpus.mac_morpho.tagged_sents()
print(tagged_sents)

t0 = nltk.DefaultTagger('N')
t1 = nltk.UnigramTagger(tagged_sents, backoff=t0)
t2 = nltk.BigramTagger(tagged_sents, backoff=t1)
t3 = nltk.TrigramTagger(tagged_sents, backoff=t2)

tagged = t3.tag(
    nltk.word_tokenize('Ontem, o João Antunes comeu peixe ao almoço.'))
print(tagged)
Exemple #13
0
tups_to_file('ruso/OUTPUT.txt', russian_tags)

############################################################
################### Training the Models ####################
############################################################

# Gathering the training data from Penn Treebank
penn_sents = nltk.corpus.treebank.tagged_sents()
penn_sents_train, penn_sents_test = train_test_split(penn_sents,
                                                     test_size=0.15)

# TrigramTagger
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(penn_sents_train, backoff=t0)
t2 = nltk.BigramTagger(penn_sents_train, backoff=t1)
t3 = nltk.TrigramTagger(penn_sents_train, backoff=t2)

print("Accuracy of the trigramm tagger in a Penn Treebank random test set: ",
      end="")
print(t3.evaluate(penn_sents_test))

macbeth_trigram_predictions = t3.tag(macbeth_words)
macbeth_trigram_tags = [x[1] for x in macbeth_trigram_predictions]

assert len(macbeth_tags_nltk) == len(macbeth_trigram_tags)
coinc = [
    macbeth_tags_nltk[i] == macbeth_trigram_tags[i]
    for i in range(len(macbeth_trigram_tags))
]
print("Percentage of tags that are coincident with NLTK default predictor: ",
      end="")
    def __init__(self):
        self._UNIGRAM_TRAIN_SETS=[
            [
                ("teaspoon", "QT"),
                ("tablespoon", "QT"),
                ("lbs", "QT"),
                ("g", "QT"), 
                ("grams", "QT"),
                ("pounds", "QT"),
                ("cups", "QT"),
                ("whole", "QT"),
                ("chopped", "QT"),
                ("medium", "QT"),
                ("size", "QT"),
                #ingredients
                ("flour", "ING"),
                ("water", "ING"),
                ("salt", "ING"),
                ("sugar", "ING"),
                ("pepper", "ING"),
                ("oil","ING"),
                ("beef", "ING"),
                ("butter", "ING"),
                ("mushrooms", "ING"), 
                ("onions", "ING"),
                ("wine", "ING"),
                ("stock","ING"),
                ("chives", "ING"),
                ("paneer", "ING"),
                ("capsicum", "ING"),
                ("ghee", "ING"), 
                ("tomatoes", "ING"), 
                ("coriander", "ING"),
                ("chillies", "ING"),
                ("garlic", "ING"),
                ("ginger", "ING"),
                ("fenugreek", "ING"),
                ("red", "ING"),
                ("green", "ING"),
                ("yellow", "ING"),
                ("Avocadoes", "ING"),
                ("Beans", "ING"),
                ("Cheese", "ING"),
                ("chipotles", "ING"),
                ("chocolate", "ING"),
                ("limes", "ING"),
                ("oregano", "ING"),
                ("pickles", "ING"),
                ("limes", "ING"),
                ("lemon", "ING"),
                ("tomatoes", "ING"),
                ("bell pepper", "ING"),
                ("capsicum", "ING"),
                ("eggplant", "ING"),
                ("lentils", "ING"),
                ("basil", "ING"),
                ("thyme", "ING"),
                ("Parsley", "ING"),
                ("Mint", "ING"),
                ("rosemary", "ING"),
                ("sage", "ING"),
                ("chives", "ING"),
                ("dill", "ING"),
                ("cilantro", "ING"),
                ("Tarragon", "ING"),
                ("saffron", "ING"),
                ("cardamom", "ING"),
                ("cinnamon", "ING"),
                ("cloves", "ING"),
                ("cumin", "ING"),
            ]
        ]
        self._BIGRAM_TRAIN_SETS = [
            [("coriander","ING"), ("seeds", "ING")],
            [("garlic","ING" ), ("paste", "ING")],
            [("green", "ING"), ("chillies", "ING")],
            [("chopped", "ING"), ("ginger", "ING")], 
            [("fenugreek", "ING"), ("leaves", "ING")],
            [("size", "ING"), ("tomatoes", "ING")],
            [("red","ING"), ("chillies", "ING")],
        ]
        self._TRIGRAM_TRAIN_SETS = [
            [("whole", "ING"), ("red","ING"), ("chillies", "ING")],
            [("chopped", "ING"), ("green", "ING"), ("chillies", "ING")],
            [("medium", "ING"), ("size", "ING"), ("tomatoes", "ING")],

        ]
        self._default_tagger = nltk.data.load(nltk.tag._POS_TAGGER)
        self._uni_tagger = nltk.UnigramTagger(self._UNIGRAM_TRAIN_SETS, backoff=self._default_tagger)
        self._bi_tagger = nltk.BigramTagger(self._BIGRAM_TRAIN_SETS, backoff=self._uni_tagger)
        self._tri_tagger = nltk.TrigramTagger(self._TRIGRAM_TRAIN_SETS, backoff=self._bi_tagger)
Exemple #15
0
tagmap[""] = "X"
for line in contents.splitlines():
    line = line.strip()
    if line == "": continue
    fine, coarse = line.split("\t")
    tagmap[fine] = coarse


def simplify_tag(t):
    if "+" in t: t = t[t.index("+") + 1:]
    if "|" in t: t = t[t.index("|") + 1:]
    t = t.lower()
    return tagmap[t]


print "Training Tagger"
dataset1 = nltk.corpus.floresta.tagged_sents()
dataset2 = nltk.corpus.mac_morpho.tagged_sents()

train = [[(w, simplify_tag(t)) for (w, t) in sent]
         for sent in dataset1 + dataset2]
tagger_fast = nltk.TrigramTagger(train,
                                 backoff=nltk.BigramTagger(
                                     train,
                                     backoff=nltk.UnigramTagger(
                                         train,
                                         backoff=nltk.DefaultTagger('N'))))
print "Done"

with open("Models/tagger.pkl", "wb") as fid:
    cPickle.dump(tagger_fast, fid, cPickle.HIGHEST_PROTOCOL)
Exemple #16
0
count = len(bigram_tuples)
print count

count = {item : bigram_tuples.count(item) for item in set(bigram_tuples)}
print count

default_tagger = nl.DefaultTagger('NN')
tagged_sentence = default_tagger.tag(tokens)
print tagged_sentence


patterns = [(r'.*ing$', 'VBG'),(r'.*ed$', 'VBD'),(r'.*es$', 'VBZ'),(r'.*ed$', 'VB')] 
regexp_tagger = nl.RegexpTagger(patterns)
tagged_sentence = regexp_tagger.tag(tokens)
print tagged_sentence
"""

training = brown.tagged_sents(categories='news')
#print training
def_tagger = nl.DefaultTagger('NN')
uni_tagger = nl.UnigramTagger(training, backoff=def_tagger)
bi_tagger = nl.BigramTagger(training, backoff=uni_tagger)
tri_tagger = nl.TrigramTagger(training, backoff=bi_tagger)

print uni_tagger.tag(tokens)
print bi_tagger.tag(tokens)
print tri_tagger.tag(tokens)



Exemple #17
0
# pylint: disable=C0111
# pylint: disable=C0103
import nltk
import sents
from pickle import dump

defaultTagger = nltk.DefaultTagger('N')
patterns = [(r'(da|do|de|das|dos)$', 'PREP'), (r'.*ndo$', 'V-GER')]
regexTagger = nltk.RegexpTagger(patterns, backoff=defaultTagger)
unigramTagger = nltk.UnigramTagger(sents.sentTreino, backoff=regexTagger)
bigramTagger = nltk.BigramTagger(sents.sentTreino, backoff=unigramTagger)
trigramTagger = nltk.TrigramTagger(sents.sentTreino, backoff=bigramTagger)

FinalTagger = trigramTagger

output = open('mac_morpho.pkl', 'wb')
dump(FinalTagger, output, -1)
output.close()
# resultado = unigramTagger.evaluate(sents.sentTeste)
# print(resultado*100.0)

# Precisão foi de 81.521% com regex (prep), default(N) e unigram
# 01/10/2017 14:40

# Precisão foi de 81.545% com regex (prep), default(N), unigram e bigram
# 03/10/2017 07:28

# Precisão foi de 81.553% com regex (prep), default(N), unigram e bigram
# 01/10/2017 14:33
Exemple #18
0
regex_tag = nltk.RegexpTagger([
    #(r'[$][0-9]+\s[MmBbTt]\S+','DV'), #dollar value
    (r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'),
    (r'.*able$', 'JJ'),
    (r'^[A-Z].*$', 'NNP'),
    (r'.*ly$', 'RB'),
    (r'.*s$', 'NNS'),
    (r'.*ing$', 'VBG'),
    (r'.*ed$', 'VBD'),
    (r'.[\/\/]\S+', 'URL'),  #URL / useless
    (r'.*', 'UNK')  # unknown terms
])

unigram_tag = nltk.UnigramTagger(train, backoff=regex_tag)
bigram_tag = nltk.BigramTagger(train, backoff=unigram_tag)
trigram_tag = nltk.TrigramTagger(train, backoff=bigram_tag)

# PoS Browns Corpus Tagging: https://en.wikipedia.org/wiki/Brown_Corpus
# custom defined Context Free Grammar (CFG) by vipul
cfg = dict()
cfg['NNP+NNP'] = 'NNP'
cfg['NN+NN'] = 'NNI'
cfg['NNP+NNI'] = 'NNI'
cfg['NNI+NN'] = 'NNI'
cfg['NNI+NNI'] = 'NNI'
cfg['NNI+NNP'] = 'NNI'
cfg['JJ+JJ'] = 'JJ'
cfg['JJ+NN'] = 'NNI'
cfg['CD+CD'] = 'CD'
cfg['NPI+NNP'] = 'NNP'  # this is specific for collecting terms with the word deal
cfg['NNI+RP'] = 'NNI'  # collects terms like "heats up" -- RP = adverb particle
Exemple #19
0
 def __init__(self):
     self.train_tagged_sents = brown.tagged_sents()
     self.default_tagger  = nltk.DefaultTagger('NN')
     self.unigram_tagger  = nltk.UnigramTagger(self.train_tagged_sents, backoff = self.default_tagger )
     self.bigram_tagger   = nltk.BigramTagger(self.train_tagged_sents, backoff = self.unigram_tagger )
     self.trigram_tagger  = nltk.TrigramTagger(self.train_tagged_sents, backoff = self.bigram_tagger) 
Exemple #20
0
r = default_tagger.evaluate(brown_tagged_sents)
print(r)

unigram_tagger = nltk.UnigramTagger(brown_tagged_sents, verbose=True)
print(unigram_tagger.tag(brown_sents[2007]))
r = unigram_tagger.evaluate(brown_tagged_sents)
print(r)

train_size = int(len(brown_tagged_sents) * 0.9)
print(train_size)
train_sents = brown_tagged_sents[:train_size]
test_sents = brown_tagged_sents[train_size:]

unigram_tagger = nltk.UnigramTagger(train_sents, verbose=True)
print(unigram_tagger.tag(brown_sents[2007]))
r = unigram_tagger.evaluate(test_sents)
print(r)

t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0, verbose=True)
t2 = nltk.BigramTagger(train_sents, backoff=t1, verbose=True)
r = t2.evaluate(test_sents)

t22 = nltk.BigramTagger(train_sents, cutoff=2, backoff=t1, verbose=True)
r = t22.evaluate(test_sents)
print(r)

t3 = nltk.TrigramTagger(train_sents, backoff=t2, verbose=True)
r = t3.evaluate(test_sents)
print(r)
#print('tags2 WORD',fdTag3.most_common())

tagTexto(textoBrownTagSentNew)
''' 
1.Estender	o	exemplo	 dos	etiquetadores	para	
TrigramTagger e	analisar	a	precisao	do	modelo	
'''

treino = mac_morpho.tagged_sents()[1000:]
teste = mac_morpho.tagged_sents()[:1000]
etiq0 = nltk.DefaultTagger('N')
etiq1 = nltk.UnigramTagger(treino, backoff=etiq0)
print('UnigramTagger', etiq1.evaluate(teste))
etiq2 = nltk.BigramTagger(treino, backoff=etiq1)
print('BigramTagger', etiq2.evaluate(teste))
etiq3 = nltk.TrigramTagger(treino, backoff=etiq2)
print('TrigramTagger', etiq3.evaluate(teste))

doc = open('textoPT.txt', encoding='utf8')
raw = doc.read()

#texto = nltk.word_tokenize('O  mundo atual possui diversos idiomas.')
texto = nltk.word_tokenize(raw)
#print('etiq2', etiq2.tag(texto))
#print('etiq3', etiq3.tag(texto))
''' 
2. Implementar	 a	 tecnica	de	validacao	10-fold	
cross-validation	e	analisar	a	precisao	dos	
modelos.	Discutir	os	resultados.	
'''
Exemple #22
0
        backoff_reg_tagger = nltk.RegexpTagger(patterns,
                                               backoff=default_tagger)

        fd = nltk.FreqDist(brown.words(categories='news'))
        cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
        most_freq_words = fd.keys()[:1500]
        likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
        lookup_tagger = nltk.UnigramTagger(model=likely_tags,
                                           backoff=backoff_reg_tagger)
        print "%s:test:%lf" % (method, lookup_tagger.evaluate(test))
    elif method == 'unigram':
        # unigram backoff tagger
        unigram_tagger = nltk.UnigramTagger(train, backoff=default_tagger)
        print "%s:test:%lf" % (method, unigram_tagger.evaluate(test))
    elif method == 'bigram':
        # bigram backoff tagger
        unigram_tagger = nltk.UnigramTagger(train, backoff=default_tagger)
        bigram_tagger = nltk.BigramTagger(train, backoff=unigram_tagger)
        print "%s:test:%lf" % (method, bigram_tagger.evaluate(test))
    elif method == 'trigram':
        # trigram backoff tagger
        unigram_tagger = nltk.UnigramTagger(train, backoff=default_tagger)
        bigram_tagger = nltk.BigramTagger(train, backoff=unigram_tagger)
        trigram_tagger = nltk.TrigramTagger(train,
                                            cutoff=10,
                                            backoff=bigram_tagger)
        print "%s:test:%lf" % (method, trigram_tagger.evaluate(test))
    else:
        print >> sys.stderr, "unknown method"
        sys.exit(2)
Exemple #23
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import floresta
import cPickle

FILENAME = "txts/floresta_trigram.pos"

def simplify_tag(t):
    if '+' in t:
        return t[t.index('+')+1:]
    else:
        return t

tsents = floresta.tagged_sents()
tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent]
train = tsents[100:]
test = tsents[:100]

tagger0 = nltk.DefaultTagger('n')
tagger1 = nltk.UnigramTagger(train, backoff=tagger0)
tagger2 = nltk.BigramTagger(train, backoff=tagger1)
tagger = nltk.TrigramTagger(train, backoff=tagger2)

tagger.evaluate(test)

with open(FILENAME, 'wb') as outFile:
    cPickle.dump(tagger, outFile, -1)
Exemple #24
0
        if o in ('-m', '--method'): method = a

    train_tagged_sents = brown.tagged_sents(categories=trainsection)
    test_tagged_sents = brown.tagged_sents(categories=testsection)
    train_tagged_words = brown.tagged_words(categories=trainsection)
    test_tagged_words = brown.tagged_words(categories=testsection)
    train_words = brown.words(categories=trainsection)

    print_to_file("\n\nmethod = " + method + "\n")
    default_tag = default_tag(train_tagged_sents)
    default_tagger = nltk.DefaultTagger(default_tag)

    if method in ['unigram', 'bigram', 'trigram']:
        tu = nltk.UnigramTagger(train_tagged_sents, backoff=default_tagger)
        tb = nltk.BigramTagger(train_tagged_sents, backoff=tu)
        tt = nltk.TrigramTagger(train_tagged_sents, backoff=tb)

    fd = nltk.FreqDist(train_words)
    cfd = nltk.ConditionalFreqDist(train_tagged_words)
    d = {k: cfd[k].max() for k in fd.keys()[:1000]}

    patterns = [
        (r'^the$', 'AT'),
        (r'^,$', ','),
        (r'^\.$', '.'),
        (r'^of$', 'IN'),
        (r'^and$', 'CC'),
        (r'^to$', 'TO'),
        (r'^a$', 'AT'),
        (r'^in$', 'IN'),
        (r'^that$', 'CS'),
    print("{0:.4f} HiddenMarkovModelTagger".format(result))

def createDataFrame():
    df = pd.DataFrame()
    df['word'] = [w for s in result for w in s]
    df['bi_tag'] = [w[1] for s in bi_tagged for w in s]
    df['tri_tag'] = [w[1] for s in tri_tagged for w in s]
    df['hmm_tag'] = [w[1] for s in hmm_tagged for w in s]
    return df


tagged_texts = loopFiles(sys.argv[1]) # loen sisse treeninghulga
test_texts = loopFiles(sys.argv[2]) # loen sisse teshulga andmed'

train_sents = tagged_texts
default_tagger = nltk.DefaultTagger("S") #S(nimisona) on koige sagedasem
unigram_tagger_backoff = nltk.UnigramTagger(train_sents, backoff = default_tagger)
bigram_tagger_backoff = nltk.BigramTagger(train_sents, backoff = unigram_tagger_backoff)
trigram_tagger_backoff = nltk.TrigramTagger(train_sents, backoff = bigram_tagger_backoff)
hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents)

result = get_tagged_words(os.getcwd() + '/' + sys.argv[3], 2)

bi_tagged = bigram_tagger_backoff.tag_sents(result)
tri_tagged = trigram_tagger_backoff.tag_sents(result)
hmm_tagged = hmm_tagger.tag_sents(result)

#Loome DataFrame'i
df = createDataFrame()
#Kirjutame faili
df.to_csv("ossip_villem-oskar_4.csv", header=False)
Exemple #26
0
import argparse

if __name__ == "__main__":

    parser = argparse.ArgumentParser("Create a part of speech tagger")
    parser.add_argument("--tagger_file", type=str, default="tagger.pkl")
    args = parser.parse_args()

    tf.io.gfile.makedirs(os.path.dirname(args.tagger_file))
    brown_tagged_sentences = nltk.corpus.brown.tagged_sents(tagset='universal')

    training_split = int(len(brown_tagged_sentences) * 0.9)
    train_sentences = brown_tagged_sentences[:training_split]
    test_sentences = brown_tagged_sentences[training_split:]

    t0 = nltk.DefaultTagger('<unk>')
    t1 = nltk.UnigramTagger(train_sentences, backoff=t0)
    t2 = nltk.BigramTagger(train_sentences, backoff=t1)
    t3 = nltk.TrigramTagger(train_sentences, backoff=t2)

    scores = [[t0.evaluate(test_sentences), t0],
              [t1.evaluate(test_sentences), t1],
              [t2.evaluate(test_sentences), t2],
              [t3.evaluate(test_sentences), t3]]

    best_score, best_tagger = max(scores, key=lambda x: x[0])
    print("Finished building tagger with {0:.2f}% accuracy".format(best_score *
                                                                   100))
    with tf.io.gfile.GFile(args.tagger_file, 'wb') as f:
        pkl.dump(best_tagger, f)
except NameError:
    word_tokenizer = make_word_tokenizer()

try:
    sent_tokenizer
except NameError:
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

try:
    tagger
except NameError:
    brown_a = nltk.corpus.brown.tagged_sents(categories='a')
    t0 = nltk.DefaultTagger('WTF')
    t1 = nltk.UnigramTagger(brown_a, backoff=t0)
    t2 = nltk.BigramTagger(brown_a, backoff=t1)
    tagger = nltk.TrigramTagger(brown_a, backoff=t2)


class SpeechAnalyzer(object):
    NOTATIONS = {
        r'--': "##PAUSE##",
        r'\(sic\)': "##SIC##",
        r'\[mispronunciation\]': '##MISPRONUNCIATION##',
        r'\.\.\.': ' ##PAUSE## '
    }
    PHRASES = [
        "wall street",
        "main street",
        "my friends",
        "middle class",
        "fannie mae",
 def initialize_taggers(self):
     self.t0 = nltk.DefaultTagger('unk')
     self.t1 = nltk.UnigramTagger(self.train, backoff=self.t0)
     self.t2 = nltk.BigramTagger(self.train, backoff=self.t1)
     self.t3 = nltk.TrigramTagger(self.train, backoff=self.t2)
    return output


if __name__ == "__main__":
    train_file = sys.argv[1]
    test_file = sys.argv[2]

    train_data, word_counts = read_train_data(train_file)
    test_data = read_test_data(test_file, word_counts)

    # Brill Tagger https://www.nltk.org/book/ch05.html 5.4
    templates = brill.fntbl37()

    t0 = nltk.DefaultTagger("NN")
    t1 = nltk.UnigramTagger(train_data, backoff=t0)
    t2 = nltk.BigramTagger(train_data, backoff=t1)
    t3 = nltk.TrigramTagger(train_data, backoff=t2)

    trainer = nltk.tag.BrillTaggerTrainer(t3, templates)
    model = t3
    #

    for sent in test_data:
        if sent:
            tagged_sent = model.tag(sent)

            output = []
            for word, tag in tagged_sent:
                output.append(word + "_" + tag)
            print(" ".join(output))
baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('TEGN'))
print "LU tagged w/ backoff: ", baseline_tagger.evaluate(tagged_sents)
print " "


"""Unigram tagger"""
unigram_tagger = nltk.UnigramTagger(train)
print "Unigram: ", unigram_tagger.evaluate(test)
print " "


"""Bigram tagger"""
bigram_tagger = nltk.BigramTagger(train)
print "Bigram: ", bigram_tagger.evaluate(test)
print " "


"""Trigram tagger with backoffs"""
t0 = nltk.DefaultTagger('TEGN')
t1 = nltk.UnigramTagger(train, backoff=t0)
t2 = nltk.BigramTagger(train, backoff=t1)
t3 = nltk.TrigramTagger(train, backoff=t2)
print "Trigram with backoffs: ", t3.evaluate(test)
print " "


""" """