Ejemplo n.º 1
0
def test_defaultTagger(tokens):

    default_tagger = nltk.DefaultTagger('NN')
    print default_tagger.tag(tokens)
Ejemplo n.º 2
0
# Preenchemos a lista tagged_sentences com os dados do córpus com as tags já filtradas, correspondentes às
# classes gramaticais:
tagged_sentences = []
print('Extraindo classes gramaticais das tags do córpus:')
for sent in tqdm(sentences):
    tagged_sent = []
    for (word, tag) in sent:
        tagged_sent.append((word.lower(), extract_pos(tag)))
    tagged_sentences.append(tagged_sent)

# Dividimos o córpus em um conjunto de treinamento e umm conjunto de testes:
train = tagged_sentences[100:]
test = tagged_sentences[:100]

# Escolhemos como Baseline um tagger que chuta sempre a mesma classe para toda palavra, a classe gramatical mais
# frequente no córpus: 'n', correspondente à NOME:
baseline = nltk.DefaultTagger('n')
print('Baseline accuracy:', baseline.evaluate(test))

# Treinamos um tagger de Unigramas usando o baseline como backoff:
tagger1 = nltk.UnigramTagger(train, backoff=baseline)
print('Unigram Tagger accuracy:', tagger1.evaluate(test))

# Em seguida, treinamos um tagger de Bigramas, usando o tagger de unigramas como backoff:
tagger2 = nltk.BigramTagger(train, backoff=tagger1)
print('Bigram Tagger accuracy', tagger2.evaluate(test))

# Exemplo de classificação sintática de uma sentença tokenizada, usando nosso melhor tagger:
print(tagger2.tag(['o', 'pássaro', 'segue', 'feliz', '.']))
Ejemplo n.º 3
0
#%%
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
print(unigram_tagger.evaluate(test_sents))

#%%  请注意,bigram 标注器能够标注训练中它看到过的句子中的所有词,但对一个没见过 的句子表现很差。只要遇到一个新词(如13.5),就无法给它分配标记。它不能标注下面的 词(如:million),即使是在训练过程中看到过的,只是因为在训练过程中从来没有见过它 前面有一个None 标记的词。因此,标注器标注句子的其余部分也失败了。它的整体准确度 得分非常低:
bigram_tagger = nltk.BigramTagger(train_sents)
print(bigram_tagger.tag(brown_sents[2007]))
unseen_sent = brown_sents[4203]
print(bigram_tagger.tag(unseen_sent))
#%%
import _pickle as cPickle

t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
output = open('t2.pkl', 'wb')
cPickle.dump(t2, output, -1)
output.close()
#%%
import _pickle as cPickle
from _pickle import load
input = open('t2.pkl', 'rb')
tagger = load(input)
input.close()
#%%
text = """The board's action shows what free enterprise
is up against in our complex maze of regulatory laws ."""
tokens = text.split()
Ejemplo n.º 4
0
                           for (w, t) in sent] for sent in dataset2]

shuffle(traindata)
shuffle(traindata2)

regex_patterns = [
    (r"^[nN][ao]s?$", "ADP"),
    (r"^[dD][ao]s?$", "ADP"),
    (r"^[pP]el[ao]s?$", "ADP"),
    (r"^[nN]est[ae]s?$", "ADP"),
    (r"^[nN]um$", "ADP"),
    (r"^[nN]ess[ae]s?$", "ADP"),
    (r"^[nN]aquel[ae]s?$", "ADP"),
    (r"^\xe0$", "ADP"),
]

tagger = nltk.BigramTagger(traindata,
                           backoff=nltk.RegexpTagger(
                               regex_patterns,
                               backoff=nltk.UnigramTagger(
                                   traindata2,
                                   backoff=nltk.AffixTagger(
                                       traindata2,
                                       backoff=nltk.DefaultTagger('NOUN')))))
templates = nltk.brill.fntbl37()
tagger = nltk.BrillTaggerTrainer(tagger, templates)
tagger = tagger.train(traindata, max_rules=100)

with open("tagger.pkl", "wb") as f:
    pickle.dump(tagger, f)
Ejemplo n.º 5
0
#     for words in sent:
#         print(words[0],end=' ')

print(brown_sents)

# 1.默认标注器
# 找出最有可能的标记
print('\n找出最有可能的标记:')
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
print('tags: ', tags)
tag_max = nltk.FreqDist(tags).max()
print('tag_max: ', tag_max)
# 创建一个将所有词都标注成tag_max的标注器
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger(tag_max)
tokens_tagged = default_tagger.tag(tokens)
print('tokens_tagged: ', tokens_tagged)

# 评估
print('评估brown_tagged_sents: ', default_tagger.evaluate(brown_tagged_sents))

# 2.正则表达式标注器
print('\n正则表达式标注器:')

patterns = [
    (r'.*ing$', 'VBG'),  # gerunds
    (r'.*ed$', 'VBD'),  # simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'NN$'),  # possessive nouns
try:
    word_tokenizer
except NameError:
    word_tokenizer = make_word_tokenizer()

try:
    sent_tokenizer
except NameError:
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

try:
    tagger
except NameError:
    brown_a = nltk.corpus.brown.tagged_sents(categories='a')
    t0 = nltk.DefaultTagger('WTF')
    t1 = nltk.UnigramTagger(brown_a, backoff=t0)
    t2 = nltk.BigramTagger(brown_a, backoff=t1)
    tagger = nltk.TrigramTagger(brown_a, backoff=t2)


class SpeechAnalyzer(object):
    NOTATIONS = {
        r'--': "##PAUSE##",
        r'\(sic\)': "##SIC##",
        r'\[mispronunciation\]': '##MISPRONUNCIATION##',
        r'\.\.\.': ' ##PAUSE## '
    }
    PHRASES = [
        "wall street",
        "main street",
Ejemplo n.º 7
0
def train_es_tagger(path):
    nltk.download('cess_esp')

    def convert_to_universal_tag(t):
        tagdict = {
            'Fa': '.',
            'Faa': '.',
            'Fat': '.',
            'Fc': '.',
            'Fd': '.',
            'Fe': '.',
            'Fg': '.',
            'Fh': '.',
            'Fi': '.',
            'Fia': '.',
            'Fit': '.',
            'Fp': '.',
            'Fpa': '.',
            'Fpt': '.',
            'Fs': '.',
            'Fx': '.',
            'Fz': '.',
            'X': 'X',
            'Y': 'X',
            'Zm': 'NUM',
            'Zp': 'NUM',
            'ao': 'ADJ',
            'ao0fp0': 'ADJ',
            'ao0fs0': 'ADJ',
            'ao0mp0': 'ADJ',
            'ao0ms0': 'ADJ',
            'aq': 'ADJ',
            'aq00000': 'ADJ',
            'aq0cn0': 'ADJ',
            'aq0cp0': 'ADJ',
            'aq0cs0': 'ADJ',
            'aq0fp0': 'ADJ',
            'aq0fpp': 'ADJ',
            'aq0fs0': 'ADJ',
            'aq0fsp': 'ADJ',
            'aq0mp0': 'ADJ',
            'aq0mpp': 'ADJ',
            'aq0ms0': 'ADJ',
            'aq0msp': 'ADJ',
            'cc': 'CONJ',
            'cs': 'CONJ',
            'da': 'DET',
            'da0fp0': 'DET',
            'da0fs0': 'DET',
            'da0mp0': 'DET',
            'da0ms0': 'DET',
            'da0ns0': 'DET',
            'dd': 'DET',
            'dd0cp0': 'DET',
            'dd0cs0': 'DET',
            'dd0fp0': 'DET',
            'dd0fs0': 'DET',
            'dd0mp0': 'DET',
            'dd0ms0': 'DET',
            'de': 'DET',
            'de0cn0': 'DET',
            'di': 'DET',
            'di0cp0': 'DET',
            'di0cs0': 'DET',
            'di0fp0': 'DET',
            'di0fs0': 'DET',
            'di0mp0': 'DET',
            'di0ms0': 'DET',
            'dn': 'DET',
            'dn0cp0': 'DET',
            'dn0cs0': 'DET',
            'dn0fp0': 'DET',
            'dn0fs0': 'DET',
            'dn0mp0': 'DET',
            'dn0ms0': 'DET',
            'dp': 'DET',
            'dp1cps': 'DET',
            'dp1css': 'DET',
            'dp1fpp': 'DET',
            'dp1fsp': 'DET',
            'dp1mpp': 'DET',
            'dp1msp': 'DET',
            'dp1mss': 'DET',
            'dp2cps': 'DET',
            'dp2css': 'DET',
            'dp3cp0': 'DET',
            'dp3cs0': 'DET',
            'dp3fs0': 'DET',
            'dp3mp0': 'DET',
            'dp3ms0': 'DET',
            'dt': 'DET',
            'dt0cn0': 'DET',
            'dt0fs0': 'DET',
            'dt0ms0': 'DET',
            'i': 'X',
            'nc': 'NOUN',
            'nc00000': 'NOUN',
            'nccn000': 'NOUN',
            'nccp000': 'NOUN',
            'nccs000': 'NOUN',
            'ncfn000': 'NOUN',
            'ncfp000': 'NOUN',
            'ncfs000': 'NOUN',
            'ncmn000': 'NOUN',
            'ncmp000': 'NOUN',
            'ncms000': 'NOUN',
            'np': 'NOUN',
            'np00000': 'NOUN',
            'np0000a': 'NOUN',
            'np0000l': 'NOUN',
            'np0000o': 'NOUN',
            'np0000p': 'NOUN',
            'p0': 'PRON',
            'p0000000': 'PRON',
            'p010p000': 'PRON',
            'p010s000': 'PRON',
            'p020s000': 'PRON',
            'p0300000': 'PRON',
            'pd': 'PRON',
            'pd0cp000': 'PRON',
            'pd0cs000': 'PRON',
            'pd0fp000': 'PRON',
            'pd0fs000': 'PRON',
            'pd0mp000': 'PRON',
            'pd0ms000': 'PRON',
            'pd0ns000': 'PRON',
            'pe': 'PRON',
            'pe000000': 'PRON',
            'pi': 'PRON',
            'pi0cp000': 'PRON',
            'pi0cs000': 'PRON',
            'pi0fp000': 'PRON',
            'pi0fs000': 'PRON',
            'pi0mp000': 'PRON',
            'pi0ms000': 'PRON',
            'pn': 'PRON',
            'pn0cp000': 'PRON',
            'pn0fp000': 'PRON',
            'pn0fs000': 'PRON',
            'pn0mp000': 'PRON',
            'pn0ms000': 'PRON',
            'pp': 'PRON',
            'pp1cp000': 'PRON',
            'pp1cs000': 'PRON',
            'pp1csn00': 'PRON',
            'pp1cso00': 'PRON',
            'pp1mp000': 'PRON',
            'pp2cp000': 'PRON',
            'pp2cp00p': 'PRON',
            'pp2cs000': 'PRON',
            'pp2cs00p': 'PRON',
            'pp2csn00': 'PRON',
            'pp2cso00': 'PRON',
            'pp3cn000': 'PRON',
            'pp3cna00': 'PRON',
            'pp3cno00': 'PRON',
            'pp3cpa00': 'PRON',
            'pp3cpd00': 'PRON',
            'pp3csa00': 'PRON',
            'pp3csd00': 'PRON',
            'pp3fp000': 'PRON',
            'pp3fpa00': 'PRON',
            'pp3fs000': 'PRON',
            'pp3fsa00': 'PRON',
            'pp3mp000': 'PRON',
            'pp3mpa00': 'PRON',
            'pp3ms000': 'PRON',
            'pp3msa00': 'PRON',
            'pp3ns000': 'PRON',
            'pr': 'PRON',
            'pr000000': 'PRON',
            'pr0cn000': 'PRON',
            'pr0cp000': 'PRON',
            'pr0cs000': 'PRON',
            'pr0fp000': 'PRON',
            'pr0fs000': 'PRON',
            'pr0mp000': 'PRON',
            'pr0ms000': 'PRON',
            'pt': 'PRON',
            'pt000000': 'PRON',
            'pt0cp000': 'PRON',
            'pt0cs000': 'PRON',
            'pt0mp000': 'PRON',
            'pt0ms000': 'PRON',
            'px': 'PRON',
            'px1fp0p0': 'PRON',
            'px1fs0p0': 'PRON',
            'px1mp0p0': 'PRON',
            'px1ms0p0': 'PRON',
            'px2fs0s0': 'PRON',
            'px3fs000': 'PRON',
            'px3mp000': 'PRON',
            'px3ms000': 'PRON',
            'px3ns000': 'PRON',
            'rg': 'ADV',
            'rn': 'ADV',
            'sn': 'ADP',
            'sn-SUJ': 'ADP',
            'sn.co-SUJ': 'ADP',
            'sn.e': 'ADP',
            'sn.e-ATR': 'ADP',
            'sn.e-CD': 'ADP',
            'sn.e-SUJ': 'ADP',
            'sn.e.1n-SUJ': 'ADP',
            'sp': 'ADP',
            'spcms': 'ADP',
            'sps00': 'ADP',
            'va': 'VERB',
            'vag0000': 'VERB',
            'vaic1p0': 'VERB',
            'vaic3p0': 'VERB',
            'vaic3s0': 'VERB',
            'vaif1p0': 'VERB',
            'vaif2s0': 'VERB',
            'vaif3p0': 'VERB',
            'vaif3s0': 'VERB',
            'vaii1p0': 'VERB',
            'vaii1s0': 'VERB',
            'vaii2s0': 'VERB',
            'vaii3p0': 'VERB',
            'vaii3s0': 'VERB',
            'vaip1p0': 'VERB',
            'vaip1s0': 'VERB',
            'vaip2p0': 'VERB',
            'vaip2s0': 'VERB',
            'vaip3p0': 'VERB',
            'vaip3s0': 'VERB',
            'vais3s0': 'VERB',
            'vam02s0': 'VERB',
            'vam03s0': 'VERB',
            'van0000': 'VERB',
            'vap00sm': 'VERB',
            'vasi1p0': 'VERB',
            'vasi1s0': 'VERB',
            'vasi3p0': 'VERB',
            'vasi3s0': 'VERB',
            'vasp1s0': 'VERB',
            'vasp3p0': 'VERB',
            'vasp3s0': 'VERB',
            'vm': 'VERB',
            'vmg0000': 'VERB',
            'vmic1p0': 'VERB',
            'vmic1s0': 'VERB',
            'vmic2s0': 'VERB',
            'vmic3p0': 'VERB',
            'vmic3s0': 'VERB',
            'vmif1p0': 'VERB',
            'vmif1s0': 'VERB',
            'vmif2s0': 'VERB',
            'vmif3p0': 'VERB',
            'vmif3s0': 'VERB',
            'vmii1p0': 'VERB',
            'vmii1s0': 'VERB',
            'vmii2p0': 'VERB',
            'vmii2s0': 'VERB',
            'vmii3p0': 'VERB',
            'vmii3s0': 'VERB',
            'vmip1p0': 'VERB',
            'vmip1s0': 'VERB',
            'vmip2p0': 'VERB',
            'vmip2s0': 'VERB',
            'vmip3p0': 'VERB',
            'vmip3s0': 'VERB',
            'vmis1p0': 'VERB',
            'vmis1s0': 'VERB',
            'vmis2s0': 'VERB',
            'vmis3p0': 'VERB',
            'vmis3s0': 'VERB',
            'vmm01p0': 'VERB',
            'vmm02s0': 'VERB',
            'vmm03p0': 'VERB',
            'vmm03s0': 'VERB',
            'vmn0000': 'VERB',
            'vmp00pf': 'VERB',
            'vmp00pm': 'VERB',
            'vmp00sf': 'VERB',
            'vmp00sm': 'VERB',
            'vmsi1p0': 'VERB',
            'vmsi1s0': 'VERB',
            'vmsi3p0': 'VERB',
            'vmsi3s0': 'VERB',
            'vmsp1p0': 'VERB',
            'vmsp1s0': 'VERB',
            'vmsp2p0': 'VERB',
            'vmsp2s0': 'VERB',
            'vmsp3p0': 'VERB',
            'vmsp3s0': 'VERB',
            'vs': 'VERB',
            'vsg0000': 'VERB',
            'vsic1s0': 'VERB',
            'vsic2s0': 'VERB',
            'vsic3p0': 'VERB',
            'vsic3s0': 'VERB',
            'vsif1s0': 'VERB',
            'vsif3p0': 'VERB',
            'vsif3s0': 'VERB',
            'vsii1p0': 'VERB',
            'vsii1s0': 'VERB',
            'vsii3p0': 'VERB',
            'vsii3s0': 'VERB',
            'vsip1p0': 'VERB',
            'vsip1s0': 'VERB',
            'vsip2s0': 'VERB',
            'vsip3p0': 'VERB',
            'vsip3s0': 'VERB',
            'vsis1s0': 'VERB',
            'vsis3p0': 'VERB',
            'vsis3s0': 'VERB',
            'vsm03s0': 'VERB',
            'vsn0000': 'VERB',
            'vsp00sm': 'VERB',
            'vssf3s0': 'VERB',
            'vssi3p0': 'VERB',
            'vssi3s0': 'VERB',
            'vssp1s0': 'VERB',
            'vssp2s0': 'VERB',
            'vssp3p0': 'VERB',
            'vssp3s0': 'VERB',
            'w': 'NOUN',
            'z': 'NUM'
        }
        t = t.lower()
        return tagdict.get(t, "." if all(tt in punctuation for tt in t) else t)

    cess = [[(w, convert_to_universal_tag(t)) for (w, t) in sent]
            for sent in nltk.corpus.cess_esp.tagged_sents()]
    shuffle(cess)
    def_tagger = nltk.DefaultTagger('NOUN')
    affix_tagger = nltk.AffixTagger(cess, backoff=def_tagger)
    unitagger = nltk.UnigramTagger(cess, backoff=affix_tagger)
    tagger = nltk.BigramTagger(cess, backoff=unitagger)
    tagger = nltk.BrillTaggerTrainer(tagger, nltk.brill.fntbl37())
    tagger = tagger.train(cess, max_rules=100)

    with open(path, "wb") as f:
        pickle.dump(tagger, f)

    return tagger
Ejemplo n.º 8
0
########################################################
###############    The Default Tagger        ###############
########################################################
from nltk.corpus import brown
import nltk

tags = [tag for (word, tag) in brown.tagged_words(categories='news')]

print(set(tags))

#Finding tags occuring most frew
print(nltk.FreqDist(tags).max())

## default Tagger - tags as input, marks every word in the tagger as given tag

default_tagger = nltk.DefaultTagger('NN')
default_tagger = nltk.DefaultTagger(nltk.FreqDist(tags).max())

text = "Gokhale conveyed that he was in Hong-Kong and could reach only past midnight even if he booked himself on the first Beijing-bound flight. He was urged to reach the Chinese capital as fast as he could, in a first clear indication that the quiet and dogged attempt to defuse the Doklam imbroglio may have borne fruit."
tokens = nltk.word_tokenize(text)

print(default_tagger.tag(tokens))

##Performace of a tagger
print(
    default_tagger.evaluate(nltk.corpus.brown.tagged_sents(categories='news')))

########################################################
###############     Regexp Tagger       ###############
########################################################
import nltk
Ejemplo n.º 9
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import floresta
import cPickle

FILENAME = "txts/floresta_trigram.pos"

def simplify_tag(t):
    if '+' in t:
        return t[t.index('+')+1:]
    else:
        return t

tsents = floresta.tagged_sents()
tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent]
train = tsents[100:]
test = tsents[:100]

tagger0 = nltk.DefaultTagger('n')
tagger1 = nltk.UnigramTagger(train, backoff=tagger0)
tagger2 = nltk.BigramTagger(train, backoff=tagger1)
tagger = nltk.TrigramTagger(train, backoff=tagger2)

tagger.evaluate(test)

with open(FILENAME, 'wb') as outFile:
    cPickle.dump(tagger, outFile, -1)
Ejemplo n.º 10
0
    data += [[(w.lower(), simplificarTag(t)) for (w, t) in sentenca]
             for sentenca in sentencas_mac_morpho if sentenca]

    base = data
    teste = data

    print('Treinando tagger. Isso pode demorar...')
    _tagger = nltk.NgramTagger(4,
                               base,
                               backoff=nltk.TrigramTagger(
                                   base,
                                   backoff=nltk.BigramTagger(
                                       base,
                                       backoff=nltk.UnigramTagger(
                                           base,
                                           backoff=nltk.DefaultTagger('n')))))

    print('Tagger treinado com sucesso! Precisão de %.1f!' %
          (_tagger.evaluate(teste) * 100))

    try:
        print('Salvando tagger...')

        output = open(CAMINHO_DUMP, 'wb')
        dump(_tagger, output, -1)
        output.close()

        print('Tagger salvo em "dump_tagger.pkl"!')

    except:
        print('ERRO: Não foi possível salvar o tagger.')
Ejemplo n.º 11
0
#print('segundo',textoBrownTagSentNew)
fdTag2 = nltk.FreqDist(tag for m in textoBrownTagSentNew for (word, tag) in m)
print('tags2 TAG', fdTag2.most_common())

fdTag3 = nltk.FreqDist(word for m in textoBrownTagSentNew for (word, tag) in m)
#print('tags2 WORD',fdTag3.most_common())

tagTexto(textoBrownTagSentNew)
''' 
1.Estender	o	exemplo	 dos	etiquetadores	para	
TrigramTagger e	analisar	a	precisao	do	modelo	
'''

treino = mac_morpho.tagged_sents()[1000:]
teste = mac_morpho.tagged_sents()[:1000]
etiq0 = nltk.DefaultTagger('N')
etiq1 = nltk.UnigramTagger(treino, backoff=etiq0)
print('UnigramTagger', etiq1.evaluate(teste))
etiq2 = nltk.BigramTagger(treino, backoff=etiq1)
print('BigramTagger', etiq2.evaluate(teste))
etiq3 = nltk.TrigramTagger(treino, backoff=etiq2)
print('TrigramTagger', etiq3.evaluate(teste))

doc = open('textoPT.txt', encoding='utf8')
raw = doc.read()

#texto = nltk.word_tokenize('O  mundo atual possui diversos idiomas.')
texto = nltk.word_tokenize(raw)
#print('etiq2', etiq2.tag(texto))
#print('etiq3', etiq3.tag(texto))
''' 
Ejemplo n.º 12
0
 def __init__(self):
     self.train_tagged_sents = brown.tagged_sents()
     self.default_tagger  = nltk.DefaultTagger('NN')
     self.unigram_tagger  = nltk.UnigramTagger(self.train_tagged_sents, backoff = self.default_tagger )
     self.bigram_tagger   = nltk.BigramTagger(self.train_tagged_sents, backoff = self.unigram_tagger )
     self.trigram_tagger  = nltk.TrigramTagger(self.train_tagged_sents, backoff = self.bigram_tagger) 
Ejemplo n.º 13
0
# pylint: disable=C0111
# pylint: disable=C0103
import nltk
import sents

patterns = [
    (r'(da|do|de|das|dos)$', 'PREP'),  # Preposições
    (r'.*ndo$', 'V-GER')  # Gerundios
]
defaultTagger = nltk.DefaultTagger('N')
regexTagger = nltk.RegexpTagger(patterns, backoff=defaultTagger)
resultado = regexTagger.evaluate(sents.sentTeste)
print(resultado * 100.0)

# Precisão foi de 23.130%
# 01/10/2017 14:34
Ejemplo n.º 14
0
def learnDefaultTagger(simpleSentence):
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    tagger = nltk.DefaultTagger("NN")
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)
Ejemplo n.º 15
0
import nltk
from nltk.corpus import cess_esp


def store_pickle_file(filename, obj):
    from pickle import dump
    output = open("./checkpoints/" + filename + ".pkl", "wb")
    dump(obj, output, -1)
    output.close()


# Creating the default tagger
default_tagger = nltk.DefaultTagger('S')

# Creating a REGEX tagger
patterns = [(r'.*o$', 'NMS'), (r'.*os$', 'NMP'), (r'.*a$', 'NFS'),
            (r'.*as$', 'NFP')]
regex_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger)

# Creating, training an UnigramTagger on cess_esp sentences
tagged_sents = cess_esp.tagged_sents()
unigram_tagger = nltk.UnigramTagger(tagged_sents, backoff=regex_tagger)

# Saving to disk the general tagger
store_pickle_file("tagger", unigram_tagger)
print("done")
Ejemplo n.º 16
0
    pylab.show()

disp()  #Calling function


#29.
bts = brown.tagged_sents(categories='news')
#Getting brown sentences

size = int(len(bts) * 0.9)
#training data 90%
train_sents = bts[:size]
#testing data
test_sents = bts[size:]
#Noun as default tagger
t0 = nltk.DefaultTagger('NN')
#Unigran tagger  with backoff option as default tagger
t1=nltk.UnigramTagger(train_sents, backoff=t0)
#Bigram tagger training data
bigram_tagger = nltk.BigramTagger(train_sents)
#bigram seen data
bigram_tagger.tag(bts[2007])
#bigram unseen data
unseen_sent = bts[4203]
#bigram unseen data
bigram_tagger.tag(unseen_sent)
#bigram training data
bitag1=nltk.BigramTagger(train_sents)
#Bigram with cutoff=2 option will ignore the words unless it is atleast 2 times, 
#so those data that are in training data and  seen once only will be not recognized
bitag2=nltk.BigramTagger(train_sents, cutoff=2)
for item in all_files:
    item = periods.sub(".", item)
    all_text.append(item)

# Make raw text and then tokenize
corpus_raw = "".join(all_text)
corpus_sents = nltk.sent_tokenize(corpus_raw, language="french")

for sentence in corpus_sents:
    corpus_list = sentence.split()
    corpus_tuples = [nltk.tag.str2tuple(item) for item in corpus_list]
    corpus_tagged_sents.append(corpus_tuples)

# Split into training and held out data
size = int(len(corpus_tagged_sents) * 0.9)
train_sents = corpus_tagged_sents[:size]
test_sents = corpus_tagged_sents[size:]

# Train taggers
tagger_default = nltk.DefaultTagger("NN")
tagger_unigram = nltk.UnigramTagger(train_sents, backoff=tagger_default)
tagger_bigram = nltk.BigramTagger(train_sents, backoff=tagger_unigram)
tagger_trigram = nltk.TrigramTagger(train_sents, backoff=tagger_bigram)

# Evaluate with disfluency chunks and print some stats
stats_dir = "./stats/"
result = tagger_trigram.evaluate(test_sents)

with open(f"{stats_dir}test_dis_ext_result.txt", "w") as file:
    file.write(str(result))
Ejemplo n.º 18
0
def perf2(cfd, wl):
    #assign word and cfd as dictionary to variable
    zz = dict((word, cfd[word].max()) for word in wl)
    bt = nltk.UnigramTagger(model=zz, backoff=nltk.DefaultTagger('NN'))
    return bt.evaluate(brown.tagged_sents(categories='news'))
Ejemplo n.º 19
0
def default_backoff_tagger(train_sents):
    tags = [tag for sent in train_sents for (word, tag) in sent]
    nltk.DefaultTagger(nltk.FreqDist(tags).max())
Ejemplo n.º 20
0
#adaptacion SPANISH
from nltk.corpus import cess_esp
nltk.tag.mapping._load_universal_map("es-cast3lb")
mapdict = nltk.tag.mapping._MAPPINGS["es-cast3lb"]["universal"]
alltags = set(t for w, t in cess_esp.tagged_words())
for tag in alltags:
    if len(tag) <= 2:  # These are complete
        continue
    mapdict[tag] = mapdict[tag[:2]]

cess_esp._tagset = "es-cast3lb"
from nltk import UnigramTagger as ut
from nltk import BigramTagger as bt
cess_sents = cess_esp.tagged_sents(tagset='universal')
uni_tag = ut(cess_sents, backoff=nltk.DefaultTagger('X'))


class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
Ejemplo n.º 21
0
def tagging_performance(cfd, wordlist):
  lt = dict((word, cfd[word].max()) for word in wordlist)
  baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
  return baseline_tagger.evaluate(brown_tagged_sents)
    recipe_name = input('What meal are you looking for ? : ')
    get_access_recipes(path, tagged_dataset_path, user, recipe_name)

    #recipe_name = input('What meal are you looking for ? : ')
    #get_access_recipes(path, tagged_dataset_path, user, recipe_name)

    print("Shopping list : ")
    print(*user.shop_list, sep="\n")

    print("Fridge : ")
    print(*user.fridge, sep="\n")

######
#Evaluation of the model : can't work because of uncleaned dataset
######
tagged_dataset = input(
    "Location of the tagged data set which is going to be created : ")
tagged = open(tagged_dataset, "rb")
tagged_ingredients = pickle.load(tagged)

train_set = tagged_ingredients[:len(tagged_ingredients)]
test_set = tagged_ingredients[len(tagged_ingredients):]

back = nltk.DefaultTagger('OTHER')
unigram_tagger = nltk.UnigramTagger(train_set, backoff=back)
bigram_tagger = nltk.BigramTagger(train_set, backoff=unigram_tagger)

#Problem on the dataset
#unigram_tagger.evaluate(test_set)
Ejemplo n.º 23
0
pos2 = nltk.Index((value, key) for (key, value) in pos.items())
pos2['ADJ']

# Automatic Tagging
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

## The default tagger
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
nltk.FreqDist(tags).max()  # 'NN'

raw = 'I do not like green engs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)

default_tagger.evaluate(brown_tagged_sents)

## the regular expression tagger
patterns = [(r'.ing$', 'VBG'), (r'.ed$', 'VBD'), (r'.es$', 'VBZ'),
            (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'),
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]

regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3])
regexp_tagger.evaluate(brown_tagged_sents)

## the lookup tagger
fd = nltk.FreqDist(brown.words(categories='news'))
Ejemplo n.º 24
0
def tag_sentences_default_tagger(sentences, tag):
    default_tagger = nltk.DefaultTagger(tag)
    for s in sentences:
        tokens = nltk.word_tokenize(s)
        s_tagged = default_tagger.tag(tokens)
        print(s_tagged)
Ejemplo n.º 25
0
def performance(cfd, wordlist):
    lt = dict((word, cfd[word].max()) for word in wordlist)
    baseline_tagger = nltk.UnigramTagger(model=lt,
                                         backoff=nltk.DefaultTagger('NN'))
    # 对新数据进行标注
    return baseline_tagger.evaluate(brown.tagged_sents(categories='fiction'))
    print("{0:.4f} HiddenMarkovModelTagger".format(result))

def createDataFrame():
    df = pd.DataFrame()
    df['word'] = [w for s in result for w in s]
    df['bi_tag'] = [w[1] for s in bi_tagged for w in s]
    df['tri_tag'] = [w[1] for s in tri_tagged for w in s]
    df['hmm_tag'] = [w[1] for s in hmm_tagged for w in s]
    return df


tagged_texts = loopFiles(sys.argv[1]) # loen sisse treeninghulga
test_texts = loopFiles(sys.argv[2]) # loen sisse teshulga andmed'

train_sents = tagged_texts
default_tagger = nltk.DefaultTagger("S") #S(nimisona) on koige sagedasem
unigram_tagger_backoff = nltk.UnigramTagger(train_sents, backoff = default_tagger)
bigram_tagger_backoff = nltk.BigramTagger(train_sents, backoff = unigram_tagger_backoff)
trigram_tagger_backoff = nltk.TrigramTagger(train_sents, backoff = bigram_tagger_backoff)
hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents)

result = get_tagged_words(os.getcwd() + '/' + sys.argv[3], 2)

bi_tagged = bigram_tagger_backoff.tag_sents(result)
tri_tagged = trigram_tagger_backoff.tag_sents(result)
hmm_tagged = hmm_tagger.tag_sents(result)

#Loome DataFrame'i
df = createDataFrame()
#Kirjutame faili
df.to_csv("ossip_villem-oskar_4.csv", header=False)
>>>from nltk.tag.stanford import POSTagger
>>>import nltk
>>>stan_tagger=POSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar')
>>>tokens =nltk.word_tokenize(s)
>>>stan_tagger.tag(tokens)

# POS tags freq distribtuion
>>>from nltk.corpus import brown
>>>import nltk
>>>tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
>>>print nltk.FreqDist(tags)

# default tagger
>>>brown_tagged_sents = brown.tagged_sents(categories='news')
>>>default_tagger = nltk.DefaultTagger('NN')
>>>print default_tagger.evaluate(brown_tagged_sents)

# N-gram taggers

>>>from nltk.tag import UnigramTagger
>>>from nltk.tag import DefaultTagger
>>>from nltk.tag import BigramTagger
>>>from nltk.tag import TrigramTagger
# we are dividing the data into a test and train to evaluate our taggers.
>>>train_data= brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
>>>test_data= brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]
>>>unigram_tagger = UnigramTagger(train_data,backoff=default_tagger)
>>>print unigram_tagger.evaluate(test_data)
>>>bigram_tagger= BigramTagger(train_data, backoff=unigram_tagger)
>>>print bigram_tagger.evaluate(test_data)
Ejemplo n.º 28
0
def combineTagger():
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    t2.evaluate(test_sents)
Ejemplo n.º 29
0
        output.append(line.split())

    return output


if __name__ == "__main__":
    train_file = sys.argv[1]
    test_file = sys.argv[2]

    train_data, word_counts = read_train_data(train_file)
    test_data = read_test_data(test_file, word_counts)

    # Brill Tagger https://www.nltk.org/book/ch05.html 5.4
    templates = brill.fntbl37()

    t0 = nltk.DefaultTagger("NN")
    t1 = nltk.UnigramTagger(train_data, backoff=t0)
    t2 = nltk.BigramTagger(train_data, backoff=t1)
    t3 = nltk.TrigramTagger(train_data, backoff=t2)

    trainer = nltk.tag.BrillTaggerTrainer(t3, templates)
    model = t3
    #

    for sent in test_data:
        if sent:
            tagged_sent = model.tag(sent)

            output = []
            for word, tag in tagged_sent:
                output.append(word + "_" + tag)
Ejemplo n.º 30
0
import nltk
from nltk.corpus import cess_esp
from pickle import dump

patterns = [ (r".*o$","NMS"),
               (r".*os$","NMP"),
               (r".*a$","NFS"),
               (r".*as$","NFP"),
             ]
cesp_tsents = cess_esp.tagged_sents()
td = nltk.DefaultTagger("s")
tr = nltk.RegexpTagger(patterns, backoff = td )
tu = nltk.UnigramTagger(cesp_tsents, backoff = tr )
output = open("tagger.pkl","wb")
dump(tu,output,-1)
output.close()