コード例 #1
0
def nlpyport_tokenizer(fileinput, TokPort_config_file):
    #define the tagset being used
    floresta.tagged_words(tagset="pt-bosque")
    contractions_path = ""
    clitics_path = ""
    text = " "
    tokens = []
    tokens_after_contractions = []
    tokens_after_clitics = []
    #get the directory of the resources
    contractions_path, clitics_path = load_token_configurations(
        TokPort_config_file)

    #Get tokens from input file, one by line
    text = get_input_from_file(fileinput)

    #Do thea actual tokenization
    tokens = nltk_tokenize(text)
    #for i in range(len(tokens)):
    #	if(tokens[i])=="#":
    #		tokens[i] = "\n"
    tokens_after_contractions = replace_contrations(contractions_path, tokens)

    #Check if tokens contain clitics
    #If so, change them to the most extended form
    tokens_after_clitics = replace_clitics(clitics_path,
                                           tokens_after_contractions)
    final_tokens = []
    for tok in tokens_after_clitics:
        final_tokens.append(tok)
    return final_tokens
コード例 #2
0
def nlpyport_tokenize_from_string(text, TokPort_config_file):
    #define the tagset being used
    #print(text)
    floresta.tagged_words(tagset="pt-bosque")
    contractions_path = ""
    clitics_path = ""
    tokens = []
    tokens_after_contractions = []
    tokens_after_clitics = []
    #get the directory of the resources
    contractions_path, clitics_path = load_token_configurations(
        TokPort_config_file)
    text_list = []
    if (isinstance(text, str)):
        text = text.replace("\n", " EOS")
        if ("EOS" not in text):
            text += " EOS"
        text_list = [text]

        #print(text)
    else:
        current_text = ""
        for elem in text:
            elem = elem.replace("\n", " EOS\n")
            has_eof = 0
            if ("EOS" not in elem):
                elem += " EOS\n"
            if elem != '\n':
                current_text += elem
        if (current_text != ""):
            text_list.append(current_text)

    text = text_list
    #Do the actual tokenization
    #print(str(text_list))
    tokens = nltk_tokenize(text)

    #print(tokens)
    novos_tokens = []
    for i in range(len(tokens)):
        if (tokens[i] != "\n"):
            novos_tokens.append(tokens[i])
    tokens = novos_tokens
    tokens_after_contractions = replace_contrations(contractions_path, tokens)

    #Check if tokens contain clitics
    #If so, change them to the most extended form
    tokens_after_clitics = replace_clitics(clitics_path,
                                           tokens_after_contractions)
    final_tokens = []
    if not (isinstance(text, str)):
        for tok in tokens_after_clitics[:-1]:
            final_tokens.append(tok)
    else:
        for tok in tokens_after_clitics:
            final_tokens.append(tok)
    return final_tokens
コード例 #3
0
def nlpyport_tokenizer(text,
                       TokPort_config_file,
                       file=False,
                       sentenceTokenizer=True):
    #define the tagset being used
    floresta.tagged_words(tagset="pt-bosque")
    contractions_path = ""
    clitics_path = ""
    tokens = []
    tokens_after_contractions = []
    tokens_after_clitics = []
    psentences = []

    #get the directory of the resources
    contractions_path, clitics_path = load_token_configurations(
        TokPort_config_file)
    contractions = Contractions(contractions_path)
    cilitics = Cilitics(clitics_path)

    if (file != False):
        text = get_input_from_file(text)
    #Do thea actual tokenization
    tokensList = []
    if (sentenceTokenizer):
        sentences = nlpyport_sent_tokenizer(text)
        for i, sentence in enumerate(sentences):
            tokens = []
            tokens = nltk.word_tokenize(sentence)
            for t, token in enumerate(tokens):
                cToken = Token(i, t, token)
                tokensList.append(cToken)
            psentences.append({"tokens": tokens, "ftokens": []})

    else:
        tokens = nltk_tokenize(text)
        for t, token in enumerate(tokens):
            cToken = Token(0, t, token)
            tokensList.append(cToken)
        psentences.append({"tokens": tokens, "ftokens": []})

    tokensList = contractions.replace_contrations(tokensList)
    #print("line pos original token lema entity contractions clitics tag")
    #for token in tokensList:
    #	print ("%s | %s | %s | %s | %s |  %s | %s | %s | %s" % (token.line, token.pos, token.token, token.ptoken, token.lemma, token.entity, token.contractions, token.clitics, token.tag))

    #Check if tokens contain clitics
    #If so, change them to the most extended form

    tokens_after_clitics = cilitics.replace_clitics(
        [tokens for tokens in tokensList if tokens.ptoken != ''])

    return tokens_after_clitics
コード例 #4
0
    def __init__(self):
        """
		construct of the class, this import all tagger of floresta 
		corpota
		"""
        self._taggers = floresta.tagged_words()
        self._nt = normalizeText()
        self._sp = prebotSupport()
        self._stem = stemming()
コード例 #5
0
def default_stopwords():
    twords = floresta.tagged_words()
    stopwords = nltk.corpus.stopwords.words('portuguese')
    stopwords += ['srs', 'sr', 'sras', 'sra', 'deputado', 'presidente',
                  'é', 'nº', 's.a.', 'v.exa.', 'v.exa', '#', 'anos', 'º',
                  'exa', 'mesa', 'legislatura', 'sessão', 'maioria',
                  'seguinte', 'mandato', 'bilhões', 'quilômetros', 'maçã',
                  'ª', 'parabéns', 'membros', 'convido', 'usual', 'biênio',
                  'brasil', 'palavra', 'discussão', 'período', 'início',
                  'pronunciamento', 'suplente', 'atividade', 'ação', 'ações',
                  'daqueles', 'diferenças', 'pasta', 'milhares', 'srªs',
                  'emenda', 'àqueles', 'tamanha', 'mês', 'capaz', 'km',
                  'modelo', 'tarefas', 'colegas', 'programa', 'voz',
                  'meios de comunicação', 'pronunciamento', 'casa', 'sessão',
                  'deliberativa', 'solene', 'ordinária', 'extraordinária',
                  'encaminhado', 'orador', 'tv', 'divulgar', 'deputado',
                  'parlamento', 'parlamentar', 'projeto',
                  'proposta', 'requerimento', 'destaque', 'veto', 'federal',
                  'câmara', 'senado', 'congresso', 'nacional', 'país',
                  'estado', 'brasil', 'lei', 'política', 'povo', 'voto',
                  'partido', 'liderança', 'bancada', 'bloco', 'líder',
                  'lider', 'frente', 'governo', 'oposição', 'presença',
                  'presente', 'passado', 'ausência', 'ausencia', 'ausente',
                  'obstrução', 'registrar', 'aprovar', 'rejeitar', 'rejeição',
                  'sabe', 'matéria', 'materia', 'questão', 'ordem', 'emenda',
                  'sistema', 'processo', 'legislativo', 'plenário', 'pedir',
                  'peço', 'comissão', 'especial', 'permanente', 'apresentar',
                  'encaminhar', 'encaminho', 'orientar', 'liberar', 'apoiar',
                  'situação', 'fato', 'revisão', 'tempo', 'pauta', 'discutir',
                  'discussão', 'debater', 'retirar', 'atender', 'colegas',
                  'autor', 'texto', 'medida', 'união', 'república',
                  'audiência', 'audiencia', 'público', 'publico', 'reunião',
                  'agradecer', 'solicitar', 'assistir', 'contrário',
                  'favorável', 'pessoa', 'comemorar', 'ato', 'momento',
                  'diretora', 'possível', 'atenção', 'agradeço', 'naquele',
                  'necessárias', 'presidenta', 'compromisso']

    valid_tags = ['adj', 'n']
    for (word, tag) in twords:
        tag = simplify_tag(tag)
        words = word.casefold().split('_')
        if tag not in valid_tags:
            stopwords += words

    return list(set(stopwords))
コード例 #6
0
import nltk
import time
import xmltodict
from LemPyPort.LemFunctions import *
from LemPyPort.dictionary import *
from TokPyPort.Tokenizer import *
from TagPyPort.Tagger import *
from CRF.CRF_Teste import *
import sys
import os


global_porperties_file = "config/global.properties"

lexical_conversions="PRP:PREP;PRON:PRO;IN:INTERJ;ART:DET;"
floresta.tagged_words(tagset = "pt-bosque")
TokPort_config_file = ""
TagPort_config_file = ""
LemPort_config_file = ""

def load_config(config_file="config/global.properties"):
	global TokPort_config_file
	global TagPort_config_file
	global LemPort_config_file
	with open (config_file,'r') as f:
		for line in f:
			if(line[0]!="#"):
				if(line.split("=")[0]=="TokPort_config_file"):
					TokPort_config_file = line.split("=")[1].strip("\n")
				elif(line.split("=")[0]=="TagPort_config_file"):
					TagPort_config_file = line.split("=")[1].strip("\n")
コード例 #7
0
ファイル: 25.py プロジェクト: Anastasia1302/nltk
# ◑ Obtain some tagged data for another language, and train and evaluate a variety of taggers on it. If the language is morphologically complex, or if there are any orthographic clues (e.g. capitalization) to word classes, consider developing a regular expression tagger for it (ordered after the unigram tagger, and before the default tagger). How does the accuracy of your tagger(s) compare with the same taggers run on English data? Discuss any issues you encounter in applying these methods to the language.

import nltk
from nltk.corpus import floresta

text = floresta.words()
floresta_tagged_sents = floresta.tagged_sents()
floresta_tagged_words = floresta.tagged_words()
fd = nltk.FreqDist(text)
cfd = nltk.ConditionalFreqDist(floresta_tagged_words)
most_freq_words = fd.most_common(100)

# lookup tagger for likely tags
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)


# trained unigram tagger
size = int(len(floresta_tagged_sents) * 0.9)
training_data = tagged_text[:size]
test_data = tagged_text[size:]

uni_tagger = nltk.UnigramTagger(model=training_data)
uni_tagger.evaluate(test_data)
コード例 #8
0
ファイル: app.py プロジェクト: danielconte/appClasses
            total = total + 1
            if palavra[1] !='0':
                gramatica = gramatica + 1
            if palavra[2] !='':
                uml = uml + 1
    print("|Total:" + str(total))
    print("|Gramática:" + str(gramatica) + " (" + str(gramatica * 100 / total) + "%)")
    print("|UML:" + str(uml) + " (" + str(uml * 100 / total) + "%)")   

def simplificar(t):
    if "+" in t:
        return t[t.index("+")+1:]
    else:
        return t
        
palavrasM = floresta.tagged_words()
palavrasM = [(palavra.lower(),simplificar(classificacao)) for (palavra,classificacao) in palavrasM]

def classificarPalavra(palavra):
    tipos = [];
    for par in palavrasM:
        if par[0]==palavra.decode('utf-8'):
            tipos.append(par[1])
    if len(tipos) > 0:
        d = defaultdict(int)
        for i in tipos:
            d[i] += 1
        classe = max(d.iteritems(), key=lambda x: x[1])
        return(classe[0])
    else:
        return("0")
コード例 #9
0
ファイル: 25.py プロジェクト: peizhe/sandbox-github-clone
# ◑ Obtain some tagged data for another language, and train and evaluate a variety of taggers on it. If the language is morphologically complex, or if there are any orthographic clues (e.g. capitalization) to word classes, consider developing a regular expression tagger for it (ordered after the unigram tagger, and before the default tagger). How does the accuracy of your tagger(s) compare with the same taggers run on English data? Discuss any issues you encounter in applying these methods to the language.

import nltk
from nltk.corpus import floresta

text = floresta.words()
floresta_tagged_sents = floresta.tagged_sents()
floresta_tagged_words = floresta.tagged_words()
fd = nltk.FreqDist(text)
cfd = nltk.ConditionalFreqDist(floresta_tagged_words)
most_freq_words = fd.most_common(100)

# lookup tagger for likely tags
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)

# trained unigram tagger
size = int(len(floresta_tagged_sents) * 0.9)
training_data = tagged_text[:size]
test_data = tagged_text[size:]

uni_tagger = nltk.UnigramTagger(model=training_data)
uni_tagger.evaluate(test_data)
コード例 #10
0
# preprocessando as sentencas tageadas
tagged_sents = floresta.tagged_sents()
tagged_sents = [[(w.lower(), simplify_tag(t))
                 for (w, t) in sent] for sent in tagged_sents if sent]

# Conjunto de treinamento do tagger
train = tagged_sents[500:]

# Conjunto de testes do tagger
test = tagged_sents[:500]

print 'Conjunto de teste: %s' % len(test)
print 'Conjunto de treinamento: %s' % len(train)
# Descobre a tag mais comum no corpus e a utliza com o default tagger
tags = [simplify_tag(pos_tag) for (word, pos_tag)
        in floresta.tagged_words()]
most_freq_tag = nltk.FreqDist(tags).max()
tagger0 = nltk.DefaultTagger(most_freq_tag)

print "Tag mais frequente: %s" % most_freq_tag
print "Acuracia do tagger0 para ",
print " tag mais frequente: %s" % tagger0.evaluate(test)


# Tagger mais elaborado, construido em cima do default tagger
tagger1 = nltk.UnigramTagger(train, backoff=tagger0)
print "Acuracia do tagger1: %s" % tagger1.evaluate(test)

# Tagger mais elaborado ainda, construido em cima do tagger anterior
tagger2 = nltk.BigramTagger(train, backoff=tagger1)
print "Acuracia do tagger2: %s" % tagger2.evaluate(test)