def __init__(self): tsents = mac_morpho.tagged_sents() tsents = [[(w.lower(), t) for (w, t) in sent] for sent in tsents if sent] tagger0 = nltk.DefaultTagger('N') tagger1 = nltk.UnigramTagger(tsents[100:], backoff=tagger0) self.tagger = nltk.BigramTagger(tsents[100:], backoff=tagger1)
def tag_unigrams_by_topic(self, dict_of_sentences_by_topic): tagged_unigrams_by_topic = {} train_sents = mac_morpho.tagged_sents()[:5000] tagger = UnigramTagger(train_sents) for k, v in dict_of_sentences_by_topic.items(): tagged_unigrams_by_topic[k] = tagger.batch_tag(dict_of_sentences_by_topic[k]) return tagged_unigrams_by_topic
def __preparar_tagger(self): nome_arquivo_tagger = './cache/postagger.pickle' if os.path.exists(nome_arquivo_tagger): logging.debug("Carregando o Pos-Tagger já treinado de " + nome_arquivo_tagger) with open(nome_arquivo_tagger, 'rb') as arquivo: self.tagger = pickle.load(arquivo) else: logging.debug("Treinando o Pos-Tagger.") #tsents = floresta.tagged_sents() tsents = mac_morpho.tagged_sents() tsents = [[(w.lower(), self.__simplify_tag(t)) for (w, t) in sent] for sent in tsents if sent] tagger0 = nltk.DefaultTagger('n') tagger1 = nltk.UnigramTagger(tsents, backoff=tagger0) tagger2 = nltk.BigramTagger(tsents, backoff=tagger1) #tagger3 = nltk.PerceptronTagger(tsents) self.tagger = tagger2 logging.debug("Gravando o Pos-Tagger treinado em " + nome_arquivo_tagger) with open(nome_arquivo_tagger, 'wb') as arquivo: pickle.dump(self.tagger, arquivo)
def accuracy_measure(): basicConfig(format='%(levelname)s %(message)s', level=INFO) info('reading tagged sentences') info('simplifying tags') # tagged sentences flo_tsents = simplified_sents_floresta(floresta.tagged_sents()) mac_tsents = mac_morpho.tagged_sents() # FLORESTA test and train data flo_size = int(len(flo_tsents) * 0.9) flo_train = flo_tsents[:flo_size] flo_test = flo_tsents[flo_size:] # MAC MORPHO test and train data mac_size = int(len(mac_tsents) * 0.9) mac_train = mac_tsents[:mac_size] mac_test = mac_tsents[mac_size:] no_backoff_taggers(flo_test, flo_train) no_backoff_taggers(mac_test, mac_train, corpus='macmorpho') if not pt.check_for_taggers(): save = True else: save = False backoff_taggers(flo_test, flo_train, save) backoff_taggers(mac_test, mac_train, save, corpus='macmorpho')
def tag_unigrams_by_topic(self, dict_of_sentences_by_topic): tagged_unigrams_by_topic = {} train_sents = mac_morpho.tagged_sents()[:5000] tagger = UnigramTagger(train_sents) for k, v in dict_of_sentences_by_topic.items(): tagged_unigrams_by_topic[k] = tagger.batch_tag( dict_of_sentences_by_topic[k]) return tagged_unigrams_by_topic
def run(self, corpus=Corpus.FLORESTA, force=False): self.should_force = force if corpus == Corpus.FLORESTA: print("\n##### Floresta Corpus #####") floresta_sent = floresta.tagged_sents() self.train("floresta", floresta_sent) elif corpus == Corpus.MAC_MORPHO: print("\n###### Mac Morpho Corpus #####") mac_morpho_sent = mac_morpho.tagged_sents() self.train("mac_morpho", mac_morpho_sent)
def get_unigram_tagger(): #using mac morpho corpus to train tagger #Use unigram tagging (maybe other ones in the future) p_train = 0.9 print "Training unigram tagger using mac mopho corpus... %.2f train" % ( p_train) tagged_sents = mac_morpho.tagged_sents() size = int(len(tagged_sents) * 0.9) train_sents = tagged_sents[:size] test_sents = tagged_sents[size:] uni_tagger = nltk.UnigramTagger(train_sents) print "Test accuracy =", uni_tagger.evaluate(test_sents) return uni_tagger
def create_a_dict_model_for_test_accuracy(self, tagged_unigrams_by_topic): pre_model = {k: map(dict, v) for k, v in tagged_unigrams_by_topic.items()} for k, v in pre_model.items(): reference_model_by_topic = {} for i in v: reference_model_by_topic.update(i) pre_model[k] = reference_model_by_topic dict_model_by_topic = pre_model test_sents = mac_morpho.tagged_sents()[:5000] tagger_accuracy_by_topic = {} for k, v in pre_model.items(): tagger_accuracy_by_topic[k] = UnigramTagger(model=pre_model[k]).evaluate(test_sents) return dict_model_by_topic, tagger_accuracy_by_topic
def tagging(documents): nDocs = len(documents) documentsProcessed = [] unigram_tagger = [] try: unigram_tagger = file_utils.load_object('tagger1','tagger', None) except: train_set = mac_morpho.tagged_sents() # Treinamento do tagger via dicionário mac_morpho unigram_tagger = nltk.UnigramTagger(train_set) # Aplicação do tagger ao documento file_utils.save_object(unigram_tagger, 'tagger1','tagger', None) # Salva os unigramas tagueados a serem dicionados aos documentos processados for iDoc in range(0,nDocs): # Ordenação dos diferentes documentos a serem processados documentsProcessed.append(unigram_tagger.tag(documents[iDoc])) # Adiciona à lista a partir da ordenação return documentsProcessed
def train_tagger(): tagged_sents = mac_morpho.tagged_sents() tagged_sents = [[(w, simplify_tag(t)) for (w, t) in sent] for sent in tagged_sents if sent] tagger0 = nltk.DefaultTagger("N") tagger1 = nltk.UnigramTagger(tagged_sents, backoff=tagger0) tagger2 = nltk.BigramTagger(tagged_sents, backoff=tagger1) output = open("tagger.pkl", "wb") pickle.dump(tagger2, output, -1) output.close() return tagger2
def train_tagger(): ''' Train a tagger on the mac_morpho tagged corpus (must be available locally) and save it for future use. A total of 4 taggers are trained to allow backing of (1) and guarantee an assignment to every term. (1) 3-gram -> 2-gram -> 1-gram -> N (noun) ''' tagged_sents = mac_morpho.tagged_sents() tagger0 = tag.DefaultTagger('N') tagger1 = tag.UnigramTagger(tagged_sents, backoff=tagger0) tagger2 = tag.BigramTagger(tagged_sents, backoff=tagger1) tagger3 = tag.NgramTagger(3, tagged_sents, backoff=tagger2) save_tagger(tagger3)
def create_a_dict_model_for_test_accuracy(self, tagged_unigrams_by_topic): pre_model = { k: map(dict, v) for k, v in tagged_unigrams_by_topic.items() } for k, v in pre_model.items(): reference_model_by_topic = {} for i in v: reference_model_by_topic.update(i) pre_model[k] = reference_model_by_topic dict_model_by_topic = pre_model test_sents = mac_morpho.tagged_sents()[:5000] tagger_accuracy_by_topic = {} for k, v in pre_model.items(): tagger_accuracy_by_topic[k] = UnigramTagger( model=pre_model[k]).evaluate(test_sents) return dict_model_by_topic, tagger_accuracy_by_topic
def tagging(documents): nDocs = len(documents) # print nDocs documentsProcessed = [] unigram_tagger = [] try: unigram_tagger = file_utils.load_object('tagger1', 'tagger', None) print unigram_tagger except: train_set = mac_morpho.tagged_sents() #test_set = mac_morpho.tagged_sents()[10001:10010] unigram_tagger = nltk.UnigramTagger(train_set) file_utils.save_object(unigram_tagger, 'tagger1', 'tagger', None) for iDoc in range(0, nDocs): #tokens = documents[iDoc] documentsProcessed.append(unigram_tagger.tag(documents[iDoc])) return documentsProcessed
def filtra_palavras(palavras): # Verifica se já existe o etiquetador treinado if os.path.exists("etiquetador.bin"): # Se sim: apenas o carrega etiquetador = pickle.load(open("etiquetador.bin", "rb")) else: # Se não: treina ele e o salva palavras_treinar = mac_morpho.tagged_sents() etiquetador = nltk.tag.UnigramTagger(palavras_treinar) pickle.dump(etiquetador, open("etiquetador.bin", "wb")) palavras_etiquetadas = etiquetador.tag(palavras) palavras_filtradas = [] for palavra in palavras_etiquetadas: if palavra[0] not in stopwords.words('portuguese'): if not (palavra[1] == 'KS' or palavra[1] == 'KC' or palavra[1] == 'ART' or palavra[1] == 'PREP'): palavras_filtradas.append(palavra[0]) return palavras_filtradas
def tagging(documents): nDocs = len(documents) documentsProcessed = [] unigram_tagger = [] try: unigram_tagger = file_utils.load_object('tagger1', 'tagger', None) except: train_set = mac_morpho.tagged_sents( ) # Treinamento do tagger via dicionário mac_morpho unigram_tagger = nltk.UnigramTagger( train_set) # Aplicação do tagger ao documento file_utils.save_object( unigram_tagger, 'tagger1', 'tagger', None ) # Salva os unigramas tagueados a serem dicionados aos documentos processados for iDoc in range( 0, nDocs): # Ordenação dos diferentes documentos a serem processados documentsProcessed.append(unigram_tagger.tag( documents[iDoc])) # Adiciona à lista a partir da ordenação return documentsProcessed
def get_pos_tagger(model_path, lang='pt'): if os.path.isfile(model_path): logger.info("Loading POS tagger at %s" % model_path) with open(model_path, 'rb') as f: pos_tagger = pickle.load(f) else: if lang == 'pt': logger.info("Training and saving portuguese POS tagger to %s" % model_path) tagged_sentences = mac_morpho.tagged_sents() tagged_sentences = [[(w, t) for (w, t) in s] for s in tagged_sentences if s] train = tagged_sentences tagger_default = nltk.DefaultTagger('N') tagger_unigram = nltk.UnigramTagger(train, backoff=tagger_default) pos_tagger = nltk.BigramTagger(train, backoff=tagger_unigram) with open(model_path, "wb") as f: pickle.dump(pos_tagger, f) else: logger.warning("Using default english POS tagger for '%s'" % lang) pos_tagger = EnglishPOSTagger() return pos_tagger
def tagging(self, documents, savePath, language): nDocs = len(documents) documentsProcessed = [] unigram_tagger = [] from data_core.file_utils import FileUtils file_utils = FileUtils(savePath) try: unigram_tagger = file_utils.load_object('tagger_' + language, 'tagger') except: if language == "pt": train_set = mac_morpho.tagged_sents() elif language == "en": train_set = brown.tagged_sents(tagset='universal') #print(train_set[0:1]) nSents = len(train_set) train_set_lower = [] for iSent in range(0, nSents): nWords = len(train_set[iSent]) words = [] for iWord in range(0, nWords): words.append( (self.text_lower_one([train_set[iSent][iWord][0] ])[0], train_set[iSent][iWord][1])) train_set_lower.append(words) #print(train_set_lower[0:1]) #test_set = mac_morpho.tagged_sents()[10001:10010] unigram_tagger = nltk.UnigramTagger(train_set_lower) file_utils.save_object(unigram_tagger, 'tagger_' + language, 'tagger') for iDoc in range(0, nDocs): #tokens = documents[iDoc] documentsProcessed.append(unigram_tagger.tag(documents[iDoc])) return documentsProcessed
def train_tagger(): ''' Um exemplo de treinamento de um etiquetador sintático usando um modelo de tri-gramas baseado em probabilidades. Um etiquetador sintático identifica quais a classe de uma palavra Ex.: Isso é um teste = Isso-PROSUB é-V um-ART teste-N Preposição Verbo Artigo Substantivo ''' # Carregando um conjunto de dados em português que possui # sentenças manualmente identificadas data = [ [(w, re.split('[|-]', tag)[0]) for w, tag in sent] for sent in mac_morpho.tagged_sents()] # Classe sintática padrão. N siginifica Nome/substantivo tagger0 = DefaultTagger('N') print('train unigram') tagger1 = UnigramTagger(data, backoff=tagger0) print('training bigram') tagger2 = BigramTagger(data, backoff=tagger1) print('training trigram') return TrigramTagger(data, backoff=tagger2)
), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = "#FFF" # white # Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = "#F00" # red _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG" _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
def TaggerOnline(tokens): etiq1 = DefaultTagger('N') sentencas_treinadoras = mac_morpho.tagged_sents()[::] etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1) tagsTokens = etiq2.tag(tokens) return tagsTokens
myPhrases.append(word) #Faz o processo ser repetido recursivamente para que o padrão(phrase) #seja procurado em cada um dos filhos da árvore e concatena o resultado #se este for favorável for child in myTree: if (type(child) is Tree): list_of_phrases = ExtractPhrases(child, phrase) if (len(list_of_phrases) > 0): myPhrases.extend(list_of_phrases) #Retorna a lista de padrões encontrados return myPhrases #Cria o etiquetador padrão para que palavras não conhecidas sejam tratadas com substantivo(N) etiqPadrao = DefaultTagger('N') #Pega o trainning set a partir das tagged_sents() do mac_morpho sentencas_treinadoras = mac_morpho.tagged_sents()[0:15000] #Cria o UnigramTagger com base no etiquetador padrão e treina-o com as sentenças etiquetadas do mac_morpho etiq = UnigramTagger(sentencas_treinadoras, backoff=etiqPadrao) coment = str(input("Entre com o texto: ")) if coment == "default": coment = open("default.txt", "r").read().replace("\n", " ") #O texto é convertido em tokens tokens=nltk.word_tokenize(coment.lower()) #É etiquetada cada token do texto tags = etiq.tag(tokens) #É criado o analisador de expresões regulares contendo os padrões procurados analiseGramatical = RegexpParser(r""" PADRAO7: {<N><ADJ>} PADRAO1: {<ADJ><N>(<PREP>?<N>)*}
'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='universal'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='universal'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='universal'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='universal'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = '#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = '#F00' #red _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0' # dark grey
ilusao_re = re.compile( r"(^|[ (\[{“'])(d+-?e+-?s+-?)?i+-?l+-?u+-?s+-?[aeiouà-ü]*-?o+-?([- .!?,;\)\]\}”'_:]|$)", re.MULTILINE | re.IGNORECASE) chateacao_re = re.compile( r"(^|[ (\[{“'])c+-?h+-?a+-?t+-?e+-?a+-?[cç]+-?[aeiouà-ü]*-?o+-?([- .!?,;\)\]\}”'_:]|$)", re.MULTILINE | re.IGNORECASE) pizza_re = re.compile( r"(^|[ (\[{“'])p+-?i+-?(z-?)+a+-?([- .!?,;\)\]\}”'_:]|$)", re.MULTILINE | re.IGNORECASE) chope_re = re.compile( r"(^|[ (\[{“'])c+-?h+-?o+-?p+-?e*([- .!?,;\)\]\}”'_:]|$)", re.MULTILINE | re.IGNORECASE) garcom_re = re.compile(r"(^|[ (\[{“'])gar[sç]+o[nm]([- .!?,;\)\]\}”'_:]|$)", re.MULTILINE | re.IGNORECASE) tsents = mac_morpho.tagged_sents() tagger0 = nltk.DefaultTagger('N') tagger1 = nltk.UnigramTagger(tsents, backoff=tagger0) tagger2 = nltk.BigramTagger(tsents, backoff=tagger1) o = "" with open('c:\\dev\\estagio\\analise.txt', encoding='UTF-8') as f: o = f.readlines() dados = [] for line in o: while (link_re.search(line)): line = re.sub(link_re, '. ', line) while (data_re.search(line)): line = re.sub(data_re, ' (data) ', line)
'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(simplify_tags=True), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(simplify_tags=True), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(simplify_tags=True), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(simplify_tags=True), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR='#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR='#F00' #red _HIGHLIGHT_WORD_TAG='HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'
'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(simplify_tags=True), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(simplify_tags=True), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(simplify_tags=True), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(simplify_tags=True), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = '#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = '#F00' #red _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0' # dark grey
textoBrownTagSentNew.append(sentNew) #print('segundo',textoBrownTagSentNew) fdTag2 = nltk.FreqDist(tag for m in textoBrownTagSentNew for (word, tag) in m) print('tags2 TAG', fdTag2.most_common()) fdTag3 = nltk.FreqDist(word for m in textoBrownTagSentNew for (word, tag) in m) #print('tags2 WORD',fdTag3.most_common()) tagTexto(textoBrownTagSentNew) ''' 1.Estender o exemplo dos etiquetadores para TrigramTagger e analisar a precisao do modelo ''' treino = mac_morpho.tagged_sents()[1000:] teste = mac_morpho.tagged_sents()[:1000] etiq0 = nltk.DefaultTagger('N') etiq1 = nltk.UnigramTagger(treino, backoff=etiq0) print('UnigramTagger', etiq1.evaluate(teste)) etiq2 = nltk.BigramTagger(treino, backoff=etiq1) print('BigramTagger', etiq2.evaluate(teste)) etiq3 = nltk.TrigramTagger(treino, backoff=etiq2) print('TrigramTagger', etiq3.evaluate(teste)) doc = open('textoPT.txt', encoding='utf8') raw = doc.read() #texto = nltk.word_tokenize('O mundo atual possui diversos idiomas.') texto = nltk.word_tokenize(raw) #print('etiq2', etiq2.tag(texto))
def tagging(self, documents, savePath, language): nDocs = len(documents) # documentsProcessed = [] unigram_tagger = [] train_set_lower = [] # def simplify_tag(t): # if "+" in t: # return t[t.index("+")+1:] # else: # return t # tsents = floresta.tagged_sents() # tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] # train = tsents[100:] # from data_core.file_utils import FileUtils # file_utils = FileUtils(savePath) # print documents try: unigram_tagger = file_utils.load_object('tagger_' + language, 'tagger') except: if language == "pt": train_set = mac_morpho.tagged_sents() # train_set = train; # elif language == "en": # train_set = brown.tagged_sents(tagset='universal') # #print(train_set[0:1]) # nSents = len(train_set) # train_set_lower = [] # for iSent in range(0,nSents): # nWords = len(train_set[iSent]) # words = [] # for iWord in range(0,nWords): # words.append((self.text_lower_one([train_set[iSent][iWord][0]])[0],train_set[iSent][iWord][1])) # train_set_lower.append(words) # # print(train_set_lower[0:1]) # test_set = mac_morpho.tagged_sents()[10001:10010] # print 'antes do mac morpho' # tagger = nltk.UnigramTagger(train_set) tagger0 = nltk.DefaultTagger('n') tagger1 = nltk.UnigramTagger(train_set, backoff=tagger0) tagger2 = nltk.BigramTagger(train_set, backoff=tagger1) # tagger = nltk.BigramTagger(train_set) # print 'depois do mac morpho' # string = unigram_tagger, 'tagger_' + language,'tagger' # print string # # # file_utils.save_object(unigram_tagger, 'tagger_' + language,'tagger') # print unigram_tagger.tag(documents[0]) for iDoc in range(0, nDocs): print tagger2.tag(documents[iDoc]) # print tagger.tag(documents[0]) return ''
def initialize_dataset(self): tagged_sentences = mac_morpho.tagged_sents() self.train = tagged_sentences[100:] self.test = tagged_sentences[:100]
"Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents( tagset="universal" ), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents( tagset="universal" ), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents( files="hindi.pos", tagset="universal" ), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents( tagset="universal" ), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents( tagset="universal" ), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents( tagset="universal" ), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = "#FFF" # white # Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = "#F00" # red _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
def Tagger(): #Tagger etiq1 = DefaultTagger('N') sentencas_treinadoras = mac_morpho.tagged_sents()[::] etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1) return etiq2
'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='simple'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='simple'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='simple'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='simple'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='simple'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = '#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = '#F00' #red _HIGHLIGHT_WORD_TAG = 'HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR = '#C0C0C0' # dark grey
def pos_concordance(word): for sent_pos in mac_morpho.tagged_sents(): sent = [s for s in sent_pos if word in s[0]]
#print("TESTE",pos_simple) candidates_simple = set(itertools.product(*pos_simple)) candidates_med = set(itertools.product(*pos_med)) candidates_full = set(itertools.product(*pos_full)) else: candidates_simple = candidates_simple.intersection(set(itertools.product(*pos_simple))) candidates_med = candidates_med.intersection(set(itertools.product(*pos_med))) candidates_full = candidates_full.intersection(set(itertools.product(*pos_full))) #print("ITERTOOLS") #print(candidates_simple) return candidates_simple, candidates_med, candidates_full sentences = [s[1] for s in inputs] log.info("Loading Mac-Morpho Tagged Sents...") tsents = list(mac_morpho.tagged_sents()) def simplify_tag(t): if "+" in t: t = t[t.index("+")+1:] if t == "ART": return "DET" return t log.info("Simplifyng POS Tags...") tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] train = tsents
}, 'handlers': { 'mail_admins': { 'level': 'ERROR', 'filters': ['require_debug_false'], 'class': 'django.utils.log.AdminEmailHandler' } }, 'loggers': { 'django.request': { 'handlers': ['mail_admins'], 'level': 'ERROR', 'propagate': True, }, } } import nltk NLTK_DATAPATH = os.path.join(os.path.dirname(nltk.__file__), "data") nltk.data.path = [NLTK_DATAPATH] ETIQUETADOR = None try: from nltk.tag import UnigramTagger from nltk.corpus import mac_morpho sentencas_treinadoras = mac_morpho.tagged_sents()[0:100] ETIQUETADOR = UnigramTagger(sentencas_treinadoras) except LookupError: pass
'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='simple'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='simple'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='simple'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='simple'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='simple'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR='#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR='#F00' #red _HIGHLIGHT_WORD_TAG='HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'
def __init__(self): tsents = mac_morpho.tagged_sents() tsents = [[(w.lower(),t) for (w,t) in sent] for sent in tsents if sent] tagger0 = nltk.DefaultTagger('N') tagger1 = nltk.UnigramTagger(tsents[100:], backoff=tagger0) self.tagger = nltk.BigramTagger(tsents[100:], backoff=tagger1)
'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='universal'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='universal'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='universal'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='universal'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR='#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR='#F00' #red _HIGHLIGHT_WORD_TAG='HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'
# pylint: disable=C0111 # pylint: disable=C0103 from nltk.corpus import mac_morpho import nltk tags = [t for s in mac_morpho.tagged_sents() for (p, t) in s] frequencia = nltk.FreqDist(tags) print(frequencia.most_common(5))
"Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="universal"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="universal"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="universal"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="universal"), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="universal"), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = "#FFF" # white # Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = "#F00" # red _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG" _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey
else: candidates_simple = candidates_simple.intersection( set(itertools.product(*pos_simple))) candidates_med = candidates_med.intersection( set(itertools.product(*pos_med))) candidates_full = candidates_full.intersection( set(itertools.product(*pos_full))) #print("ITERTOOLS") #print(candidates_simple) return candidates_simple, candidates_med, candidates_full sentences = [s[1] for s in inputs] log.info("Loading Mac-Morpho Tagged Sents...") tsents = list(mac_morpho.tagged_sents()) def simplify_tag(t): if "+" in t: t = t[t.index("+") + 1:] if t == "ART": return "DET" return t log.info("Simplifyng POS Tags...") tsents = [[(w.lower(), simplify_tag(t)) for (w, t) in sent] for sent in tsents if sent]