def test_singleton(self): excpt_thrown = False try: Cogroo() except TypeError, e: excpt_thrown = True self.assertEquals(Cogroo.MSG_CALL_INSTANCE, '%s' % e)
def test_lemmatize(): cogroo = Cogroo.Instance() phrase_to_lemmatize = 'o entendimento das metas propostas oferece uma interessante oportunidade para verificação ' \ 'do impacto na agilidade decisória ' expected_result = 'o entender de o meta propor oferecer um interessante oportunidade para verificação de o ' \ 'impacto em o agilidade decisório' assert expected_result == cogroo.lemmatize(phrase_to_lemmatize)
class CogrooSemanticizer: cogroo = Cogroo.Instance() entities_list = [] def __init__(self, text): self.input_text = text self.pos_tagged_text = self.cogroo.analyze(text).sentences[0] def get_entities(self): self.entities_list = self.search_chunks() agglutinator = Agglutinator.Agglutinator(self.input_text, self.entities_list) self.entities_list = agglutinator.agglutinate() self.clean_entities() print("\n", "-" * 20, "> CogrooSemanticizer") print(self.pos_tagged_text.chunks) for entity in self.entities_list: print(entity) return self.entities_list def search_chunks(self): for chunk in self.pos_tagged_text.chunks: if chunk.tag in n_tags or chunk.tag in v_tags: self.entities_list = self.filter_chunk(chunk) return self.entities_list def filter_chunk(self, chunk): ''' Retira expressões das frases :param chunk: :param self.entities_list: :return: entities_list: ''' for token in chunk.tokens: if token.pos in n_tags: entity = ec.Entity(text=token.lexeme, start=token.start, end=token.end, tag='NP', pos=token.pos) self.entities_list.append(entity) if token.pos in v_tags and token.lexeme == 'vem': entity = ec.Entity(text=token.lexeme, start=token.start, end=token.end, tag='VP', pos=token.pos) self.entities_list.append(entity) return self.entities_list def clean_entities(self): for entity in self.entities_list: text = entity.text.replace('_', ' ') entity.text = text
def main(name): file = open(name, mode='r') cogroo = Cogroo.Instance() sentences = [] for idx, line in enumerate(file): aux = line.strip().split("\t") frase = aux[0] sentence = aux[0].replace("'", '"').replace(":", "#").replace( ";", "$").replace(".", "%") en1 = aux[2].replace("=", "_").replace(".", "%") en2 = aux[5].replace("=", "_").replace(".", "%") if int(aux[1].split(",")[0]) > int(aux[4].split(",")[0]): small = en2 big = en1 else: small = en1 big = en2 sentence = sentence.replace(en1, "en1").replace(en2, "en2") aux_features = [] pos = "" check_en = 0 aux_lexeme = [] analyzer = cogroo.analyze(sentence).sentences[0].tokens words = [] for idx, t in enumerate(analyzer): if check_en == 0 and (t.lexeme == "en1" or t.lexeme == "en2"): check_en = 1 aux_lexeme.append(t.lexeme) t.lemma = small.replace("%", ".") t.lexeme = small.replace("%", ".") continue elif check_en == 1 and (t.lexeme == "en1" or t.lexeme == "en2"): if not t.lexeme == aux_lexeme[0]: t.lemma = big.replace("%", ".") t.lexeme = big.replace("%", ".") break elif check_en == 0: continue aux = t.synchunk if len(t.synchunk) <= 2 else t.synchunk[2:] pos += " " + t.pos vector = [t.lemma, t.pos, t.chunk[2:], aux] vector.append("1") if t.pos[0] == "v" else vector.append("0") vector.append("1") if t.pos == "adv" else vector.append("0") # POS, Lemma and Syntactic Tags (-2, -1, 1, 2) gap(analyzer, vector, idx, -2) gap(analyzer, vector, idx, -1) gap(analyzer, vector, idx, 1) gap(analyzer, vector, idx, 2) consecutive(analyzer, vector, idx, -2, -1) consecutive(analyzer, vector, idx, -1, 0) consecutive(analyzer, vector, idx, 0, 1) consecutive(analyzer, vector, idx, 1, 2) # Pattern based features try: vector.append("1") if analyzer[ idx - 2].pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx - 1].pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 1].pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 2].pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 1].pos == "prp" and t.pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 1].pos == "art" and t.pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append( "1") if analyzer[idx + 2].pos == "art" and analyzer[ idx + 1].pos == "prp" and t.pos[0] == "v" else vector.append( "0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 1].pos == "prp" and t.pos == "n" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 1].pos == "prp" and t.pos == "adv" else vector.append("0") except: vector.append("0") try: vector.append( "1") if analyzer[idx + 2].pos == "art" and analyzer[ idx + 1].pos == "prp" and t.pos == "adv" else vector.append( "0") except: vector.append("0") # Syntactic features # Núcleo do sintagma if not t.lexeme in ("o", "a", "os", "as") and t.pos in ( "n", "prop", "pron-det", "pron-pers", "pron-indp") and "NP" in t.chunk: vector.append("1") else: vector.append("0") # Objeto Direto if aux == "ACC": vector.append("1") else: vector.append("0") vector.insert(0, t.lexeme) aux = line.strip().split("\t") if aux[3] == "PER" or aux[6] == "PER": vector.append("O-PER") elif aux[3] == "PLC" or aux[6] == "PLC": vector.append("O-PLC") else: vector.append("O-O") aux_features.append(vector) for x in aux_features: x.append(pos[1:]) x.append(str(len(pos[1:].split(" ")))) words.append(x) en1 = en1.replace("%", ".") en2 = en2.replace("%", ".") sentences.append([words, frase, en1, en2]) # SALVA OS VETORES DE FEATURES print(sentences) with gzip.open('4_RelP/features_teste.txt.gz', 'wb') as f: pickle.dump(sentences, f)
import csv from cogroo_interface import Cogroo cogroo = Cogroo.Instance() classe = [] resposta = [] lema_list = [] analise = [] classes_pergs = [] i=0 with open('corpus.csv') as csvfile: readCSV = csv.reader(csvfile, delimiter=',') next(readCSV) for row in readCSV: classe.append(row[3]) resposta.append(row[2]) pergunta_lema = str(cogroo.lemmatize(row[1])) pergunta_lema = pergunta_lema.lower() #Passa todas as palavras para lowercase lema_list.append( str(pergunta_lema).split(' ') ) #adiciona as perguntas lematizadas em uma lista #analise_perg = cogroo.analyze(pergunta_lema) #analise.append(analise_perg.sentences[0].tokens) #guarda a analise morfologica das perguntas em uma lista y=0 while(y<len(lema_list[i])): perg = str(lema_list[i][y]) if(perg=='o' or perg=='de' or perg==',' or perg=='que' or perg=='qual' or perg=='a' or perg=='um' or perg=='.' or perg=='o(' or perg=='?' or perg=='' or perg=='ser' or perg=='quem' or perg=='em' or perg=='por' or perg=='algoritmo' or perg=='ir' or perg=='se' or perg=='random_forest' or perg=='random' or perg=='forest') or perg=='para': lema_list[i].pop(y) y=y-1
def setUp(self): self.cogroo = Cogroo.instance()
def __init__(self, plan): self.cogroo = Cogroo.Instance() self.plan = plan
def main(name): file = open(name, "r") cogroo = Cogroo.Instance() sentences = [] for idx, line in enumerate(file): aux = line.strip().split("\t") '''print("Frase original -",aux[0]) print("EN1 -", aux[2]) print("EN2 -",aux[7])''' # PRÉ-PROCESSAMENTO frase = aux[0] sentence = aux[0].replace("'", '"').replace(":", "#").replace( ";", "$").replace(".", "%") en1 = aux[2].replace("=", "_").replace(".", "%") rel_num = aux[4].split(",") rel = aux[5].split(" ") en2 = aux[7].replace("=", "_").replace(".", "%") if rel_num[0] == 'None': rel_start = 0 else: rel_start = int(rel_num[0]) if int(aux[1].split(",")[0]) > int(aux[6].split(",")[0]): small = en2 big = en1 else: small = en1 big = en2 sentence = sentence.replace(en1, "en1").replace(en2, "en2") '''print("Frase modificada -",sentence, "\n") ajuda = "Sem relação" if rel == [''] else " ".join(rel) print(ajuda, "\n")''' aux_rel = [] aux_features = [] aux_lexeme = [] pos = "" check_en = 0 count_rel = 0 analyzer = cogroo.analyze(sentence).sentences[0].tokens words = [] for idx, t in enumerate(analyzer): if check_en == 0 and (t.lexeme == "en1" or t.lexeme == "en2"): check_en = 1 aux_lexeme.append(t.lexeme) t.lemma = small.replace("%", ".") t.lexeme = small.replace("%", ".") continue elif check_en == 1 and (t.lexeme == "en1" or t.lexeme == "en2"): if not t.lexeme == aux_lexeme[0]: t.lemma = big.replace("%", ".") t.lexeme = big.replace("%", ".") break elif check_en == 0: continue # CLASSE if count_rel == len(rel): classification = 0 elif t.lexeme == rel[count_rel]: classification = 1 count_rel += 1 else: classification = 0 # FEATURES aux = t.synchunk if len(t.synchunk) <= 2 else t.synchunk[2:] pos += " " + t.pos vector = [t.lemma, t.pos, t.chunk[2:], aux] vector.append("1") if t.pos[0] == "v" else vector.append("0") vector.append("1") if t.pos == "adv" else vector.append("0") # POS, Lemma and Syntactic Tags (-2, -1, 1, 2) gap(analyzer, vector, idx, -2) gap(analyzer, vector, idx, -1) gap(analyzer, vector, idx, 1) gap(analyzer, vector, idx, 2) consecutive(analyzer, vector, idx, -2, -1) consecutive(analyzer, vector, idx, -1, 0) consecutive(analyzer, vector, idx, 0, 1) consecutive(analyzer, vector, idx, 1, 2) # Pattern based features try: vector.append("1") if analyzer[ idx - 2].pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx - 1].pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 1].pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 2].pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 1].pos == "prp" and t.pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 1].pos == "art" and t.pos[0] == "v" else vector.append("0") except: vector.append("0") try: vector.append( "1") if analyzer[idx + 2].pos == "art" and analyzer[ idx + 1].pos == "prp" and t.pos[0] == "v" else vector.append( "0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 1].pos == "prp" and t.pos == "n" else vector.append("0") except: vector.append("0") try: vector.append("1") if analyzer[ idx + 1].pos == "prp" and t.pos == "adv" else vector.append("0") except: vector.append("0") try: vector.append( "1") if analyzer[idx + 2].pos == "art" and analyzer[ idx + 1].pos == "prp" and t.pos == "adv" else vector.append( "0") except: vector.append("0") # Syntactic features # Núcleo do sintagma if not t.lexeme in ("o", "a", "os", "as") and t.pos in ( "n", "prop", "pron-det", "pron-pers", "pron-indp") and "NP" in t.chunk: vector.append("1") else: vector.append("0") # Objeto Direto if aux == "ACC": vector.append("1") else: vector.append("0") vector.insert(0, t.lexeme) vector.insert(0, str(classification)) aux = line.strip().split("\t") if aux[3] == "PER" or aux[8] == "PER": vector.append("O-PER") elif aux[3] == "PLC" or aux[8] == "PLC": vector.append("O-PLC") else: vector.append("O-O") aux_features.append(vector) '''print(classification, t.lexeme)''' for x in aux_features: x.append(pos[1:]) x.append(str(len(pos[1:].split(" ")))) words.append(x) '''print("\n")''' en1 = en1.replace("%", ".") en2 = en2.replace("%", ".") sentences.append([words, frase, en1, en2]) # SALVA OS VETORES DE FEATURES with gzip.open('features_treino.txt.gz', 'wb') as f: pickle.dump(sentences, f)
def __init__(self): self.cogroo = Cogroo.Instance()