def LSA_Kmeans(clusters, textoTreinamento, nomeUsuarios, textoComparacao=None): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) ########################################################################################## # PRÉ-PROCESSAMENTO DO TEXTO ENVIADO PARA CRIAÇÃO DO DICIONÁRIO DE RELACOES SEMANTICAS # ########################################################################################## #UTILIZA AS FUNCOES removeA e removePontuacao PARA TRATAR textoTreinamento textoTrein = [removeA(removePontuacao(i)) for i in textoTreinamento] #print textoTrein textoComp = [removeA(removePontuacao(i)) for i in textoComparacao] #CARREGA A LISTA DE STOPWORDS DA NLTK stop = stopwords.words('portuguese') #RETIRA OS ACENTOS DA LISTA DE STOPWORDS stoplist = [(removeA(s)) for s in stop ] # print stoplist #REMOVE AS STOPWORDS E PALAVRAS COM MENOS DE 3 CARACTERES textoTrein = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \ for document in textoTrein] # print sw_textoTrein textoComp = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \ for document in textoComp] # print textoComp ############################################################################################## # INICIO DE APLICACAO DO LSA - CRIANDO O DICIONARIO DE TERMOS/FREQUENCIA # ############################################################################################## #DEFINE FREQUENCIA COMO UMA VARIAVEL DO TIPO DICIONARIO DE INTEIROS frequencia = defaultdict(int) #ARMAZENA A QUANTIDADE DE REPETIÇÕES DE UM TERMO EM TODOS OS DOCUMENTOS DA COLECAO for t in textoTrein: for token in t: frequencia[token] += 1 # pprint(frequencia) #PALAVRAS COM FREQUENCIA 1 NÃO SÃO IMPORTANTES, POIS NÃO POSSUEM RELACOES DE CO-OCORRENCIA #Remove todas as palavras que apareceram apenas 1 vez durante a contagem textoTrein = [[token for token in palavra if frequencia[token] > 1]\ for palavra in textoTrein] # pprint(textoTrein) ########################################################################################## # Dictionary encapsulates the mapping between normalized words and their integer ids. # # The main function is `doc2bow`, which converts a collection of words to its # # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. # ########################################################################################## dicionario = corpora.Dictionary(textoTrein) # print dicionario # Armazena o ID das palavras que aparecem apenas 1 vez nos textos once_ids = [tokenId for tokenId,docfreq in dicionario.dfs.iteritems() if docfreq == 1] # print once_ids #remove todas as palavras com frequencia = 1 dicionario.filter_tokens(once_ids) #reorganiza o dicionario, realocando os dados para os indices que foram removidos dicionario.compactify() # print dicionario.token2id # token -> tokenId # print dicionario.dfs # document frequencies: tokenId -> in how many documents this token appeared # Atribui a corpus_textoTrein o textoTrein no formato "bag-of-words" # The main function is `doc2bow`, which converts a collection of words to its # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. corpus_textoTrein = [dicionario.doc2bow(texto) for texto in textoTrein] # pprint(corpus_textoTrein) corpus_textoComp = [dicionario.doc2bow(textoC) for textoC in textoComp] # pprint(corpus_textoComp) ########################################################################################## # MODELO DE TRANSFORMACAO - BAG-OF-WORDS PARA TF-IDF # ########################################################################################## # TRANSFORMA corpus_textoTrein (bag-of-words) # PARA tfidf_TextoTrein (frequencia termos x inverso da freq no documento tfidf_TextoTrein = models.TfidfModel(corpus=corpus_textoTrein) # print tfidf_TextoTrein #USA O posIni PARA GERAR A MATRIZ DE COMPARACAO COM OS DADOS DO DICIONARIO corpus_tfidf_TextoTrein = tfidf_TextoTrein[corpus_textoComp] # print list(corpus_tfidf_TextoTrein) #TRANSFORMA A MATRIZ TF-IDF modelo_lsa = models.LsiModel(corpus_tfidf_TextoTrein, id2word=dicionario,num_topics=len(dicionario)) query = [] for q in textoComparacao: vec_bow = dicionario.doc2bow(q.lower().split()) vec_lsi = modelo_lsa[vec_bow] #convert a query de comparação num espaço LSI query.append(vec_lsi) # print "query" # pprint(query) #TRANSFORMA corpus_textoComp num espaço LSA e indexa indexComp = similarities.MatrixSimilarity(modelo_lsa[corpus_textoComp]) # print "indexComp" # pprint(list(indexComp)) # To obtain similarities of our query document against the indexed documents: # perform a similarity query against the corpus sims = indexComp[query] # pprint(sims) ########################################################################################## # JUNÇÃO COM K-MEANS PARA REALIZAR AGRUPAMENTOS # ########################################################################################## ##Valor ideal, após experimentos = 100000 km_model = KMeans(n_clusters=clusters, n_init=100000) km_model.fit_transform(sims) clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(idx) ### impressões para visualizar no console # print "clustering _LSA_KMEANS" # pprint(clustering) # print len(clustering) # for i in range(len(clustering)): # for j in clustering[i]: # print "grupo", i # print j, nomeUsuarios[j] # print textoComparacao[j] return clustering
def similaridade_lsa(textoTreinamento, nomeUsuarios, textoComparacao=None): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) ########################################################################################## # PRÉ-PROCESSAMENTO DO TEXTO ENVIADO PARA CRIAÇÃO DO DICIONÁRIO DE RELACOES SEMANTICAS # ########################################################################################## #UTILIZA AS FUNCOES removeA e removePontuacao PARA TRATAR textoTreinamento textoTrein = [removeA(removePontuacao(i)) for i in textoTreinamento] #print textoTrein textoComp = [removeA(removePontuacao(i)) for i in textoComparacao] #CARREGA A LISTA DE STOPWORDS DA NLTK stop = stopwords.words('portuguese') #RETIRA OS ACENTOS DA LISTA DE STOPWORDS stoplist = [(removeA(s)) for s in stop ] # print stoplist #REMOVE AS STOPWORDS E PALAVRAS COM MENOS DE 3 CARACTERES textoTrein = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \ for document in textoTrein] # print sw_textoTrein textoComp = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \ for document in textoComp] # print textoComp ############################################################################################## # INICIO DE APLICACAO DO LSA - CRIANDO O DICIONARIO DE TERMOS/FREQUENCIA # ############################################################################################## #DEFINE FREQUENCIA COMO UMA VARIAVEL DO TIPO DICIONARIO DE INTEIROS frequencia = defaultdict(int) #ARMAZENA A QUANTIDADE DE REPETIÇÕES DE UM TERMO EM TODOS OS DOCUMENTOS DA COLECAO for t in textoTrein: for token in t: frequencia[token] += 1 # pprint(frequencia) #PALAVRAS COM FREQUENCIA 1 NÃO SÃO IMPORTANTES, POIS NÃO POSSUEM RELACOES DE CO-OCORRENCIA #Remove todas as palavras que apareceram apenas 1 vez durante a contagem textoTrein = [[token for token in palavra if frequencia[token] > 1]\ for palavra in textoTrein] # pprint(textoTrein) ########################################################################################## # Dictionary encapsulates the mapping between normalized words and their integer ids. # # The main function is `doc2bow`, which converts a collection of words to its # # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. # ########################################################################################## dicionario = corpora.Dictionary(textoTrein) # print dicionario # Armazena o ID das palavras que aparecem apenas 1 vez nos textos once_ids = [tokenId for tokenId,docfreq in dicionario.dfs.iteritems() if docfreq == 1] # print once_ids #remove todas as palavras com frequencia = 1 dicionario.filter_tokens(once_ids) #reorganiza o dicionario, realocando os dados para os indices que foram removidos dicionario.compactify() # print dicionario.token2id # token -> tokenId # print dicionario.dfs # document frequencies: tokenId -> in how many documents this token appeared # Atribui a corpus_textoTrein o textoTrein no formato "bag-of-words" # The main function is `doc2bow`, which converts a collection of words to its # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. corpus_textoTrein = [dicionario.doc2bow(texto) for texto in textoTrein] # pprint(corpus_textoTrein) corpus_textoComp = [dicionario.doc2bow(textoC) for textoC in textoComp] # pprint(corpus_textoComp) ########################################################################################## # MODELO DE TRANSFORMACAO - BAG-OF-WORDS PARA TF-IDF # ########################################################################################## # TRANSFORMA corpus_textoTrein (bag-of-words) # PARA tfidf_TextoTrein (frequencia termos x inverso da freq no documento tfidf_TextoTrein = models.TfidfModel(corpus=corpus_textoTrein) # print tfidf_TextoTrein #USA O posIni PARA GERAR A MATRIZ DE COMPARACAO COM OS DADOS DO DICIONARIO corpus_tfidf_TextoTrein = tfidf_TextoTrein[corpus_textoComp] # print list(corpus_tfidf_TextoTrein) #TRANSFORMA A MATRIZ TF-IDF modelo_lsa = models.LsiModel(corpus_tfidf_TextoTrein, id2word=dicionario,num_topics=len(dicionario)) #TRANSFORMA OS DADOS DE TREINAMENTO EM LSA # corpus_lsi = modelo_lsa[corpus_tfidf_TextoTrein] # for doc in corpus_lsi: # pprint(doc) query = [] for q in textoComparacao: vec_bow = dicionario.doc2bow(q.lower().split()) vec_lsi = modelo_lsa[vec_bow] #convert a query de comparação num espaço LSI query.append(vec_lsi) # print "query" # pprint(query) #TRANSFORMA corpus_textoComp num espaço LSA e indexa indexComp = similarities.MatrixSimilarity(modelo_lsa[corpus_textoComp]) # print "indexComp" # pprint(list(indexComp)) # To obtain similarities of our query document against the nine indexed documents: # perform a similarity query against the corpus sims = indexComp[query] # pprint(list(enumerate(sims))) now = datetime.now() resultado = open(os.path.join(os.path.dirname(__file__),"../arquivos/resultado"+now.__str__()+".txt"), "w") resultados = [] for i in range(0, len(sims)): aux = sims[i] # print "sorted",sorted(sims[i], reverse=True) # print i, aux for y in range(i+1, len(aux)): str_aux = [nomeUsuarios[i] +" " + aux[y].__str__() + "% similar" + nomeUsuarios[y]] # print str_aux # resultados.append(str_aux) resultados.append([aux[y],nomeUsuarios[i],nomeUsuarios[y]]) resultado.write(nomeUsuarios[i] +" " + aux[y].__str__() + "% similar" + nomeUsuarios[y] + "\n") # # resultado.close() # print "resultados" # pprint(resultados) return resultados
def clusterArgInicial(idtese): #Variaveis e funçoes para conexação com o banco de dados do Debate de Teses cursor = connection.cursor() cursor2 = connection.cursor() cursor.execute( "select distinct `usr`.`primeironome` as `name`, `arg`.`argumento` AS `posicionamentoinicial` from ((((`argumento` `arg` join `revisao` `rev`) join `replica` `rep`) join `posicionamento` `pos`) join `argumentador` `urg`)join `usuario` `usr` where ((`arg`.`tese_idtese` = " + idtese + " ) and (`rev`.`argumento_idargumento` = `arg`.`idargumento`) and (`rep`.`revisao_idrevisao` = `rev`.`idrevisao`) and (`arg`.`argumentador_idargumentador` = `pos`.`argumentador_idargumentador`) and (`arg`.`tese_idtese` = `pos`.`tese_idtese`) and (`arg`.`posicionamentoinicial` is not null) and (`arg`.`argumentador_idargumentador` = `urg`.`idargumentador`) and(`urg`.`usuario_idusuario` = `usr`.`idusuario`) and (`pos`.`posicionamentofinal` is not null))" ) cursor2.execute("select tese from tese where idtese=" + idtese) #Variavel e função para tratar tags html e acentos com codificação ISO h = HTMLParser.HTMLParser() #dados retirados da consulta ao banco dadosSql = cursor.fetchall() textotese = cursor2.fetchall() #listas para tratar os dados iniciais usu = [] posInicial = [] dados = [] tese = [] #lista com dados pos tagger tag_posInicial = [] tag_comAce_posInicial = [] #lista com dados após a remoção das stopwords sw_tese = [] sw_posInicial = [] aux_usu = [] sw_tagPosInicial = [] #texto marcado e sem stopwords sw_tagcomAce_posInicial = [] #texto COM ACENTOS marcado e sem stopwords #lista com dados após a aplicação de Stemming st_posInicial = [] st_tese = [] st_tagPosInicial = [] #texto marcado, sem stopwords e com stemmer aplicado st_tagcomAce_posInicial = [ ] #texto COM ACENTOS marcado, sem stopwords e com stemmer aplicado ############################################################################################################# #LISTA COM OS POSICIONAMENTOS INICIAIS APÓS APLICAÇÃO DA NORMALIZAÇAÕ posInicial_Normalizado = [] normalizacao = [] ############################################################################################################# #Aplicacao de Case Folding for d in dadosSql: dados.append([ re.sub('<[^>]*>', '', h.unescape(d[0])).lower(), re.sub('<[^>]*>', '', h.unescape(d[1])).lower() ]) for t in textotese: tese.append(re.sub('<[^>]*>', '', h.unescape(t[0])).lower()) #Colocando os textos de posicionamento inicial em numa lista separada for i in dados: x = 0 usu.append(i[x].upper()) posInicial.append( i[x + 1].lower() ) #lista com o posicionamento Inicial com todas as letras em minusculo ############################################################################################################# ### Classificacao das palavras de acordo com sua classe gramatical ### Utilizacao do postagger NLPNET ### http://nilc.icmc.usp.br/nlpnet/index.html# tagger = nlpnet.POSTagger() semAce_posInicial = [ ] #armazena o posInicial apenas sem acentos, sem pontuações, sem endereço web e sem numeros comAce_posInicial = [ ] #armazena o posInicial apenas COM acentos, sem pontuações, sem endereço web e sem numeros for i in posInicial: semAce_posInicial.append( removePontuacao(removeA(removeNum(removeSE(removeEndWeb((i))))))) for i in semAce_posInicial: tag_posInicial.append(tagger.tag(i)) for i in posInicial: comAce_posInicial.append( removePontuacao(removeNum(removeSE(removeEndWeb((i)))))) for i in comAce_posInicial: tag_comAce_posInicial.append(tagger.tag(i)) ############################################################################################################# #APENAS PARA REALIZAR TESTE E COLOCAR NA DISSERTACAO # pprint(semAce_posInicial) # pprint(comAce_posInicial) # exit() # tagg_posInicial = [] # for texto in posInicial: # tagg_posInicial.append(tagger.tag(texto)) # # print "posInicial" # pprint(posInicial) # # print "tagg_posInicial" # pprint(tagg_posInicial) ############################################################################################################# ############################################################################################################# ### REMOCAO DE STOPWORDS ### Remocao dos termos de acordo com a NLTK ### Remocao dos termos classificados como artigos, verbos, adverbios, etc... for i in usu: aux_usu.append(removeStopWords(i)) for i in tese: sw_tese.append(removeStopWords(i)) for i in posInicial: sw_posInicial.append(removeStopWords(i)) for i in tag_posInicial: sw_tagPosInicial.append(limpaCorpus(i)) for i in tag_comAce_posInicial: sw_tagcomAce_posInicial.append(limpaCorpus(i)) #################################################################################################################################### # Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa # Retirando afixos dos textos do posInicial e tese stemmer = RSLPStemmer() for i in range(len(sw_posInicial)): st_aux = sw_posInicial[i] string_aux = "" for sufixo in st_aux.split(): string_aux = string_aux + " " + stemmer.stem(sufixo) st_posInicial.append(string_aux) for i in range(len(sw_tese)): st_aux = sw_tese[i] string_aux = "" for sufixo in st_aux.split(): string_aux = string_aux + " " + stemmer.stem(sufixo) st_tese.append(string_aux) for i in range(len(sw_tagPosInicial)): termosST = "" auxST = [] for j in range(len(sw_tagPosInicial[i])): aux = stemmer.stem(sw_tagPosInicial[i][j][0]) etiqueta = sw_tagPosInicial[i][j][1] termosST = (aux, etiqueta) auxST.append(termosST) st_tagPosInicial.append(auxST) for i in range(len(sw_tagcomAce_posInicial)): termosST = "" auxST = [] for j in range(len(sw_tagcomAce_posInicial[i])): aux = stemmer.stem(sw_tagcomAce_posInicial[i][j][0]) etiqueta = sw_tagcomAce_posInicial[i][j][1] termosST = (aux, etiqueta) auxST.append(termosST) st_tagcomAce_posInicial.append(auxST) #################################################################################################################################### ### A NORMALIZACAO DE TERMOS REFERE-SE A TECNICA DE TROCAR PALAVRAS SINONIMAS, OU SEJA, QUE TENHAM SIGNIFICADO ## ### SEMELHANTE, POR UM UNICO TERMO REPRESENTATIVO NO CORPUS DE ANALISE. DESSA FORMA, É POSSIVEL AUMENTAR O GRAU ## ### DE SIMILARIDADE ENTRE OS TEXTOS ANALISADOS ATRAVES DO USO DE TECNICAS DE ANALISE ESTATISTICAS, COMO SIMILA ## ### RIDADE DE COSSENOS OU DISTANCIA EUCLIDIANA. ## #################################################################################################################################### ### A NORMALIZACAO FOI DESENVOLVIDA COM BASE NOS DADOS DISPONIBILIZADOS PELO PROJETO TEP 2.0 DO NILC/USP ## ### http://143.107.183.175:21480/tep2/index.htm ## ### ## ### FORMATO DO ARQUIVO ## ### NUM1. [Tipo] {termos sinonimos} <NUM2> ## ### 263. [Verbo] {consentir, deixar, permitir} <973> ## ### NUM1 = NUMERO DA LINHA DE REFERENCIA PARA TERMO SINONIMO ## ### NUM2 = NUMERO DA LINHA DE REFERENCIA PARA TERMO ANTONIMO (SENTIDO OPOSTO) ## #################################################################################################################################### #abre o arquivo com as relacoes de sinonimia (termos linhaWordNet) e antonimia (termos contrarios) #arquivo apenas com termos classificados como substantivos, adjetivos e verbos base_tep = codecs.open( os.path.join(os.path.dirname(__file__), '../base_tep2/base_tep.txt'), 'r', 'UTF8') # dicionario = open('/home/panceri/git/alpes_v1/base_tep2/dicionarioSinonimos.txt', 'w') #variavel com conteúdo do arquivo em memoria #não imprimir essa variável, MUITO GRANDEE!!! wordNet = base_tep.readlines() #fechar arquivo base_tep.close() #################################################################################################################################### ## NORMALIZAÇÃO FEITA COM BASE NOS RADICAIS DE FORMAÇÃO DAS PALAVRAS ## ## APLICAÇÃO DO RSPL PRIMEIRO PARA DEPOIS BUSCAR NA BASE OS TERMOS SIMILARES ## ## DENTRO DA BASE_TEP OS TERMOS TAMBÉM FORAM REDUZIDOS AOS SEUS RADICIAIS DE FORMAÇÃO ## ## O DICIONÁRIO ESTÁ COM A REFERÊNCIA PARA A LINHA AONDE ESTÃO OS TERMOS SINÔNIMOS ## ## OS TERMOS SÃO ANALISADOS CONSIDERANDO SUAS ACENTUAÇÕES, PARA APLICAÇÃO CORRETA DO RSLP ## #################################################################################################################################### yappi.set_clock_type('cpu') yappi.start(builtins=True) start = time.time() st_WordNetV = [ ] ##armazena num, tipo, e radical dos sinonimos - APENAS VERBOS st_WordNetN = [ ] ##armazena num, tipo, e radical dos sinonimos - APENAS SUBSTANTIVOS st_WordNetA = [ ] ##armazena num, tipo, e radical dos sinonimos - APENAS ADJETIVOS st_WordNetO = [ ] ##armazena num, tipo, e radical dos sinonimos - APENAS OUTROS for linhaWordnet in wordNet: listaAux = [] termos = re.findall(r"\{(.*)\}", linhaWordnet) num = re.findall(r"([0-9]+)\.", linhaWordnet) tipo = re.findall(r"\[(.*)\]", linhaWordnet) if tipo[0] == "Substantivo": listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetN.append(listaAux) elif tipo[0] == "Verbo": listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetV.append(listaAux) elif tipo[0] == "Adjetivo": listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetA.append(listaAux) else: listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetO.append(listaAux) duration = time.time() - start stats = yappi.get_func_stats() stats.save('stemmWordNet.out', type='callgrind') #################################################################################################################################### ### A ANÁLISE É REALIZADA COM BASE NO TEXTO SEM A EXCLUSÃO DOS ACENTOS ## ### POIS AO EXCLUÍ-LOS A REDUÇÃO AO RADICAL DE FORMAÇÃO (APLICAÇÃO DO RSLP) É PREJUDICADA ## ### OS TESTES REALIZADOS MOSTRARAM QUE ESSA É UMA MELHOR ABORDAGEM, UMA VEZ QUE NOSSOS TEXTOS SÃO PEQUENOS ## ### E PRECISAMOS CHEGAR O MAIS PRÓXIMO POSSÍVEL SEM CONSIDERAR SEUS SENTIDOS E/OU CONTEXTOS ## #################################################################################################################################### yappi.set_clock_type('cpu') yappi.start(builtins=True) start = time.time() normalizacao = normalizacaoWordnet(st_WordNetA, st_WordNetN, st_WordNetV, st_WordNetO, st_tagcomAce_posInicial) ############################################################### # Colocando os textos normalizados numa lista de 1 diemensão ############################################################### stringNorm = "" auxNorm = [] for i in range(len(normalizacao)): auxNorm = normalizacao[i] for x in range(len(auxNorm)): stringNorm = stringNorm + " " + auxNorm[x] posInicial_Normalizado.append(stringNorm) stringNorm = "" duration = time.time() - start stats = yappi.get_func_stats() stats.save('normalizacaoWordnet.out', type='callgrind') #################################################################################################################################### # print "posInicial" # pprint(posInicial) # # print "comAce_posInicial" # pprint(comAce_posInicial) # # print "tag_comAce_posInicial" # pprint(tag_comAce_posInicial) # # print "sw_tagcomAce_posInicial" # pprint(sw_tagcomAce_posInicial) # # print "st_tagcomAce_posInicial" # pprint(st_tagcomAce_posInicial) # print "posInicial_Normalizado" # print len(posInicial_Normalizado) # pprint(posInicial_Normalizado) # exit() #################################################################################################################################### #retorno da função - usado na views.py para alimentar o template debate.html #passar parametros que devem ser apresentados na templates debate.html return [ st_tese, posInicial, sw_tese, aux_usu, st_posInicial, tese, posInicial_Normalizado ]
def normalizacao1(dicSin, termo, radical, etiqueta): # inicio = datetime.now() # print inicio,"normalizacaoWordnet" #variáveis locais SA_wordnet = [] #armazena a wordnet sem acentos listaDicion = [ ] #lista com o número da linha de referência dos termos sinominos e com todos os termos sinonimos encontrados #abre o arquivo com as relacoes de sinonimia (termos linhaWordNet) e antonimia (termos contrarios) base_tep = codecs.open( os.path.join(os.path.dirname(__file__), '../../base_tep2/base_tep.txt'), 'r', 'UTF8') # dicionario = open('/home/panceri/git/alpes_v1/base_tep2/dicionarioSinonimos.txt', 'w') #variavel com conteúdo do arquivo em memoria #não imprimir essa variável, MUITO GRANDEE!!! wordNet = base_tep.readlines() #fechar arquivo base_tep.close() #retirar acentos da base for i in wordNet: SA_wordnet.append(removeA(i)) #teste com busca pelo radical (stemmer) stemmer = RSLPStemmer() # print termo, radical, etiqueta # yappi.set_clock_type('cpu') # yappi.start(builtins=True) # # start = time.time() # busca termo dentro de arquivo # armazena termo como chave do dicionario # os linhaWordNet são armazenados como uma lista if etiqueta == "N": for linhaWordNet in wordNet: if (linhaWordNet.find("[Substantivo]") >= 0): termosSinonimos = re.findall(r'\{(.*\w)\}', linhaWordNet) for listaSinonimos in termosSinonimos: sa_listaSinonimos = removePontuacao( listaSinonimos) #lista de linhaWordNet sem as , for palavraSinonima in sa_listaSinonimos.split(): st_palavraSinonima = stemmer.stem(palavraSinonima) if radical == st_palavraSinonima: numETerm = re.findall( r"([0-9]+). \[\w+\] \{(.*)\}", linhaWordNet) listaDicion.append(numETerm) dicSin[termo] = listaDicion # pprint(dicSin) elif etiqueta == "ADJ": for linhaWordNet in wordNet: if (linhaWordNet.find("[Adjetivo]") >= 0): termosSinonimos = re.findall(r'\{(.*)\}', linhaWordNet) for listaSinonimos in termosSinonimos: sa_listaSinonimos = removePontuacao( listaSinonimos) #lista de linhaWordNet sem as , for palavraSinonima in sa_listaSinonimos.split(): st_palavraSinonima = stemmer.stem(palavraSinonima) # auxTermos = sa_listaSinonimos.split() if radical == st_palavraSinonima: numETerm = re.findall( r"([0-9]+). \[\w+\] \{(.*)\}", linhaWordNet) listaDicion.append(numETerm) dicSin[termo] = listaDicion # pprint(dicSin) elif etiqueta == "V" or etiqueta == "VAUX": for linhaWordNet in wordNet: if (linhaWordNet.find("[Verbo]") >= 0): termosSinonimos = re.findall(r'\{(.*)\}', linhaWordNet) for listaSinonimos in termosSinonimos: sa_listaSinonimos = removePontuacao( listaSinonimos) #lista de linhaWordNet sem as , for palavraSinonima in sa_listaSinonimos.split(): st_palavraSinonima = stemmer.stem(palavraSinonima) # auxTermos = sa_listaSinonimos.split() if radical == st_palavraSinonima: numETerm = re.findall( r"([0-9]+). \[\w+\] \{(.*)\}", linhaWordNet) listaDicion.append(numETerm) dicSin[termo] = listaDicion # pprint(dicSin) else: #PARA TRATAR OS ADVÉRBIOS for linhaWordNet in wordNet: termosSinonimos = re.findall(r'\{(.*)\}', linhaWordNet) for listaSinonimos in termosSinonimos: sa_listaSinonimos = removePontuacao( listaSinonimos) #lista de linhaWordNet sem as , for palavraSinonima in sa_listaSinonimos.split(): st_palavraSinonima = stemmer.stem(palavraSinonima) # auxTermos = sa_listaSinonimos.split() if radical == st_palavraSinonima: numETerm = re.findall(r"([0-9]+). \[\w+\] \{(.*)\}", linhaWordNet) listaDicion.append(numETerm) dicSin[termo] = listaDicion
def LSA_Kmeans(clusters, textoTreinamento, nomeUsuarios, textoComparacao=None): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) ########################################################################################## # PRÉ-PROCESSAMENTO DO TEXTO ENVIADO PARA CRIAÇÃO DO DICIONÁRIO DE RELACOES SEMANTICAS # ########################################################################################## #UTILIZA AS FUNCOES removeA e removePontuacao PARA TRATAR textoTreinamento textoTrein = [removeA(removePontuacao(i)) for i in textoTreinamento] #print textoTrein textoComp = [removeA(removePontuacao(i)) for i in textoComparacao] #CARREGA A LISTA DE STOPWORDS DA NLTK stop = stopwords.words('portuguese') #RETIRA OS ACENTOS DA LISTA DE STOPWORDS stoplist = [(removeA(s)) for s in stop] # print stoplist #REMOVE AS STOPWORDS E PALAVRAS COM MENOS DE 3 CARACTERES textoTrein = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \ for document in textoTrein] # print sw_textoTrein textoComp = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \ for document in textoComp] # print textoComp ############################################################################################## # INICIO DE APLICACAO DO LSA - CRIANDO O DICIONARIO DE TERMOS/FREQUENCIA # ############################################################################################## #DEFINE FREQUENCIA COMO UMA VARIAVEL DO TIPO DICIONARIO DE INTEIROS frequencia = defaultdict(int) #ARMAZENA A QUANTIDADE DE REPETIÇÕES DE UM TERMO EM TODOS OS DOCUMENTOS DA COLECAO for t in textoTrein: for token in t: frequencia[token] += 1 # pprint(frequencia) #PALAVRAS COM FREQUENCIA 1 NÃO SÃO IMPORTANTES, POIS NÃO POSSUEM RELACOES DE CO-OCORRENCIA #Remove todas as palavras que apareceram apenas 1 vez durante a contagem textoTrein = [[token for token in palavra if frequencia[token] > 1]\ for palavra in textoTrein] # pprint(textoTrein) ########################################################################################## # Dictionary encapsulates the mapping between normalized words and their integer ids. # # The main function is `doc2bow`, which converts a collection of words to its # # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. # ########################################################################################## dicionario = corpora.Dictionary(textoTrein) # print dicionario # Armazena o ID das palavras que aparecem apenas 1 vez nos textos once_ids = [ tokenId for tokenId, docfreq in dicionario.dfs.iteritems() if docfreq == 1 ] # print once_ids #remove todas as palavras com frequencia = 1 dicionario.filter_tokens(once_ids) #reorganiza o dicionario, realocando os dados para os indices que foram removidos dicionario.compactify() # print dicionario.token2id # token -> tokenId # print dicionario.dfs # document frequencies: tokenId -> in how many documents this token appeared # Atribui a corpus_textoTrein o textoTrein no formato "bag-of-words" # The main function is `doc2bow`, which converts a collection of words to its # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. corpus_textoTrein = [dicionario.doc2bow(texto) for texto in textoTrein] # pprint(corpus_textoTrein) corpus_textoComp = [dicionario.doc2bow(textoC) for textoC in textoComp] # pprint(corpus_textoComp) ########################################################################################## # MODELO DE TRANSFORMACAO - BAG-OF-WORDS PARA TF-IDF # ########################################################################################## # TRANSFORMA corpus_textoTrein (bag-of-words) # PARA tfidf_TextoTrein (frequencia termos x inverso da freq no documento tfidf_TextoTrein = models.TfidfModel(corpus=corpus_textoTrein) # print tfidf_TextoTrein #USA O posIni PARA GERAR A MATRIZ DE COMPARACAO COM OS DADOS DO DICIONARIO corpus_tfidf_TextoTrein = tfidf_TextoTrein[corpus_textoComp] # print list(corpus_tfidf_TextoTrein) #TRANSFORMA A MATRIZ TF-IDF modelo_lsa = models.LsiModel(corpus_tfidf_TextoTrein, id2word=dicionario, num_topics=len(dicionario)) query = [] for q in textoComparacao: vec_bow = dicionario.doc2bow(q.lower().split()) vec_lsi = modelo_lsa[ vec_bow] #convert a query de comparação num espaço LSI query.append(vec_lsi) # print "query" # pprint(query) #TRANSFORMA corpus_textoComp num espaço LSA e indexa indexComp = similarities.MatrixSimilarity(modelo_lsa[corpus_textoComp]) # print "indexComp" # pprint(list(indexComp)) # To obtain similarities of our query document against the indexed documents: # perform a similarity query against the corpus sims = indexComp[query] # pprint(sims) ########################################################################################## # JUNÇÃO COM K-MEANS PARA REALIZAR AGRUPAMENTOS # ########################################################################################## ##Valor ideal, após experimentos = 100000 km_model = KMeans(n_clusters=clusters, n_init=100000) km_model.fit_transform(sims) clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(idx) ### impressões para visualizar no console # print "clustering _LSA_KMEANS" # pprint(clustering) # print len(clustering) # for i in range(len(clustering)): # for j in clustering[i]: # print "grupo", i # print j, nomeUsuarios[j] # print textoComparacao[j] return clustering
def gruposArgumentacao(auxResult, qtdeGrupos=3, LSA=None, Normalizacao=True, TAGs=True): inicio = datetime.now() print inicio,"gruposArgumentacao" yappi.set_clock_type('cpu') yappi.start(builtins=True) start = time.time() grupos = [] tese = auxResult[5] posInicial_Normalizado = auxResult[6] ## dicSin = contém o dicionario com os termos sinonimos já relacionados (relaciona as palavras digitadas pelos alunos com ## o arquivo da wordnet, destaca as relações de sinonimias e apresenta o radical do termo (stemm aplicado) vinculado aos ## numeros das linha aonde estão os seus similares na wordnet st_tese = auxResult[0] #texto da tese com aplicação de stemmer posIni = auxResult[1] #texto original da argumentação sw_tese = auxResult[2] aux_usu = auxResult[3] st_posInicial = auxResult[4] base_treinamento = codecs.open(os.path.join(os.path.dirname(__file__),'../arquivos/baseTreinamento.txt'), 'r', 'UTF8') treinamento = [removeA(removePontuacao(i)) for i in base_treinamento] # ALTERAR PARA PEGAR DADOS DA INTERFACE (CAIXA DE TEXTO) # OU COLOCAR OPÇÃO DE ENVIO DE ARQUIVO .TXT E ABRIR ESSES PARA USAR COMO BASE base_treinamento.close() ########################################################################################## ### ABORDAGEM (1): UTILIZAR O ARGUMENTO COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA ### ########################################################################################## #BASE DE TREINAMENTO COMPOSTA PELAS ARGUMENTAÇÕES DOS ALUNOS if LSA == True and Normalizacao == False: print "if LSA == True and Normalizacao == False:" if qtdeGrupos == 3: grupos = LSA_Kmeans(clusters=3, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos == 4: grupos = LSA_Kmeans(clusters=4, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos == 5: grupos = LSA_Kmeans(clusters=5, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos==6: grupos = LSA_Kmeans(clusters=6, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni) else: print "ERRO" ########################################################################################### ### ABORDAGEM (2): UTILIZAR OUTROS TEXTOS COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA ### ########################################################################################### #BASE DE TREINAMENTO COMPOSTA DE MATERIAIS DIDÁTICOS INDICADOS PELO PROFESSOR elif LSA == False and Normalizacao == False: print "elif LSA == False and Normalizacao == False:" if qtdeGrupos == 3: grupos = LSA_Kmeans(clusters=3, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos == 4: grupos = LSA_Kmeans(clusters=4, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos == 5: grupos = LSA_Kmeans(clusters=5, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos == 6: grupos = LSA_Kmeans(clusters=6, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni) else: print "ERRO" exit() ####################################################################################################### ### ABORDAGEM (3): UTILIZAR O ARGUMENTO NORMALIZADO COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA ### ###################################################################################################### #BASE DE TREINAMENTO COMPOSTA DE MATERIAIS DIDÁTICOS INDICADOS PELO PROFESSOR elif LSA == True and Normalizacao == True: print "elif LSA == True and Normalizacao == True:" if qtdeGrupos == 3: grupos = LSA_Kmeans(clusters=3, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado) elif qtdeGrupos == 4: grupos = LSA_Kmeans(clusters=4, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado) elif qtdeGrupos == 5: grupos = LSA_Kmeans(clusters=5, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado) elif qtdeGrupos == 6: grupos = LSA_Kmeans(clusters=6, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado) else: print "ERRO" exit() ########################################################################################## ### ABORDAGEM (4): UTILIZAÇÃO DO K-MEANS PURO COM TF-IDF ### ########################################################################################## elif LSA == None and Normalizacao == False: print "elif LSA == None and Normalizacao == False:" test_set = st_posInicial train_set = st_tese ### Utilização das funções para calculo do TF-IDF com a tese e o posInicial ### Funções implementadas com base na SkLearn vectorizer = CountVectorizer() vectorizer.fit_transform(test_set) count_vectorizer = CountVectorizer() count_vectorizer.fit_transform(train_set) count_vectorizer.vocabulary_ freq_term_matrix = count_vectorizer.transform(test_set) tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) tf_idf_matrix = tfidf.transform(freq_term_matrix) if qtdeGrupos == 3: grupos = tfIdf_Kmeans(st_posInicial, 3) elif qtdeGrupos == 4: grupos = tfIdf_Kmeans(st_posInicial, 4) elif qtdeGrupos == 5: grupos = tfIdf_Kmeans(st_posInicial, 5) elif qtdeGrupos == 6: grupos = tfIdf_Kmeans(st_posInicial, 6) else: print "ERRO" exit() ########################################################################################## ### ABORDAGEM (5): UTILIZAÇÃO DO K-MEANS PURO COM TF-IDF ### ### COM DADOS NORMALIZADOS ### ########################################################################################## ### Calculo com base nos textos normalizados!!! elif LSA == None and Normalizacao == True: print "elif LSA == None and Normalizacao == True:" test_set = posInicial_Normalizado train_set = st_tese ### Utilização das funções para calculo do TF-IDF com a tese e o posInicial ### Funções implementadas com base na SkLearn vectorizer = CountVectorizer() vectorizer.fit_transform(test_set) count_vectorizer = CountVectorizer() count_vectorizer.fit_transform(train_set) count_vectorizer.vocabulary_ freq_term_matrix = count_vectorizer.transform(test_set) tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) tf_idf_matrix = tfidf.transform(freq_term_matrix) if qtdeGrupos == 3: grupos = tfIdf_Kmeans(posInicial_Normalizado, 3) elif qtdeGrupos == 4: grupos = tfIdf_Kmeans(posInicial_Normalizado, 4) elif qtdeGrupos == 5: grupos = tfIdf_Kmeans(posInicial_Normalizado, 5) elif qtdeGrupos == 6: grupos = tfIdf_Kmeans(posInicial_Normalizado, 6) else: print "ERRO" exit() ########################################################################################## ### RESULTADOS - INDEPENDEM DA ABORDAGEM ### ########################################################################################## grupo1 = [] grupo2 = [] grupo3 = [] grupo4 = [] grupo5 = [] grupo6 = [] indices = [] ind_aux = 0 ind_aux2 = 0 ind_aux3 = 0 ind_aux4 = 0 ind_aux5 = 0 ind_aux6 = 0 for i in range(len(grupos)): for j in range(len(grupos[i])): if i == 0: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " + posIni[aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo1.append(texto) indices.append(grupos[i][j]) elif i == 1: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " + posIni[aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo2.append(texto) indices.append(grupos[i][j]) elif i == 2: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " + posIni[aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo3.append(texto) indices.append(grupos[i][j]) #para n_clusters = 4 elif i == 3: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " + posIni[aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo4.append(texto) indices.append(grupos[i][j]) #para n_clusters = 5 elif i == 4: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " + posIni[aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo5.append(texto) indices.append(grupos[i][j]) #para n_clusters = 6 elif i == 5: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>"+ aux_usu[aux] + "</span> <br/> Posicionamento Inicial: " + posIni[aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo6.append(texto) indices.append(grupos[i][j]) if qtdeGrupos == 3: ind_aux = indices[:len(grupo1)] ind_aux2 = indices[len(ind_aux):len(ind_aux)+len(grupo2)] ind_aux3 = indices[len(ind_aux)+len(grupo2):] elif qtdeGrupos == 4: ind_aux = indices[:len(grupo1)] ind_aux2 = indices[len(grupo1):len(grupo1)+len(grupo2)] ind_aux3 = indices[len(grupo1)+len(grupo2):(len(grupo1)+len(grupo2))+len(grupo3)] ind_aux4 = indices[(len(grupo1)+len(grupo2))+len(grupo3):] print "GRUPOS", grupos print "INDICES", indices elif qtdeGrupos == 5: ind_aux = indices[:len(grupo1)] print "ind_aux", ind_aux print "len_g1", len(grupo1) ind_aux2 = indices[len(grupo1):len(grupo1)+len(grupo2)] print "ind_aux", ind_aux2 print "len_g2", len(grupo2) ind_aux3 = indices[len(grupo1)+len(grupo2):(len(grupo1)+len(grupo2))+len(grupo3)] print "ind_aux", ind_aux3 print "len_g3", len(grupo3) ind_aux4 = indices[(len(grupo1)+len(grupo2)+len(grupo3)):(len(grupo1)+len(grupo2)+len(grupo3))+len(grupo4)] print "ind_aux", ind_aux4 print "len_g4", len(grupo4) ind_aux5 = indices[(len(grupo1)+len(grupo2)+len(grupo3))+len(grupo4):] print "ind_aux", ind_aux5 print "len_g5", len(grupo5) elif qtdeGrupos == 6: ind_aux = indices[:len(grupo1)] print "ind_aux", ind_aux print "len_g1", len(grupo1) ind_aux2 = indices[len(grupo1):len(grupo1)+len(grupo2)] print "ind_aux", ind_aux2 print "len_g2", len(grupo2) ind_aux3 = indices[len(grupo1)+len(grupo2):(len(grupo1)+len(grupo2))+len(grupo3)] print "ind_aux", ind_aux3 print "len_g3", len(grupo3) ind_aux4 = indices[(len(grupo1)+len(grupo2)+len(grupo3)):(len(grupo1)+len(grupo2)+len(grupo3))+len(grupo4)] print "ind_aux", ind_aux4 print "len_g4", len(grupo4) ind_aux5 = indices[(len(grupo1)+len(grupo2)+len(grupo3))+len(grupo4):(len(grupo1)+len(grupo2)+len(grupo3)+len(grupo4))+len(grupo5)] print "ind_aux", ind_aux5 print "len_g5", len(grupo5) ind_aux6 = indices[(len(grupo1)+len(grupo2)+len(grupo3)+len(grupo4))+len(grupo5):] print "ind_aux", ind_aux6 print "len_g6", len(grupo6) else: print "ERRO" exit() # ########################################################################################## # ### IMPRESSÃO DOS GRUPOS NO CONSOLE - PARA CONFERÊNCIA (COMENTAR DEPOIS) ### # ########################################################################################## # # ########################################################################################## # ## UTILIZADO PARA VALIDAR O CÁLCULO REALIZADO E IMPRIMI-LO ## # ########################################################################################## # test_set = st_posInicial # train_set = st_tese # vectorizer = CountVectorizer() # vectorizer.fit_transform(train_set) # count_vectorizer = CountVectorizer() # count_vectorizer.fit_transform(train_set) # count_vectorizer.vocabulary_ # freq_term_matrix = count_vectorizer.transform(test_set) # tfidf = TfidfTransformer(norm="l2") # tfidf.fit(freq_term_matrix) # tf_idf_matrix = tfidf.transform(freq_term_matrix) # ########################################################################################## # # # print "grupo 1", len(grupo1) # cos = [] # lsaPosIni = [] # lsaUsu =[] # # for y in range(len(ind_aux)): # print "posIni[y]", aux_usu[ind_aux[y]],posIni[ind_aux[y]] # lsaPosIni.append(posIni[ind_aux[y]]) # lsaUsu.append(aux_usu[ind_aux[y]]) # for x in range(y+1, len(ind_aux)): # num1 = ind_aux[y] # num2 = ind_aux[x] # cos.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # print "cos",cos # print "len_cos",len(cos) # sum_cos = 0 # # if len(cos) != 0: # for i in cos: # sum_cos = i + sum_cos # # print "media = ", sum_cos / len(cos) # else: # print "sem média" # # ########################################################################################## # print "grupo 2", len(grupo2) # cos2 = [] # lsaPosIni = [] # lsaUsu =[] # print lsaPosIni # print lsaUsu # # for y in range(len(ind_aux2)): # lsaPosIni.append(posIni[ind_aux2[y]]) # lsaUsu.append(aux_usu[ind_aux2[y]]) # for x in range(y+1, len(ind_aux2)): # num1 = ind_aux2[y] # num2 = ind_aux2[x] # cos2.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print aux_usu[num1],aux_usu[num2] # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # print "cos",cos2 # print "len_cos",len(cos2) # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # # # sum_cos = 0 # if len(cos2) != 0: # for i in cos2: # sum_cos = i + sum_cos # print "media = ", sum_cos / len(cos2) # else: # print "sem média" # # ########################################################################################## # print "grupo 3", len(grupo3) # cos3 = [] # lsaPosIni = [] # lsaUsu =[] # print lsaPosIni # print lsaUsu # # for y in range(len(ind_aux3)): # lsaPosIni.append(posIni[ind_aux3[y]]) # lsaUsu.append(aux_usu[ind_aux3[y]]) # for x in range(y+1, len(ind_aux3)): # num1 = ind_aux3[y] # num2 = ind_aux3[x] # cos3.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print aux_usu[num1],aux_usu[num2] # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # # print "cos",cos3 # print "len_cos",len(cos3) # # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # # sum_cos = 0 # if len(cos3) != 0: # for i in cos3: # sum_cos = i + sum_cos # print "media = ", sum_cos / len(cos3) # else: # print "sem média" ######################################################################################### # print "grupo 4", len(grupo4) # cos4 = [] # lsaPosIni = [] # lsaUsu =[] # print lsaPosIni # print lsaUsu # for y in range(len(ind_aux4)): # lsaPosIni.append(posIni[ind_aux4[y]]) # lsaUsu.append(aux_usu[ind_aux4[y]]) # for x in range(y+1, len(ind_aux4)): # num1 = ind_aux4[y] # num2 = ind_aux4[x] # cos4.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print aux_usu[num1],aux_usu[num2] # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # # print "cos",cos4 # print "len_cos",len(cos4) # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # # sum_cos = 0 # if len(cos4) != 0: # for i in cos4: # sum_cos = i + sum_cos # print "media = ", sum_cos / len(cos4) # else: # print "sem média" # # # ######################################################################################### # print "grupo 5", len(grupo5) # cos5 = [] # lsaPosIni = [] # lsaUsu =[] # print lsaPosIni # print lsaUsu # # for y in range(len(ind_aux5)): # lsaPosIni.append(posIni[ind_aux5[y]]) # lsaUsu.append(aux_usu[ind_aux5[y]]) # for x in range(y+1, len(ind_aux5)): # num1 = ind_aux5[y] # num2 = ind_aux5[x] # cos5.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print aux_usu[num1],aux_usu[num2] # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # # print "cos",cos5 # print "len_cos", len(cos5) # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # # sum_cos = 0 # if len(cos5) != 0: # for i in cos5: # sum_cos = i + sum_cos # print "media = ", sum_cos / len(cos5) # else: # print "sem média" # # ######################################################################################### # print "grupo 6", len(grupo6) # cos6 = [] # lsaPosIni = [] # lsaUsu =[] # # for y in range(len(ind_aux6)): # lsaPosIni.append(posIni[ind_aux6[y]]) # lsaUsu.append(aux_usu[ind_aux6[y]]) # for x in range(y+1, len(ind_aux6)): # num1 = ind_aux6[y] # num2 = ind_aux6[x] # cos6.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print aux_usu[num1],aux_usu[num2] # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # # print "cos",cos6 # print "len_cos",len(cos6) # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # # sum_cos = 0 # if len(cos6) != 0: # for i in cos6: # sum_cos = i + sum_cos # print "media = ", sum_cos / len(cos6) # else: # print "sem média" ########################################################################################## fim = datetime.now() print fim,"gruposArgumentacao" duration = time.time() - start stats = yappi.get_func_stats() stats.save('gruposArgumentacao.out', type = 'callgrind') return grupo1, grupo2, grupo3, grupo4, grupo5, grupo6, tese
def clusterFinal(idtese): #Variaveis e funçoes para conexação com o banco de dados do Debate de Teses cursor = connection.cursor() cursor2 = connection.cursor() cursor.execute("select distinct `usr`.`primeironome` as `name`, `pos`.`posicionamentofinal` AS `posicionamentofinal` from ((((`argumento` `arg` join `revisao` `rev`) join `replica` `rep`) join `posicionamento` `pos`) join `argumentador` `urg`)join `usuario` `usr` where ((`arg`.`tese_idtese` = " + idtese + " ) and (`rev`.`argumento_idargumento` = `arg`.`idargumento`) and (`rep`.`revisao_idrevisao` = `rev`.`idrevisao`) and (`arg`.`argumentador_idargumentador` = `pos`.`argumentador_idargumentador`) and (`arg`.`tese_idtese` = `pos`.`tese_idtese`) and (`arg`.`posicionamentoinicial` is not null) and (`arg`.`argumentador_idargumentador` = `urg`.`idargumentador`) and(`urg`.`usuario_idusuario` = `usr`.`idusuario`) and (`pos`.`posicionamentofinal` is not null))") cursor2.execute("select tese from tese where grupo_idgrupo = 1064 ") #Variavel e função para tratar tags html e acentos com codificação ISO h = HTMLParser.HTMLParser() #dados retirados da consulta ao banco dadosSql = cursor.fetchall() textotese = cursor2.fetchall() #listas para tratar os dados iniciais usu = [] posInicial = [] dados = [] tese = [] #lista com dados pos tagger tag_posInicial = [] tag_comAce_posInicial = [] #lista com dados após a remoção das stopwords sw_tese = [] sw_posInicial = [] aux_usu = [] sw_tagPosInicial = [] #texto marcado e sem stopwords sw_tagcomAce_posInicial = [] #texto COM ACENTOS marcado e sem stopwords #lista com dados após a aplicação de Stemming st_posInicial = [] st_tese = [] st_tagPosInicial = [] #texto marcado, sem stopwords e com stemmer aplicado st_tagcomAce_posInicial = [] #texto COM ACENTOS marcado, sem stopwords e com stemmer aplicado ############################################################################################################# #LISTA COM OS POSICIONAMENTOS INICIAIS APÓS APLICAÇÃO DA NORMALIZAÇAÕ posInicial_Normalizado = [] normalizacao = [] ############################################################################################################# #Aplicacao de Case Folding for d in dadosSql: dados.append([re.sub('<[^>]*>', '', h.unescape(d[0])).lower(), re.sub('<[^>]*>', '', h.unescape(d[1])).lower()]) for t in textotese: tese.append(re.sub('<[^>]*>', '', h.unescape(t[0])).lower()) #Colocando os textos de posicionamento inicial em numa lista separada for i in dados: x = 0 usu.append(i[x].upper()) posInicial.append(i[x+1].lower()) #lista com o posicionamento Inicial com todas as letras em minusculo ############################################################################################################# ### Classificacao das palavras de acordo com sua classe gramatical ### Utilizacao do postagger NLPNET ### http://nilc.icmc.usp.br/nlpnet/index.html# tagger = nlpnet.POSTagger() semAce_posInicial = [] #armazena o posInicial apenas sem acentos, sem pontuações, sem endereço web e sem numeros comAce_posInicial = [] #armazena o posInicial apenas COM acentos, sem pontuações, sem endereço web e sem numeros for i in posInicial: semAce_posInicial.append(removePontuacao(removeA(removeNum(removeSE(removeEndWeb((i))))))) for i in semAce_posInicial: tag_posInicial.append(tagger.tag(i)) for i in posInicial: comAce_posInicial.append(removePontuacao(removeNum(removeSE(removeEndWeb((i)))))) for i in comAce_posInicial: tag_comAce_posInicial.append(tagger.tag(i)) ############################################################################################################# #APENAS PARA REALIZAR TESTE E COLOCAR NA DISSERTACAO # pprint(semAce_posInicial) # pprint(comAce_posInicial) # exit() # tagg_posInicial = [] # for texto in posInicial: # tagg_posInicial.append(tagger.tag(texto)) # # print "posInicial" # pprint(posInicial) # # print "tagg_posInicial" # pprint(tagg_posInicial) ############################################################################################################# ############################################################################################################# ### REMOCAO DE STOPWORDS ### Remocao dos termos de acordo com a NLTK ### Remocao dos termos classificados como artigos, verbos, adverbios, etc... for i in usu: aux_usu.append(removeStopWords(i)) for i in tese: sw_tese.append(removeStopWords(i)) for i in posInicial: sw_posInicial.append(removeStopWords(i)) for i in tag_posInicial: sw_tagPosInicial.append(limpaCorpus(i)) for i in tag_comAce_posInicial: sw_tagcomAce_posInicial.append(limpaCorpus(i)) #################################################################################################################################### # Aplicação do RSPL Stemmer para remoção dos afixos das palavras da lingua portuguesa # Retirando afixos dos textos do posInicial e tese stemmer = RSLPStemmer() for i in range(len(sw_posInicial)): st_aux = sw_posInicial[i] string_aux = "" for sufixo in st_aux.split(): string_aux = string_aux + " " + stemmer.stem(sufixo) st_posInicial.append(string_aux) for i in range(len(sw_tese)): st_aux = sw_tese[i] string_aux = "" for sufixo in st_aux.split(): string_aux = string_aux + " " + stemmer.stem(sufixo) st_tese.append(string_aux) for i in range(len(sw_tagPosInicial)): termosST = "" auxST = [] for j in range(len(sw_tagPosInicial[i])): aux = stemmer.stem(sw_tagPosInicial[i][j][0]) etiqueta = sw_tagPosInicial[i][j][1] termosST = (aux,etiqueta) auxST.append(termosST) st_tagPosInicial.append(auxST) for i in range(len(sw_tagcomAce_posInicial)): termosST = "" auxST = [] for j in range(len(sw_tagcomAce_posInicial[i])): aux = stemmer.stem(sw_tagcomAce_posInicial[i][j][0]) etiqueta = sw_tagcomAce_posInicial[i][j][1] termosST = (aux,etiqueta) auxST.append(termosST) st_tagcomAce_posInicial.append(auxST) #################################################################################################################################### ### A NORMALIZACAO DE TERMOS REFERE-SE A TECNICA DE TROCAR PALAVRAS SINONIMAS, OU SEJA, QUE TENHAM SIGNIFICADO ## ### SEMELHANTE, POR UM UNICO TERMO REPRESENTATIVO NO CORPUS DE ANALISE. DESSA FORMA, É POSSIVEL AUMENTAR O GRAU ## ### DE SIMILARIDADE ENTRE OS TEXTOS ANALISADOS ATRAVES DO USO DE TECNICAS DE ANALISE ESTATISTICAS, COMO SIMILA ## ### RIDADE DE COSSENOS OU DISTANCIA EUCLIDIANA. ## #################################################################################################################################### ### A NORMALIZACAO FOI DESENVOLVIDA COM BASE NOS DADOS DISPONIBILIZADOS PELO PROJETO TEP 2.0 DO NILC/USP ## ### http://143.107.183.175:21480/tep2/index.htm ## ### ## ### FORMATO DO ARQUIVO ## ### NUM1. [Tipo] {termos sinonimos} <NUM2> ## ### 263. [Verbo] {consentir, deixar, permitir} <973> ## ### NUM1 = NUMERO DA LINHA DE REFERENCIA PARA TERMO SINONIMO ## ### NUM2 = NUMERO DA LINHA DE REFERENCIA PARA TERMO ANTONIMO (SENTIDO OPOSTO) ## #################################################################################################################################### #abre o arquivo com as relacoes de sinonimia (termos linhaWordNet) e antonimia (termos contrarios) #arquivo apenas com termos classificados como substantivos, adjetivos e verbos base_tep = codecs.open(os.path.join(os.path.dirname(__file__),'../base_tep2/base_tep.txt'), 'r', 'UTF8') # dicionario = open('/home/panceri/git/alpes_v1/base_tep2/dicionarioSinonimos.txt', 'w') #variavel com conteúdo do arquivo em memoria #não imprimir essa variável, MUITO GRANDEE!!! wordNet = base_tep.readlines() #fechar arquivo base_tep.close() #################################################################################################################################### ## NORMALIZAÇÃO FEITA COM BASE NOS RADICAIS DE FORMAÇÃO DAS PALAVRAS ## ## APLICAÇÃO DO RSPL PRIMEIRO PARA DEPOIS BUSCAR NA BASE OS TERMOS SIMILARES ## ## DENTRO DA BASE_TEP OS TERMOS TAMBÉM FORAM REDUZIDOS AOS SEUS RADICIAIS DE FORMAÇÃO ## ## O DICIONÁRIO ESTÁ COM A REFERÊNCIA PARA A LINHA AONDE ESTÃO OS TERMOS SINÔNIMOS ## ## OS TERMOS SÃO ANALISADOS CONSIDERANDO SUAS ACENTUAÇÕES, PARA APLICAÇÃO CORRETA DO RSLP ## #################################################################################################################################### yappi.set_clock_type('cpu') yappi.start(builtins=True) start = time.time() st_WordNetV = [] ##armazena num, tipo, e radical dos sinonimos - APENAS VERBOS st_WordNetN = [] ##armazena num, tipo, e radical dos sinonimos - APENAS SUBSTANTIVOS st_WordNetA = [] ##armazena num, tipo, e radical dos sinonimos - APENAS ADJETIVOS st_WordNetO = [] ##armazena num, tipo, e radical dos sinonimos - APENAS OUTROS for linhaWordnet in wordNet: listaAux = [] termos = re.findall(r"\{(.*)\}", linhaWordnet) num = re.findall(r"([0-9]+)\.", linhaWordnet) tipo = re.findall(r"\[(.*)\]", linhaWordnet) if tipo[0] == "Substantivo": listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetN.append(listaAux) elif tipo[0] == "Verbo": listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetV.append(listaAux) elif tipo[0] == "Adjetivo": listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetA.append(listaAux) else: listaAux.append(num) listaAux.append(tipo) for T in termos: aux = T.split() auxL = [] for i in aux: aux1 = i.replace(",", "") dadosStem = stemmer.stem(aux1) auxL.append(dadosStem) listaAux.append(auxL) st_WordNetO.append(listaAux) duration = time.time() - start stats = yappi.get_func_stats() stats.save('stemmWordNet.out', type = 'callgrind') #################################################################################################################################### ### A ANÁLISE É REALIZADA COM BASE NO TEXTO SEM A EXCLUSÃO DOS ACENTOS ## ### POIS AO EXCLUÍ-LOS A REDUÇÃO AO RADICAL DE FORMAÇÃO (APLICAÇÃO DO RSLP) É PREJUDICADA ## ### OS TESTES REALIZADOS MOSTRARAM QUE ESSA É UMA MELHOR ABORDAGEM, UMA VEZ QUE NOSSOS TEXTOS SÃO PEQUENOS ## ### E PRECISAMOS CHEGAR O MAIS PRÓXIMO POSSÍVEL SEM CONSIDERAR SEUS SENTIDOS E/OU CONTEXTOS ## #################################################################################################################################### yappi.set_clock_type('cpu') yappi.start(builtins=True) start = time.time() normalizacao = normalizacaoWordnet(st_WordNetA, st_WordNetN, st_WordNetV, st_WordNetO, st_tagcomAce_posInicial) ############################################################### # Colocando os textos normalizados numa lista de 1 diemensão ############################################################### stringNorm = "" auxNorm = [] for i in range(len(normalizacao)): auxNorm = normalizacao[i] for x in range(len(auxNorm)): stringNorm = stringNorm + " " + auxNorm[x] posInicial_Normalizado.append(stringNorm) stringNorm = "" duration = time.time() - start stats = yappi.get_func_stats() stats.save('normalizacaoWordnet.out', type = 'callgrind') #################################################################################################################################### # print "posInicial" # pprint(posInicial) # # print "comAce_posInicial" # pprint(comAce_posInicial) # # print "tag_comAce_posInicial" # pprint(tag_comAce_posInicial) # # print "sw_tagcomAce_posInicial" # pprint(sw_tagcomAce_posInicial) # # print "st_tagcomAce_posInicial" # pprint(st_tagcomAce_posInicial) # print "posInicial_Normalizado" # print len(posInicial_Normalizado) # pprint(posInicial_Normalizado) # exit() #################################################################################################################################### return [st_tese, posInicial, sw_tese, aux_usu, st_posInicial, tese, posInicial_Normalizado]
def normalizacao(dicSin, termo, radical, etiqueta): #variáveis locais SA_wordnet = [] #armazena a wordnet sem acentos listaTodosSin = [] #lista com todos os termos sinonimos encontrados listaNumRef = [] #lista com o número da linha de referência dos termos sinominos #abre o arquivo com as relacoes de sinonimia (termos linhaWordNet) e antonimia (termos contrarios) base_tep = codecs.open(os.path.join(os.path.dirname(__file__),'../../base_tep2/base_tep.txt'), 'r', 'UTF8') dicionario = open(os.path.join(os.path.dirname(__file__),'../../base_tep2/dicionarioSinonimos.txt'), 'w') #variavel com conteúdo do arquivo em memoria #não imprimir essa variável, MUITO GRANDEE!!! wordNet = base_tep.readlines() #fechar arquivo base_tep.close() #retirar acentos da base for i in wordNet: SA_wordnet.append(removeA(i)) #teste com busca pelo radical (stemmer) stemmer = RSLPStemmer() # termoStm = stemmer.stem(termo) # print termo, radical, etiqueta # busca termo dentro de arquivo # armazena termo como chave do dicionario # os linhaWordNet são armazenados como uma lista if etiqueta == "N": for linhaWordNet in SA_wordnet: if(linhaWordNet.find("[Substantivo]")>=0): if(linhaWordNet.find(termo)>=0): listaSinonimos = re.findall('{[^}]*}', linhaWordNet) for palavraSinonima in listaSinonimos: numRefSin = re.findall('^[0-9]*.', linhaWordNet) #retorna o numero de referencia dos linhaWordNet sa_palavraSinonima = removePontuacao(palavraSinonima) #lista de linhaWordNet sem as {} for termSinWordNet in sa_palavraSinonima.split(): st_termSinWordNet = stemmer.stem(termSinWordNet) if radical == st_termSinWordNet: listaNumRef.append(numRefSin) listaTodosSin.append(termSinWordNet) dicSin[termo] = listaNumRef,listaTodosSin elif etiqueta == "ADJ": for linhaWordNet in wordNet: if(linhaWordNet.find("[Adjetivo]")>=0): if(linhaWordNet.find(termo)>=0): listaSinonimos = re.findall('{[^}]*}', linhaWordNet) for palavraSinonima in listaSinonimos: numRefSin = re.findall('^[0-9]*.', linhaWordNet) #retorna o numero de referencia dos linhaWordNet sa_palavraSinonima = removePontuacao(palavraSinonima) #lista de linhaWordNet sem as {} for termSinWordNet in sa_palavraSinonima.split(): st_termSinWordNet = stemmer.stem(termSinWordNet) if radical == st_termSinWordNet: listaNumRef.append(numRefSin) listaTodosSin.append(sa_palavraSinonima) dicSin[termo] = listaNumRef,listaTodosSin elif etiqueta == "V" or etiqueta == "VAUX": for linhaWordNet in wordNet: if(linhaWordNet.find("[Verbo]")>=0): if(linhaWordNet.find(termo)>=0): listaSinonimos = re.findall('{[^}]*}', linhaWordNet) for palavraSinonima in listaSinonimos: numRefSin = re.findall('^[0-9]*.', linhaWordNet) #retorna o numero de referencia dos linhaWordNet sa_palavraSinonima = removePontuacao(palavraSinonima) for termSinWordNet in sa_palavraSinonima.split(): st_termSinWordNet = stemmer.stem(termSinWordNet) if radical == st_termSinWordNet: listaNumRef.append(numRefSin) listaTodosSin.append(sa_palavraSinonima) dicSin[termo] = listaNumRef else: #PARA TRATAR OS ADVÉRBIOS for linhaWordNet in wordNet: if(linhaWordNet.find(termo)>=0): listaSinonimos = re.findall('{[^}]*}', linhaWordNet) for palavraSinonima in listaSinonimos: numRefSin = re.findall('^[0-9]*.', linhaWordNet) #retorna o numero de referencia dos linhaWordNet sa_palavraSinonima = removePontuacao(palavraSinonima) for termSinWordNet in sa_palavraSinonima.split(): st_termSinWordNet = stemmer.stem(termSinWordNet) if radical == st_termSinWordNet: listaNumRef.append(numRefSin) listaTodosSin.append(sa_palavraSinonima) dicSin[termo] = listaNumRef ### verificar como imprimir isso num arquivo ### veriricar como imprimir um dicionario num arquivo txt listaux = [] for termo, listaNumRef in dicSin.items(): temp = '{}: {}'.format(termo, listaNumRef) # print '{}: {}'.format(termo, listaNumRef) listaux.append(temp) dicionario.write(temp) dicionario.close()
def gruposArgumentacao(auxResult, qtdeGrupos=3, LSA=None, Normalizacao=True, TAGs=True): inicio = datetime.now() print inicio, "gruposArgumentacao" yappi.set_clock_type('cpu') yappi.start(builtins=True) start = time.time() grupos = [] tese = auxResult[5] posInicial_Normalizado = auxResult[6] ## dicSin = contém o dicionario com os termos sinonimos já relacionados (relaciona as palavras digitadas pelos alunos com ## o arquivo da wordnet, destaca as relações de sinonimias e apresenta o radical do termo (stemm aplicado) vinculado aos ## numeros das linha aonde estão os seus similares na wordnet st_tese = auxResult[0] #texto da tese com aplicação de stemmer posIni = auxResult[1] #texto original da argumentação sw_tese = auxResult[2] aux_usu = auxResult[3] st_posInicial = auxResult[4] base_treinamento = codecs.open( os.path.join(os.path.dirname(__file__), '../arquivos/baseTreinamento.txt'), 'r', 'UTF8') treinamento = [removeA(removePontuacao(i)) for i in base_treinamento] # ALTERAR PARA PEGAR DADOS DA INTERFACE (CAIXA DE TEXTO) # OU COLOCAR OPÇÃO DE ENVIO DE ARQUIVO .TXT E ABRIR ESSES PARA USAR COMO BASE base_treinamento.close() ########################################################################################## ### ABORDAGEM (1): UTILIZAR O ARGUMENTO COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA ### ########################################################################################## #BASE DE TREINAMENTO COMPOSTA PELAS ARGUMENTAÇÕES DOS ALUNOS if LSA == True and Normalizacao == False: print "if LSA == True and Normalizacao == False:" if qtdeGrupos == 3: grupos = LSA_Kmeans(clusters=3, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos == 4: grupos = LSA_Kmeans(clusters=4, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos == 5: grupos = LSA_Kmeans(clusters=5, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos == 6: grupos = LSA_Kmeans(clusters=6, textoTreinamento=posIni, nomeUsuarios=aux_usu, textoComparacao=posIni) else: print "ERRO" ########################################################################################### ### ABORDAGEM (2): UTILIZAR OUTROS TEXTOS COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA ### ########################################################################################### #BASE DE TREINAMENTO COMPOSTA DE MATERIAIS DIDÁTICOS INDICADOS PELO PROFESSOR elif LSA == False and Normalizacao == False: print "elif LSA == False and Normalizacao == False:" if qtdeGrupos == 3: grupos = LSA_Kmeans(clusters=3, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos == 4: grupos = LSA_Kmeans(clusters=4, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos == 5: grupos = LSA_Kmeans(clusters=5, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni) elif qtdeGrupos == 6: grupos = LSA_Kmeans(clusters=6, textoTreinamento=treinamento, nomeUsuarios=aux_usu, textoComparacao=posIni) else: print "ERRO" exit() ####################################################################################################### ### ABORDAGEM (3): UTILIZAR O ARGUMENTO NORMALIZADO COMO BASE PARA CRIAÇÃO DOS DICIONÁRIOS DO LSA ### ###################################################################################################### #BASE DE TREINAMENTO COMPOSTA DE MATERIAIS DIDÁTICOS INDICADOS PELO PROFESSOR elif LSA == True and Normalizacao == True: print "elif LSA == True and Normalizacao == True:" if qtdeGrupos == 3: grupos = LSA_Kmeans(clusters=3, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado) elif qtdeGrupos == 4: grupos = LSA_Kmeans(clusters=4, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado) elif qtdeGrupos == 5: grupos = LSA_Kmeans(clusters=5, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado) elif qtdeGrupos == 6: grupos = LSA_Kmeans(clusters=6, textoTreinamento=posInicial_Normalizado, nomeUsuarios=aux_usu, textoComparacao=posInicial_Normalizado) else: print "ERRO" exit() ########################################################################################## ### ABORDAGEM (4): UTILIZAÇÃO DO K-MEANS PURO COM TF-IDF ### ########################################################################################## elif LSA == None and Normalizacao == False: print "elif LSA == None and Normalizacao == False:" test_set = st_posInicial train_set = st_tese ### Utilização das funções para calculo do TF-IDF com a tese e o posInicial ### Funções implementadas com base na SkLearn vectorizer = CountVectorizer() vectorizer.fit_transform(test_set) count_vectorizer = CountVectorizer() count_vectorizer.fit_transform(train_set) count_vectorizer.vocabulary_ freq_term_matrix = count_vectorizer.transform(test_set) tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) tf_idf_matrix = tfidf.transform(freq_term_matrix) if qtdeGrupos == 3: grupos = tfIdf_Kmeans(st_posInicial, 3) elif qtdeGrupos == 4: grupos = tfIdf_Kmeans(st_posInicial, 4) elif qtdeGrupos == 5: grupos = tfIdf_Kmeans(st_posInicial, 5) elif qtdeGrupos == 6: grupos = tfIdf_Kmeans(st_posInicial, 6) else: print "ERRO" exit() ########################################################################################## ### ABORDAGEM (5): UTILIZAÇÃO DO K-MEANS PURO COM TF-IDF ### ### COM DADOS NORMALIZADOS ### ########################################################################################## ### Calculo com base nos textos normalizados!!! elif LSA == None and Normalizacao == True: print "elif LSA == None and Normalizacao == True:" test_set = posInicial_Normalizado train_set = st_tese ### Utilização das funções para calculo do TF-IDF com a tese e o posInicial ### Funções implementadas com base na SkLearn vectorizer = CountVectorizer() vectorizer.fit_transform(test_set) count_vectorizer = CountVectorizer() count_vectorizer.fit_transform(train_set) count_vectorizer.vocabulary_ freq_term_matrix = count_vectorizer.transform(test_set) tfidf = TfidfTransformer(norm="l2") tfidf.fit(freq_term_matrix) tf_idf_matrix = tfidf.transform(freq_term_matrix) if qtdeGrupos == 3: grupos = tfIdf_Kmeans(posInicial_Normalizado, 3) elif qtdeGrupos == 4: grupos = tfIdf_Kmeans(posInicial_Normalizado, 4) elif qtdeGrupos == 5: grupos = tfIdf_Kmeans(posInicial_Normalizado, 5) elif qtdeGrupos == 6: grupos = tfIdf_Kmeans(posInicial_Normalizado, 6) else: print "ERRO" exit() ########################################################################################## ### RESULTADOS - INDEPENDEM DA ABORDAGEM ### ########################################################################################## grupo1 = [] grupo2 = [] grupo3 = [] grupo4 = [] grupo5 = [] grupo6 = [] indices = [] ind_aux = 0 ind_aux2 = 0 ind_aux3 = 0 ind_aux4 = 0 ind_aux5 = 0 ind_aux6 = 0 for i in range(len(grupos)): for j in range(len(grupos[i])): if i == 0: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>" + aux_usu[ aux] + "</span> <br/> Posicionamento Inicial: " + posIni[ aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo1.append(texto) indices.append(grupos[i][j]) elif i == 1: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>" + aux_usu[ aux] + "</span> <br/> Posicionamento Inicial: " + posIni[ aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo2.append(texto) indices.append(grupos[i][j]) elif i == 2: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>" + aux_usu[ aux] + "</span> <br/> Posicionamento Inicial: " + posIni[ aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo3.append(texto) indices.append(grupos[i][j]) #para n_clusters = 4 elif i == 3: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>" + aux_usu[ aux] + "</span> <br/> Posicionamento Inicial: " + posIni[ aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo4.append(texto) indices.append(grupos[i][j]) #para n_clusters = 5 elif i == 4: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>" + aux_usu[ aux] + "</span> <br/> Posicionamento Inicial: " + posIni[ aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo5.append(texto) indices.append(grupos[i][j]) #para n_clusters = 6 elif i == 5: aux = grupos[i][j] if TAGs: texto = "Aluno: <span>" + aux_usu[ aux] + "</span> <br/> Posicionamento Inicial: " + posIni[ aux] else: texto = aux_usu[aux] + "#$#" + posIni[aux] grupo6.append(texto) indices.append(grupos[i][j]) if qtdeGrupos == 3: ind_aux = indices[:len(grupo1)] ind_aux2 = indices[len(ind_aux):len(ind_aux) + len(grupo2)] ind_aux3 = indices[len(ind_aux) + len(grupo2):] elif qtdeGrupos == 4: ind_aux = indices[:len(grupo1)] ind_aux2 = indices[len(grupo1):len(grupo1) + len(grupo2)] ind_aux3 = indices[len(grupo1) + len(grupo2):(len(grupo1) + len(grupo2)) + len(grupo3)] ind_aux4 = indices[(len(grupo1) + len(grupo2)) + len(grupo3):] print "GRUPOS", grupos print "INDICES", indices elif qtdeGrupos == 5: ind_aux = indices[:len(grupo1)] print "ind_aux", ind_aux print "len_g1", len(grupo1) ind_aux2 = indices[len(grupo1):len(grupo1) + len(grupo2)] print "ind_aux", ind_aux2 print "len_g2", len(grupo2) ind_aux3 = indices[len(grupo1) + len(grupo2):(len(grupo1) + len(grupo2)) + len(grupo3)] print "ind_aux", ind_aux3 print "len_g3", len(grupo3) ind_aux4 = indices[( len(grupo1) + len(grupo2) + len(grupo3)):(len(grupo1) + len(grupo2) + len(grupo3)) + len(grupo4)] print "ind_aux", ind_aux4 print "len_g4", len(grupo4) ind_aux5 = indices[(len(grupo1) + len(grupo2) + len(grupo3)) + len(grupo4):] print "ind_aux", ind_aux5 print "len_g5", len(grupo5) elif qtdeGrupos == 6: ind_aux = indices[:len(grupo1)] print "ind_aux", ind_aux print "len_g1", len(grupo1) ind_aux2 = indices[len(grupo1):len(grupo1) + len(grupo2)] print "ind_aux", ind_aux2 print "len_g2", len(grupo2) ind_aux3 = indices[len(grupo1) + len(grupo2):(len(grupo1) + len(grupo2)) + len(grupo3)] print "ind_aux", ind_aux3 print "len_g3", len(grupo3) ind_aux4 = indices[( len(grupo1) + len(grupo2) + len(grupo3)):(len(grupo1) + len(grupo2) + len(grupo3)) + len(grupo4)] print "ind_aux", ind_aux4 print "len_g4", len(grupo4) ind_aux5 = indices[(len(grupo1) + len(grupo2) + len(grupo3)) + len(grupo4):(len(grupo1) + len(grupo2) + len(grupo3) + len(grupo4)) + len(grupo5)] print "ind_aux", ind_aux5 print "len_g5", len(grupo5) ind_aux6 = indices[(len(grupo1) + len(grupo2) + len(grupo3) + len(grupo4)) + len(grupo5):] print "ind_aux", ind_aux6 print "len_g6", len(grupo6) else: print "ERRO" exit() # ########################################################################################## # ### IMPRESSÃO DOS GRUPOS NO CONSOLE - PARA CONFERÊNCIA (COMENTAR DEPOIS) ### # ########################################################################################## # # ########################################################################################## # ## UTILIZADO PARA VALIDAR O CÁLCULO REALIZADO E IMPRIMI-LO ## # ########################################################################################## # test_set = st_posInicial # train_set = st_tese # vectorizer = CountVectorizer() # vectorizer.fit_transform(train_set) # count_vectorizer = CountVectorizer() # count_vectorizer.fit_transform(train_set) # count_vectorizer.vocabulary_ # freq_term_matrix = count_vectorizer.transform(test_set) # tfidf = TfidfTransformer(norm="l2") # tfidf.fit(freq_term_matrix) # tf_idf_matrix = tfidf.transform(freq_term_matrix) # ########################################################################################## # # # print "grupo 1", len(grupo1) # cos = [] # lsaPosIni = [] # lsaUsu =[] # # for y in range(len(ind_aux)): # print "posIni[y]", aux_usu[ind_aux[y]],posIni[ind_aux[y]] # lsaPosIni.append(posIni[ind_aux[y]]) # lsaUsu.append(aux_usu[ind_aux[y]]) # for x in range(y+1, len(ind_aux)): # num1 = ind_aux[y] # num2 = ind_aux[x] # cos.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # print "cos",cos # print "len_cos",len(cos) # sum_cos = 0 # # if len(cos) != 0: # for i in cos: # sum_cos = i + sum_cos # # print "media = ", sum_cos / len(cos) # else: # print "sem média" # # ########################################################################################## # print "grupo 2", len(grupo2) # cos2 = [] # lsaPosIni = [] # lsaUsu =[] # print lsaPosIni # print lsaUsu # # for y in range(len(ind_aux2)): # lsaPosIni.append(posIni[ind_aux2[y]]) # lsaUsu.append(aux_usu[ind_aux2[y]]) # for x in range(y+1, len(ind_aux2)): # num1 = ind_aux2[y] # num2 = ind_aux2[x] # cos2.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print aux_usu[num1],aux_usu[num2] # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # print "cos",cos2 # print "len_cos",len(cos2) # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # # # sum_cos = 0 # if len(cos2) != 0: # for i in cos2: # sum_cos = i + sum_cos # print "media = ", sum_cos / len(cos2) # else: # print "sem média" # # ########################################################################################## # print "grupo 3", len(grupo3) # cos3 = [] # lsaPosIni = [] # lsaUsu =[] # print lsaPosIni # print lsaUsu # # for y in range(len(ind_aux3)): # lsaPosIni.append(posIni[ind_aux3[y]]) # lsaUsu.append(aux_usu[ind_aux3[y]]) # for x in range(y+1, len(ind_aux3)): # num1 = ind_aux3[y] # num2 = ind_aux3[x] # cos3.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print aux_usu[num1],aux_usu[num2] # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # # print "cos",cos3 # print "len_cos",len(cos3) # # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # # sum_cos = 0 # if len(cos3) != 0: # for i in cos3: # sum_cos = i + sum_cos # print "media = ", sum_cos / len(cos3) # else: # print "sem média" ######################################################################################### # print "grupo 4", len(grupo4) # cos4 = [] # lsaPosIni = [] # lsaUsu =[] # print lsaPosIni # print lsaUsu # for y in range(len(ind_aux4)): # lsaPosIni.append(posIni[ind_aux4[y]]) # lsaUsu.append(aux_usu[ind_aux4[y]]) # for x in range(y+1, len(ind_aux4)): # num1 = ind_aux4[y] # num2 = ind_aux4[x] # cos4.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print aux_usu[num1],aux_usu[num2] # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # # print "cos",cos4 # print "len_cos",len(cos4) # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # # sum_cos = 0 # if len(cos4) != 0: # for i in cos4: # sum_cos = i + sum_cos # print "media = ", sum_cos / len(cos4) # else: # print "sem média" # # # ######################################################################################### # print "grupo 5", len(grupo5) # cos5 = [] # lsaPosIni = [] # lsaUsu =[] # print lsaPosIni # print lsaUsu # # for y in range(len(ind_aux5)): # lsaPosIni.append(posIni[ind_aux5[y]]) # lsaUsu.append(aux_usu[ind_aux5[y]]) # for x in range(y+1, len(ind_aux5)): # num1 = ind_aux5[y] # num2 = ind_aux5[x] # cos5.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print aux_usu[num1],aux_usu[num2] # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # # print "cos",cos5 # print "len_cos", len(cos5) # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # # sum_cos = 0 # if len(cos5) != 0: # for i in cos5: # sum_cos = i + sum_cos # print "media = ", sum_cos / len(cos5) # else: # print "sem média" # # ######################################################################################### # print "grupo 6", len(grupo6) # cos6 = [] # lsaPosIni = [] # lsaUsu =[] # # for y in range(len(ind_aux6)): # lsaPosIni.append(posIni[ind_aux6[y]]) # lsaUsu.append(aux_usu[ind_aux6[y]]) # for x in range(y+1, len(ind_aux6)): # num1 = ind_aux6[y] # num2 = ind_aux6[x] # cos6.append(cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2])) # euc = euclidean_distances(tf_idf_matrix[num1], tf_idf_matrix[num2],squared=True) # print aux_usu[num1],aux_usu[num2] # print "cosine", cosine_similarity(tf_idf_matrix[num1], tf_idf_matrix[num2]) # print "euc", euc # # print "cos",cos6 # print "len_cos",len(cos6) # simLSA = similaridade_lsa(treinamento, lsaUsu, lsaPosIni) # print "simLSA" # pprint(sorted(simLSA, reverse=True)) # # simLSA1 = similaridade_lsa(posIni, lsaUsu, lsaPosIni) # print "simLSA1" # pprint(sorted(simLSA1, reverse=True)) # # sum_cos = 0 # if len(cos6) != 0: # for i in cos6: # sum_cos = i + sum_cos # print "media = ", sum_cos / len(cos6) # else: # print "sem média" ########################################################################################## fim = datetime.now() print fim, "gruposArgumentacao" duration = time.time() - start stats = yappi.get_func_stats() stats.save('gruposArgumentacao.out', type='callgrind') return grupo1, grupo2, grupo3, grupo4, grupo5, grupo6, tese