def api(): api_response = "" bot_response = "" # PARSE THE USER INPUT AND LOOK FOR PATTERNS if request.form.get('api_response'): api_response = app_dict[request.form.get('api_response')] if request.form.get('input'): udpipe_url = "http://lindat.mff.cuni.cz/services/udpipe/api/process?tokenizer&tagger&parser" udpipe_data = urllib.parse.urlencode({ 'data': request.form.get('input'), 'model': "portuguese-bosque-ud-2.6-200830", }).encode('ascii') with urllib.request.urlopen(udpipe_url, udpipe_data) as f: udpipe_output = json.loads(remove_accents( f.read().decode('utf-8')))['result'] text = estrutura_ud.Corpus(recursivo=True) text.build(udpipe_output) print("input: {}".format(text.to_str())) # try to find linguistic pattern for pattern in linguistic_patterns: for query in linguistic_patterns[pattern]: #print(query) if all( interrogar_UD.main(text, 5, x, fastSearch=True) ['casos'] for x in query): bot_response = responses[pattern] break # try to answer from wikipedia if not bot_response: # get most awkward word in input names = [] verbs = [] for sentence in text.sentences.values(): for token in sentence.tokens: clean_token = token.lemma.lower() if token.upos in [ "NOUN", "PROPN" ] and not clean_token in names and clean_token in frequency_of_important_words: names.append(clean_token) if ( token.upos in ["VERB"] or token.deprel in ["cop"] ) and not clean_token in verbs and clean_token in frequency_of_important_words: verbs.append(clean_token) most_awkward_name = sorted( names, key=lambda x: frequency_of_important_words[x]) most_awkward_verb = sorted( verbs, key=lambda x: frequency_of_important_words[x]) if most_awkward_name: try: bot_response = wikipedia.summary(most_awkward_name[0], sentences=2) except wikipedia.DisambiguationError as e: s = random.choice(e.options) bot_response = wikipedia.summary(s, sentences=2) # try to answer from "pensador" if not bot_response and most_awkward_verb: with urllib.request.urlopen("https://www.pensador.com/{}/".format( most_awkward_verb[0])) as f: soup = BeautifulSoup(f, "html.parser") parse = soup.find_all("p", class_="frase") bot_response = random.choice(parse).get_text() if parse else "" # no answer found if not bot_response: bot_response = "Desculpe, ainda não sei como responder..." response = jsonify({ "api_response": api_response, "bot_response": bot_response }) response.headers.add('Access-Control-Allow-Origin', '*') return response
def validate(conllu, sent_id=None, errorList="validar_UD.txt", noMissingToken=False): errorDictionary = {} if isinstance(conllu, str): corpus = estrutura_ud.Corpus(recursivo=True, sent_id=sent_id) corpus.load(conllu) else: corpus = conllu for sentence in corpus.sentences.values(): if not sentence.text.endswith(sentence.tokens[-1].word): if not '1 - Sentença não termina com o último token' in errorDictionary: errorDictionary[ '1 - Sentença não termina com o último token'] = [] errorDictionary[ '1 - Sentença não termina com o último token'].append({ "sentence": "", "sent_id": sentence.sent_id, "t": sentence.map_token_id["1"], "attribute": "id", }) temRoot = False tem2Root = False for token in sentence.tokens: if token.deprel == "root" and not temRoot: temRoot = True elif token.deprel == "root" and temRoot: tem2Root = True if tem2Root: if not '1 - Tem 2 root' in errorDictionary: errorDictionary['1 - Tem 2 root'] = [] errorDictionary['1 - Tem 2 root'].append({ "sentence": sentence, "sent_id": sentence.sent_id, "t": sentence.map_token_id["1"], "attribute": "id", }) if not temRoot: if not '1 - Não tem root' in errorDictionary: errorDictionary['1 - Não tem root'] = [] errorDictionary['1 - Não tem root'].append({ "sentence": sentence, "sent_id": sentence.sent_id, "t": sentence.map_token_id["1"], "attribute": "id", }) if not noMissingToken: missingToken = re.findall(r"\n\n(?!#|$).*", corpus.to_str()) if missingToken: if not '1 - Há tokens faltando no corpus' in errorDictionary: errorDictionary['1 - Há tokens faltando no corpus'] = [] for missing in missingToken: errorDictionary['1 - Há tokens faltando no corpus'].append({ "sentence": "", "sent_id": "<pre>" + missing + "</pre>", }) with open(errorList) as f: errorListFile = f.read().splitlines() errorList = [] [errorList.append(x) for x in errorListFile] for error in errorList: if error and error[0] != "#": if "erro: " in error: comment = error.split("erro: ")[1] comment = comment.strip() coluna = error.split( "|", 1)[0] if "|" in error.split("erro: ")[0] else "" continue parameters = error.strip() for sentString in interrogar_UD.main(corpus, 5, parameters, 0, sent_id, separate=True)['output']: if not comment in errorDictionary: errorDictionary[comment] = [] sentence = estrutura_ud.Sentence(recursivo=True) sentence.build(fromInterrogarToHtml(sentString['resultado'])) tokenT = 0 for t, token in enumerate(sentence.tokens): if "<b>" in token.to_str(): tokenT = t break errorDictionary[comment].append({ "t": tokenT, "sentence": sentence, "attribute": coluna, }) return errorDictionary
def renderSentences(script=""): conllu = form['conllu'].value corpus = conllu caminhoCompletoConllu = './interrogar-ud/conllu/' + conllu caminhoCompletoHtml = form['html'].value parametros = form['parametros'].value.split(" ", 1)[1] criterio = int(form['parametros'].value.split(" ", 1)[0]) if "script" in form: script = form['script'].value startPoint = int(form['indexSentences'].value) filtradoPrevious = int(form['filtrado'].value) if 'filtrado' in form else 0 nomePesquisa = form['nomePesquisa'].value filtros = [] filtrar_filtros = "" pagina_filtros = "" if nomePesquisa not in fastSearch: pagina_html = caminhoCompletoHtml.rsplit("/", 1)[1].rsplit(".", 1)[0] if os.path.isfile("./cgi-bin/filtros.json"): with open("./cgi-bin/filtros.json") as f: filtros_json = json.load(f) filtrar_filtros = "<h4 class='translateHtml'>Filtros já aplicados:</h4>" if pagina_html in filtros_json and filtros_json[ pagina_html]['filtros'] else "" if pagina_html in filtros_json: filtros = [ x for nome in filtros_json[pagina_html]['filtros'] for x in filtros_json[pagina_html]['filtros'][nome]['sentences'] ] for pagina in [[ x, len(filtros_json[pagina_html]['filtros'][x] ['sentences']) ] for x in filtros_json[pagina_html]['filtros']]: filtrar_filtros += f'<li><a style="cursor:pointer;" class="translateTitle" title="Clique para adicionar ao mesmo filtro" onclick="$(\'#nome_pesquisa,#nome_pesquisa_sel\').val($(this).children(nome).text());"><span id="nome">{pagina[0]}</a> ({pagina[1]})</li>' pagina_filtros += f'<li><a style="cursor:pointer;" target="_blank" href=\'../../cgi-bin/filtrar.py?action=view&html={pagina_html}&filtro=' + encodeUrl( pagina[0]) + f'\'>{pagina[0]} ({pagina[1]})</li>' else: filtros = [] if os.path.isfile('./cgi-bin/json/' + slugify(conllu + "_" + parametros + ".json")): with open( "./cgi-bin/json/" + slugify(conllu + "_" + parametros + ".json"), "r") as f: resultadosBusca = json.load(f) else: if not script: resultadosBusca = interrogar_UD.main(caminhoCompletoConllu, criterio, parametros) else: shutil.copy("./cgi-bin/scripts/" + script, './cgi-bin/queryScript.py') with open("./cgi-bin/queryScript.py", 'r') as f: scriptFile = f.read().replace("<!--corpus-->", caminhoCompletoConllu) with open("./cgi-bin/queryScript.py", "w") as f: f.write(scriptFile) import queryScript resultadosBusca = queryScript.getResultadosBusca() sent_id_list = [] if 'sent_id_list' in form and form['sent_id_list'].value: sent_id_list = [ re.sub(r'<.*?>', '', re.findall(r'# sent_id = (.*?)\n', x['resultado'])[0]) for x in resultadosBusca['output'] ] sent_id_list = [x for x in sent_id_list if x not in filtros] numeroOcorrencias = len(resultadosBusca['output']) - len(filtros) #numeroOcorrenciasMenosFiltro = numeroOcorrencias - len(filtros) if numeroOcorrencias > startPoint + 21 and numeroOcorrencias >= 1: finalPoint = startPoint + 20 noMore = False else: finalPoint = len( resultadosBusca['output']) - len(filtros) + len(filtros) noMore = True arquivoHtml = "" resultados = [] quantos = 0 filtrado = int(form['filtrado'].value) for ocorrencia in resultadosBusca['output'][startPoint:]: anotado = estrutura_ud.Sentence(recursivo=False) estruturado = estrutura_ud.Sentence(recursivo=False) anotado.build( web.escape(ocorrencia['resultado'].replace('<b>', '@BOLD').replace( '</b>', '/BOLD').replace( '<font color=' + tabela['yellow'] + '>', '@YELLOW/').replace( '<font color=' + tabela['red'] + '>', '@RED/').replace( '<font color=' + tabela['cyan'] + '>', '@CYAN/').replace( '<font color=' + tabela['blue'] + '>', '@BLUE/').replace( '<font color=' + tabela['purple'] + '>', '@PURPLE/').replace('</font>', '/FONT'))) estruturado.build( web.unescape(ocorrencia['resultado']).replace( '<b>', '@BOLD').replace('</b>', '/BOLD').replace( '<font color=' + tabela['yellow'] + '>', '@YELLOW/').replace( '<font color=' + tabela['red'] + '>', '@RED/').replace( '<font color=' + tabela['cyan'] + '">', '@CYAN/').replace( '<font color=' + tabela['blue'] + '>', '@BLUE/').replace( '<font color=' + tabela['purple'] + '>', '@PURPLE/').replace( '</font>', '/FONT').replace( '@BOLD', '').replace('/BOLD', '').replace( '@YELLOW/', '').replace( '@RED/', '').replace( '@CYAN/', '').replace( '@BLUE/', '').replace( '@PURPLE/', '').replace( '/FONT', '')) if not estruturado.sent_id in filtros and not estruturado.text in filtros: resultados.append({'anotado': anotado, 'estruturado': estruturado}) quantos += 1 else: quantos += 1 filtrado += 1 if finalPoint + 1 < len(resultadosBusca['output']): finalPoint += 1 if quantos == finalPoint - startPoint: break for i, ocorrencia in enumerate(resultados): anotado = ocorrencia['anotado'] estruturado = ocorrencia['estruturado'] arquivoHtml += '<div class="container sentence-container">\n' arquivoHtml += f'<p>{str(startPoint+i+1-filtradoPrevious)}/{numeroOcorrencias}</p>' + '\n' if estruturado.sent_id: arquivoHtml += '<p {} class="metadados_sentence">'.format( 'onmouseover="$(this).css(\'text-decoration\', \'underline\');" onmouseleave="$(this).css(\'text-decoration\', \'none\');"' if nomePesquisa not in fastSearch else "") arquivoHtml += f'''<input class="cb translateTitle" id=checkbox_{str(startPoint+i+1)} style="margin-left:0px;" title="Selecionar sentença para filtragem" sent_id="{estruturado.sent_id}" type=checkbox>''' if nomePesquisa not in fastSearch else "" arquivoHtml += f'''{estruturado.sent_id}</p>''' + '\n' arquivoHtml += f"<p><span id=text_{str(startPoint+i+1)}>{(anotado.metadados['clean_text'] if 'clean_text' in anotado.metadados else anotado.text).replace('/BOLD', '</b>').replace('@BOLD', '<b>').replace('@YELLOW/', '<font color=' + tabela['yellow'] + '>').replace('@PURPLE/', '<font color=' + tabela['purple'] + '>').replace('@BLUE/', '<font color=' + tabela['blue'] + '>').replace('@RED/', '<font color=' + tabela['red'] + '>').replace('@CYAN/', '<font color=' + tabela['cyan'] + '>').replace('/FONT', '</font>')}</span></p>" + '\n' if ((estruturado.sent_id and ('-' in estruturado.sent_id or re.search(r'^\d+$', estruturado.sent_id))) or estruturado.id) and estruturado.text: arquivoHtml += f"<p class='toolbar' style='display:none;'><button class='translateHtml sentence-control' id=contexto_{str(startPoint+i+1)} onclick=\"contexto('{estruturado.sent_id}', '{estruturado.id}', '{corpus}')\" style=\"margin-left:0px\">Mostrar contexto</button> <button class='translateHtml anotacao sentence-control' id=mostrar_{str(startPoint+i+1)} onclick=\"mostrar('div_{str(startPoint+i+1)}', 'mostrar_{str(startPoint+i+1)}')\" style=\"margin-left:0px\">Mostrar anotação</button> <button class='translateHtml opt sentence-control' id=opt_{str(startPoint+i+1)} onclick=\"mostraropt('optdiv_{str(startPoint+i+1)}', 'opt_{str(startPoint+i+1)}')\" style=\"margin-left:0px\">Mostrar opções</button> <button class=\"abrirInquerito translateHtml sentence-control\" onclick='inquerito(\"form_{str(startPoint+i+1)}\");'>Abrir inquérito</button></p>" + "\n" else: arquivoHtml += f"<p class='toolbar' style='display:none;'><button id=mostrar_{str(startPoint+i+1)} class=\"translateHtml anotacao sentence-control\" onclick=\"mostrar('div_{str(startPoint+i+1)}', 'mostrar_{str(startPoint+i+1)}')\" style=\"margin-left:0px\">Mostrar anotação</button> <button id=opt_{str(startPoint+i+1)} class=\"translateHtml sentence-control opt\" onclick=\"mostraropt('optdiv_{str(startPoint+i+1)}', 'opt_{str(startPoint+i+1)}')\" style=\"margin-left:0px\">Mostrar opções</button> <button class='translateHtml abrirInquerito sentence-control' onclick='inquerito(\"form_{str(startPoint+i+1)}\")'>Abrir inquérito</button></p>" + '\n' arquivoHtml += f"<span style=\"display:none; padding-left:20px;\" id=\"optdiv_{str(startPoint+i+1)}\">" arquivoHtml += f"<form action=\"../../cgi-bin/inquerito.py?conllu={conllu}\" target=\"_blank\" method=POST id=form_{str(startPoint+i+1)}><input type=hidden name=sentid value=\"{estruturado.sent_id}\"><input type=hidden name=occ value=\"{numeroOcorrencias}\"><input type=hidden name=textheader value=\"{estruturado.sent_id}\"><input type=hidden name=nome_interrogatorio value=\"{web.escape(nomePesquisa)}\"><input type=hidden name=link_interrogatorio value=\"{caminhoCompletoHtml}\"><input type=hidden name=text value=\"{estruturado.text}\">" if "@BOLD" in anotado.to_str(): arquivoHtml += f"<input type=hidden name=tokenId value=\"" + ",".join( [ functions.cleanEstruturaUD(x.id) for x in anotado.tokens if '@BOLD' in x.to_str() ]) + "\">" arquivoHtml += "</form><br>" if nomePesquisa not in fastSearch: arquivoHtml += f"<a style=\"cursor:pointer\" onclick='selectAbove({str(startPoint+i+1)})' class='translateHtml'>Selecionar todas as frases acima</a><br>" if nomePesquisa not in fastSearch: arquivoHtml += f"<!--a style=\"cursor:pointer\" onclick='filtraragora(\"{str(startPoint+i+1)}\")'>Separar sentença</a-->" #arquivoHtml += '<br>' if nomePesquisa in fastSearch: arquivoHtml += "<span class='translateHtml'>Salve a busca para liberar mais opções</span>" arquivoHtml += f"<form action=\"../../cgi-bin/udpipe.py?conllu={conllu}\" target=\"_blank\" method=POST id=udpipe_{str(startPoint+i+1)}><input type=hidden name=textheader value=\"{estruturado.text}\"></form><!--a style=\"cursor:pointer\" onclick='anotarudpipe(\"udpipe_{str(startPoint+i+1)}\")' class='translateHtml'>Anotar frase com o UDPipe</a!-->" arquivoHtml += '' arquivoHtml += f"<form action=\"../../cgi-bin/draw_tree.py?conllu={conllu}\" target=\"_blank\" method=POST id=tree_{str(startPoint+i+1)}><input type=hidden name=sent_id value=\"{estruturado.sent_id}\"><input type=hidden name=text value=\"{estruturado.text}\"></form><!--a style=\"cursor:pointer\" onclick='drawtree(\"tree_{str(startPoint+i+1)}\")' class='translateHtml'>Visualizar árvore de dependências</a-->" arquivoHtml += '</p></span>\n' arquivoHtml += f"<pre id=div_{str(startPoint+i+1)} style=\"display:none\">{anotado.to_str().replace('/BOLD', '</b>').replace('@BOLD', '<b>').replace('@YELLOW/', '<font color=' + tabela['yellow'] + '>').replace('@PURPLE/', '<font color=' + tabela['purple'] + '>').replace('@BLUE/', '<font color=' + tabela['blue'] + '>').replace('@RED/', '<font color=' + tabela['red'] + '>').replace('@CYAN/', '<font color=' + tabela['cyan'] + '>').replace('/FONT', '</font>')}</pre>" + '\n' arquivoHtml += '</div>\n' print(json.JSONEncoder().encode({ 'success': True, 'html': arquivoHtml, 'noMore': noMore, 'indexSentences': finalPoint, 'filtrado': filtrado, 'filtrar_filtros': filtrar_filtros, 'pagina_filtros': pagina_filtros, 'filtros': len(filtros), 'sent_id_list': "|".join(sent_id_list), 'sent_id_count': str(len(sent_id_list)) }))