def main(ud1, ud2, output, coluna = 4): conllu1 = LerUD(ud1) conllu2 = LerUD(ud2) conllu1Estruturado, conllu2Estruturado = estrutura_ud.Corpus(), estrutura_ud.Corpus() conllu1Estruturado.load(ud1) conllu2Estruturado.load(ud2) lista_conllu = get_list(conllu1Estruturado, conllu2Estruturado, coluna) lista_conllu1 = lista_conllu['matriz_1'] lista_conllu2 = lista_conllu['matriz_2'] pd.options.display.max_rows = None pd.options.display.max_columns = None pd.set_option('display.expand_frame_repr', False) saída = list() saída.append('Col ' + str(coluna)+': ' + feats[coluna]) saída.append('GOLDEN: ' + ud1) saída.append('PREVISTO: ' + ud2 + '\n') saída.append(str(pd.crosstab(pd.Series(lista_conllu1), pd.Series(lista_conllu2), rownames=['UD[1]'], colnames=['UD[2]'], margins=True))) saída.append('\n') saída.append('#!$$ Sentenças de GOLDEN que não foram encontradas em PREVISTO:\n') for item in lista_conllu['solitários_1']: saída.append(item) #Output if ':' in output: codificação_saída = output.split(':')[1] else: codificação_saída = 'utf8' output = output.split(':')[0] get_percentages(ud1, ud2, output, coluna) #Gera os arquivos HTML gerar_HTML("\n".join(saída), conllu1, conllu2, coluna, output, codificação_saída) #Gera o arquivo "txt" (apenas a matriz) open(output, 'w', encoding=codificação_saída).write("\n".join(saída))
def getAnnotation(): if not google.authorized and GOOGLE_LOGIN: return redirect(url_for('google.login')) html1, html2 = "", "" if request.values.get('ud') == 'ud1': ud1 = estrutura_ud.Corpus(recursivo=False, sent_id=request.values.get('sent_id')) ud1.load(conllu(request.values.get('c')).findGolden()) bold = request.values.get('bold') or "" annotationUd1 = escape( ud1.sentences.get(request.values.get('sent_id')).tokens_to_str()) html1 = "<table id='t01' style='margin:auto; cursor:pointer; margin-bottom:30px'>" for t, linha in enumerate(annotationUd1.splitlines()): html1 += "<tr class='bold'>" if bold and t == int(bold) else "<tr>" for col, coluna in enumerate(linha.split("\t")): if col == 0: drag = 'id notPipe ' elif col == 6: drag = 'drag notPipe ' elif col in [3, 7] or coluna == "_": drag = "notPipe " else: drag = "" html1 += '<td contenteditable=true class="{drag}valor"><input type=hidden name="{col}<coluna>{t}">{coluna}</td>'.format( col=col, t=t, coluna=coluna, drag=drag) html1 += "</tr>" html1 += "</table>" elif request.values.get('ud') == 'ud2': ud2 = estrutura_ud.Corpus(recursivo=False, sent_id=request.values.get('sent_id')) ud2.load(conllu(request.values.get('c')).findSystem()) bold = request.values.get('bold') or "" annotationUd2 = escape( ud2.sentences.get(request.values.get('sent_id')).tokens_to_str()) html2 = "<table id='t01' style='margin:auto; cursor:pointer; margin-bottom:30px;'>" for t, linha in enumerate(annotationUd2.splitlines()): html2 += "<tr class='bold'>" if bold and t == int(bold) else "<tr>" for col, coluna in enumerate(linha.split("\t")): if col == 0: drag = 'id' elif col == 6: drag = 'drag' else: drag = "" html2 += '<td contenteditable=true class="{drag} valor"><input type=hidden name="{col}<coluna>{t}">{coluna}</td>'.format( col=col, t=t, coluna=coluna, drag=drag) html2 += "</tr>" html2 += "</table>" if 'ud1' in globals(): del ud1 if 'ud2' in globals(): del ud2 return jsonify({ 'annotationUd1': html1, 'annotationUd2': html2, 'success': True, })
def renderErrors(c, texto="", exc=[], fromZero=False): if not os.path.isfile(conllu(c).findErrors() + "_html") or fromZero: if fromZero or not texto: #if not os.path.isfile(conllu(c).findErrors()): if os.system(JULGAMENTO_FOLDER + f'/.julgamento/bin/python3 {os.path.abspath(os.path.dirname(__file__))}/tools/validate.py {conllu(c).findGolden()} --max-err=0 --lang={VALIDATE_LANG} 2>&1 | tee {conllu(c).findErrors()}'): pass with open(conllu(c).findErrors()) as f: texto = f.read() if conllu(c).golden() in allCorpora.corpora and allCorpora.corpora.get(conllu(c).golden()): corpus = allCorpora.corpora.get(conllu(c).golden()) else: corpus = estrutura_ud.Corpus(recursivo=True) corpus.load(conllu(c).findGolden()) with open(conllu(c).findGolden(), 'r') as f: arquivo = f.read() arquivoSplit = arquivo.splitlines() sent_ids = {} exceptions = [ 'Exception caught', 'for 9', 'Non-tree', 'HEAD == ID', 'cycle', 'Skipping' ] exceptions += exc for linha in texto.splitlines(): if linha and any(x.lower().strip() in linha.lower() for x in exceptions) and ' Node ' in linha and 'Sent ' in linha and ("Line " in linha or ' line ' in linha): t = int(linha.split("Line ", 1)[1].split(" ")[0]) if "Line " in linha else int(linha.split(" line ", 1)[1].split(" ")[0]) if "\t" in arquivoSplit[t-1]: if not linha.split(":", 1)[1] in sent_ids: sent_ids[linha.split(":", 1)[1]] = [] bold = {'word': arquivoSplit[t-1].split("\t")[1], 'color': 'black', 'id': arquivo.splitlines()[t-1].split("\t")[0]}# if '\t' in arquivo.splitlines()[t-1] else "" t = allCorpora.corpora[conllu(c).golden()].sentences[linha.split(" Node ")[0].split("Sent ", 1)[1]].map_token_id[arquivo.splitlines()[t-1].split("\t")[0]] sent_ids[linha.split(":", 1)[1]].append({'id': linha.split(" Node ")[0].split("Sent ", 1)[1], 't': t, 'bold': bold}) html = "" for k, problem in enumerate(sorted(sent_ids)): html += f"<div class='alert alert-warning' role='alert'>{k+1} / {len(sent_ids)} - {problem}</div>" for i, sent_id in enumerate(sent_ids[problem]): if sent_id['id'] in corpus.sentences: if sent_id['bold']['word'] and sent_id['bold']['color'] and sent_id['t']: html += f'<div class="panel panel-default"><div class="panel-body">{ i+1 } / { len(sent_ids[problem]) }</div>' + \ render_template( 'sentence.html', golden=corpus.sentences[sent_id['id']], c=c, t=sent_id['t'], bold=sent_id['bold'], goldenAndSystem=True if conllu(c).system() in allCorpora.corpora else False, ) + "</div></div>" else: html += f'<div class="panel panel-default"><div class="panel-body">{ i+1 } / { len(sent_ids[problem]) }: {sent_id["id"]}</div>' with open(conllu(c).findErrors() + "_html", "w") as f: f.write(html) else: with open(conllu(c).findErrors() + "_html") as f: html = f.read() return html
def modify_sentid(filename, sent_id, new_sentid): corpus = estrutura_ud.Corpus(recursivo=False, sent_id=sent_id) corpus.load("./interrogar-ud/conllu/" + filename) corpus.sentences[sent_id].sent_id = new_sentid corpus.sentences[sent_id].metadados['sent_id'] = new_sentid corpus.sentences[new_sentid] = corpus.sentences.pop(sent_id) corpus.save("./interrogar-ud/conllu/" + filename) return True
def loadCorpus(x): if os.path.isfile(conllu(x).findGolden()) and not os.path.isfile(conllu(x).findOriginal()): shutil.copyfile(conllu(x).findGolden(), conllu(x).findOriginal()) if os.path.isfile(conllu(x).findSystem()) and not conllu(x).system() in allCorpora.corpora: allCorpora.corpora[conllu(x).system()] = estrutura_ud.Corpus(recursivo=True) if not conllu(x).golden() in allCorpora.corpora: allCorpora.corpora[conllu(x).golden()] = estrutura_ud.Corpus(recursivo=True) if not conllu(x).original() in allCorpora.corpora: allCorpora.corpora[conllu(x).original()] = estrutura_ud.Corpus(recursivo=True) if conllu(x).system() in allCorpora.corpora and not allCorpora.corpora[conllu(x).system()].sentences: sys.stderr.write("\n>>>>>>>>>>>>>> loading system {}...".format(x)) corpus = estrutura_ud.Corpus(recursivo=True) corpus.load(conllu(x).findSystem()) allCorpora.corpora[conllu(x).system()].sentences = dict(corpus.sentences.items()) sys.stderr.write(" system ok <<<<<<<<") if conllu(x).original() in allCorpora.corpora and not allCorpora.corpora[conllu(x).original()].sentences: corpus = estrutura_ud.Corpus(recursivo=True) corpus.load(conllu(x).findOriginal()) allCorpora.corpora[conllu(x).original()].sentences = dict(corpus.sentences.items()) if conllu(x).golden() in allCorpora.corpora and not allCorpora.corpora[conllu(x).golden()].sentences: sys.stderr.write("\n>>>>>>>>>>>>>> loading {}...".format(x)) corpus = estrutura_ud.Corpus(recursivo=True) corpus.load(conllu(x).findGolden()) allCorpora.corpora[conllu(x).golden()].sentences = dict(corpus.sentences.items()) sys.stderr.write(" ok <<<<<<<<") corpus = ""
def juntarPartitions(crossvalidation, listaPartitions, listaDeIdsEmOrdem): arquivoConlluCompletoAnotado = [] for partition in sorted([int(x) for x in listaPartitions if x != "sobra"]): with open( f"partition_{partition}/MC_partition_{partition}/partition_{partition}_sistema.conllu", "r") as f: arquivoConlluCompletoAnotado.append( [x for x in f.read().splitlines()]) adicionarAoLog(f"partição {partition} acrescida ao corpus anotado") with open( f"partition_sobra/MC_partition_sobra/partition_sobra_sistema.conllu", "r") as f: arquivoConlluCompletoAnotado.append([x for x in f.read().splitlines()]) adicionarAoLog(f"partição sobra acrescida ao corpus anotado") for a in range(len(arquivoConlluCompletoAnotado)): arquivoConlluCompletoAnotado[a] = "\n".join( arquivoConlluCompletoAnotado[a]) arquivoConlluCompletoAnotado = "\n\n".join(arquivoConlluCompletoAnotado) corpusSemOrdem = estrutura_ud.Corpus(recursivo=False) corpusSemOrdem.build(arquivoConlluCompletoAnotado) corpusOrdem = [] for sentOrdem in listaDeIdsEmOrdem: corpusOrdem.append(corpusSemOrdem.sentences[sentOrdem].to_str()) corpus = estrutura_ud.Corpus(recursivo=False) corpus.build("\n\n".join(corpusOrdem)) if not os.path.isdir(f"MC_{sys.argv[1]}"): os.mkdir(f"MC_{sys.argv[1]}") adicionarAoLog( f"salvando corpus anotado em MC_{sys.argv[1]}/{sys.argv[1]}_sistema.conllu" ) corpus.save(f"MC_{sys.argv[1]}/{sys.argv[1]}_sistema.conllu") os.system( f"cd MC_{sys.argv[1]}; mv {sys.argv[1]}_sistema.conllu ../../; cd ../../; mv {sys.argv[1]}_inProgress {sys.argv[1]}_success 2>&1 | tee -a ../log.txt" ) os._exit(1) adicionarAoLog( f"finalizado. Resultado em MC_{sys.argv[1]}/{sys.argv[1]}.html")
def main(): adicionarAoLog(f"carregando {sys.argv[1]}.conllu") conlluVirgem = estrutura_ud.Corpus(recursivo=False) conlluVirgem.load(f"{sys.argv[1]}.conllu") listaDeIdsEmOrdem = [x.sent_id for x in conlluVirgem.sentences.values()] crossvalidation = Crossvalidation(listaDeIdsEmOrdem) crossvalidation.montarParticoes() listaPartitions = [ x.rsplit("_", 1)[1] for x in os.listdir(".") if os.path.isdir(x) and "partition_" in x ] checarPartitions(crossvalidation, listaPartitions, listaDeIdsEmOrdem)
def appos_e_ccomp_parataxis(corpus): corpus23 = estrutura_ud.Corpus(recursivo=False) corpus23.load( '/home/elvis/Dropbox/tronco/comcorhd.tronco.me/UD_Portuguese-Bosque/www/interrogar-ud/conllu/bosqueUD_2.3.conllu' ) for sentid, sentence in corpus.sentences.items(): if 'ccomp:parataxis' in sentence.to_str(): for token in sentence.tokens: if 'ccomp:parataxis' == token.deprel: token.deprel = 'parataxis' if 'appos:parataxis' in sentence.to_str(): for n, token in enumerate(sentence.tokens): if 'appos:parataxis' == token.deprel: token.deprel = 'nmod' if corpus23.sentences[sentid].tokens[ n].deprel == 'nmod' else 'parataxis' return corpus
def addToken(conllu, sent_id, option, token_id, conllu_completo="", new_tokens=[], mergeSentencesId="", form=False): if form: if not os.path.isfile("../cgi-bin/tokenization.json"): tokenization = {} with open("../cgi-bin/tokenization.json", "w") as f: json.dump(tokenization, f) with open("../cgi-bin/tokenization.json") as f: tokenization = json.load(f) if not isinstance(conllu, estrutura_ud.Corpus): corpus = estrutura_ud.Corpus(recursivo=False, any_of_keywords=[re.escape("# sent_id = " + sent_id + "\n"), re.escape("# sent_id = " + mergeSentencesId + "\n")]) corpus.load(conllu if not conllu_completo else conllu_completo) else: corpus = conllu if token_id == "left": token_id = corpus.sentences[sent_id].tokens[0].id elif token_id == "right": token_id = str(int(corpus.sentences[sent_id].tokens[-1].id)+1) if option in ["add", "addContraction"]: if not new_tokens: if not mergeSentencesId: novo_token = estrutura_ud.Token() novo_token.build("_\t_\t_\t_\t_\t_\t0\t_\t_\t_") new_tokens.append(novo_token) else: new_tokens = corpus.sentences[mergeSentencesId].tokens else: novo_token = estrutura_ud.Token() novo_token.build(new_tokens[0]) new_tokens = [novo_token] last_id = "" for novo_token in reversed(new_tokens): if option == "add": novo_token.id = token_id if not '-' in novo_token.id else str(int(token_id)) + "-" + str(int(token_id)+int(novo_token.id.split("-")[1])-int(novo_token.id.split("-")[0])) elif option == "addContraction": novo_token.id = token_id + "-" + token_id if mergeSentencesId: if not last_id: last_id = corpus.sentences[sent_id].tokens[-1].id if token_id == str(int(last_id)+1) and not '-' in novo_token.id: novo_token.dephead = str(int(novo_token.dephead) + int(last_id)) if not token_id in corpus.sentences[sent_id].map_token_id: corpus.sentences[sent_id].tokens.append(novo_token) corpus.sentences[sent_id].map_token_id[token_id] = len(corpus.sentences[sent_id].tokens) - 1 else: corpus.sentences[sent_id].tokens.insert(corpus.sentences[sent_id].map_token_id[token_id], novo_token) if option == "add": for t, token in enumerate(corpus.sentences[sent_id].tokens): if not '-' in novo_token.id: if t > corpus.sentences[sent_id].map_token_id[token_id]: token.id = str(int(token.id)+1) if not '-' in token.id else str(int(token.id.split("-")[0])+1) + "-" + str(int(token.id.split("-")[1])+1) corpus.sentences[sent_id].map_token_id[token.id] = t for t, token in enumerate(corpus.sentences[sent_id].tokens): if not mergeSentencesId and token.dephead not in ["0", "_"] and token.dephead in corpus.sentences[sent_id].map_token_id and token_id in corpus.sentences[sent_id].map_token_id and corpus.sentences[sent_id].map_token_id[token.dephead] >= corpus.sentences[sent_id].map_token_id[token_id]: token.dephead = str(int(token.dephead)+1) if form: if not conllu in tokenization: tokenization[conllu] = {} if not sent_id in tokenization[conllu]: tokenization[conllu][sent_id] = [] tokenization[conllu][sent_id].append({'option': option, 'token_id': token_id, 'new_token': [novo_token.to_str()]}) if mergeSentencesId and token_id != str(int(last_id)+1): for t, token in enumerate(corpus.sentences[sent_id].tokens): if token.dephead not in ["0", "_"] and t > int(corpus.sentences[sent_id].map_token_id[new_tokens[-1].id]): token.dephead = str(int(token.dephead) + int(new_tokens[-1].id)) if mergeSentencesId: if token_id == corpus.sentences[sent_id].tokens[0].id: corpus.sentences[sent_id].metadados['text'] = corpus.sentences[mergeSentencesId].text + ' ' + corpus.sentences[sent_id].text else: corpus.sentences[sent_id].metadados['text'] += ' ' + corpus.sentences[mergeSentencesId].text corpus.sentences.pop(mergeSentencesId) elif option in ["rm"]: if not '-' in token_id: for t, token in enumerate(corpus.sentences[sent_id].tokens): if token_id in corpus.sentences[sent_id].map_token_id and t > corpus.sentences[sent_id].map_token_id[token_id]: token.id = str(int(token.id)-1) if not '-' in token.id else str(int(token.id.split("-")[0])-1) + "-" + str(int(token.id.split("-")[1])-1) if token.dephead not in ["_", "0"]: if token.dephead in corpus.sentences[sent_id].map_token_id and token_id in corpus.sentences[sent_id].map_token_id and corpus.sentences[sent_id].map_token_id[token.dephead] > corpus.sentences[sent_id].map_token_id[token_id]: token.dephead = str(int(token.dephead)-1) corpus.sentences[sent_id].tokens = [x for t, x in enumerate(corpus.sentences[sent_id].tokens) if t != corpus.sentences[sent_id].map_token_id[token_id]] if form: if not conllu in tokenization: tokenization[conllu] = {} if not sent_id in tokenization[conllu]: tokenization[conllu][sent_id] = [] tokenization[conllu][sent_id].append({'option': option, 'token_id': token_id}) if form: with open("../cgi-bin/tokenization.json", "w") as f: json.dump(tokenization, f) corpus.save(conllu + "_tokenization" if not conllu_completo else conllu_completo + "_tokenization") os.remove(conllu if not conllu_completo else conllu_completo) os.rename(conllu + "_tokenization" if not conllu_completo else conllu_completo + "_tokenization", conllu if not conllu_completo else conllu_completo) else: return corpus
def main(arquivoUD, criterio, parametros, limit=0, sent_id="", fastSearch=False): parametros = parametros.strip() #Lê o arquivo UD if criterio in [1, 2, 3, 4]: import estrutura_dados import estrutura_ud qualquercoisa = estrutura_dados.LerUD(arquivoUD) if criterio in [5]: import estrutura_ud if isinstance(arquivoUD, str): with open(arquivoUD, "r") as f: if "head_token" in parametros or "next_token" in parametros or "previous_token" in parametros: #qtd = len(parametros.split("head_token")) -1 + len(parametros.split("previous_token")) -1 + len(parametros.split("next_token")) -1 corpus = estrutura_ud.Corpus(recursivo=True, sent_id=sent_id) else: corpus = estrutura_ud.Corpus(recursivo=False, sent_id=sent_id) start = time.time() corpus.build(f.read()) sys.stderr.write("\ncorpus.build: " + str(time.time() - start)) else: corpus = arquivoUD #Cria a lista que vai ser enviada seja ao terminal ou ao HTML output = list() casos = 0 #Regex tabela = ['@YELLOW/', '@PURPLE/', '@BLUE/', '@RED/', '@CYAN/'] if criterio == 1: for a, sentence in enumerate(qualquercoisa): if limit and len(output) == limit: break sentence2 = sentence for b, linha in enumerate(sentence): linha2 = linha if isinstance(linha2, list): sentence2[b] = "\t".join(sentence2[b]) sentence2 = "\n".join(sentence2) regex = re.search(parametros, sentence2, flags=re.IGNORECASE | re.MULTILINE) if regex: casos += len( re.findall(parametros, sentence2, flags=re.IGNORECASE | re.MULTILINE)) cores = len(regex.groups()) new_sentence = re.sub('(' + parametros + ')', r'<b>\1</b>', sentence2, flags=re.IGNORECASE | re.MULTILINE) tokens = list() header = '!@#' for linha in new_sentence.splitlines(): if '# text = ' in linha: header = linha if 'b>' in linha and '\t' in linha: tokens.append( linha.split('\t')[1].replace('<b>', '').replace( '</b>', '')) header2 = header for token in tokens: header2 = re.sub(r'\b' + re.escape(token) + r'\b', '<b>' + token + '</b>', header2) for i in range(cores): if regex[i + 1] != None and i < len(tabela): token = regex[i + 1] if '\t' in regex[i + 1]: token = regex[i + 1].split('\t')[1] header2 = re.sub(r'\b' + re.escape(token) + r'\b', tabela[i] + token + '/FONT', header2) new_sentence = new_sentence.replace(header, header2) output.append(new_sentence.splitlines()) #If critério 2 if criterio == 2: #Variáveis y = parametros.split('#')[0].strip() z = int(parametros.split('#')[1]) k = [x.strip() for x in parametros.split('#')[2].split('|')] w = int(parametros.split('#')[3]) for sentence in qualquercoisa: achei = 'nãoachei' descarta = False for i, linha in enumerate(sentence): if isinstance(linha, list): #print(linha) if y == linha[z - 1]: achei = linha[0] token = linha[1] sentence[i] = '<b>' + '\t'.join(sentence[i]) + '</b>' sentence[i] = sentence[i].split('\t') #break if achei != 'nãoachei': for i, linha in enumerate(sentence): if '# text' in linha: sentence[i] = re.sub(r'\b' + re.escape(token) + r'\b', '<b>' + token + '</b>', sentence[i]) if achei != 'nãoachei': for linha in sentence: if isinstance(linha, list): for k_subitem in k: if achei == linha[6] and k_subitem == linha[z - 1]: descarta = True if descarta == False: output.append(sentence) casos += 1 #Regex Independentes if criterio == 3: regras = [x.strip() for x in parametros.split('::')] for a, sentence in enumerate(qualquercoisa): sentence2 = sentence for b, linha in enumerate(sentence): linha2 = linha if isinstance(linha2, list): sentence2[b] = "\t".join(sentence2[b]) sentence2 = "\n".join(sentence2) descarta = False for regranum, regra in enumerate(regras): if regra[0] == '!': regex = re.search(regra[1:], sentence2, flags=re.IGNORECASE | re.MULTILINE) casos += len( re.findall(regra[1:], sentence2, flags=re.I | re.M)) else: regex = re.search(regra, sentence2, flags=re.IGNORECASE | re.MULTILINE) casos += len( re.findall(regra, sentence2, flags=re.I | re.M)) if (regra[0] == '!' and regex) or (regra[0] != '!' and not regex): descarta = True break sentence2 = re.sub('(' + regra + ')', tabela[regranum] + r'<b>\1</b>/FONT', sentence2, flags=re.IGNORECASE | re.MULTILINE) if not descarta: tokens = list() header = '!@#' for linha in sentence2.splitlines(): if '# text = ' in linha: header = linha if 'b>' in linha and '\t' in linha: if '@' in linha: tokens.append((linha.split('\t')[1].replace( '<b>', '').replace('</b>', '').replace( '@' + linha.split('@')[1].split('/')[0] + '/', ''), '@' + linha.split('@')[1].split('/')[0] + '/')) lastcolor = '@' + linha.split('@')[1].split( '/')[0] + '/' else: tokens.append((linha.split('\t')[1].replace( '<b>', '').replace('</b>', ''), lastcolor)) header2 = header for token in tokens: header2 = re.sub(r'\b' + re.escape(token[0]) + r'\b', token[1] + '<b>' + token[0] + '</b>/FONT', header2) sentence2 = sentence2.replace(header, header2) output.append(sentence2.splitlines()) #pais e filhos if criterio == 4: filho = parametros.split('::')[0].strip() pai = parametros.split('::')[1].strip() negativo_filho = False negativo_pai = False if filho[0] == '!': negativo_filho = True filho = ''.join(filho[1:]) if pai[0] == '!': negativo_pai = True pai = ''.join(pai[1:]) for a, sentenca in enumerate(qualquercoisa): acheifilho = 'não' acheipai = 'não' descarta = False for b, linha in enumerate(sentenca): if isinstance(linha, list): if re.search(filho, '\t'.join(linha), flags=re.IGNORECASE | re.MULTILINE): acheifilho = (linha, b) if isinstance(linha, list): if re.search(pai, '\t'.join(linha), flags=re.IGNORECASE | re.MULTILINE): acheipai = (linha, b) if not negativo_filho and not negativo_pai and acheipai != 'não' and acheifilho != 'não' and acheipai[ 0][0] == acheifilho[0][6]: for c, linha in enumerate(sentenca): if '# text' in linha: qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheipai[0][1]) + r'\b', '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheifilho[0][1]) + r'\b', '<b>@RED/' + acheifilho[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) break qualquercoisa[a][acheipai[1]] = ( '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheipai[1]]) + '/FONT</b>').split('\t') qualquercoisa[a][acheifilho[1]] = ( '<b>@RED/' + '\t'.join(qualquercoisa[a][acheifilho[1]]) + '/FONT</b>').split('\t') output.append(qualquercoisa[a]) break elif negativo_filho and acheipai != 'não' and acheifilho != 'não' and acheipai[ 0][0] == acheifilho[0][6]: descarta = True break elif negativo_pai and acheifilho != 'não' and acheipai != 'não' and acheipai[ 0][0] == acheifilho[0][6]: descarta = True break if negativo_filho and acheipai != 'não' and acheifilho != 'não' and not descarta: for c, linha in enumerate(sentenca): if '# text' in linha: qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheipai[0][1]) + r'\b', '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) break qualquercoisa[a][acheipai[1]] = ( '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheipai[1]]) + '/FONT</b>').split('\t') output.append(qualquercoisa[a]) elif negativo_pai and acheipai != 'não' and acheifilho != 'não' and not descarta: for c, linha in enumerate(sentenca): if '# text' in linha: qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheifilho[0][1]) + r'\b', '<b>@BLUE/' + acheifilho[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) break qualquercoisa[a][acheifilho[1]] = ( '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheifilho[1]]) + '/FONT</b>').split('\t') output.append(qualquercoisa[a]) elif negativo_filho and acheipai != 'não' and acheifilho == 'não': for c, linha in enumerate(sentenca): if '# text' in linha: qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheipai[0][1]) + r'\b', '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) break qualquercoisa[a][acheipai[1]] = ( '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheipai[1]]) + '/FONT</b>').split('\t') output.append(qualquercoisa[a]) elif negativo_pai and acheifilho != 'não' and acheipai == 'não': for c, linha in enumerate(sentenca): if '# text' in linha: qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheifilho[0][1]) + r'\b', '<b>@RED/' + acheifilho[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) break qualquercoisa[a][acheifilho[1]] = ( '<b>@RED/' + '\t'.join(qualquercoisa[a][acheifilho[1]]) + '/FONT</b>').split('\t') output.append(qualquercoisa[a]) #Python start = time.time() if criterio == 5: pesquisa = parametros casos = 0 pesquisa = pesquisa.replace(" = ", " == ") pesquisa = pesquisa.replace(" @", " ") if pesquisa[0] == "@": pesquisa = pesquisa[1:] pesquisa = pesquisa.replace(" ", " ").strip() pesquisa = pesquisa.replace(" == ", " == token.") pesquisa = pesquisa.replace(" === ", " === token.") pesquisa = pesquisa.replace(" != ", " != token.") pesquisa = pesquisa.replace(" !== ", " !== token.") pesquisa = pesquisa.replace(" > ", " > token.") pesquisa = pesquisa.replace(" < ", " < token.") pesquisa = pesquisa.replace(" >= ", " >= token.") pesquisa = pesquisa.replace(" <= ", " <= token.") pesquisa = "token." + pesquisa pesquisa = pesquisa.replace(" and ", " and token.") pesquisa = pesquisa.replace(" or ", " or token.") pesquisa = pesquisa.replace(" in ", " in token.") pesquisa = pesquisa.replace('token."', '"') pesquisa = pesquisa.replace('token.[', '[') pesquisa = pesquisa.replace('token.(', '(') pesquisa = pesquisa.replace('token.not', 'not') pesquisa = pesquisa.replace('token.token.', 'token.') pesquisa = re.sub(r'token\.([1234567890])', r'\1', pesquisa) pesquisa = re.sub( r'(\S+)\s==\s(\".*?\")', r'any( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )', pesquisa) pesquisa = re.sub( r'(\S+)\s===\s(\".*?\")', r'all( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )', pesquisa) pesquisa = re.sub( r'(\S+)\s!=\s(\".*?\")', r'not any( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )', pesquisa) pesquisa = re.sub( r'(\S+)\s!==\s(\".*?\")', r'not all( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )', pesquisa) pesquisa = pesquisa.strip() if (".id" in pesquisa or ".dephead" in pesquisa) and ( not "int(" in pesquisa) and ("<" in pesquisa or ">" in pesquisa): pesquisa = re.sub(r"(\b\S+\.(id|dephead)\b)", r"int(\1)", pesquisa) identificador = "token" if parametros[0] == "@": parametros = parametros[1:] arroba = parametros.split( " ")[0] if not ' @' in parametros else parametros.rsplit( " @", 1)[1].replace( "int(", "").split(")")[0].split(" ")[0].replace("(", "") arroba = "token." + arroba arroba = arroba.replace("token.token", "token") arroba = arroba.rsplit(".", 1)[0] #if " in " in arroba: arroba = arroba.split(" in ")[1] #with open("expressao_busca.txt", "w") as f: #f.write(f"parametro: {parametros}\npesquisa: {pesquisa}\narroba: {arroba}") agilizar = re.findall(r'"([^"]*)"', parametros) #print(agilizar) #agilizado = [x for x in corpus.sentences.values() if all(re.search(y, x.to_str()) for y in agilizar)] #agilizado = corpus.sentences.values() agilizado = filter( lambda x: all(re.search(y, x.to_str()) for y in agilizar), corpus.sentences.values()) #print(agilizado) for sentence in agilizado: if limit and limit == len(output): break condition = "global sim; global sentence2; sim = 0; sentence2 = copy.copy(sentence); sentence2.print = sentence2.tokens_to_str()" condition += ''' for ''' + identificador + ''' in sentence.tokens: try: if not "-" in '''+identificador+'''.id and (''' + pesquisa + ''') : sentence2.metadados['text'] = re.sub(r'\\b(' + re.escape('''+ identificador +'''.word) + r')\\b', r"@RED/\\1/FONT", sentence2.metadados['text'], flags=re.IGNORECASE|re.MULTILINE) sentence2.print = sentence2.print.replace('''+ identificador +'''.to_str(), "@RED/" + '''+ identificador +'''.to_str() + "/FONT") '''#try por causa de não ter um next_token no fim de sentença, por ex. if identificador + ".head_token" in pesquisa: condition += ''' sentence2.metadados['text'] = re.sub(r'\\b(' + re.escape(''' + identificador + '''.head_token.word) + r')\\b', r"@BLUE/\\1/FONT", sentence2.metadados['text'], flags=re.IGNORECASE|re.MULTILINE) sentence2.print = sentence2.print.replace(''' + identificador + '''.head_token.to_str(), "@BLUE/" + ''' + identificador + '''.head_token.to_str() + "/FONT")''' condition += ''' sentence2.metadados['text'] = re.sub(r'\\b(' + re.escape(''' + arroba + '''.word) + r')\\b', r"<b>\\1</b>", sentence2.metadados['text'], flags=re.IGNORECASE|re.MULTILINE) final = sentence2.metadados_to_str() + "\\n" + sentence2.print final = final.splitlines() arroba = ''' + arroba + '''.id for l, linha in enumerate(final): if linha.split("\\t")[0] == arroba or ("/" in linha.split("\\t")[0] and linha.split("\\t")[0].split("/")[1] == arroba): final[l] = "<b>" + final[l] + "</b>" final = "\\n".join(final)''' exec(condition + ''' output.append(final) except Exception as e: print(e) pass''') sys.stderr.write("\ncritério 5: " + str(time.time() - start)) #Transforma o output em lista de sentenças (sem splitlines e sem split no \t) if criterio not in [5]: for a, sentence in enumerate(output): for b, linha in enumerate(sentence): if isinstance(linha, list): sentence[b] = "\t".join(sentence[b]) output[a] = "\n".join(sentence) start = time.time() for i, final in enumerate(output): if not fastSearch: anotado = estrutura_ud.Sentence(recursivo=False) estruturado = estrutura_ud.Sentence(recursivo=False) anotado.build( cgi.escape( final.replace('<b>', '@BOLD').replace( '</b>', '/BOLD').replace( '<font color=' + tabelaf['yellow'] + '>', '@YELLOW/').replace( '<font color=' + tabelaf['red'] + '>', '@RED/').replace( '<font color=' + tabelaf['cyan'] + '>', '@CYAN/').replace( '<font color=' + tabelaf['blue'] + '>', '@BLUE/').replace( '<font color=' + tabelaf['purple'] + '>', '@PURPLE/').replace( '</font>', '/FONT'))) estruturado.build( web.unescape(final).replace('<b>', '@BOLD').replace( '</b>', '/BOLD').replace( '<font color=' + tabelaf['yellow'] + '>', '@YELLOW/').replace( '<font color=' + tabelaf['red'] + '>', '@RED/').replace( '<font color=' + tabelaf['cyan'] + '">', '@CYAN/').replace( '<font color=' + tabelaf['blue'] + '>', '@BLUE/'). replace('<font color=' + tabelaf['purple'] + '>', '@PURPLE/').replace('</font>', '/FONT').replace( '@BOLD', '').replace('/BOLD', '').replace( '@YELLOW/', '').replace('@RED/', '').replace( '@CYAN/', '').replace('@BLUE/', '').replace('@PURPLE/', '').replace( '/FONT', '')) else: anotado = "" estruturado = "" output[i] = { 'resultado': final, 'resultadoAnotado': anotado, 'resultadoEstruturado': estruturado, } #sys.stderr.write("\nbuscaDicionarios: " + str(time.time() - start)) return {'output': output, 'casos': casos}
antigoRootAUX = token.head_token.dephead token.head_token.dephead = antigoRootVERB token.dephead = antigoRootAUX token.upos = "SCONJ" token.deprel = "mark" for sentence in corpus.sentences.values(): for token in sentence.tokens: if token.upos == "AUX" and token.head_token.upos == "VERB" and token.lemma not in "ser|estar|ir|ter|haver".split( "|"): token.deprel = token.head_token.deprel token.upos = "VERB" token.head_token.deprel = "xcomp" token.dephead = token.head_token.dephead token.head_token.dephead = token.id return corpus if not os.system("sh scripts/1_criar_branch.sh " + sys.argv[1]): corpus = estrutura_ud.Corpus(recursivo=True) corpus.load( '/home/elvis/Dropbox/tronco/comcorhd.tronco.me/UD_Portuguese-Bosque/www/interrogar-ud/conllu/' + sys.argv[1] + ".conllu") corpus = appos_e_ccomp_parataxis(corpus) corpus = loc_verbal_aspectual(corpus) corpus.save( '/home/elvis/Dropbox/tronco/comcorhd.tronco.me/UD_Portuguese-Bosque/www/interrogar-ud/conllu/' + sys.argv[1] + ".conllu") os.system("sh scripts/2_1-commit.sh " + sys.argv[1] + " release_changes")
from functions import prettyDate from datetime import datetime form = cgi.FieldStorage() contextoEsquerda = ["", ""] contextoDireita = ["", ""] sent_id = form['sent_id'].value if 'sent_id' in form else "" id = form['id'].value if 'id' in form else "" conllu = form['corpus'].value numero = re.search(r'^\d+$', sent_id.rsplit('-', 1)[1])[0] if '-' in sent_id else sent_id identificador = sent_id.rsplit("-", 1)[0] + "-" if '-' in sent_id else "" corpus = estrutura_ud.Corpus(recursivo=False, keywords=[re.escape(identificador)]) corpus.load('./interrogar-ud/conllu/' + form['corpus'].value) contextoEsquerda = [] contextoDireita = [] for i in range(int(numero) - 1): if identificador + str(i + 1) in corpus.sentences: contextoEsquerda.append([ identificador + str(i + 1), corpus.sentences[identificador + str(i + 1)].text ]) all_sentences = [ x for x in corpus.sentences if x.rsplit("-", 1)[0] + "-" == identificador ]
def delete_sentence(filename, sent_id): corpus = estrutura_ud.Corpus(recursivo=False, sent_id=sent_id) corpus.load("./interrogar-ud/conllu/" + filename) del corpus.sentences[sent_id] corpus.save("./interrogar-ud/conllu/" + filename) return True
def api(): api_response = "" bot_response = "" # PARSE THE USER INPUT AND LOOK FOR PATTERNS if request.form.get('api_response'): api_response = app_dict[request.form.get('api_response')] if request.form.get('input'): udpipe_url = "http://lindat.mff.cuni.cz/services/udpipe/api/process?tokenizer&tagger&parser" udpipe_data = urllib.parse.urlencode({ 'data': request.form.get('input'), 'model': "portuguese-bosque-ud-2.6-200830", }).encode('ascii') with urllib.request.urlopen(udpipe_url, udpipe_data) as f: udpipe_output = json.loads(remove_accents( f.read().decode('utf-8')))['result'] text = estrutura_ud.Corpus(recursivo=True) text.build(udpipe_output) print("input: {}".format(text.to_str())) # try to find linguistic pattern for pattern in linguistic_patterns: for query in linguistic_patterns[pattern]: #print(query) if all( interrogar_UD.main(text, 5, x, fastSearch=True) ['casos'] for x in query): bot_response = responses[pattern] break # try to answer from wikipedia if not bot_response: # get most awkward word in input names = [] verbs = [] for sentence in text.sentences.values(): for token in sentence.tokens: clean_token = token.lemma.lower() if token.upos in [ "NOUN", "PROPN" ] and not clean_token in names and clean_token in frequency_of_important_words: names.append(clean_token) if ( token.upos in ["VERB"] or token.deprel in ["cop"] ) and not clean_token in verbs and clean_token in frequency_of_important_words: verbs.append(clean_token) most_awkward_name = sorted( names, key=lambda x: frequency_of_important_words[x]) most_awkward_verb = sorted( verbs, key=lambda x: frequency_of_important_words[x]) if most_awkward_name: try: bot_response = wikipedia.summary(most_awkward_name[0], sentences=2) except wikipedia.DisambiguationError as e: s = random.choice(e.options) bot_response = wikipedia.summary(s, sentences=2) # try to answer from "pensador" if not bot_response and most_awkward_verb: with urllib.request.urlopen("https://www.pensador.com/{}/".format( most_awkward_verb[0])) as f: soup = BeautifulSoup(f, "html.parser") parse = soup.find_all("p", class_="frase") bot_response = random.choice(parse).get_text() if parse else "" # no answer found if not bot_response: bot_response = "Desculpe, ainda não sei como responder..." response = jsonify({ "api_response": api_response, "bot_response": bot_response }) response.headers.add('Access-Control-Allow-Origin', '*') return response
def validate(conllu, sent_id=None, errorList="validar_UD.txt", noMissingToken=False): errorDictionary = {} if isinstance(conllu, str): corpus = estrutura_ud.Corpus(recursivo=True, sent_id=sent_id) corpus.load(conllu) else: corpus = conllu for sentence in corpus.sentences.values(): if not sentence.text.endswith(sentence.tokens[-1].word): if not '1 - Sentença não termina com o último token' in errorDictionary: errorDictionary[ '1 - Sentença não termina com o último token'] = [] errorDictionary[ '1 - Sentença não termina com o último token'].append({ "sentence": "", "sent_id": sentence.sent_id, "t": sentence.map_token_id["1"], "attribute": "id", }) temRoot = False tem2Root = False for token in sentence.tokens: if token.deprel == "root" and not temRoot: temRoot = True elif token.deprel == "root" and temRoot: tem2Root = True if tem2Root: if not '1 - Tem 2 root' in errorDictionary: errorDictionary['1 - Tem 2 root'] = [] errorDictionary['1 - Tem 2 root'].append({ "sentence": sentence, "sent_id": sentence.sent_id, "t": sentence.map_token_id["1"], "attribute": "id", }) if not temRoot: if not '1 - Não tem root' in errorDictionary: errorDictionary['1 - Não tem root'] = [] errorDictionary['1 - Não tem root'].append({ "sentence": sentence, "sent_id": sentence.sent_id, "t": sentence.map_token_id["1"], "attribute": "id", }) if not noMissingToken: missingToken = re.findall(r"\n\n(?!#|$).*", corpus.to_str()) if missingToken: if not '1 - Há tokens faltando no corpus' in errorDictionary: errorDictionary['1 - Há tokens faltando no corpus'] = [] for missing in missingToken: errorDictionary['1 - Há tokens faltando no corpus'].append({ "sentence": "", "sent_id": "<pre>" + missing + "</pre>", }) with open(errorList) as f: errorListFile = f.read().splitlines() errorList = [] [errorList.append(x) for x in errorListFile] for error in errorList: if error and error[0] != "#": if "erro: " in error: comment = error.split("erro: ")[1] comment = comment.strip() coluna = error.split( "|", 1)[0] if "|" in error.split("erro: ")[0] else "" continue parameters = error.strip() for sentString in interrogar_UD.main(corpus, 5, parameters, 0, sent_id, separate=True)['output']: if not comment in errorDictionary: errorDictionary[comment] = [] sentence = estrutura_ud.Sentence(recursivo=True) sentence.build(fromInterrogarToHtml(sentString['resultado'])) tokenT = 0 for t, token in enumerate(sentence.tokens): if "<b>" in token.to_str(): tokenT = t break errorDictionary[comment].append({ "t": tokenT, "sentence": sentence, "attribute": coluna, }) return errorDictionary
def splitSentence(conllu, sent_id, sameSentenceId, newSentenceId, sameText, newText, token_id, conllu_completo="", form=False): if form: if not os.path.isfile("../cgi-bin/tokenization.json"): tokenization = {} with open("../cgi-bin/tokenization.json", "w") as f: json.dump(tokenization, f) with open("../cgi-bin/tokenization.json") as f: tokenization = json.load(f) if not isinstance(conllu, estrutura_ud.Corpus): corpus = estrutura_ud.Corpus(recursivo=False, any_of_keywords=[re.escape("# sent_id = " + sent_id + "\n"), re.escape("# sent_id = " + mergeSentencesId + "\n")]) corpus.load(conllu if not conllu_completo else conllu_completo) else: corpus = conllu new_sentence = estrutura_ud.Sentence(recursivo=True) new_sentence.build(corpus.sentences[sent_id].to_str()) new_sentence.sent_id = newSentenceId new_sentence.metadados['sent_id'] = newSentenceId new_token = False new_sentence_tokens = [] old_sentence_tokens = [] removed_tokens = 0 for token in corpus.sentences[sent_id].tokens: if new_token: new_sentence_tokens.append(token) else: old_sentence_tokens.append(token) if not '-' in token.id and not new_token: removed_tokens += 1 if token.id == token_id: new_token = True new_sentence.tokens = new_sentence_tokens corpus.sentences[sent_id].tokens = old_sentence_tokens corpus.sentences[sent_id].metadados['text'] = sameText corpus.sentences[sent_id].text = sameText corpus.sentences[new_sentence.sent_id] = new_sentence corpus.sentences[new_sentence.sent_id].refresh_map_token_id() corpus.sentences[new_sentence.sent_id].metadados['text'] = newText corpus.sentences[new_sentence.sent_id].text = newText corpus.sent_id = sameSentenceId corpus.sentences[sameSentenceId] = corpus.sentences.pop(sent_id) corpus.sentences[sameSentenceId].metadados['sent_id'] = sameSentenceId corpus.sentences[sameSentenceId].sent_id = sameSentenceId for t, token in enumerate(corpus.sentences[new_sentence.sent_id].tokens): token.id = str(int(token.id)-removed_tokens) if not '-' in token.id else str(int(token.id.split("-")[0])-removed_tokens) + "-" + str(int(token.id.split("-")[1])-removed_tokens) if token.dephead not in ["_", "0"]: token.dephead = str(int(token.dephead)-removed_tokens) if int(token.dephead) < 0: token.dephead = "0" if form: with open("../cgi-bin/tokenization.json", "w") as f: json.dump(tokenization, f) corpus.save(conllu + "_tokenization" if not conllu_completo else conllu_completo + "_tokenization") os.remove(conllu if not conllu_completo else conllu_completo) os.rename(conllu + "_tokenization" if not conllu_completo else conllu_completo + "_tokenization", conllu if not conllu_completo else conllu_completo) return new_sentence.sent_id else: return corpus
import estrutura_ud import sys corpus = estrutura_ud.Corpus() corpus.load(sys.argv[1]) for sentence in corpus.sentences.values(): for token in sentence.tokens: token.xpos = "_" if '-' in token.id: token.deps = "_" print(corpus.to_str())
metadados = {} if 'obra id=' in corpus: corpus_key = "obra" lista_tags = [] sentences = [] tokens = [] lista_faltantes = [] dep_lugar_errado = [] lista_contracoes = [] sent_id = 1 primeira_plus = False ja_primeira_plus = False mwe = False if os.path.isfile("corpus.conllu"): corpus = estrutura_ud.Corpus(recursivo=False) corpus.load("corpus.conllu") else: for l, linha in enumerate(corpus_splitlines): if l % 1000 == 0: sys.stderr.write("\nLinha processada: {}/{}".format( l, corpus_splitlines_len)) try: if linha.strip().startswith("<") and ' id="' in linha: metadados[linha.strip().split("<")[1].split(' id="') [0]] = re.search('<.*? id="([^"]+)"', linha)[1] if linha.strip().startswith("<") and not linha.strip(
def upload(alert="", success=""): if not google.authorized and GOOGLE_LOGIN: return redirect(url_for('google.login')) if request.method == "GET": return render_template('upload.html', user=google.get('/oauth2/v2/userinfo').json(), formDB=formDB()) elif request.method == "POST" and 'goldenFile' in request.files: goldenFile = request.files.get('goldenFile') if goldenFile.filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS: goldenFileName = removerAcento( conllu(request.values.get('goldenName')).golden()) if (INTERROGATORIO and not os.path.isfile(COMCORHD_FOLDER + '/' + goldenFileName) ) or ( not INTERROGATORIO and not os.path.isfile(UPLOAD_FOLDER + '/' + goldenFileName)): goldenFile.save( COMCORHD_FOLDER + '/' + goldenFileName) if INTERROGATORIO else goldenFile.save( UPLOAD_FOLDER + '/' + goldenFileName) shutil.copyfile( conllu(goldenFileName).findGolden(), conllu(goldenFileName).findOriginal()) textInterrogatorio = "(1) Realize buscas e edições no corpus pelo <a href='http://github.com/alvelvis/Interrogat-rio'>Interrogatório</a>, ou, (2) " success = f'"{goldenFileName}" enviado com sucesso! {textInterrogatorio if INTERROGATORIO else ""}Julgue-o na <a href="/corpus">página inicial</a>.' else: alert = "Arquivo golden já existe na pasta." else: alert = 'Extensão deve estar entre "' + ",".join( ALLOWED_EXTENSIONS) + '"' elif request.method == "POST" and 'systemFile' in request.files: goldenFile = request.values.get('sysGoldenFile') systemFile = request.files.get('systemFile') if systemFile.filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS: systemFileName = conllu(goldenFile).system() systemFile.save(UPLOAD_FOLDER + '/' + systemFileName) if not os.path.isfile(conllu(systemFileName).findOriginal()): shutil.copyfile( conllu(systemFileName).findGolden(), conllu(systemFileName).findOriginal()) corpusGolden = estrutura_ud.Corpus(recursivo=False) corpusSystem = estrutura_ud.Corpus(recursivo=False) corpusGolden.load(conllu(goldenFile).findGolden()) corpusSystem.load(conllu(goldenFile).findSystem()) if len(corpusGolden.sentences) != len(corpusSystem.sentences): alert = "Arquivo sistema não tem o mesmo número de sentenças do arquivo golden." os.remove(conllu(goldenFile).findSystem()) else: success = f'"{systemFileName}" enviado com sucesso! Julgue o corpus na <a href="/corpus">página inicial</a>.' addDatabase(goldenFile) #loadCorpus.submit(goldenFile) del corpusGolden del corpusSystem else: alert = 'Extensão deve estar entre "' + ",".join( ALLOWED_EXTENSIONS) + '"' elif request.method == 'POST' and 'trainFile' in request.values: corpusTemporario = False if os.path.isfile(COMCORHD_FOLDER + "/" + conllu(request.values.get('trainFile')).golden()): os.system( f'cp {COMCORHD_FOLDER + "/" + conllu(request.values.get("trainFile")).golden()} {UPLOAD_FOLDER}' ) corpusTemporario = f"; rm {UPLOAD_FOLDER}/{conllu(request.values.get('trainFile')).golden()} &" if not request.values.get('crossvalidation'): Popen( f"cd {UPLOAD_FOLDER}; cp {conllu(request.values.get('trainFile')).golden()} {conllu(request.values.get('trainFile')).naked + '_test'}.conllu; sh udpipe.sh {conllu(request.values.get('trainFile')).naked + '_test'} {request.values.get('partitions')} 2>&1 | tee -a {conllu(request.values.get('trainFile')).naked + '_test'}_inProgress {corpusTemporario if corpusTemporario else '&'}", shell=True) nomeConllu = conllu( request.values.get('trainFile')).naked + "_test" else: Popen( f"cd {UPLOAD_FOLDER}; sh crossvalidation.sh {request.values.get('trainFile')} {request.values.get('partitions')} 2>&1 | tee -a {request.values.get('trainFile')}_inProgress {corpusTemporario if corpusTemporario else '&'}", shell=True) nomeConllu = conllu(request.values.get('trainFile')).naked novoCorpus = models.Corpus( name=nomeConllu, date=str(datetime.datetime.now()), sentences=0, about=request.values.get('about') if request.values.get('about') else ">", partitions=request.values.get('partitions'), author=google.get('/oauth2/v2/userinfo').json()['email'] if GOOGLE_LOGIN else "", goldenAlias='Golden', systemAlias='Sistema') db.session.add(novoCorpus) db.session.commit() success = "Um modelo está sendo treinado a partir do corpus \"" + nomeConllu + "\". Acompanhe o status do treinamento na <a href='/'>página inicial do Julgamento.</a>" elif request.method == 'POST' and 'repoName' in request.values: sh = f"cd {UPLOAD_FOLDER}/repositories/{request.values.get('repoName')}; \ git pull; \ git checkout {request.values.get('repoCommit').split(' | commit ')[1]}; \ cat documents/*.conllu > {conllu(removerAcento(request.values.get('repoCorpusName'))).findGolden()}; \ cat documents/*.conllu > {conllu(removerAcento(request.values.get('repoCorpusName'))).findOriginal()}" if request.values.get('criarRamo'): sh += f"; git checkout -b {removerAcento(request.values.get('repoCorpusName'))}; \ git push --set-upstream origin {removerAcento(request.values.get('repoCorpusName'))}" if not os.path.isfile( f"{conllu(removerAcento(request.values.get('repoCorpusName'))).findGolden()}" ): os.system(sh) textInterrogatorio = "(1) Realize buscas e edições no corpus pelo <a href='http://github.com/alvelvis/interrogat-rio'>Interrogatório</a>, ou, (2) " success = f"Corpus {'e ramo ' if request.values.get('criarRamo') else ''}\"{removerAcento(request.values.get('repoCorpusName'))}\" criado{'s' if request.values.get('criarRamo') else ''} com sucesso! {textInterrogatorio if INTERROGATORIO else ''}Para prosseguir com o julgamento, treine um modelo a partir desse corpus clicando no menu lateral \"Treinar um modelo\" ou envie um arquivo sistema equivalente ao corpus." else: alert = f"Corpus com o nome '{removerAcento(request.values.get('repoCorpusName'))}' já existe." return render_template('upload.html', alert=alert, success=success, user=google.get('/oauth2/v2/userinfo').json(), formDB=formDB())
def get_percentages(ud1, ud2, output, coluna): if not os.path.isdir("UAS"): os.mkdir("UAS") UAS = dict() with open(ud1, "r") as f: golden = estrutura_ud.Corpus() golden.build(f.read()) with open(ud2, "r") as f: system = estrutura_ud.Corpus() system.build(f.read()) dicionario = {} for sentid, sentence in golden.sentences.items(): for t, token in enumerate(sentence.tokens): if not token.__dict__[feats[coluna].lower()] in dicionario: if coluna == 8: dicionario[token.__dict__[feats[coluna].lower()]] = [0, 0, 0, 0, 0] UAS[token.deprel] = dict() else: dicionario[token.__dict__[feats[coluna].lower()]] = [0, 0, 0] dicionario[token.__dict__[feats[coluna].lower()]][0] += 1 if system.sentences[sentid].tokens[t].__dict__[feats[coluna].lower()] == token.__dict__[feats[coluna].lower()]: dicionario[token.__dict__[feats[coluna].lower()]][1] += 1 if coluna == 8: if system.sentences[sentid].tokens[t].dephead == token.dephead: dicionario[token.deprel][2] += 1 else: tok_golden = token.head_token.upos tok_system = system.sentences[sentid].tokens[t].head_token.upos tok_golden += "_L" if int(token.head_token.id) < int(token.id) else "_R" tok_system += "_L" if int(system.sentences[sentid].tokens[t].head_token.id) < int(system.sentences[sentid].tokens[t].id) else "_R" if tok_golden + "/" + tok_system in UAS[token.deprel]: UAS[token.deprel][tok_golden + "/" + tok_system]["qtd"] += 1 else: UAS[token.deprel][tok_golden + "/" + tok_system] = {"qtd": 1, "sentences": []} UAS[token.deprel][tok_golden + "/" + tok_system]["sentences"].append([sentence, system.sentences[sentid], token, token.head_token, system.sentences[sentid].tokens[t].head_token, system.sentences[sentid].tokens[t]]) sent_accuracy = [0, 0] for sentid, sentence in golden.sentences.items(): if sentid in system.sentences and len(sentence.tokens) == len(system.sentences[sentid].tokens): sent_accuracy[0] += 1 acertos = 0 for t, token in enumerate(sentence.tokens): if system.sentences[sentid].tokens[t].upos == token.upos and system.sentences[sentid].tokens[t].dephead == token.dephead and system.sentences[sentid].tokens[t].deprel == token.deprel: acertos += 1 if acertos == len(sentence.tokens): sent_accuracy[1] += 1 #print(sentid) sentence_accuracy = "<table><tr><th>Acurácia por sentença</th></tr><tr><th>Sentenças comparáveis</th><th>Sentenças corretas</th><th>Número relativo</th></tr><tr><td>{0}</td><td>{1}</td><td>{2}</td></tr></table>".format(sent_accuracy[0], sent_accuracy[1], str((sent_accuracy[1]/sent_accuracy[0])*100) + "%") with open(output + "_sentence.txt", "w") as f: f.write(sentence_accuracy) if coluna == 8: csv = ["{0:20} {1:10} {2:10} {3:10} {4:10} {5:10}".format("DEPREL", "GOLDEN", "ACERTOS_DEPREL", "ACERTOS_DEPREL_DEPHEAD", "PORCENTAGEM_DEPREL", "PORCENTAGEM_DEPREL_DEPHEAD")] for classe in sorted(dicionario): dicionario[classe][3] = (dicionario[classe][1] / dicionario[classe][0]) * 100 dicionario[classe][4] = (dicionario[classe][2] / dicionario[classe][0]) * 100 csv.append("{0:20} {1:10} {2:10} {3:10} {4:10} {5:10}".format(classe, str(dicionario[classe][0]), str(dicionario[classe][1]), str(dicionario[classe][2]), str(dicionario[classe][3]) + "%", str(dicionario[classe][4]) + "%")) else: csv = ["{0:20} {1:10} {2:10} {3:10}".format(feats[coluna], "GOLDEN", "ACERTOS", "PORCENTAGEM")] for classe in sorted(dicionario): dicionario[classe][2] = (dicionario[classe][1] / dicionario[classe][0]) * 100 csv.append("{0:20} {1:10} {2:10} {3:10}".format(classe, str(dicionario[classe][0]), str(dicionario[classe][1]), str(dicionario[classe][2]) + "%")) with open(output + "_results.txt", "w") as f: f.write("\n".join(csv)) for deprel in UAS: total = 0 for x in UAS[deprel].values(): total += x["qtd"] escrever = ["<tr><td>{0}</td><td>{1}</td><td>{2}</td><td><a href='./{4}_{0}_{1}.html'>{3}%</a></td></tr>".format(padrao.split("/")[0], padrao.split("/")[1], quantidade["qtd"], (quantidade["qtd"]/total)*100, deprel) for padrao, quantidade in sorted(UAS[deprel].items(), key=lambda x: x[1]["qtd"], reverse=True)] with open("UAS/" + deprel + ".html", "w") as f: f.write("<body style='margin:20px'>" + str(dicionario[deprel][3] - dicionario[deprel][4]) + '% "' + deprel + '" com dephead divergentes<br><br><head><link href="../style.css" rel="stylesheet" type="text/css"></head>' + "<table><tr><td colspan='4'>Distribuição dos erros</td></tr><tr><th>GOLDEN</th><th>PREVISTO</th><th>N</th><th>%</th></tr>" + "\n".join(escrever) + "<tr><td colspan='2'>Total</td><td>" + str(total) + "</td></tr></table>") for padrao in UAS[deprel]: escrever = "<body style='margin:20px;'>DEPREL: " + deprel + "\n<br>GOLDEN HEAD: " + padrao.split("/")[0] + "\n<br>PREVISTO HEAD: " + padrao.split("/")[1] + '''\n<br><input type=button value='Copiar sent_id das frases' onclick='copiar_frases()'> <input id='input' style='display:none'><br><br>''' for n, sentence in enumerate(UAS[deprel][padrao]["sentences"]): escrever += str(n+1) + " / " + str(len(UAS[deprel][padrao]["sentences"])) escrever += "\n<br><input type=hidden name=copiar_id value='"+sentence[0].sent_id.replace("'", "\\'")+"'># sent_id = " + sentence[0].sent_id text = sentence[0].text text = re.sub(r"\b" + re.escape(sentence[2].word) + r"\b", "<b>" + sentence[2].word + "</b>", text) escrever += "\n<input type=hidden name=negrito value='"+sentence[2].word.replace("'", "\\'")+"'>" text = re.sub(r"\b" + re.escape(sentence[3].word) + r"\b", "<font color=green>" + sentence[3].word + "</font>", text) text = re.sub(r"\b" + re.escape(sentence[4].word) + r"\b", "<font color=red>" + sentence[4].word + "</font>", text) escrever += "\n<br># text = " + text escrever += '''\n<br><input type='button' id="but_'''+str(n)+'''" value='Mostrar GOLDEN' onclick='if(document.getElementById("pre_'''+str(n)+'''").style.display == "none") { document.getElementById("pre_''' + str(n) + '''").style.display = "block"; document.getElementById("but_'''+str(n)+'''").value = "Esconder GOLDEN"; } else { document.getElementById("pre_''' + str(n) + '''").style.display = "none"; document.getElementById("but_'''+str(n)+'''").value = "Mostrar GOLDEN"; }\'>''' escrever += '''\n<input type='button' id="but2_'''+str(n)+'''" value='Mostrar PREVISTO' onclick='if(document.getElementById("pre2_'''+str(n)+'''").style.display == "none") { document.getElementById("pre2_''' + str(n) + '''").style.display = "block"; document.getElementById("but2_'''+str(n)+'''").value = "Esconder PREVISTO"; } else { document.getElementById("pre2_''' + str(n) + '''").style.display = "none"; document.getElementById("but2_'''+str(n)+'''").value = "Mostrar PREVISTO"; }\'>''' escrever += '\n<pre id=pre_' + str(n) + ' style="display:none">GOLDEN<br>' + sentence[0].to_str().replace(sentence[2].to_str(), "<b>" + sentence[2].to_str() + "</b>").replace(sentence[3].to_str(), "<font color=green>" + sentence[3].to_str() + "</font>") + '</pre>' escrever += '\n<pre id=pre2_' + str(n) + ' style="display:none">PREVISTO<br>' + sentence[1].to_str().replace(sentence[5].to_str(), "<b>" + sentence[5].to_str() + "</b>").replace(sentence[4].to_str(), "<font color=red>" + sentence[4].to_str() + "</font>") + '</pre>' escrever += "\n<hr>" escrever += ''' <script> String.prototype.rsplit = function(sep, maxsplit) { var split = this.split(sep); return maxsplit ? [ split.slice(0, -maxsplit).join(sep) ].concat(split.slice(-maxsplit)) : split; } function copiar_frases(){ document.getElementById("input").value = ""; document.getElementById("input").style.display = "inline"; var sentids, i, negritos; sentids = document.getElementsByName("copiar_id"); negritos = document.getElementsByName("negrito"); for (i = 0; i < sentids.length; i++) { document.getElementById("input").value = document.getElementById("input").value + "^# sent_id = " + sentids[i].value + "$(.*\\\\n)*.*" + negritos[i].value + "|"; } document.getElementById("input").value = document.getElementById("input").value.rsplit('|',1)[0]; } </script>''' with open("UAS/" + deprel + "_" + padrao.replace("/", "_") + ".html", "w") as f: f.write(escrever)
def sendAnnotation(): if not google.authorized and GOOGLE_LOGIN: return redirect(url_for('google.login')) goldenAndSystem = int(request.values.get('goldenAndSystem')) change = False attention = "" if any('<coluna>' in data and request.values.get(data) for data in request.values): sent_id = request.values.get('sent_id') arquivo = conllu(request.values.get('c')).findGolden( ) if request.values.get('ud') == 'ud1' else conllu( request.values.get('c')).findSystem() if goldenAndSystem: arquivoSystem = conllu(request.values.get('c')).findSystem() corpus = estrutura_ud.Corpus(recursivo=False, sent_id=request.values.get('sent_id')) corpus.load(arquivo) if goldenAndSystem: corpusSystem = estrutura_ud.Corpus( recursivo=False, sent_id=request.values.get('sent_id')) corpusSystem.load(arquivoSystem) for data in request.values: if '<coluna>' in data and request.values.get(data): token = int(data.split('<coluna>')[1]) coluna = data.split('<coluna>')[0] if not re.search( r'^\d+$', data.split('<coluna>')[0], flags=re.MULTILINE ) else dicionarioColunas[data.split('<coluna>')[0]] valor = html.unescape( request.values.get(data).replace("<br>", "").strip()).replace( "<br>", "").strip() if request.values.get("headToken"): headTokenNum = request.values.get( "headToken" ) if request.values.get("headToken") != "_" else "0" if corpus.sentences[sent_id].tokens[token].__dict__[ coluna] != valor: corpus.sentences[sent_id].tokens[token].__dict__[ coluna] = valor if request.values.get('headToken'): corpus.sentences[sent_id].tokens[ token].dephead = headTokenNum change = True allCorpora.corpora[conllu( request.values.get("c")).golden()].sentences[ sent_id].tokens[token].__dict__[coluna] = valor if goldenAndSystem: if corpusSystem.sentences[sent_id].tokens[token].__dict__[ coluna] != valor: corpusSystem.sentences[sent_id].tokens[token].__dict__[ coluna] = valor if request.values.get('headToken'): corpus.sentences[sent_id].tokens[ token].dephead = headTokenNum change = True allCorpora.corpora[conllu( request.values.get("c")).system()].sentences[ sent_id].tokens[token].__dict__[coluna] = valor attention = [] if change: corpus.save(arquivo) if goldenAndSystem: corpusSystem.save(arquivoSystem) errors = validar_UD.validate(corpus, errorList=VALIDAR_UD, noMissingToken=True, sent_id=request.values.get('sent_id')) if errors: for error in errors: if error.strip(): attention += [ f'<div class="alert alert-warning translateHtml" role="alert">Atenção: {error}</div><ul>' ] for value in errors[error]: if value['sentence']: attention += [ "<li>" + functions.cleanEstruturaUD( value['sentence'].tokens[ value['t']].id) + " / " + functions.cleanEstruturaUD( value['sentence'].tokens[ value['t']].word) + " / " + functions.cleanEstruturaUD( value['sentence'].tokens[value['t']]. __dict__[value['attribute']]) + "</li>" ] attention += ["</ul>"] del corpus if "corpusSystem" in globals(): del corpusSystem attention = "\n".join(attention) return jsonify({ 'change': change, 'data': prettyDate(datetime.datetime.now()).prettyDateDMAH(), 'attention': attention, 'success': True, })
def main(arquivoUD, criterio, parametros, limit=0, sent_id="", fastSearch=False, separate=False): parametros = parametros.strip() pesquisa = "" if criterio in [1]: import estrutura_ud if isinstance(arquivoUD, str): with open(arquivoUD) as f: arquivoUD = f.read() else: arquivoUD = arquivoUD.to_str() #Lê o arquivo UD if criterio in [3, 4]: import estrutura_dados import estrutura_ud qualquercoisa = estrutura_dados.LerUD(arquivoUD) if criterio in [2]: import estrutura_ud if isinstance(arquivoUD, str): if "head_token" in parametros or "next_token" in parametros or "previous_token" in parametros: corpus = estrutura_ud.Corpus(recursivo=True, sent_id=sent_id) else: corpus = estrutura_ud.Corpus(recursivo=False, sent_id=sent_id) start = time.time() corpus.load(arquivoUD) sys.stderr.write("\ncorpus.build: " + str(time.time() - start)) else: corpus = arquivoUD #Cria a lista que vai ser enviada seja ao terminal ou ao HTML output = list() casos = 0 #Regex tabela = ['@YELLOW/', '@PURPLE/', '@BLUE/', '@RED/', '@CYAN/'] if criterio == 1: start = time.time() sentence = "" f = arquivoUD.splitlines(keepends=True) for line in f: if line.strip(): sentence += line else: if limit and len(output) == limit: break regex = re.findall('(' + parametros + ')', sentence, flags=re.I) if regex: casos += len(regex) new_sentence = re.sub('(' + parametros + ')', r'<b>\1</b>', sentence, flags=re.I) tokens = list() header = '!@#' if not '# text = ' in new_sentence else '# text = ' + new_sentence.split( "# text = ")[1].split("\n")[0] for linha in new_sentence.splitlines(): if 'b>' in linha and '\t' in linha: if '\\' in linha: linha = re.sub(r"\\(\d+)", r"\\\\\1", linha, flags=re.I) tokens.append( linha.split('\t')[1].replace('<b>', '').replace( '</b>', '')) header2 = header for token in tokens: header2 = re.sub(r'\b' + re.escape(token) + r'\b', '<b>' + token + '</b>', header2, flags=re.I) for reg in regex: if not isinstance(reg, str): for i, grupo in enumerate(reg): if i != 0: if grupo and i - 1 < len(tabela): token = "" if '\t' in grupo: token = grupo.split('\t')[1] if token: header2 = re.sub( r'\b' + re.escape(token) + r'\b', tabela[i - 1] + token + '/FONT', header2, flags=re.I) new_sentence = new_sentence.replace(header, header2) output.append(new_sentence) sentence = "" sys.stderr.write(f"\ncriterio 1: {time.time() - start}") #If critério 2 if criterio == 2: #Variáveis y = parametros.split('#')[0].strip() z = int(parametros.split('#')[1].strip()) k = parametros.split('#')[2].strip() w = int(parametros.split('#')[3].strip()) for sentence in corpus.sentences.values(): for token in sentence.tokens: colunas = token.to_str().split("\t") if any(colunas[z - 1] == x for x in y.split("|")): descarta = False for _token in sentence.tokens: _colunas = _token.to_str().split("\t") if any(_colunas[w - 1] == x for x in k.split( "|")) and _token.dephead == token.id: descarta = True if not descarta: output.append( re.sub(r"\b" + re.escape(token.word) + r"\b", "<b>" + re.escape(token.word) + "</b>", sentence.to_str())) casos += 1 #Regex Independentes if criterio == 3: regras = [x.strip() for x in parametros.split('::')] for a, sentence in enumerate(qualquercoisa): sentence2 = sentence for b, linha in enumerate(sentence): linha2 = linha if isinstance(linha2, list): sentence2[b] = "\t".join(sentence2[b]) sentence2 = "\n".join(sentence2) descarta = False for regranum, regra in enumerate(regras): if regra[0] == '!': regex = re.search(regra[1:], sentence2, flags=re.IGNORECASE | re.MULTILINE) casos += len( re.findall(regra[1:], sentence2, flags=re.I | re.M)) else: regex = re.search(regra, sentence2, flags=re.IGNORECASE | re.MULTILINE) casos += len( re.findall(regra, sentence2, flags=re.I | re.M)) if (regra[0] == '!' and regex) or (regra[0] != '!' and not regex): descarta = True break sentence2 = re.sub('(' + regra + ')', tabela[regranum] + r'<b>\1</b>/FONT', sentence2, flags=re.IGNORECASE | re.MULTILINE) if not descarta: tokens = list() header = '!@#' for linha in sentence2.splitlines(): if '# text = ' in linha: header = linha if 'b>' in linha and '\t' in linha: if '@' in linha: tokens.append((linha.split('\t')[1].replace( '<b>', '').replace('</b>', '').replace( '@' + linha.split('@')[1].split('/')[0] + '/', ''), '@' + linha.split('@')[1].split('/')[0] + '/')) lastcolor = '@' + linha.split('@')[1].split( '/')[0] + '/' else: tokens.append((linha.split('\t')[1].replace( '<b>', '').replace('</b>', ''), lastcolor)) header2 = header for token in tokens: header2 = re.sub(r'\b' + re.escape(token[0]) + r'\b', token[1] + '<b>' + token[0] + '</b>/FONT', header2) sentence2 = sentence2.replace(header, header2) output.append(sentence2.splitlines()) #pais e filhos if criterio == 4: filho = parametros.split('::')[0].strip() pai = parametros.split('::')[1].strip() negativo_filho = False negativo_pai = False if filho[0] == '!': negativo_filho = True filho = ''.join(filho[1:]) if pai[0] == '!': negativo_pai = True pai = ''.join(pai[1:]) for a, sentenca in enumerate(qualquercoisa): acheifilho = 'não' acheipai = 'não' descarta = False for b, linha in enumerate(sentenca): if isinstance(linha, list): if re.search(filho, '\t'.join(linha), flags=re.IGNORECASE | re.MULTILINE): acheifilho = (linha, b) if isinstance(linha, list): if re.search(pai, '\t'.join(linha), flags=re.IGNORECASE | re.MULTILINE): acheipai = (linha, b) if not negativo_filho and not negativo_pai and acheipai != 'não' and acheifilho != 'não' and acheipai[ 0][0] == acheifilho[0][6]: for c, linha in enumerate(sentenca): if '# text' in linha: qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheipai[0][1]) + r'\b', '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheifilho[0][1]) + r'\b', '<b>@RED/' + acheifilho[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) break qualquercoisa[a][acheipai[1]] = ( '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheipai[1]]) + '/FONT</b>').split('\t') qualquercoisa[a][acheifilho[1]] = ( '<b>@RED/' + '\t'.join(qualquercoisa[a][acheifilho[1]]) + '/FONT</b>').split('\t') output.append(qualquercoisa[a]) break elif negativo_filho and acheipai != 'não' and acheifilho != 'não' and acheipai[ 0][0] == acheifilho[0][6]: descarta = True break elif negativo_pai and acheifilho != 'não' and acheipai != 'não' and acheipai[ 0][0] == acheifilho[0][6]: descarta = True break if negativo_filho and acheipai != 'não' and acheifilho != 'não' and not descarta: for c, linha in enumerate(sentenca): if '# text' in linha: qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheipai[0][1]) + r'\b', '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) break qualquercoisa[a][acheipai[1]] = ( '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheipai[1]]) + '/FONT</b>').split('\t') output.append(qualquercoisa[a]) elif negativo_pai and acheipai != 'não' and acheifilho != 'não' and not descarta: for c, linha in enumerate(sentenca): if '# text' in linha: qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheifilho[0][1]) + r'\b', '<b>@BLUE/' + acheifilho[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) break qualquercoisa[a][acheifilho[1]] = ( '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheifilho[1]]) + '/FONT</b>').split('\t') output.append(qualquercoisa[a]) elif negativo_filho and acheipai != 'não' and acheifilho == 'não': for c, linha in enumerate(sentenca): if '# text' in linha: qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheipai[0][1]) + r'\b', '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) break qualquercoisa[a][acheipai[1]] = ( '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheipai[1]]) + '/FONT</b>').split('\t') output.append(qualquercoisa[a]) elif negativo_pai and acheifilho != 'não' and acheipai == 'não': for c, linha in enumerate(sentenca): if '# text' in linha: qualquercoisa[a][c] = re.sub( r'\b' + re.escape(acheifilho[0][1]) + r'\b', '<b>@RED/' + acheifilho[0][1] + '/FONT</b>', qualquercoisa[a][c], flags=re.IGNORECASE | re.MULTILINE) break qualquercoisa[a][acheifilho[1]] = ( '<b>@RED/' + '\t'.join(qualquercoisa[a][acheifilho[1]]) + '/FONT</b>').split('\t') output.append(qualquercoisa[a]) #Python if criterio == 5: parametros = parametros.split(" and ") for t, parametro in enumerate(parametros): if not any(x in parametros[t] for x in [' = ', '==', '!=', ' < ', ' > ']): parametros[t] = re.findall(r'@?"[^"]+?"', parametros[t].replace(" ", "")) parametros[t] = [ ("@" if "@" in x else "") + ("next_token." * i) + "word = " + x.replace("@", "") for i, x in enumerate(parametros[t]) if x ] parametros[t] = " and ".join(parametros[t]) parametros = " and ".join(parametros) pesquisa = parametros pesquisa = pesquisa.replace(" = ", " == ") pesquisa = pesquisa.replace(" @", " ") if pesquisa[0] == "@": pesquisa = pesquisa[1:] pesquisa = pesquisa.replace(" ", " ").strip() pesquisa = pesquisa.replace(" == ", " == token.") pesquisa = pesquisa.replace(" === ", " === token.") pesquisa = pesquisa.replace(" != ", " != token.") pesquisa = pesquisa.replace(" !== ", " !== token.") pesquisa = pesquisa.replace(" > ", " > token.") pesquisa = pesquisa.replace(" < ", " < token.") pesquisa = pesquisa.replace(" >= ", " >= token.") pesquisa = pesquisa.replace(" <= ", " <= token.") pesquisa = "token." + pesquisa pesquisa = pesquisa.replace(" and ", " and token.") pesquisa = pesquisa.replace(" or ", " or token.") pesquisa = pesquisa.replace(" in ", " in token.") pesquisa = pesquisa.replace(" text ", " sentence.text ") pesquisa = pesquisa.replace(" sent_id ", " sentence.sent_id ") pesquisa = pesquisa.replace('token."', '"') pesquisa = pesquisa.replace('token.[', '[') pesquisa = pesquisa.replace('token.(', '(') pesquisa = pesquisa.replace('token.not', 'not') pesquisa = pesquisa.replace('token.token.', 'token.') pesquisa = pesquisa.replace('token.sentence.', 'sentence.') pesquisa = pesquisa.replace("token.text", "sentence.text") pesquisa = pesquisa.replace("token.sent_id", "sentence.sent_id") pesquisa = pesquisa.replace('token.int(', 'int(') #pesquisa = pesquisa.replace("token.and", "and") # pesquisa = pesquisa.replace("== int(", "==int(") pesquisa = re.sub(r'token\.([1234567890])', r'\1', pesquisa) indexed_conditions = { x.split(" == ")[0].strip().split("token.", 1)[1]: x.split(" == ")[1].strip().replace('"', '') for x in pesquisa.split(" and ") if ' == ' in x and 'token.' in x and not any( y in x for y in ["head_token", "previous_token", "next_token"]) } #["head_token.head", "head_token.next", "head_token.previous", "next_token.head", "next_token.next", "next_token.previous", "previous_token.head", "previous_token.next", "previous_token.previous"])} pesquisa = re.sub(r"token\.([^. ]+?)(\s|$)", r"token.__dict__['\1']\2", pesquisa) pesquisa = re.sub( r'(\S+)\s==\s(\".*?\")', r'any( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("ddd") )', pesquisa ) #ddd provisório enquanto split na barra em pé não funciona pesquisa = re.sub( r'(\S+)\s===\s(\".*?\")', r'all( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )', pesquisa) pesquisa = re.sub( r'(\S+)\s!=\s(\".*?\")', r'not any( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("ddd") )', pesquisa) pesquisa = re.sub( r'(\S+)\s!==\s(\".*?\")', r'not all( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )', pesquisa) pesquisa = pesquisa.strip() if (".__dict__['id']" in pesquisa or ".__dict__['dephead']" in pesquisa ) and (not "int(" in pesquisa) and (" < " in pesquisa or " > " in pesquisa): pesquisa = re.sub(r"(\S+\.__dict__\['(id|dephead)'\])", r"int(\1)", pesquisa) identificador = "token" if parametros[0] == "@": parametros = parametros[1:] arroba = parametros.split( " ")[0] if not ' @' in parametros else parametros.rsplit( " @", 1)[1].replace( "int(", "").split(")")[0].split(" ")[0].replace("(", "") arroba = "token." + arroba arroba = arroba.replace("token.token", "token") arroba = arroba.rsplit(".", 1)[0] agilizar = re.findall(r'"([^"]*)"', parametros) import estrutura_ud if isinstance(arquivoUD, str): if "head_token" in parametros or "next_token" in parametros or "previous_token" in parametros: corpus = estrutura_ud.Corpus(recursivo=True, sent_id=sent_id, keywords=agilizar) else: corpus = estrutura_ud.Corpus(recursivo=False, sent_id=sent_id, keywords=agilizar) start = time.time() corpus.load(arquivoUD) sys.stderr.write("\ncorpus.build: " + str(time.time() - start)) else: corpus = arquivoUD start = time.time() casos = [] t1 = time.time() if indexed_conditions: sentences = defaultdict(list) tokens = defaultdict(list) values = {} for sent_id in corpus.sentences: for col in indexed_conditions: if col in corpus.sentences[sent_id].processed: values = re.findall( r"\n(" + indexed_conditions[col] + r")\n", "\n" + "\n\n".join( list(corpus.sentences[sent_id].processed[col])) + "\n") for value in values: if value: if not isinstance(value, str): value = value[0] tokens[col].extend( corpus.sentences[sent_id].processed[col] [value]) for col in tokens: tokens[col] = set(tokens[col]) tokens_filtered = [] if tokens.values(): tokens_filtered = set.intersection(*list(tokens.values())) ''' priority = ['lemma', 'word', 'deprel', 'upos'] priority_possible = [] for col in indexed_conditions: if col in priority: priority_possible.append(priority.index(col)) if priority_possible: col = priority[min(priority_possible)] else: col = list(indexed_conditions)[0] cols = list(indexed_conditions) cols.remove(col) ''' for token in tokens_filtered: sent_id = token.split("<tok>")[0] t = int(token.split("<tok>")[1]) sentences[sent_id].append(t) else: sentences = corpus.sentences sys.stderr.write(f"\nindexing: {time.time() - t1}") t1 = time.time() for sent_id in sentences: sentence = corpus.sentences[sent_id] sentence2 = sentence clean_text = [ x.word for x in sentence2.tokens if not '-' in x.id and not '.' in x.id ] clean_id = [ x.id for x in sentence2.tokens if not '-' in x.id and not '.' in x.id ] corresponde = 0 tokens = sentence2.tokens_to_str() map_id = {x: t for t, x in enumerate(clean_id)} if limit and limit == len(output): break condition = "global sim; sim = 0" condition += ''' if not indexed_conditions: available_tokens = list(range(len(sentence.tokens))) else: available_tokens = sentences[sent_id] for token_t in available_tokens: token = sentence.tokens[token_t] try: if (not "-" in token.id and not '.' in token.id and (''' + pesquisa + ''')) : corresponde = 1 clean_text[map_id[token.id]] = "@BLUE/" + clean_text[map_id[token.id]] + "/FONT" tokens = tokens.replace(token.string, "@BLUE/" + token.string + "/FONT") '''#try por causa de não ter um next_token no fim de sentença, por ex. if "token.head_token" in pesquisa: condition += ''' clean_text[map_id[token.head_token.id]] = "@RED/" + clean_text[map_id[token.head_token.id]] + "/FONT" tokens = tokens.replace(token.head_token.string, "@RED/" + token.head_token.string + "/FONT")''' if "token.next_token" in pesquisa: condition += ''' clean_text[map_id[token.next_token.id]] = "@BLUE/" + clean_text[map_id[token.next_token.id]] + "/FONT" tokens = tokens.replace(token.next_token.string, "@BLUE/" + token.next_token.string + "/FONT")''' if "token.previous_token" in pesquisa: condition += ''' clean_text[map_id[token.previous_token.id]] = "@BLUE/" + clean_text[map_id[token.previous_token.id]] + "/FONT" tokens = tokens.replace(token.previous_token.string, "@BLUE/" + token.previous_token.string + "/FONT")''' condition += ''' clean_text[map_id[''' + arroba + '''.id]] = "<b>" + clean_text[map_id[''' + arroba + '''.id]] + "</b>"''' exec(condition + ''' casos.append(1) arroba_id = ''' + arroba + '''.id tokens = tokens.splitlines() for l, linha in enumerate(tokens): if linha.split("\\t")[0] == arroba_id or ("/" in linha.split("\\t")[0] and linha.split("\\t")[0].split("/")[1] == arroba_id): tokens[l] = "<b>" + tokens[l] + "</b>" tokens = "\\n".join(tokens) if separate: corresponde = 0 final = "# clean_text = " + " ".join(clean_text) + "\\n" + sentence2.metadados_to_str() + "\\n" + tokens output.append(final) except Exception as e: print(str(e)) print(token.to_str()) pass if corresponde and not separate: corresponde = 0 final = "# clean_text = " + " ".join(clean_text) + "\\n" + sentence2.metadados_to_str() + "\\n" + tokens output.append(final)''') sys.stderr.write("\ncritério 5: " + str(time.time() - start)) casos = len(casos) sys.stderr.write(f"\nfor each sentence: {time.time() - t1}") #Transforma o output em lista de sentenças (sem splitlines e sem split no \t) if criterio not in [5, 2, 1]: for a, sentence in enumerate(output): for b, linha in enumerate(sentence): if isinstance(linha, list): sentence[b] = "\t".join(sentence[b]) output[a] = "\n".join(sentence) start = time.time() for i, final in enumerate(output): if not fastSearch: anotado = estrutura_ud.Sentence(recursivo=False) estruturado = estrutura_ud.Sentence(recursivo=False) anotado.build( web.escape( final.replace('<b>', '@BOLD').replace( '</b>', '/BOLD').replace( '<font color=' + tabelaf['yellow'] + '>', '@YELLOW/').replace( '<font color=' + tabelaf['red'] + '>', '@RED/').replace( '<font color=' + tabelaf['cyan'] + '>', '@CYAN/').replace( '<font color=' + tabelaf['blue'] + '>', '@BLUE/').replace( '<font color=' + tabelaf['purple'] + '>', '@PURPLE/').replace( '</font>', '/FONT'))) estruturado.build( web.unescape(final).replace('<b>', '@BOLD').replace( '</b>', '/BOLD').replace( '<font color=' + tabelaf['yellow'] + '>', '@YELLOW/').replace( '<font color=' + tabelaf['red'] + '>', '@RED/').replace( '<font color=' + tabelaf['cyan'] + '">', '@CYAN/').replace( '<font color=' + tabelaf['blue'] + '>', '@BLUE/'). replace('<font color=' + tabelaf['purple'] + '>', '@PURPLE/').replace('</font>', '/FONT').replace( '@BOLD', '').replace('/BOLD', '').replace( '@YELLOW/', '').replace('@RED/', '').replace( '@CYAN/', '').replace('@BLUE/', '').replace('@PURPLE/', '').replace( '/FONT', '')) else: anotado = "" estruturado = "" output[i] = { 'resultado': final, 'resultadoAnotado': anotado, 'resultadoEstruturado': estruturado, } #sys.stderr.write("\nbuscaDicionarios: " + str(time.time() - start)) sentences = {} if not fastSearch: sentences = { x['resultadoEstruturado'].sent_id: i for i, x in enumerate(output) } return { 'output': output, 'casos': casos, 'sentences': sentences, 'parameters': pesquisa if pesquisa else parametros }
import sys sys.path.append("/home/elvis/ACDC-UD") import estrutura_ud if len(sys.argv < 3): sys.stderr.write("usage: release.conllu workbench_anyversion.conllu") exit() release = estrutura_ud.Corpus(recursivo=True) release.load(sys.argv[1]) workbench = estrutura_ud.Corpus(recursivo=True) workbench.load(sys.argv[2]) for sentid, sentence in workbench.sentences.items(): for t, token in enumerate(sentence.tokens): if token.deprel == "appos:parataxis": release.sentences[sentid].tokens[t].deprel = "appos:parataxis" if token.deprel == "ccomp:parataxis": release.sentences[sentid].tokens[t].deprel = "appos:parataxis" if token.deprel == "xcomp" and token.upos == "VERB" and token.head_token.upos == "VERB": token.deprel = token.head_token.deprel token.dephead = token.head_token.dephead token.head_token.upos = "AUX" token.head_token.deprel = "aux" token.head_token.dephead = token.id for _t, _token in enumerate(sentence.tokens): if _token.upos == "SCONJ" and _token.deprel == "mark" and _token.dephead == token.id: _token.upos = "ADP" _token.deprel = "compound" _token.dephead = token.head_token.id