Ejemplos de Corpus en Python

Lenguaje de programación: Python

Namespace/Package Name: estrutura_ud

Método / Función: Corpus

Ejemplos en hotexamples.com: 23

Python Corpus - 23 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de estrutura_ud.Corpus extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

Archivo: confusao.py Proyecto: alvelvis/ACDC-UD

def main(ud1, ud2, output, coluna = 4):
	conllu1 = LerUD(ud1)
	conllu2 = LerUD(ud2)
	conllu1Estruturado, conllu2Estruturado = estrutura_ud.Corpus(), estrutura_ud.Corpus()
	conllu1Estruturado.load(ud1)
	conllu2Estruturado.load(ud2)
	lista_conllu = get_list(conllu1Estruturado, conllu2Estruturado, coluna)
	lista_conllu1 = lista_conllu['matriz_1']
	lista_conllu2 = lista_conllu['matriz_2']
	pd.options.display.max_rows = None
	pd.options.display.max_columns = None
	pd.set_option('display.expand_frame_repr', False)
	saída = list()
	saída.append('Col ' + str(coluna)+': ' + feats[coluna])
	saída.append('GOLDEN: ' + ud1)
	saída.append('PREVISTO: ' + ud2 + '\n')
	saída.append(str(pd.crosstab(pd.Series(lista_conllu1), pd.Series(lista_conllu2), rownames=['UD[1]'], colnames=['UD[2]'], margins=True)))
	saída.append('\n')
	saída.append('#!$$ Sentenças de GOLDEN que não foram encontradas em PREVISTO:\n')
	for item in lista_conllu['solitários_1']:
			saída.append(item)

		#Output
	if ':' in output: codificação_saída = output.split(':')[1]
	else: codificação_saída = 'utf8'
	output = output.split(':')[0]

	get_percentages(ud1, ud2, output, coluna)

	#Gera os arquivos HTML
	gerar_HTML("\n".join(saída), conllu1, conllu2, coluna, output, codificação_saída)
	#Gera o arquivo "txt" (apenas a matriz)
	open(output, 'w', encoding=codificação_saída).write("\n".join(saída))

Ejemplo n.º 2

Mostrar archivo

Archivo: app.py Proyecto: alvelvis/Julgamento

def getAnnotation():
    if not google.authorized and GOOGLE_LOGIN:
        return redirect(url_for('google.login'))

    html1, html2 = "", ""

    if request.values.get('ud') == 'ud1':
        ud1 = estrutura_ud.Corpus(recursivo=False,
                                  sent_id=request.values.get('sent_id'))
        ud1.load(conllu(request.values.get('c')).findGolden())
        bold = request.values.get('bold') or ""
        annotationUd1 = escape(
            ud1.sentences.get(request.values.get('sent_id')).tokens_to_str())

        html1 = "<table id='t01' style='margin:auto; cursor:pointer; margin-bottom:30px'>"
        for t, linha in enumerate(annotationUd1.splitlines()):
            html1 += "<tr class='bold'>" if bold and t == int(bold) else "<tr>"
            for col, coluna in enumerate(linha.split("\t")):
                if col == 0: drag = 'id notPipe '
                elif col == 6: drag = 'drag notPipe '
                elif col in [3, 7] or coluna == "_": drag = "notPipe "
                else: drag = ""
                html1 += '<td contenteditable=true class="{drag}valor"><input type=hidden name="{col}<coluna>{t}">{coluna}</td>'.format(
                    col=col, t=t, coluna=coluna, drag=drag)
            html1 += "</tr>"
        html1 += "</table>"

    elif request.values.get('ud') == 'ud2':
        ud2 = estrutura_ud.Corpus(recursivo=False,
                                  sent_id=request.values.get('sent_id'))
        ud2.load(conllu(request.values.get('c')).findSystem())
        bold = request.values.get('bold') or ""
        annotationUd2 = escape(
            ud2.sentences.get(request.values.get('sent_id')).tokens_to_str())

        html2 = "<table id='t01' style='margin:auto; cursor:pointer; margin-bottom:30px;'>"
        for t, linha in enumerate(annotationUd2.splitlines()):
            html2 += "<tr class='bold'>" if bold and t == int(bold) else "<tr>"
            for col, coluna in enumerate(linha.split("\t")):
                if col == 0: drag = 'id'
                elif col == 6: drag = 'drag'
                else: drag = ""
                html2 += '<td contenteditable=true class="{drag} valor"><input type=hidden name="{col}<coluna>{t}">{coluna}</td>'.format(
                    col=col, t=t, coluna=coluna, drag=drag)
            html2 += "</tr>"
        html2 += "</table>"

    if 'ud1' in globals():
        del ud1

    if 'ud2' in globals():
        del ud2

    return jsonify({
        'annotationUd1': html1,
        'annotationUd2': html2,
        'success': True,
    })

Ejemplo n.º 3

Mostrar archivo

def renderErrors(c, texto="", exc=[], fromZero=False):
    if not os.path.isfile(conllu(c).findErrors() + "_html") or fromZero:
        if fromZero or not texto:
            #if not os.path.isfile(conllu(c).findErrors()):
            if os.system(JULGAMENTO_FOLDER + f'/.julgamento/bin/python3 {os.path.abspath(os.path.dirname(__file__))}/tools/validate.py {conllu(c).findGolden()} --max-err=0 --lang={VALIDATE_LANG} 2>&1 | tee {conllu(c).findErrors()}'):
                pass
            with open(conllu(c).findErrors()) as f:
                texto = f.read()
        if conllu(c).golden() in allCorpora.corpora and allCorpora.corpora.get(conllu(c).golden()):
            corpus = allCorpora.corpora.get(conllu(c).golden())
        else:
            corpus = estrutura_ud.Corpus(recursivo=True)
            corpus.load(conllu(c).findGolden())
        with open(conllu(c).findGolden(), 'r') as f:
            arquivo = f.read()
            arquivoSplit = arquivo.splitlines()
        sent_ids = {}
        exceptions = [
            'Exception caught',
            'for 9',
            'Non-tree',
            'HEAD == ID',
            'cycle',
            'Skipping'
        ]
        exceptions += exc
        for linha in texto.splitlines():
            if linha and any(x.lower().strip() in linha.lower() for x in exceptions) and ' Node ' in linha and 'Sent ' in linha and ("Line " in linha or ' line ' in linha):
                t = int(linha.split("Line ", 1)[1].split(" ")[0]) if "Line " in linha else int(linha.split(" line ", 1)[1].split(" ")[0])
                if "\t" in arquivoSplit[t-1]:
                    if not linha.split(":", 1)[1] in sent_ids:
                        sent_ids[linha.split(":", 1)[1]] = []
                    bold = {'word': arquivoSplit[t-1].split("\t")[1], 'color': 'black', 'id': arquivo.splitlines()[t-1].split("\t")[0]}# if '\t' in arquivo.splitlines()[t-1] else ""
                    t = allCorpora.corpora[conllu(c).golden()].sentences[linha.split(" Node ")[0].split("Sent ", 1)[1]].map_token_id[arquivo.splitlines()[t-1].split("\t")[0]]
                    sent_ids[linha.split(":", 1)[1]].append({'id': linha.split(" Node ")[0].split("Sent ", 1)[1], 't': t, 'bold': bold})
        html = ""
        for k, problem in enumerate(sorted(sent_ids)):
            html += f"<div class='alert alert-warning' role='alert'>{k+1} / {len(sent_ids)} - {problem}</div>"
            for i, sent_id in enumerate(sent_ids[problem]):
                if sent_id['id'] in corpus.sentences:
                    if sent_id['bold']['word'] and sent_id['bold']['color'] and sent_id['t']:
                        html += f'<div class="panel panel-default"><div class="panel-body">{ i+1 } / { len(sent_ids[problem]) }</div>' + \
                            render_template(
                                'sentence.html',
                                golden=corpus.sentences[sent_id['id']],
                                c=c,
                                t=sent_id['t'],
                                bold=sent_id['bold'],
                                goldenAndSystem=True if conllu(c).system() in allCorpora.corpora else False,
                            ) + "</div></div>"
                    else:
                        html += f'<div class="panel panel-default"><div class="panel-body">{ i+1 } / { len(sent_ids[problem]) }: {sent_id["id"]}</div>'

        with open(conllu(c).findErrors() + "_html", "w") as f:
            f.write(html)
    else:
        with open(conllu(c).findErrors() + "_html") as f:
            html = f.read()
    
    return html

Ejemplo n.º 4

Mostrar archivo

Archivo: api.py Proyecto: alvelvis/Interrogat-rio

def modify_sentid(filename, sent_id, new_sentid):
    corpus = estrutura_ud.Corpus(recursivo=False, sent_id=sent_id)
    corpus.load("./interrogar-ud/conllu/" + filename)
    corpus.sentences[sent_id].sent_id = new_sentid
    corpus.sentences[sent_id].metadados['sent_id'] = new_sentid
    corpus.sentences[new_sentid] = corpus.sentences.pop(sent_id)
    corpus.save("./interrogar-ud/conllu/" + filename)
    return True

Ejemplo n.º 5

Mostrar archivo

def loadCorpus(x):
    if os.path.isfile(conllu(x).findGolden()) and not os.path.isfile(conllu(x).findOriginal()):
        shutil.copyfile(conllu(x).findGolden(), conllu(x).findOriginal())
    if os.path.isfile(conllu(x).findSystem()) and not conllu(x).system() in allCorpora.corpora:
        allCorpora.corpora[conllu(x).system()] = estrutura_ud.Corpus(recursivo=True)
    if not conllu(x).golden() in allCorpora.corpora:
        allCorpora.corpora[conllu(x).golden()] = estrutura_ud.Corpus(recursivo=True)
    if not conllu(x).original() in allCorpora.corpora:
        allCorpora.corpora[conllu(x).original()] = estrutura_ud.Corpus(recursivo=True)
    if conllu(x).system() in allCorpora.corpora and not allCorpora.corpora[conllu(x).system()].sentences:
        sys.stderr.write("\n>>>>>>>>>>>>>> loading system {}...".format(x))
        corpus = estrutura_ud.Corpus(recursivo=True)
        corpus.load(conllu(x).findSystem())
        allCorpora.corpora[conllu(x).system()].sentences = dict(corpus.sentences.items())
        sys.stderr.write(" system ok <<<<<<<<")
    if conllu(x).original() in allCorpora.corpora and not allCorpora.corpora[conllu(x).original()].sentences:
        corpus = estrutura_ud.Corpus(recursivo=True)
        corpus.load(conllu(x).findOriginal())
        allCorpora.corpora[conllu(x).original()].sentences = dict(corpus.sentences.items())
    if conllu(x).golden() in allCorpora.corpora and not allCorpora.corpora[conllu(x).golden()].sentences:
        sys.stderr.write("\n>>>>>>>>>>>>>> loading {}...".format(x))
        corpus = estrutura_ud.Corpus(recursivo=True)
        corpus.load(conllu(x).findGolden())
        allCorpora.corpora[conllu(x).golden()].sentences = dict(corpus.sentences.items())
        sys.stderr.write(" ok <<<<<<<<")
    corpus = ""

Ejemplo n.º 6

Mostrar archivo

Archivo: crossvalidation.py Proyecto: alvelvis/Julgamento

def juntarPartitions(crossvalidation, listaPartitions, listaDeIdsEmOrdem):
    arquivoConlluCompletoAnotado = []
    for partition in sorted([int(x) for x in listaPartitions if x != "sobra"]):
        with open(
                f"partition_{partition}/MC_partition_{partition}/partition_{partition}_sistema.conllu",
                "r") as f:
            arquivoConlluCompletoAnotado.append(
                [x for x in f.read().splitlines()])
            adicionarAoLog(f"partição {partition} acrescida ao corpus anotado")
    with open(
            f"partition_sobra/MC_partition_sobra/partition_sobra_sistema.conllu",
            "r") as f:
        arquivoConlluCompletoAnotado.append([x for x in f.read().splitlines()])
        adicionarAoLog(f"partição sobra acrescida ao corpus anotado")
    for a in range(len(arquivoConlluCompletoAnotado)):
        arquivoConlluCompletoAnotado[a] = "\n".join(
            arquivoConlluCompletoAnotado[a])
    arquivoConlluCompletoAnotado = "\n\n".join(arquivoConlluCompletoAnotado)
    corpusSemOrdem = estrutura_ud.Corpus(recursivo=False)
    corpusSemOrdem.build(arquivoConlluCompletoAnotado)
    corpusOrdem = []
    for sentOrdem in listaDeIdsEmOrdem:
        corpusOrdem.append(corpusSemOrdem.sentences[sentOrdem].to_str())
    corpus = estrutura_ud.Corpus(recursivo=False)
    corpus.build("\n\n".join(corpusOrdem))
    if not os.path.isdir(f"MC_{sys.argv[1]}"): os.mkdir(f"MC_{sys.argv[1]}")
    adicionarAoLog(
        f"salvando corpus anotado em MC_{sys.argv[1]}/{sys.argv[1]}_sistema.conllu"
    )
    corpus.save(f"MC_{sys.argv[1]}/{sys.argv[1]}_sistema.conllu")
    os.system(
        f"cd MC_{sys.argv[1]}; mv {sys.argv[1]}_sistema.conllu ../../; cd ../../; mv {sys.argv[1]}_inProgress {sys.argv[1]}_success 2>&1 | tee -a ../log.txt"
    )
    os._exit(1)
    adicionarAoLog(
        f"finalizado. Resultado em MC_{sys.argv[1]}/{sys.argv[1]}.html")

Ejemplo n.º 7

Mostrar archivo

Archivo: crossvalidation.py Proyecto: alvelvis/Julgamento

def main():
    adicionarAoLog(f"carregando {sys.argv[1]}.conllu")
    conlluVirgem = estrutura_ud.Corpus(recursivo=False)
    conlluVirgem.load(f"{sys.argv[1]}.conllu")

    listaDeIdsEmOrdem = [x.sent_id for x in conlluVirgem.sentences.values()]

    crossvalidation = Crossvalidation(listaDeIdsEmOrdem)
    crossvalidation.montarParticoes()

    listaPartitions = [
        x.rsplit("_", 1)[1] for x in os.listdir(".")
        if os.path.isdir(x) and "partition_" in x
    ]
    checarPartitions(crossvalidation, listaPartitions, listaDeIdsEmOrdem)

Ejemplo n.º 8

Mostrar archivo

def appos_e_ccomp_parataxis(corpus):
    corpus23 = estrutura_ud.Corpus(recursivo=False)
    corpus23.load(
        '/home/elvis/Dropbox/tronco/comcorhd.tronco.me/UD_Portuguese-Bosque/www/interrogar-ud/conllu/bosqueUD_2.3.conllu'
    )
    for sentid, sentence in corpus.sentences.items():
        if 'ccomp:parataxis' in sentence.to_str():
            for token in sentence.tokens:
                if 'ccomp:parataxis' == token.deprel:
                    token.deprel = 'parataxis'
        if 'appos:parataxis' in sentence.to_str():
            for n, token in enumerate(sentence.tokens):
                if 'appos:parataxis' == token.deprel:
                    token.deprel = 'nmod' if corpus23.sentences[sentid].tokens[
                        n].deprel == 'nmod' else 'parataxis'
    return corpus

Ejemplo n.º 9

Mostrar archivo

Archivo: tokenization.py Proyecto: alvelvis/ACDC-UD

def addToken(conllu, sent_id, option, token_id, conllu_completo="", new_tokens=[], mergeSentencesId="", form=False):

    if form:
        if not os.path.isfile("../cgi-bin/tokenization.json"):
            tokenization = {}
            with open("../cgi-bin/tokenization.json", "w") as f:
                json.dump(tokenization, f)

        with open("../cgi-bin/tokenization.json") as f:
            tokenization = json.load(f)

    if not isinstance(conllu, estrutura_ud.Corpus):
        corpus = estrutura_ud.Corpus(recursivo=False, any_of_keywords=[re.escape("# sent_id = " + sent_id + "\n"), re.escape("# sent_id = " + mergeSentencesId + "\n")])
        corpus.load(conllu if not conllu_completo else conllu_completo)
    else:
        corpus = conllu

    if token_id == "left":
        token_id = corpus.sentences[sent_id].tokens[0].id
    elif token_id == "right":
        token_id = str(int(corpus.sentences[sent_id].tokens[-1].id)+1)

    if option in ["add", "addContraction"]:
        
        if not new_tokens:
            if not mergeSentencesId:
                novo_token = estrutura_ud.Token()
                novo_token.build("_\t_\t_\t_\t_\t_\t0\t_\t_\t_")
                new_tokens.append(novo_token)
            else:
                new_tokens = corpus.sentences[mergeSentencesId].tokens
        else:
            novo_token = estrutura_ud.Token()
            novo_token.build(new_tokens[0])
            new_tokens = [novo_token]

        last_id = ""
        for novo_token in reversed(new_tokens):
            if option == "add":
                novo_token.id = token_id if not '-' in novo_token.id else str(int(token_id)) + "-" + str(int(token_id)+int(novo_token.id.split("-")[1])-int(novo_token.id.split("-")[0]))
            elif option == "addContraction":
                novo_token.id = token_id + "-" + token_id
            if mergeSentencesId:
                if not last_id:
                    last_id = corpus.sentences[sent_id].tokens[-1].id
                if token_id == str(int(last_id)+1) and not '-' in novo_token.id:
                    novo_token.dephead = str(int(novo_token.dephead) + int(last_id))
            if not token_id in corpus.sentences[sent_id].map_token_id:
                corpus.sentences[sent_id].tokens.append(novo_token)
                corpus.sentences[sent_id].map_token_id[token_id] = len(corpus.sentences[sent_id].tokens) - 1
            else:
                corpus.sentences[sent_id].tokens.insert(corpus.sentences[sent_id].map_token_id[token_id], novo_token)
            if option == "add":
                for t, token in enumerate(corpus.sentences[sent_id].tokens):
                    if not '-' in novo_token.id:
                        if t > corpus.sentences[sent_id].map_token_id[token_id]:
                            token.id = str(int(token.id)+1) if not '-' in token.id else str(int(token.id.split("-")[0])+1) + "-" + str(int(token.id.split("-")[1])+1)
                            corpus.sentences[sent_id].map_token_id[token.id] = t
                for t, token in enumerate(corpus.sentences[sent_id].tokens):
                    if not mergeSentencesId and token.dephead not in ["0", "_"] and token.dephead in corpus.sentences[sent_id].map_token_id and token_id in corpus.sentences[sent_id].map_token_id and corpus.sentences[sent_id].map_token_id[token.dephead] >= corpus.sentences[sent_id].map_token_id[token_id]:
                        token.dephead = str(int(token.dephead)+1)

            if form:
                if not conllu in tokenization:
                    tokenization[conllu] = {}
                if not sent_id in tokenization[conllu]:
                    tokenization[conllu][sent_id] = []
                tokenization[conllu][sent_id].append({'option': option, 'token_id': token_id, 'new_token': [novo_token.to_str()]})

        if mergeSentencesId and token_id != str(int(last_id)+1):
            for t, token in enumerate(corpus.sentences[sent_id].tokens):
                if token.dephead not in ["0", "_"] and t > int(corpus.sentences[sent_id].map_token_id[new_tokens[-1].id]):
                    token.dephead = str(int(token.dephead) + int(new_tokens[-1].id))
                    
        if mergeSentencesId:
            if token_id == corpus.sentences[sent_id].tokens[0].id:
                corpus.sentences[sent_id].metadados['text'] = corpus.sentences[mergeSentencesId].text + ' ' + corpus.sentences[sent_id].text
            else:
                corpus.sentences[sent_id].metadados['text'] += ' ' + corpus.sentences[mergeSentencesId].text
            corpus.sentences.pop(mergeSentencesId)

    elif option in ["rm"]:
        if not '-' in token_id:
            for t, token in enumerate(corpus.sentences[sent_id].tokens):
                    if token_id in corpus.sentences[sent_id].map_token_id and t > corpus.sentences[sent_id].map_token_id[token_id]:
                        token.id = str(int(token.id)-1) if not '-' in token.id else str(int(token.id.split("-")[0])-1) + "-" + str(int(token.id.split("-")[1])-1)
                    if token.dephead not in ["_", "0"]:
                        if token.dephead in corpus.sentences[sent_id].map_token_id and token_id in corpus.sentences[sent_id].map_token_id and corpus.sentences[sent_id].map_token_id[token.dephead] > corpus.sentences[sent_id].map_token_id[token_id]:
                            token.dephead = str(int(token.dephead)-1)
        corpus.sentences[sent_id].tokens = [x for t, x in enumerate(corpus.sentences[sent_id].tokens) if t != corpus.sentences[sent_id].map_token_id[token_id]]

        if form:
            if not conllu in tokenization:
                tokenization[conllu] = {}
            if not sent_id in tokenization[conllu]:
                tokenization[conllu][sent_id] = []
            tokenization[conllu][sent_id].append({'option': option, 'token_id': token_id})

    if form:
        with open("../cgi-bin/tokenization.json", "w") as f:
            json.dump(tokenization, f)
        corpus.save(conllu + "_tokenization" if not conllu_completo else conllu_completo + "_tokenization")
        os.remove(conllu if not conllu_completo else conllu_completo)
        os.rename(conllu + "_tokenization" if not conllu_completo else conllu_completo + "_tokenization", conllu if not conllu_completo else conllu_completo)
    else:
        return corpus

Ejemplo n.º 10

Mostrar archivo

Archivo: interrogar_UD.py Proyecto: tatianashc/ACDC-UD

def main(arquivoUD,
         criterio,
         parametros,
         limit=0,
         sent_id="",
         fastSearch=False):
    parametros = parametros.strip()

    #Lê o arquivo UD
    if criterio in [1, 2, 3, 4]:
        import estrutura_dados
        import estrutura_ud
        qualquercoisa = estrutura_dados.LerUD(arquivoUD)

    if criterio in [5]:
        import estrutura_ud
        if isinstance(arquivoUD, str):
            with open(arquivoUD, "r") as f:
                if "head_token" in parametros or "next_token" in parametros or "previous_token" in parametros:
                    #qtd = len(parametros.split("head_token")) -1 + len(parametros.split("previous_token")) -1 + len(parametros.split("next_token")) -1
                    corpus = estrutura_ud.Corpus(recursivo=True,
                                                 sent_id=sent_id)
                else:
                    corpus = estrutura_ud.Corpus(recursivo=False,
                                                 sent_id=sent_id)
                start = time.time()
                corpus.build(f.read())
                sys.stderr.write("\ncorpus.build: " + str(time.time() - start))
        else:
            corpus = arquivoUD

    #Cria a lista que vai ser enviada seja ao terminal ou ao HTML
    output = list()
    casos = 0

    #Regex
    tabela = ['@YELLOW/', '@PURPLE/', '@BLUE/', '@RED/', '@CYAN/']

    if criterio == 1:
        for a, sentence in enumerate(qualquercoisa):
            if limit and len(output) == limit:
                break
            sentence2 = sentence
            for b, linha in enumerate(sentence):
                linha2 = linha
                if isinstance(linha2, list):
                    sentence2[b] = "\t".join(sentence2[b])
            sentence2 = "\n".join(sentence2)
            regex = re.search(parametros,
                              sentence2,
                              flags=re.IGNORECASE | re.MULTILINE)
            if regex:
                casos += len(
                    re.findall(parametros,
                               sentence2,
                               flags=re.IGNORECASE | re.MULTILINE))
                cores = len(regex.groups())
                new_sentence = re.sub('(' + parametros + ')',
                                      r'<b>\1</b>',
                                      sentence2,
                                      flags=re.IGNORECASE | re.MULTILINE)
                tokens = list()
                header = '!@#'
                for linha in new_sentence.splitlines():
                    if '# text = ' in linha:
                        header = linha
                    if 'b>' in linha and '\t' in linha:
                        tokens.append(
                            linha.split('\t')[1].replace('<b>', '').replace(
                                '</b>', ''))
                header2 = header
                for token in tokens:
                    header2 = re.sub(r'\b' + re.escape(token) + r'\b',
                                     '<b>' + token + '</b>', header2)
                for i in range(cores):
                    if regex[i + 1] != None and i < len(tabela):
                        token = regex[i + 1]
                        if '\t' in regex[i + 1]:
                            token = regex[i + 1].split('\t')[1]
                            header2 = re.sub(r'\b' + re.escape(token) + r'\b',
                                             tabela[i] + token + '/FONT',
                                             header2)
                new_sentence = new_sentence.replace(header, header2)
                output.append(new_sentence.splitlines())

    #If critério 2
    if criterio == 2:

        #Variáveis
        y = parametros.split('#')[0].strip()
        z = int(parametros.split('#')[1])
        k = [x.strip() for x in parametros.split('#')[2].split('|')]
        w = int(parametros.split('#')[3])
        for sentence in qualquercoisa:
            achei = 'nãoachei'
            descarta = False
            for i, linha in enumerate(sentence):
                if isinstance(linha, list):
                    #print(linha)
                    if y == linha[z - 1]:
                        achei = linha[0]
                        token = linha[1]
                        sentence[i] = '<b>' + '\t'.join(sentence[i]) + '</b>'
                        sentence[i] = sentence[i].split('\t')
                        #break
            if achei != 'nãoachei':
                for i, linha in enumerate(sentence):
                    if '# text' in linha:
                        sentence[i] = re.sub(r'\b' + re.escape(token) + r'\b',
                                             '<b>' + token + '</b>',
                                             sentence[i])

            if achei != 'nãoachei':
                for linha in sentence:
                    if isinstance(linha, list):
                        for k_subitem in k:
                            if achei == linha[6] and k_subitem == linha[z - 1]:
                                descarta = True
                if descarta == False:
                    output.append(sentence)
                    casos += 1

    #Regex Independentes
    if criterio == 3:
        regras = [x.strip() for x in parametros.split('::')]

        for a, sentence in enumerate(qualquercoisa):
            sentence2 = sentence
            for b, linha in enumerate(sentence):
                linha2 = linha
                if isinstance(linha2, list):
                    sentence2[b] = "\t".join(sentence2[b])
            sentence2 = "\n".join(sentence2)
            descarta = False
            for regranum, regra in enumerate(regras):
                if regra[0] == '!':
                    regex = re.search(regra[1:],
                                      sentence2,
                                      flags=re.IGNORECASE | re.MULTILINE)
                    casos += len(
                        re.findall(regra[1:], sentence2, flags=re.I | re.M))
                else:
                    regex = re.search(regra,
                                      sentence2,
                                      flags=re.IGNORECASE | re.MULTILINE)
                    casos += len(
                        re.findall(regra, sentence2, flags=re.I | re.M))
                if (regra[0] == '!' and regex) or (regra[0] != '!'
                                                   and not regex):
                    descarta = True
                    break
                sentence2 = re.sub('(' + regra + ')',
                                   tabela[regranum] + r'<b>\1</b>/FONT',
                                   sentence2,
                                   flags=re.IGNORECASE | re.MULTILINE)
            if not descarta:
                tokens = list()
                header = '!@#'
                for linha in sentence2.splitlines():
                    if '# text = ' in linha:
                        header = linha
                    if 'b>' in linha and '\t' in linha:
                        if '@' in linha:
                            tokens.append((linha.split('\t')[1].replace(
                                '<b>', '').replace('</b>', '').replace(
                                    '@' + linha.split('@')[1].split('/')[0] +
                                    '/', ''), '@' +
                                           linha.split('@')[1].split('/')[0] +
                                           '/'))
                            lastcolor = '@' + linha.split('@')[1].split(
                                '/')[0] + '/'
                        else:
                            tokens.append((linha.split('\t')[1].replace(
                                '<b>', '').replace('</b>', ''), lastcolor))
                header2 = header
                for token in tokens:
                    header2 = re.sub(r'\b' + re.escape(token[0]) + r'\b',
                                     token[1] + '<b>' + token[0] + '</b>/FONT',
                                     header2)
                sentence2 = sentence2.replace(header, header2)
                output.append(sentence2.splitlines())

    #pais e filhos
    if criterio == 4:
        filho = parametros.split('::')[0].strip()
        pai = parametros.split('::')[1].strip()

        negativo_filho = False
        negativo_pai = False
        if filho[0] == '!':
            negativo_filho = True
            filho = ''.join(filho[1:])
        if pai[0] == '!':
            negativo_pai = True
            pai = ''.join(pai[1:])

        for a, sentenca in enumerate(qualquercoisa):
            acheifilho = 'não'
            acheipai = 'não'
            descarta = False
            for b, linha in enumerate(sentenca):
                if isinstance(linha, list):
                    if re.search(filho,
                                 '\t'.join(linha),
                                 flags=re.IGNORECASE | re.MULTILINE):
                        acheifilho = (linha, b)
                if isinstance(linha, list):
                    if re.search(pai,
                                 '\t'.join(linha),
                                 flags=re.IGNORECASE | re.MULTILINE):
                        acheipai = (linha, b)

                if not negativo_filho and not negativo_pai and acheipai != 'não' and acheifilho != 'não' and acheipai[
                        0][0] == acheifilho[0][6]:
                    for c, linha in enumerate(sentenca):
                        if '# text' in linha:
                            qualquercoisa[a][c] = re.sub(
                                r'\b' + re.escape(acheipai[0][1]) + r'\b',
                                '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>',
                                qualquercoisa[a][c],
                                flags=re.IGNORECASE | re.MULTILINE)
                            qualquercoisa[a][c] = re.sub(
                                r'\b' + re.escape(acheifilho[0][1]) + r'\b',
                                '<b>@RED/' + acheifilho[0][1] + '/FONT</b>',
                                qualquercoisa[a][c],
                                flags=re.IGNORECASE | re.MULTILINE)
                            break
                    qualquercoisa[a][acheipai[1]] = (
                        '<b>@BLUE/' +
                        '\t'.join(qualquercoisa[a][acheipai[1]]) +
                        '/FONT</b>').split('\t')
                    qualquercoisa[a][acheifilho[1]] = (
                        '<b>@RED/' +
                        '\t'.join(qualquercoisa[a][acheifilho[1]]) +
                        '/FONT</b>').split('\t')
                    output.append(qualquercoisa[a])
                    break

                elif negativo_filho and acheipai != 'não' and acheifilho != 'não' and acheipai[
                        0][0] == acheifilho[0][6]:
                    descarta = True
                    break

                elif negativo_pai and acheifilho != 'não' and acheipai != 'não' and acheipai[
                        0][0] == acheifilho[0][6]:
                    descarta = True
                    break

            if negativo_filho and acheipai != 'não' and acheifilho != 'não' and not descarta:
                for c, linha in enumerate(sentenca):
                    if '# text' in linha:
                        qualquercoisa[a][c] = re.sub(
                            r'\b' + re.escape(acheipai[0][1]) + r'\b',
                            '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>',
                            qualquercoisa[a][c],
                            flags=re.IGNORECASE | re.MULTILINE)
                        break
                qualquercoisa[a][acheipai[1]] = (
                    '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheipai[1]]) +
                    '/FONT</b>').split('\t')
                output.append(qualquercoisa[a])

            elif negativo_pai and acheipai != 'não' and acheifilho != 'não' and not descarta:
                for c, linha in enumerate(sentenca):
                    if '# text' in linha:
                        qualquercoisa[a][c] = re.sub(
                            r'\b' + re.escape(acheifilho[0][1]) + r'\b',
                            '<b>@BLUE/' + acheifilho[0][1] + '/FONT</b>',
                            qualquercoisa[a][c],
                            flags=re.IGNORECASE | re.MULTILINE)
                        break
                qualquercoisa[a][acheifilho[1]] = (
                    '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheifilho[1]]) +
                    '/FONT</b>').split('\t')
                output.append(qualquercoisa[a])

            elif negativo_filho and acheipai != 'não' and acheifilho == 'não':
                for c, linha in enumerate(sentenca):
                    if '# text' in linha:
                        qualquercoisa[a][c] = re.sub(
                            r'\b' + re.escape(acheipai[0][1]) + r'\b',
                            '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>',
                            qualquercoisa[a][c],
                            flags=re.IGNORECASE | re.MULTILINE)
                        break
                qualquercoisa[a][acheipai[1]] = (
                    '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheipai[1]]) +
                    '/FONT</b>').split('\t')
                output.append(qualquercoisa[a])

            elif negativo_pai and acheifilho != 'não' and acheipai == 'não':
                for c, linha in enumerate(sentenca):
                    if '# text' in linha:
                        qualquercoisa[a][c] = re.sub(
                            r'\b' + re.escape(acheifilho[0][1]) + r'\b',
                            '<b>@RED/' + acheifilho[0][1] + '/FONT</b>',
                            qualquercoisa[a][c],
                            flags=re.IGNORECASE | re.MULTILINE)
                        break
                qualquercoisa[a][acheifilho[1]] = (
                    '<b>@RED/' + '\t'.join(qualquercoisa[a][acheifilho[1]]) +
                    '/FONT</b>').split('\t')
                output.append(qualquercoisa[a])

    #Python
    start = time.time()
    if criterio == 5:
        pesquisa = parametros
        casos = 0

        pesquisa = pesquisa.replace(" = ", " == ")
        pesquisa = pesquisa.replace(" @", " ")
        if pesquisa[0] == "@": pesquisa = pesquisa[1:]
        pesquisa = pesquisa.replace("  ", " ").strip()
        pesquisa = pesquisa.replace(" == ", " == token.")
        pesquisa = pesquisa.replace(" === ", " === token.")
        pesquisa = pesquisa.replace(" != ", " != token.")
        pesquisa = pesquisa.replace(" !== ", " !== token.")
        pesquisa = pesquisa.replace(" > ", " > token.")
        pesquisa = pesquisa.replace(" < ", " < token.")
        pesquisa = pesquisa.replace(" >= ", " >= token.")
        pesquisa = pesquisa.replace(" <= ", " <= token.")
        pesquisa = "token." + pesquisa
        pesquisa = pesquisa.replace(" and ", " and token.")
        pesquisa = pesquisa.replace(" or ", " or token.")
        pesquisa = pesquisa.replace(" in ", " in token.")
        pesquisa = pesquisa.replace('token."', '"')
        pesquisa = pesquisa.replace('token.[', '[')
        pesquisa = pesquisa.replace('token.(', '(')

        pesquisa = pesquisa.replace('token.not', 'not')
        pesquisa = pesquisa.replace('token.token.', 'token.')
        pesquisa = re.sub(r'token\.([1234567890])', r'\1', pesquisa)

        pesquisa = re.sub(
            r'(\S+)\s==\s(\".*?\")',
            r'any( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )',
            pesquisa)
        pesquisa = re.sub(
            r'(\S+)\s===\s(\".*?\")',
            r'all( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )',
            pesquisa)
        pesquisa = re.sub(
            r'(\S+)\s!=\s(\".*?\")',
            r'not any( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )',
            pesquisa)
        pesquisa = re.sub(
            r'(\S+)\s!==\s(\".*?\")',
            r'not all( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )',
            pesquisa)
        pesquisa = pesquisa.strip()

        if (".id" in pesquisa or ".dephead" in pesquisa) and (
                not "int(" in pesquisa) and ("<" in pesquisa
                                             or ">" in pesquisa):
            pesquisa = re.sub(r"(\b\S+\.(id|dephead)\b)", r"int(\1)", pesquisa)

        identificador = "token"

        if parametros[0] == "@":
            parametros = parametros[1:]

        arroba = parametros.split(
            " ")[0] if not ' @' in parametros else parametros.rsplit(
                " @", 1)[1].replace(
                    "int(", "").split(")")[0].split(" ")[0].replace("(", "")
        arroba = "token." + arroba
        arroba = arroba.replace("token.token", "token")
        arroba = arroba.rsplit(".", 1)[0]
        #if " in " in arroba: arroba = arroba.split(" in ")[1]

        #with open("expressao_busca.txt", "w") as f:
        #f.write(f"parametro: {parametros}\npesquisa: {pesquisa}\narroba: {arroba}")

        agilizar = re.findall(r'"([^"]*)"', parametros)
        #print(agilizar)
        #agilizado = [x for x in corpus.sentences.values() if all(re.search(y, x.to_str()) for y in agilizar)]
        #agilizado = corpus.sentences.values()
        agilizado = filter(
            lambda x: all(re.search(y, x.to_str()) for y in agilizar),
            corpus.sentences.values())
        #print(agilizado)

        for sentence in agilizado:
            if limit and limit == len(output):
                break
            condition = "global sim; global sentence2; sim = 0; sentence2 = copy.copy(sentence); sentence2.print = sentence2.tokens_to_str()"

            condition += '''
for ''' + identificador + ''' in sentence.tokens:
	try:
		if not "-" in '''+identificador+'''.id and (''' + pesquisa + ''') :
			sentence2.metadados['text'] = re.sub(r'\\b(' + re.escape('''+ identificador +'''.word) + r')\\b', r"@RED/\\1/FONT", sentence2.metadados['text'], flags=re.IGNORECASE|re.MULTILINE)
			sentence2.print = sentence2.print.replace('''+ identificador +'''.to_str(), "@RED/" + '''+ identificador +'''.to_str() + "/FONT")
	'''#try por causa de não ter um next_token no fim de sentença, por ex.
            if identificador + ".head_token" in pesquisa:
                condition += '''
			sentence2.metadados['text'] = re.sub(r'\\b(' + re.escape(''' + identificador + '''.head_token.word) + r')\\b', r"@BLUE/\\1/FONT", sentence2.metadados['text'], flags=re.IGNORECASE|re.MULTILINE)
			sentence2.print = sentence2.print.replace(''' + identificador + '''.head_token.to_str(), "@BLUE/" + ''' + identificador + '''.head_token.to_str() + "/FONT")'''

            condition += '''
			sentence2.metadados['text'] = re.sub(r'\\b(' + re.escape(''' + arroba + '''.word) + r')\\b', r"<b>\\1</b>", sentence2.metadados['text'], flags=re.IGNORECASE|re.MULTILINE)
			final = sentence2.metadados_to_str() + "\\n" + sentence2.print
			final = final.splitlines()
			arroba = ''' + arroba + '''.id
			for l, linha in enumerate(final):
				if linha.split("\\t")[0] == arroba or ("/" in linha.split("\\t")[0] and linha.split("\\t")[0].split("/")[1] == arroba):
					final[l] = "<b>" + final[l] + "</b>"
			final = "\\n".join(final)'''

            exec(condition + '''
			output.append(final)
	except Exception as e:
		print(e)
		pass''')
        sys.stderr.write("\ncritério 5: " + str(time.time() - start))
    #Transforma o output em lista de sentenças (sem splitlines e sem split no \t)
    if criterio not in [5]:
        for a, sentence in enumerate(output):
            for b, linha in enumerate(sentence):
                if isinstance(linha, list):
                    sentence[b] = "\t".join(sentence[b])
            output[a] = "\n".join(sentence)

    start = time.time()
    for i, final in enumerate(output):
        if not fastSearch:
            anotado = estrutura_ud.Sentence(recursivo=False)
            estruturado = estrutura_ud.Sentence(recursivo=False)
            anotado.build(
                cgi.escape(
                    final.replace('<b>', '@BOLD').replace(
                        '</b>', '/BOLD').replace(
                            '<font color=' + tabelaf['yellow'] + '>',
                            '@YELLOW/').replace(
                                '<font color=' + tabelaf['red'] + '>',
                                '@RED/').replace(
                                    '<font color=' + tabelaf['cyan'] + '>',
                                    '@CYAN/').replace(
                                        '<font color=' + tabelaf['blue'] + '>',
                                        '@BLUE/').replace(
                                            '<font color=' +
                                            tabelaf['purple'] + '>',
                                            '@PURPLE/').replace(
                                                '</font>', '/FONT')))
            estruturado.build(
                web.unescape(final).replace('<b>', '@BOLD').replace(
                    '</b>', '/BOLD').replace(
                        '<font color=' + tabelaf['yellow'] + '>',
                        '@YELLOW/').replace(
                            '<font color=' + tabelaf['red'] + '>',
                            '@RED/').replace(
                                '<font color=' + tabelaf['cyan'] + '">',
                                '@CYAN/').replace(
                                    '<font color=' + tabelaf['blue'] + '>',
                                    '@BLUE/').
                replace('<font color=' + tabelaf['purple'] + '>',
                        '@PURPLE/').replace('</font>', '/FONT').replace(
                            '@BOLD', '').replace('/BOLD', '').replace(
                                '@YELLOW/', '').replace('@RED/', '').replace(
                                    '@CYAN/',
                                    '').replace('@BLUE/',
                                                '').replace('@PURPLE/',
                                                            '').replace(
                                                                '/FONT', ''))
        else:
            anotado = ""
            estruturado = ""
        output[i] = {
            'resultado': final,
            'resultadoAnotado': anotado,
            'resultadoEstruturado': estruturado,
        }
    #sys.stderr.write("\nbuscaDicionarios: " + str(time.time() - start))

    return {'output': output, 'casos': casos}

Ejemplo n.º 11

Mostrar archivo

                antigoRootAUX = token.head_token.dephead
                token.head_token.dephead = antigoRootVERB
                token.dephead = antigoRootAUX
                token.upos = "SCONJ"
                token.deprel = "mark"
    for sentence in corpus.sentences.values():
        for token in sentence.tokens:
            if token.upos == "AUX" and token.head_token.upos == "VERB" and token.lemma not in "ser|estar|ir|ter|haver".split(
                    "|"):
                token.deprel = token.head_token.deprel
                token.upos = "VERB"
                token.head_token.deprel = "xcomp"
                token.dephead = token.head_token.dephead
                token.head_token.dephead = token.id

    return corpus


if not os.system("sh scripts/1_criar_branch.sh " + sys.argv[1]):
    corpus = estrutura_ud.Corpus(recursivo=True)
    corpus.load(
        '/home/elvis/Dropbox/tronco/comcorhd.tronco.me/UD_Portuguese-Bosque/www/interrogar-ud/conllu/'
        + sys.argv[1] + ".conllu")
    corpus = appos_e_ccomp_parataxis(corpus)
    corpus = loc_verbal_aspectual(corpus)

    corpus.save(
        '/home/elvis/Dropbox/tronco/comcorhd.tronco.me/UD_Portuguese-Bosque/www/interrogar-ud/conllu/'
        + sys.argv[1] + ".conllu")
    os.system("sh scripts/2_1-commit.sh " + sys.argv[1] + " release_changes")

Ejemplo n.º 12

Mostrar archivo

from functions import prettyDate
from datetime import datetime

form = cgi.FieldStorage()

contextoEsquerda = ["", ""]
contextoDireita = ["", ""]
sent_id = form['sent_id'].value if 'sent_id' in form else ""
id = form['id'].value if 'id' in form else ""
conllu = form['corpus'].value

numero = re.search(r'^\d+$',
                   sent_id.rsplit('-', 1)[1])[0] if '-' in sent_id else sent_id
identificador = sent_id.rsplit("-", 1)[0] + "-" if '-' in sent_id else ""

corpus = estrutura_ud.Corpus(recursivo=False,
                             keywords=[re.escape(identificador)])
corpus.load('./interrogar-ud/conllu/' + form['corpus'].value)

contextoEsquerda = []
contextoDireita = []

for i in range(int(numero) - 1):
    if identificador + str(i + 1) in corpus.sentences:
        contextoEsquerda.append([
            identificador + str(i + 1),
            corpus.sentences[identificador + str(i + 1)].text
        ])

all_sentences = [
    x for x in corpus.sentences if x.rsplit("-", 1)[0] + "-" == identificador
]

Ejemplo n.º 13

Mostrar archivo

Archivo: api.py Proyecto: alvelvis/Interrogat-rio

def delete_sentence(filename, sent_id):
    corpus = estrutura_ud.Corpus(recursivo=False, sent_id=sent_id)
    corpus.load("./interrogar-ud/conllu/" + filename)
    del corpus.sentences[sent_id]
    corpus.save("./interrogar-ud/conllu/" + filename)
    return True

Ejemplo n.º 14

Mostrar archivo

def api():
    api_response = ""
    bot_response = ""

    # PARSE THE USER INPUT AND LOOK FOR PATTERNS
    if request.form.get('api_response'):
        api_response = app_dict[request.form.get('api_response')]
    if request.form.get('input'):
        udpipe_url = "http://lindat.mff.cuni.cz/services/udpipe/api/process?tokenizer&tagger&parser"
        udpipe_data = urllib.parse.urlencode({
            'data':
            request.form.get('input'),
            'model':
            "portuguese-bosque-ud-2.6-200830",
        }).encode('ascii')
        with urllib.request.urlopen(udpipe_url, udpipe_data) as f:
            udpipe_output = json.loads(remove_accents(
                f.read().decode('utf-8')))['result']
            text = estrutura_ud.Corpus(recursivo=True)
            text.build(udpipe_output)
        print("input: {}".format(text.to_str()))

        # try to find linguistic pattern
        for pattern in linguistic_patterns:
            for query in linguistic_patterns[pattern]:
                #print(query)
                if all(
                        interrogar_UD.main(text, 5, x, fastSearch=True)
                    ['casos'] for x in query):
                    bot_response = responses[pattern]
                    break

        # try to answer from wikipedia
        if not bot_response:
            # get most awkward word in input
            names = []
            verbs = []
            for sentence in text.sentences.values():
                for token in sentence.tokens:
                    clean_token = token.lemma.lower()
                    if token.upos in [
                            "NOUN", "PROPN"
                    ] and not clean_token in names and clean_token in frequency_of_important_words:
                        names.append(clean_token)
                    if (
                            token.upos in ["VERB"] or token.deprel in ["cop"]
                    ) and not clean_token in verbs and clean_token in frequency_of_important_words:
                        verbs.append(clean_token)
            most_awkward_name = sorted(
                names, key=lambda x: frequency_of_important_words[x])
            most_awkward_verb = sorted(
                verbs, key=lambda x: frequency_of_important_words[x])
            if most_awkward_name:
                try:
                    bot_response = wikipedia.summary(most_awkward_name[0],
                                                     sentences=2)
                except wikipedia.DisambiguationError as e:
                    s = random.choice(e.options)
                    bot_response = wikipedia.summary(s, sentences=2)

        # try to answer from "pensador"
        if not bot_response and most_awkward_verb:
            with urllib.request.urlopen("https://www.pensador.com/{}/".format(
                    most_awkward_verb[0])) as f:
                soup = BeautifulSoup(f, "html.parser")
                parse = soup.find_all("p", class_="frase")
                bot_response = random.choice(parse).get_text() if parse else ""

        # no answer found
        if not bot_response:
            bot_response = "Desculpe, ainda não sei como responder..."

    response = jsonify({
        "api_response": api_response,
        "bot_response": bot_response
    })
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response

Ejemplo n.º 15

Mostrar archivo

Archivo: validar_UD.py Proyecto: alvelvis/ACDC-UD

def validate(conllu,
             sent_id=None,
             errorList="validar_UD.txt",
             noMissingToken=False):

    errorDictionary = {}

    if isinstance(conllu, str):
        corpus = estrutura_ud.Corpus(recursivo=True, sent_id=sent_id)
        corpus.load(conllu)
    else:
        corpus = conllu

    for sentence in corpus.sentences.values():
        if not sentence.text.endswith(sentence.tokens[-1].word):
            if not '1 - Sentença não termina com o último token' in errorDictionary:
                errorDictionary[
                    '1 - Sentença não termina com o último token'] = []
            errorDictionary[
                '1 - Sentença não termina com o último token'].append({
                    "sentence":
                    "",
                    "sent_id":
                    sentence.sent_id,
                    "t":
                    sentence.map_token_id["1"],
                    "attribute":
                    "id",
                })

        temRoot = False
        tem2Root = False
        for token in sentence.tokens:
            if token.deprel == "root" and not temRoot:
                temRoot = True
            elif token.deprel == "root" and temRoot:
                tem2Root = True
        if tem2Root:
            if not '1 - Tem 2 root' in errorDictionary:
                errorDictionary['1 - Tem 2 root'] = []
            errorDictionary['1 - Tem 2 root'].append({
                "sentence":
                sentence,
                "sent_id":
                sentence.sent_id,
                "t":
                sentence.map_token_id["1"],
                "attribute":
                "id",
            })
        if not temRoot:
            if not '1 - Não tem root' in errorDictionary:
                errorDictionary['1 - Não tem root'] = []
            errorDictionary['1 - Não tem root'].append({
                "sentence":
                sentence,
                "sent_id":
                sentence.sent_id,
                "t":
                sentence.map_token_id["1"],
                "attribute":
                "id",
            })

    if not noMissingToken:
        missingToken = re.findall(r"\n\n(?!#|$).*", corpus.to_str())
        if missingToken:
            if not '1 - Há tokens faltando no corpus' in errorDictionary:
                errorDictionary['1 - Há tokens faltando no corpus'] = []
            for missing in missingToken:
                errorDictionary['1 - Há tokens faltando no corpus'].append({
                    "sentence":
                    "",
                    "sent_id":
                    "<pre>" + missing + "</pre>",
                })

    with open(errorList) as f:
        errorListFile = f.read().splitlines()
        errorList = []
        [errorList.append(x) for x in errorListFile]

    for error in errorList:
        if error and error[0] != "#":
            if "erro: " in error:
                comment = error.split("erro: ")[1]
                comment = comment.strip()
                coluna = error.split(
                    "|", 1)[0] if "|" in error.split("erro: ")[0] else ""
                continue

            parameters = error.strip()
            for sentString in interrogar_UD.main(corpus,
                                                 5,
                                                 parameters,
                                                 0,
                                                 sent_id,
                                                 separate=True)['output']:
                if not comment in errorDictionary:
                    errorDictionary[comment] = []
                sentence = estrutura_ud.Sentence(recursivo=True)
                sentence.build(fromInterrogarToHtml(sentString['resultado']))
                tokenT = 0
                for t, token in enumerate(sentence.tokens):
                    if "<b>" in token.to_str():
                        tokenT = t
                        break

                errorDictionary[comment].append({
                    "t": tokenT,
                    "sentence": sentence,
                    "attribute": coluna,
                })

    return errorDictionary

Ejemplo n.º 16

Mostrar archivo

Archivo: tokenization.py Proyecto: alvelvis/ACDC-UD

def splitSentence(conllu, sent_id, sameSentenceId, newSentenceId, sameText, newText, token_id, conllu_completo="", form=False):

    if form:
        if not os.path.isfile("../cgi-bin/tokenization.json"):
            tokenization = {}
            with open("../cgi-bin/tokenization.json", "w") as f:
                json.dump(tokenization, f)

        with open("../cgi-bin/tokenization.json") as f:
            tokenization = json.load(f)

    if not isinstance(conllu, estrutura_ud.Corpus):
        corpus = estrutura_ud.Corpus(recursivo=False, any_of_keywords=[re.escape("# sent_id = " + sent_id + "\n"), re.escape("# sent_id = " + mergeSentencesId + "\n")])
        corpus.load(conllu if not conllu_completo else conllu_completo)
    else:
        corpus = conllu

    new_sentence = estrutura_ud.Sentence(recursivo=True)
    new_sentence.build(corpus.sentences[sent_id].to_str())

    new_sentence.sent_id = newSentenceId
    new_sentence.metadados['sent_id'] = newSentenceId

    new_token = False
    new_sentence_tokens = []
    old_sentence_tokens = []
    removed_tokens = 0
    for token in corpus.sentences[sent_id].tokens:
        if new_token:
            new_sentence_tokens.append(token)
        else:
            old_sentence_tokens.append(token)
        if not '-' in token.id and not new_token:
            removed_tokens += 1
        if token.id == token_id:
            new_token = True
        
    new_sentence.tokens = new_sentence_tokens
    corpus.sentences[sent_id].tokens = old_sentence_tokens
    corpus.sentences[sent_id].metadados['text'] = sameText
    corpus.sentences[sent_id].text = sameText
    corpus.sentences[new_sentence.sent_id] = new_sentence
    corpus.sentences[new_sentence.sent_id].refresh_map_token_id()
    corpus.sentences[new_sentence.sent_id].metadados['text'] = newText
    corpus.sentences[new_sentence.sent_id].text = newText
    corpus.sent_id = sameSentenceId
    corpus.sentences[sameSentenceId] = corpus.sentences.pop(sent_id)
    corpus.sentences[sameSentenceId].metadados['sent_id'] = sameSentenceId
    corpus.sentences[sameSentenceId].sent_id = sameSentenceId

    for t, token in enumerate(corpus.sentences[new_sentence.sent_id].tokens):
        token.id = str(int(token.id)-removed_tokens) if not '-' in token.id else str(int(token.id.split("-")[0])-removed_tokens) + "-" + str(int(token.id.split("-")[1])-removed_tokens)
        if token.dephead not in ["_", "0"]:
            token.dephead = str(int(token.dephead)-removed_tokens)
            if int(token.dephead) < 0:
                token.dephead = "0"

    if form:
        with open("../cgi-bin/tokenization.json", "w") as f:
            json.dump(tokenization, f)
        corpus.save(conllu + "_tokenization" if not conllu_completo else conllu_completo + "_tokenization")
        os.remove(conllu if not conllu_completo else conllu_completo)
        os.rename(conllu + "_tokenization" if not conllu_completo else conllu_completo + "_tokenization", conllu if not conllu_completo else conllu_completo)
        return new_sentence.sent_id
    else:
        return corpus

Ejemplo n.º 17

Mostrar archivo

Archivo: tirar_xpos.py Proyecto: alvelvis/ACDC-UD

import estrutura_ud
import sys
corpus = estrutura_ud.Corpus()
corpus.load(sys.argv[1])

for sentence in corpus.sentences.values():
    for token in sentence.tokens:
        token.xpos = "_"
        if '-' in token.id:
            token.deps = "_"

print(corpus.to_str())

Ejemplo n.º 18

Mostrar archivo

Archivo: ACDC-UD.py Proyecto: alvelvis/ACDC-UD

    metadados = {}
    if 'obra id=' in corpus:
        corpus_key = "obra"
    lista_tags = []
    sentences = []
    tokens = []
    lista_faltantes = []
    dep_lugar_errado = []
    lista_contracoes = []
    sent_id = 1
    primeira_plus = False
    ja_primeira_plus = False
    mwe = False

    if os.path.isfile("corpus.conllu"):
        corpus = estrutura_ud.Corpus(recursivo=False)
        corpus.load("corpus.conllu")
    else:

        for l, linha in enumerate(corpus_splitlines):
            if l % 1000 == 0:
                sys.stderr.write("\nLinha processada: {}/{}".format(
                    l, corpus_splitlines_len))

            try:

                if linha.strip().startswith("<") and ' id="' in linha:
                    metadados[linha.strip().split("<")[1].split(' id="')
                              [0]] = re.search('<.*? id="([^"]+)"', linha)[1]

                if linha.strip().startswith("<") and not linha.strip(

Ejemplo n.º 19

Mostrar archivo

Archivo: app.py Proyecto: alvelvis/Julgamento

def upload(alert="", success=""):
    if not google.authorized and GOOGLE_LOGIN:
        return redirect(url_for('google.login'))
    if request.method == "GET":
        return render_template('upload.html',
                               user=google.get('/oauth2/v2/userinfo').json(),
                               formDB=formDB())

    elif request.method == "POST" and 'goldenFile' in request.files:
        goldenFile = request.files.get('goldenFile')
        if goldenFile.filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS:
            goldenFileName = removerAcento(
                conllu(request.values.get('goldenName')).golden())
            if (INTERROGATORIO and
                    not os.path.isfile(COMCORHD_FOLDER + '/' + goldenFileName)
                ) or (
                    not INTERROGATORIO and
                    not os.path.isfile(UPLOAD_FOLDER + '/' + goldenFileName)):
                goldenFile.save(
                    COMCORHD_FOLDER + '/' +
                    goldenFileName) if INTERROGATORIO else goldenFile.save(
                        UPLOAD_FOLDER + '/' + goldenFileName)
                shutil.copyfile(
                    conllu(goldenFileName).findGolden(),
                    conllu(goldenFileName).findOriginal())
                textInterrogatorio = "(1) Realize buscas e edições no corpus pelo <a href='http://github.com/alvelvis/Interrogat-rio'>Interrogatório</a>, ou, (2) "
                success = f'"{goldenFileName}" enviado com sucesso! {textInterrogatorio if INTERROGATORIO else ""}Julgue-o na <a href="/corpus">página inicial</a>.'
            else:
                alert = "Arquivo golden já existe na pasta."
        else:
            alert = 'Extensão deve estar entre "' + ",".join(
                ALLOWED_EXTENSIONS) + '"'

    elif request.method == "POST" and 'systemFile' in request.files:
        goldenFile = request.values.get('sysGoldenFile')
        systemFile = request.files.get('systemFile')
        if systemFile.filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS:
            systemFileName = conllu(goldenFile).system()
            systemFile.save(UPLOAD_FOLDER + '/' + systemFileName)
            if not os.path.isfile(conllu(systemFileName).findOriginal()):
                shutil.copyfile(
                    conllu(systemFileName).findGolden(),
                    conllu(systemFileName).findOriginal())
            corpusGolden = estrutura_ud.Corpus(recursivo=False)
            corpusSystem = estrutura_ud.Corpus(recursivo=False)
            corpusGolden.load(conllu(goldenFile).findGolden())
            corpusSystem.load(conllu(goldenFile).findSystem())
            if len(corpusGolden.sentences) != len(corpusSystem.sentences):
                alert = "Arquivo sistema não tem o mesmo número de sentenças do arquivo golden."
                os.remove(conllu(goldenFile).findSystem())
            else:
                success = f'"{systemFileName}" enviado com sucesso! Julgue o corpus na <a href="/corpus">página inicial</a>.'
                addDatabase(goldenFile)
            #loadCorpus.submit(goldenFile)
            del corpusGolden
            del corpusSystem
        else:
            alert = 'Extensão deve estar entre "' + ",".join(
                ALLOWED_EXTENSIONS) + '"'

    elif request.method == 'POST' and 'trainFile' in request.values:
        corpusTemporario = False
        if os.path.isfile(COMCORHD_FOLDER + "/" +
                          conllu(request.values.get('trainFile')).golden()):
            os.system(
                f'cp {COMCORHD_FOLDER + "/" + conllu(request.values.get("trainFile")).golden()} {UPLOAD_FOLDER}'
            )
            corpusTemporario = f"; rm {UPLOAD_FOLDER}/{conllu(request.values.get('trainFile')).golden()} &"
        if not request.values.get('crossvalidation'):
            Popen(
                f"cd {UPLOAD_FOLDER}; cp {conllu(request.values.get('trainFile')).golden()} {conllu(request.values.get('trainFile')).naked + '_test'}.conllu; sh udpipe.sh {conllu(request.values.get('trainFile')).naked + '_test'} {request.values.get('partitions')} 2>&1 | tee -a {conllu(request.values.get('trainFile')).naked + '_test'}_inProgress {corpusTemporario if corpusTemporario else '&'}",
                shell=True)
            nomeConllu = conllu(
                request.values.get('trainFile')).naked + "_test"
        else:
            Popen(
                f"cd {UPLOAD_FOLDER}; sh crossvalidation.sh {request.values.get('trainFile')} {request.values.get('partitions')} 2>&1 | tee -a {request.values.get('trainFile')}_inProgress {corpusTemporario if corpusTemporario else '&'}",
                shell=True)
            nomeConllu = conllu(request.values.get('trainFile')).naked
        novoCorpus = models.Corpus(
            name=nomeConllu,
            date=str(datetime.datetime.now()),
            sentences=0,
            about=request.values.get('about')
            if request.values.get('about') else ">",
            partitions=request.values.get('partitions'),
            author=google.get('/oauth2/v2/userinfo').json()['email']
            if GOOGLE_LOGIN else "",
            goldenAlias='Golden',
            systemAlias='Sistema')
        db.session.add(novoCorpus)
        db.session.commit()
        success = "Um modelo está sendo treinado a partir do corpus \"" + nomeConllu + "\". Acompanhe o status do treinamento na <a href='/'>página inicial do Julgamento.</a>"

    elif request.method == 'POST' and 'repoName' in request.values:
        sh = f"cd {UPLOAD_FOLDER}/repositories/{request.values.get('repoName')}; \
				git pull; \
					git checkout {request.values.get('repoCommit').split(' | commit ')[1]}; \
						cat documents/*.conllu > {conllu(removerAcento(request.values.get('repoCorpusName'))).findGolden()}; \
							cat documents/*.conllu > {conllu(removerAcento(request.values.get('repoCorpusName'))).findOriginal()}"

        if request.values.get('criarRamo'):
            sh += f"; git checkout -b {removerAcento(request.values.get('repoCorpusName'))}; \
						git push --set-upstream origin {removerAcento(request.values.get('repoCorpusName'))}"

        if not os.path.isfile(
                f"{conllu(removerAcento(request.values.get('repoCorpusName'))).findGolden()}"
        ):
            os.system(sh)
            textInterrogatorio = "(1) Realize buscas e edições no corpus pelo <a href='http://github.com/alvelvis/interrogat-rio'>Interrogatório</a>, ou, (2) "
            success = f"Corpus {'e ramo ' if request.values.get('criarRamo') else ''}\"{removerAcento(request.values.get('repoCorpusName'))}\" criado{'s' if request.values.get('criarRamo') else ''} com sucesso! {textInterrogatorio if INTERROGATORIO else ''}Para prosseguir com o julgamento, treine um modelo a partir desse corpus clicando no menu lateral \"Treinar um modelo\" ou envie um arquivo sistema equivalente ao corpus."
        else:
            alert = f"Corpus com o nome '{removerAcento(request.values.get('repoCorpusName'))}' já existe."

    return render_template('upload.html',
                           alert=alert,
                           success=success,
                           user=google.get('/oauth2/v2/userinfo').json(),
                           formDB=formDB())

Ejemplo n.º 20

Mostrar archivo

Archivo: confusao.py Proyecto: alvelvis/ACDC-UD

def get_percentages(ud1, ud2, output, coluna):
	if not os.path.isdir("UAS"):
		os.mkdir("UAS")
	UAS = dict()

	with open(ud1, "r") as f:
		golden = estrutura_ud.Corpus()
		golden.build(f.read())	

	with open(ud2, "r") as f:
		system = estrutura_ud.Corpus()
		system.build(f.read())

	dicionario = {}
	for sentid, sentence in golden.sentences.items():
		for t, token in enumerate(sentence.tokens):
			if not token.__dict__[feats[coluna].lower()] in dicionario:
				if coluna == 8:
					dicionario[token.__dict__[feats[coluna].lower()]] = [0, 0, 0, 0, 0]
					UAS[token.deprel] = dict()
				else:
					dicionario[token.__dict__[feats[coluna].lower()]] = [0, 0, 0]
			dicionario[token.__dict__[feats[coluna].lower()]][0] += 1
			if system.sentences[sentid].tokens[t].__dict__[feats[coluna].lower()] == token.__dict__[feats[coluna].lower()]:
				dicionario[token.__dict__[feats[coluna].lower()]][1] += 1
				if coluna == 8:
					if system.sentences[sentid].tokens[t].dephead == token.dephead:
						dicionario[token.deprel][2] += 1
					else:
						tok_golden = token.head_token.upos
						tok_system = system.sentences[sentid].tokens[t].head_token.upos
						tok_golden += "_L" if int(token.head_token.id) < int(token.id) else "_R"
						tok_system += "_L" if int(system.sentences[sentid].tokens[t].head_token.id) < int(system.sentences[sentid].tokens[t].id) else "_R"
						if tok_golden + "/" + tok_system in UAS[token.deprel]:
							UAS[token.deprel][tok_golden + "/" + tok_system]["qtd"] += 1
						else:
							UAS[token.deprel][tok_golden + "/" + tok_system] = {"qtd": 1, "sentences": []}
						UAS[token.deprel][tok_golden + "/" + tok_system]["sentences"].append([sentence, system.sentences[sentid], token, token.head_token, system.sentences[sentid].tokens[t].head_token, system.sentences[sentid].tokens[t]])

	sent_accuracy = [0, 0]
	for sentid, sentence in golden.sentences.items():
		if sentid in system.sentences and len(sentence.tokens) == len(system.sentences[sentid].tokens):
			sent_accuracy[0] += 1
			acertos = 0
			for t, token in enumerate(sentence.tokens):
				if system.sentences[sentid].tokens[t].upos == token.upos and system.sentences[sentid].tokens[t].dephead == token.dephead and system.sentences[sentid].tokens[t].deprel == token.deprel:
					acertos += 1
			if acertos == len(sentence.tokens):
				sent_accuracy[1] += 1
				#print(sentid)
	sentence_accuracy = "<table><tr><th>Acurácia por sentença</th></tr><tr><th>Sentenças comparáveis</th><th>Sentenças corretas</th><th>Número relativo</th></tr><tr><td>{0}</td><td>{1}</td><td>{2}</td></tr></table>".format(sent_accuracy[0], sent_accuracy[1], str((sent_accuracy[1]/sent_accuracy[0])*100) + "%")
	with open(output + "_sentence.txt", "w") as f:
		f.write(sentence_accuracy)

	if coluna == 8:
		csv = ["{0:20} {1:10} {2:10} {3:10} {4:10} {5:10}".format("DEPREL", "GOLDEN", "ACERTOS_DEPREL", "ACERTOS_DEPREL_DEPHEAD", "PORCENTAGEM_DEPREL", "PORCENTAGEM_DEPREL_DEPHEAD")]
		for classe in sorted(dicionario):
			dicionario[classe][3] = (dicionario[classe][1] / dicionario[classe][0]) * 100
			dicionario[classe][4] = (dicionario[classe][2] / dicionario[classe][0]) * 100
			csv.append("{0:20} {1:10} {2:10} {3:10} {4:10} {5:10}".format(classe, str(dicionario[classe][0]), str(dicionario[classe][1]), str(dicionario[classe][2]), str(dicionario[classe][3]) + "%", str(dicionario[classe][4]) + "%"))
	else:
		csv = ["{0:20} {1:10} {2:10} {3:10}".format(feats[coluna], "GOLDEN", "ACERTOS", "PORCENTAGEM")]
		for classe in sorted(dicionario):
			dicionario[classe][2] = (dicionario[classe][1] / dicionario[classe][0]) * 100
			csv.append("{0:20} {1:10} {2:10} {3:10}".format(classe, str(dicionario[classe][0]), str(dicionario[classe][1]), str(dicionario[classe][2]) + "%"))

	with open(output + "_results.txt", "w") as f:
		f.write("\n".join(csv))

	for deprel in UAS:
		total = 0
		for x in UAS[deprel].values(): total += x["qtd"]
		escrever = ["<tr><td>{0}</td><td>{1}</td><td>{2}</td><td><a href='./{4}_{0}_{1}.html'>{3}%</a></td></tr>".format(padrao.split("/")[0], padrao.split("/")[1], quantidade["qtd"], (quantidade["qtd"]/total)*100, deprel) for padrao, quantidade in sorted(UAS[deprel].items(), key=lambda x: x[1]["qtd"], reverse=True)]
		with open("UAS/" + deprel + ".html", "w") as f:
			f.write("<body style='margin:20px'>" + str(dicionario[deprel][3] - dicionario[deprel][4]) + '% "' + deprel + '" com dephead divergentes<br><br><head><link href="../style.css" rel="stylesheet" type="text/css"></head>' + "<table><tr><td colspan='4'>Distribuição dos erros</td></tr><tr><th>GOLDEN</th><th>PREVISTO</th><th>N</th><th>%</th></tr>" + "\n".join(escrever) + "<tr><td colspan='2'>Total</td><td>" + str(total) + "</td></tr></table>")

		for padrao in UAS[deprel]:
			escrever = "<body style='margin:20px;'>DEPREL: " + deprel + "\n<br>GOLDEN HEAD: " + padrao.split("/")[0] + "\n<br>PREVISTO HEAD: " + padrao.split("/")[1] + '''\n<br><input type=button value='Copiar sent_id das frases' onclick='copiar_frases()'> <input id='input' style='display:none'><br><br>'''
			for n, sentence in enumerate(UAS[deprel][padrao]["sentences"]):
				escrever += str(n+1) + " / " + str(len(UAS[deprel][padrao]["sentences"]))
				escrever += "\n<br><input type=hidden name=copiar_id value='"+sentence[0].sent_id.replace("'", "\\'")+"'># sent_id = " + sentence[0].sent_id
				text = sentence[0].text
				text = re.sub(r"\b" + re.escape(sentence[2].word) + r"\b", "<b>" + sentence[2].word + "</b>", text)
				escrever += "\n<input type=hidden name=negrito value='"+sentence[2].word.replace("'", "\\'")+"'>"
				text = re.sub(r"\b" + re.escape(sentence[3].word) + r"\b", "<font color=green>" + sentence[3].word + "</font>", text)
				text = re.sub(r"\b" + re.escape(sentence[4].word) + r"\b", "<font color=red>" + sentence[4].word + "</font>", text)
				escrever += "\n<br># text = " + text
				escrever += '''\n<br><input type='button' id="but_'''+str(n)+'''" value='Mostrar GOLDEN' onclick='if(document.getElementById("pre_'''+str(n)+'''").style.display == "none") { document.getElementById("pre_''' + str(n) + '''").style.display = "block"; document.getElementById("but_'''+str(n)+'''").value = "Esconder GOLDEN"; } else { document.getElementById("pre_''' + str(n) + '''").style.display = "none"; document.getElementById("but_'''+str(n)+'''").value = "Mostrar GOLDEN"; }\'>'''
				escrever += '''\n<input type='button' id="but2_'''+str(n)+'''" value='Mostrar PREVISTO' onclick='if(document.getElementById("pre2_'''+str(n)+'''").style.display == "none") { document.getElementById("pre2_''' + str(n) + '''").style.display = "block"; document.getElementById("but2_'''+str(n)+'''").value = "Esconder PREVISTO"; } else { document.getElementById("pre2_''' + str(n) + '''").style.display = "none"; document.getElementById("but2_'''+str(n)+'''").value = "Mostrar PREVISTO"; }\'>'''
				escrever += '\n<pre id=pre_' + str(n) + ' style="display:none">GOLDEN<br>' + sentence[0].to_str().replace(sentence[2].to_str(), "<b>" + sentence[2].to_str() + "</b>").replace(sentence[3].to_str(), "<font color=green>" + sentence[3].to_str() + "</font>") + '</pre>'
				escrever += '\n<pre id=pre2_' + str(n) + ' style="display:none">PREVISTO<br>' + sentence[1].to_str().replace(sentence[5].to_str(), "<b>" + sentence[5].to_str() + "</b>").replace(sentence[4].to_str(), "<font color=red>" + sentence[4].to_str() + "</font>") + '</pre>'
				escrever += "\n<hr>"
			escrever += '''
	<script>
	String.prototype.rsplit = function(sep, maxsplit) {
	var split = this.split(sep);
	return maxsplit ? [ split.slice(0, -maxsplit).join(sep) ].concat(split.slice(-maxsplit)) : split;
	}

	function copiar_frases(){
	document.getElementById("input").value = "";
	document.getElementById("input").style.display = "inline";
	
	var sentids, i, negritos;
	sentids = document.getElementsByName("copiar_id");
	negritos = document.getElementsByName("negrito");
	for (i = 0; i < sentids.length; i++) {
		document.getElementById("input").value = document.getElementById("input").value + "^# sent_id = " + sentids[i].value + "$(.*\\\\n)*.*" + negritos[i].value + "|";
	}
	document.getElementById("input").value = document.getElementById("input").value.rsplit('|',1)[0];
	}
	</script>'''
			with open("UAS/" + deprel + "_" + padrao.replace("/", "_") + ".html", "w") as f:
				f.write(escrever)

Ejemplo n.º 21

Mostrar archivo

Archivo: app.py Proyecto: alvelvis/Julgamento

def sendAnnotation():
    if not google.authorized and GOOGLE_LOGIN:
        return redirect(url_for('google.login'))

    goldenAndSystem = int(request.values.get('goldenAndSystem'))
    change = False
    attention = ""
    if any('<coluna>' in data and request.values.get(data)
           for data in request.values):
        sent_id = request.values.get('sent_id')
        arquivo = conllu(request.values.get('c')).findGolden(
        ) if request.values.get('ud') == 'ud1' else conllu(
            request.values.get('c')).findSystem()
        if goldenAndSystem:
            arquivoSystem = conllu(request.values.get('c')).findSystem()

        corpus = estrutura_ud.Corpus(recursivo=False,
                                     sent_id=request.values.get('sent_id'))
        corpus.load(arquivo)

        if goldenAndSystem:
            corpusSystem = estrutura_ud.Corpus(
                recursivo=False, sent_id=request.values.get('sent_id'))
            corpusSystem.load(arquivoSystem)

        for data in request.values:
            if '<coluna>' in data and request.values.get(data):
                token = int(data.split('<coluna>')[1])
                coluna = data.split('<coluna>')[0] if not re.search(
                    r'^\d+$', data.split('<coluna>')[0], flags=re.MULTILINE
                ) else dicionarioColunas[data.split('<coluna>')[0]]
                valor = html.unescape(
                    request.values.get(data).replace("<br>",
                                                     "").strip()).replace(
                                                         "<br>", "").strip()
                if request.values.get("headToken"):
                    headTokenNum = request.values.get(
                        "headToken"
                    ) if request.values.get("headToken") != "_" else "0"
                if corpus.sentences[sent_id].tokens[token].__dict__[
                        coluna] != valor:
                    corpus.sentences[sent_id].tokens[token].__dict__[
                        coluna] = valor
                    if request.values.get('headToken'):
                        corpus.sentences[sent_id].tokens[
                            token].dephead = headTokenNum
                    change = True
                    allCorpora.corpora[conllu(
                        request.values.get("c")).golden()].sentences[
                            sent_id].tokens[token].__dict__[coluna] = valor

                if goldenAndSystem:
                    if corpusSystem.sentences[sent_id].tokens[token].__dict__[
                            coluna] != valor:
                        corpusSystem.sentences[sent_id].tokens[token].__dict__[
                            coluna] = valor
                        if request.values.get('headToken'):
                            corpus.sentences[sent_id].tokens[
                                token].dephead = headTokenNum
                        change = True
                        allCorpora.corpora[conllu(
                            request.values.get("c")).system()].sentences[
                                sent_id].tokens[token].__dict__[coluna] = valor

        attention = []
        if change:
            corpus.save(arquivo)
            if goldenAndSystem:
                corpusSystem.save(arquivoSystem)
            errors = validar_UD.validate(corpus,
                                         errorList=VALIDAR_UD,
                                         noMissingToken=True,
                                         sent_id=request.values.get('sent_id'))
            if errors:
                for error in errors:
                    if error.strip():
                        attention += [
                            f'<div class="alert alert-warning translateHtml" role="alert">Atenção: {error}</div><ul>'
                        ]
                        for value in errors[error]:
                            if value['sentence']:
                                attention += [
                                    "<li>" + functions.cleanEstruturaUD(
                                        value['sentence'].tokens[
                                            value['t']].id) + " / " +
                                    functions.cleanEstruturaUD(
                                        value['sentence'].tokens[
                                            value['t']].word) + " / " +
                                    functions.cleanEstruturaUD(
                                        value['sentence'].tokens[value['t']].
                                        __dict__[value['attribute']]) + "</li>"
                                ]
                        attention += ["</ul>"]

        del corpus
        if "corpusSystem" in globals(): del corpusSystem
        attention = "\n".join(attention)

    return jsonify({
        'change': change,
        'data': prettyDate(datetime.datetime.now()).prettyDateDMAH(),
        'attention': attention,
        'success': True,
    })

Ejemplo n.º 22

Mostrar archivo

Archivo: interrogar_UD.py Proyecto: alvelvis/chatbot

def main(arquivoUD,
         criterio,
         parametros,
         limit=0,
         sent_id="",
         fastSearch=False,
         separate=False):
    parametros = parametros.strip()
    pesquisa = ""

    if criterio in [1]:
        import estrutura_ud
        if isinstance(arquivoUD, str):
            with open(arquivoUD) as f:
                arquivoUD = f.read()
        else:
            arquivoUD = arquivoUD.to_str()

    #Lê o arquivo UD
    if criterio in [3, 4]:
        import estrutura_dados
        import estrutura_ud
        qualquercoisa = estrutura_dados.LerUD(arquivoUD)

    if criterio in [2]:
        import estrutura_ud
        if isinstance(arquivoUD, str):
            if "head_token" in parametros or "next_token" in parametros or "previous_token" in parametros:
                corpus = estrutura_ud.Corpus(recursivo=True, sent_id=sent_id)
            else:
                corpus = estrutura_ud.Corpus(recursivo=False, sent_id=sent_id)
            start = time.time()
            corpus.load(arquivoUD)
            sys.stderr.write("\ncorpus.build: " + str(time.time() - start))
        else:
            corpus = arquivoUD

    #Cria a lista que vai ser enviada seja ao terminal ou ao HTML
    output = list()
    casos = 0

    #Regex
    tabela = ['@YELLOW/', '@PURPLE/', '@BLUE/', '@RED/', '@CYAN/']

    if criterio == 1:
        start = time.time()
        sentence = ""
        f = arquivoUD.splitlines(keepends=True)
        for line in f:
            if line.strip():
                sentence += line
            else:
                if limit and len(output) == limit:
                    break
                regex = re.findall('(' + parametros + ')',
                                   sentence,
                                   flags=re.I)
                if regex:
                    casos += len(regex)
                    new_sentence = re.sub('(' + parametros + ')',
                                          r'<b>\1</b>',
                                          sentence,
                                          flags=re.I)
                    tokens = list()
                    header = '!@#' if not '# text = ' in new_sentence else '# text = ' + new_sentence.split(
                        "# text = ")[1].split("\n")[0]
                    for linha in new_sentence.splitlines():
                        if 'b>' in linha and '\t' in linha:
                            if '\\' in linha:
                                linha = re.sub(r"\\(\d+)",
                                               r"\\\\\1",
                                               linha,
                                               flags=re.I)
                            tokens.append(
                                linha.split('\t')[1].replace('<b>',
                                                             '').replace(
                                                                 '</b>', ''))
                    header2 = header
                    for token in tokens:
                        header2 = re.sub(r'\b' + re.escape(token) + r'\b',
                                         '<b>' + token + '</b>',
                                         header2,
                                         flags=re.I)
                    for reg in regex:
                        if not isinstance(reg, str):
                            for i, grupo in enumerate(reg):
                                if i != 0:
                                    if grupo and i - 1 < len(tabela):
                                        token = ""
                                        if '\t' in grupo:
                                            token = grupo.split('\t')[1]
                                        if token:
                                            header2 = re.sub(
                                                r'\b' + re.escape(token) +
                                                r'\b',
                                                tabela[i - 1] + token +
                                                '/FONT',
                                                header2,
                                                flags=re.I)
                    new_sentence = new_sentence.replace(header, header2)
                    output.append(new_sentence)
                sentence = ""
        sys.stderr.write(f"\ncriterio 1: {time.time() - start}")

    #If critério 2
    if criterio == 2:

        #Variáveis
        y = parametros.split('#')[0].strip()
        z = int(parametros.split('#')[1].strip())
        k = parametros.split('#')[2].strip()
        w = int(parametros.split('#')[3].strip())
        for sentence in corpus.sentences.values():
            for token in sentence.tokens:
                colunas = token.to_str().split("\t")
                if any(colunas[z - 1] == x for x in y.split("|")):
                    descarta = False
                    for _token in sentence.tokens:
                        _colunas = _token.to_str().split("\t")
                        if any(_colunas[w - 1] == x for x in k.split(
                                "|")) and _token.dephead == token.id:
                            descarta = True
                    if not descarta:
                        output.append(
                            re.sub(r"\b" + re.escape(token.word) + r"\b",
                                   "<b>" + re.escape(token.word) + "</b>",
                                   sentence.to_str()))
                        casos += 1

    #Regex Independentes
    if criterio == 3:
        regras = [x.strip() for x in parametros.split('::')]

        for a, sentence in enumerate(qualquercoisa):
            sentence2 = sentence
            for b, linha in enumerate(sentence):
                linha2 = linha
                if isinstance(linha2, list):
                    sentence2[b] = "\t".join(sentence2[b])
            sentence2 = "\n".join(sentence2)
            descarta = False
            for regranum, regra in enumerate(regras):
                if regra[0] == '!':
                    regex = re.search(regra[1:],
                                      sentence2,
                                      flags=re.IGNORECASE | re.MULTILINE)
                    casos += len(
                        re.findall(regra[1:], sentence2, flags=re.I | re.M))
                else:
                    regex = re.search(regra,
                                      sentence2,
                                      flags=re.IGNORECASE | re.MULTILINE)
                    casos += len(
                        re.findall(regra, sentence2, flags=re.I | re.M))
                if (regra[0] == '!' and regex) or (regra[0] != '!'
                                                   and not regex):
                    descarta = True
                    break
                sentence2 = re.sub('(' + regra + ')',
                                   tabela[regranum] + r'<b>\1</b>/FONT',
                                   sentence2,
                                   flags=re.IGNORECASE | re.MULTILINE)
            if not descarta:
                tokens = list()
                header = '!@#'
                for linha in sentence2.splitlines():
                    if '# text = ' in linha:
                        header = linha
                    if 'b>' in linha and '\t' in linha:
                        if '@' in linha:
                            tokens.append((linha.split('\t')[1].replace(
                                '<b>', '').replace('</b>', '').replace(
                                    '@' + linha.split('@')[1].split('/')[0] +
                                    '/', ''), '@' +
                                           linha.split('@')[1].split('/')[0] +
                                           '/'))
                            lastcolor = '@' + linha.split('@')[1].split(
                                '/')[0] + '/'
                        else:
                            tokens.append((linha.split('\t')[1].replace(
                                '<b>', '').replace('</b>', ''), lastcolor))
                header2 = header
                for token in tokens:
                    header2 = re.sub(r'\b' + re.escape(token[0]) + r'\b',
                                     token[1] + '<b>' + token[0] + '</b>/FONT',
                                     header2)
                sentence2 = sentence2.replace(header, header2)
                output.append(sentence2.splitlines())

    #pais e filhos
    if criterio == 4:
        filho = parametros.split('::')[0].strip()
        pai = parametros.split('::')[1].strip()

        negativo_filho = False
        negativo_pai = False
        if filho[0] == '!':
            negativo_filho = True
            filho = ''.join(filho[1:])
        if pai[0] == '!':
            negativo_pai = True
            pai = ''.join(pai[1:])

        for a, sentenca in enumerate(qualquercoisa):
            acheifilho = 'não'
            acheipai = 'não'
            descarta = False
            for b, linha in enumerate(sentenca):
                if isinstance(linha, list):
                    if re.search(filho,
                                 '\t'.join(linha),
                                 flags=re.IGNORECASE | re.MULTILINE):
                        acheifilho = (linha, b)
                if isinstance(linha, list):
                    if re.search(pai,
                                 '\t'.join(linha),
                                 flags=re.IGNORECASE | re.MULTILINE):
                        acheipai = (linha, b)

                if not negativo_filho and not negativo_pai and acheipai != 'não' and acheifilho != 'não' and acheipai[
                        0][0] == acheifilho[0][6]:
                    for c, linha in enumerate(sentenca):
                        if '# text' in linha:
                            qualquercoisa[a][c] = re.sub(
                                r'\b' + re.escape(acheipai[0][1]) + r'\b',
                                '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>',
                                qualquercoisa[a][c],
                                flags=re.IGNORECASE | re.MULTILINE)
                            qualquercoisa[a][c] = re.sub(
                                r'\b' + re.escape(acheifilho[0][1]) + r'\b',
                                '<b>@RED/' + acheifilho[0][1] + '/FONT</b>',
                                qualquercoisa[a][c],
                                flags=re.IGNORECASE | re.MULTILINE)
                            break
                    qualquercoisa[a][acheipai[1]] = (
                        '<b>@BLUE/' +
                        '\t'.join(qualquercoisa[a][acheipai[1]]) +
                        '/FONT</b>').split('\t')
                    qualquercoisa[a][acheifilho[1]] = (
                        '<b>@RED/' +
                        '\t'.join(qualquercoisa[a][acheifilho[1]]) +
                        '/FONT</b>').split('\t')
                    output.append(qualquercoisa[a])
                    break

                elif negativo_filho and acheipai != 'não' and acheifilho != 'não' and acheipai[
                        0][0] == acheifilho[0][6]:
                    descarta = True
                    break

                elif negativo_pai and acheifilho != 'não' and acheipai != 'não' and acheipai[
                        0][0] == acheifilho[0][6]:
                    descarta = True
                    break

            if negativo_filho and acheipai != 'não' and acheifilho != 'não' and not descarta:
                for c, linha in enumerate(sentenca):
                    if '# text' in linha:
                        qualquercoisa[a][c] = re.sub(
                            r'\b' + re.escape(acheipai[0][1]) + r'\b',
                            '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>',
                            qualquercoisa[a][c],
                            flags=re.IGNORECASE | re.MULTILINE)
                        break
                qualquercoisa[a][acheipai[1]] = (
                    '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheipai[1]]) +
                    '/FONT</b>').split('\t')
                output.append(qualquercoisa[a])

            elif negativo_pai and acheipai != 'não' and acheifilho != 'não' and not descarta:
                for c, linha in enumerate(sentenca):
                    if '# text' in linha:
                        qualquercoisa[a][c] = re.sub(
                            r'\b' + re.escape(acheifilho[0][1]) + r'\b',
                            '<b>@BLUE/' + acheifilho[0][1] + '/FONT</b>',
                            qualquercoisa[a][c],
                            flags=re.IGNORECASE | re.MULTILINE)
                        break
                qualquercoisa[a][acheifilho[1]] = (
                    '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheifilho[1]]) +
                    '/FONT</b>').split('\t')
                output.append(qualquercoisa[a])

            elif negativo_filho and acheipai != 'não' and acheifilho == 'não':
                for c, linha in enumerate(sentenca):
                    if '# text' in linha:
                        qualquercoisa[a][c] = re.sub(
                            r'\b' + re.escape(acheipai[0][1]) + r'\b',
                            '<b>@BLUE/' + acheipai[0][1] + '/FONT</b>',
                            qualquercoisa[a][c],
                            flags=re.IGNORECASE | re.MULTILINE)
                        break
                qualquercoisa[a][acheipai[1]] = (
                    '<b>@BLUE/' + '\t'.join(qualquercoisa[a][acheipai[1]]) +
                    '/FONT</b>').split('\t')
                output.append(qualquercoisa[a])

            elif negativo_pai and acheifilho != 'não' and acheipai == 'não':
                for c, linha in enumerate(sentenca):
                    if '# text' in linha:
                        qualquercoisa[a][c] = re.sub(
                            r'\b' + re.escape(acheifilho[0][1]) + r'\b',
                            '<b>@RED/' + acheifilho[0][1] + '/FONT</b>',
                            qualquercoisa[a][c],
                            flags=re.IGNORECASE | re.MULTILINE)
                        break
                qualquercoisa[a][acheifilho[1]] = (
                    '<b>@RED/' + '\t'.join(qualquercoisa[a][acheifilho[1]]) +
                    '/FONT</b>').split('\t')
                output.append(qualquercoisa[a])

    #Python

    if criterio == 5:

        parametros = parametros.split(" and ")
        for t, parametro in enumerate(parametros):
            if not any(x in parametros[t]
                       for x in [' = ', '==', '!=', ' < ', ' > ']):
                parametros[t] = re.findall(r'@?"[^"]+?"',
                                           parametros[t].replace(" ", ""))
                parametros[t] = [
                    ("@" if "@" in x else "") + ("next_token." * i) +
                    "word = " + x.replace("@", "")
                    for i, x in enumerate(parametros[t]) if x
                ]
                parametros[t] = " and ".join(parametros[t])
        parametros = " and ".join(parametros)
        pesquisa = parametros

        pesquisa = pesquisa.replace(" = ", " == ")
        pesquisa = pesquisa.replace(" @", " ")
        if pesquisa[0] == "@": pesquisa = pesquisa[1:]
        pesquisa = pesquisa.replace("  ", " ").strip()
        pesquisa = pesquisa.replace(" == ", " == token.")
        pesquisa = pesquisa.replace(" === ", " === token.")
        pesquisa = pesquisa.replace(" != ", " != token.")
        pesquisa = pesquisa.replace(" !== ", " !== token.")
        pesquisa = pesquisa.replace(" > ", " > token.")
        pesquisa = pesquisa.replace(" < ", " < token.")
        pesquisa = pesquisa.replace(" >= ", " >= token.")
        pesquisa = pesquisa.replace(" <= ", " <= token.")
        pesquisa = "token." + pesquisa
        pesquisa = pesquisa.replace(" and ", " and token.")
        pesquisa = pesquisa.replace(" or ", " or token.")
        pesquisa = pesquisa.replace(" in ", " in token.")
        pesquisa = pesquisa.replace(" text ", " sentence.text ")
        pesquisa = pesquisa.replace(" sent_id ", " sentence.sent_id ")
        pesquisa = pesquisa.replace('token."', '"')
        pesquisa = pesquisa.replace('token.[', '[')
        pesquisa = pesquisa.replace('token.(', '(')

        pesquisa = pesquisa.replace('token.not', 'not')
        pesquisa = pesquisa.replace('token.token.', 'token.')
        pesquisa = pesquisa.replace('token.sentence.', 'sentence.')
        pesquisa = pesquisa.replace("token.text", "sentence.text")
        pesquisa = pesquisa.replace("token.sent_id", "sentence.sent_id")
        pesquisa = pesquisa.replace('token.int(', 'int(')
        #pesquisa = pesquisa.replace("token.and", "and")
        #		pesquisa = pesquisa.replace("== int(", "==int(")
        pesquisa = re.sub(r'token\.([1234567890])', r'\1', pesquisa)

        indexed_conditions = {
            x.split(" == ")[0].strip().split("token.", 1)[1]:
            x.split(" == ")[1].strip().replace('"', '')
            for x in pesquisa.split(" and ")
            if ' == ' in x and 'token.' in x and not any(
                y in x for y in ["head_token", "previous_token", "next_token"])
        }  #["head_token.head", "head_token.next", "head_token.previous", "next_token.head", "next_token.next", "next_token.previous", "previous_token.head", "previous_token.next", "previous_token.previous"])}
        pesquisa = re.sub(r"token\.([^. ]+?)(\s|$)", r"token.__dict__['\1']\2",
                          pesquisa)

        pesquisa = re.sub(
            r'(\S+)\s==\s(\".*?\")',
            r'any( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("ddd") )',
            pesquisa
        )  #ddd provisório enquanto split na barra em pé não funciona
        pesquisa = re.sub(
            r'(\S+)\s===\s(\".*?\")',
            r'all( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )',
            pesquisa)
        pesquisa = re.sub(
            r'(\S+)\s!=\s(\".*?\")',
            r'not any( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("ddd") )',
            pesquisa)
        pesquisa = re.sub(
            r'(\S+)\s!==\s(\".*?\")',
            r'not all( re.search( r"^" + r\2 + r"$", x ) for x in \1.split("|") )',
            pesquisa)
        pesquisa = pesquisa.strip()

        if (".__dict__['id']" in pesquisa or ".__dict__['dephead']" in pesquisa
            ) and (not "int(" in pesquisa) and (" < " in pesquisa
                                                or " > " in pesquisa):
            pesquisa = re.sub(r"(\S+\.__dict__\['(id|dephead)'\])", r"int(\1)",
                              pesquisa)

        identificador = "token"

        if parametros[0] == "@":
            parametros = parametros[1:]

        arroba = parametros.split(
            " ")[0] if not ' @' in parametros else parametros.rsplit(
                " @", 1)[1].replace(
                    "int(", "").split(")")[0].split(" ")[0].replace("(", "")
        arroba = "token." + arroba
        arroba = arroba.replace("token.token", "token")
        arroba = arroba.rsplit(".", 1)[0]

        agilizar = re.findall(r'"([^"]*)"', parametros)

        import estrutura_ud
        if isinstance(arquivoUD, str):
            if "head_token" in parametros or "next_token" in parametros or "previous_token" in parametros:
                corpus = estrutura_ud.Corpus(recursivo=True,
                                             sent_id=sent_id,
                                             keywords=agilizar)
            else:
                corpus = estrutura_ud.Corpus(recursivo=False,
                                             sent_id=sent_id,
                                             keywords=agilizar)
            start = time.time()
            corpus.load(arquivoUD)
            sys.stderr.write("\ncorpus.build: " + str(time.time() - start))
        else:
            corpus = arquivoUD

        start = time.time()
        casos = []

        t1 = time.time()
        if indexed_conditions:
            sentences = defaultdict(list)
            tokens = defaultdict(list)
            values = {}
            for sent_id in corpus.sentences:
                for col in indexed_conditions:
                    if col in corpus.sentences[sent_id].processed:
                        values = re.findall(
                            r"\n(" + indexed_conditions[col] + r")\n",
                            "\n" + "\n\n".join(
                                list(corpus.sentences[sent_id].processed[col]))
                            + "\n")
                        for value in values:
                            if value:
                                if not isinstance(value, str):
                                    value = value[0]
                                tokens[col].extend(
                                    corpus.sentences[sent_id].processed[col]
                                    [value])
            for col in tokens:
                tokens[col] = set(tokens[col])
            tokens_filtered = []
            if tokens.values():
                tokens_filtered = set.intersection(*list(tokens.values()))
            '''
			priority = ['lemma', 'word', 'deprel', 'upos']
			priority_possible = []
			for col in indexed_conditions:
				if col in priority:
					priority_possible.append(priority.index(col))
			if priority_possible:
				col = priority[min(priority_possible)]
			else:
				col = list(indexed_conditions)[0]
			cols = list(indexed_conditions)
			cols.remove(col)
			'''
            for token in tokens_filtered:
                sent_id = token.split("<tok>")[0]
                t = int(token.split("<tok>")[1])
                sentences[sent_id].append(t)
        else:
            sentences = corpus.sentences
        sys.stderr.write(f"\nindexing: {time.time() - t1}")

        t1 = time.time()
        for sent_id in sentences:
            sentence = corpus.sentences[sent_id]
            sentence2 = sentence
            clean_text = [
                x.word for x in sentence2.tokens
                if not '-' in x.id and not '.' in x.id
            ]
            clean_id = [
                x.id for x in sentence2.tokens
                if not '-' in x.id and not '.' in x.id
            ]
            corresponde = 0
            tokens = sentence2.tokens_to_str()
            map_id = {x: t for t, x in enumerate(clean_id)}
            if limit and limit == len(output):
                break
            condition = "global sim; sim = 0"

            condition += '''
if not indexed_conditions:
	available_tokens = list(range(len(sentence.tokens)))
else:
	available_tokens = sentences[sent_id]
for token_t in available_tokens:
	token = sentence.tokens[token_t]
	try:
		if (not "-" in token.id and not '.' in token.id and (''' + pesquisa + ''')) :
			corresponde = 1
			clean_text[map_id[token.id]] = "@BLUE/" + clean_text[map_id[token.id]] + "/FONT"
			tokens = tokens.replace(token.string, "@BLUE/" + token.string + "/FONT")
	'''#try por causa de não ter um next_token no fim de sentença, por ex.
            if "token.head_token" in pesquisa:
                condition += '''
			clean_text[map_id[token.head_token.id]] = "@RED/" + clean_text[map_id[token.head_token.id]] + "/FONT"
			tokens = tokens.replace(token.head_token.string, "@RED/" + token.head_token.string + "/FONT")'''
            if "token.next_token" in pesquisa:
                condition += '''
			clean_text[map_id[token.next_token.id]] = "@BLUE/" + clean_text[map_id[token.next_token.id]] + "/FONT"
			tokens = tokens.replace(token.next_token.string, "@BLUE/" + token.next_token.string + "/FONT")'''
            if "token.previous_token" in pesquisa:
                condition += '''
			clean_text[map_id[token.previous_token.id]] = "@BLUE/" + clean_text[map_id[token.previous_token.id]] + "/FONT"
			tokens = tokens.replace(token.previous_token.string, "@BLUE/" + token.previous_token.string + "/FONT")'''
            condition += '''
			clean_text[map_id[''' + arroba + '''.id]] = "<b>" + clean_text[map_id[''' + arroba + '''.id]] + "</b>"'''

            exec(condition + '''
			casos.append(1)
			arroba_id = ''' + arroba + '''.id
			tokens = tokens.splitlines()
			for l, linha in enumerate(tokens):
				if linha.split("\\t")[0] == arroba_id or ("/" in linha.split("\\t")[0] and linha.split("\\t")[0].split("/")[1] == arroba_id):
					tokens[l] = "<b>" + tokens[l] + "</b>"
			tokens = "\\n".join(tokens)

			if separate:
				corresponde = 0
				final = "# clean_text = " + " ".join(clean_text) + "\\n" + sentence2.metadados_to_str() + "\\n" + tokens
				output.append(final)
			
	except Exception as e:
		print(str(e))
		print(token.to_str())
		pass
if corresponde and not separate:
	corresponde = 0
	final = "# clean_text = " + " ".join(clean_text) + "\\n" + sentence2.metadados_to_str() + "\\n" + tokens
	output.append(final)''')
        sys.stderr.write("\ncritério 5: " + str(time.time() - start))
        casos = len(casos)
        sys.stderr.write(f"\nfor each sentence: {time.time() - t1}")
    #Transforma o output em lista de sentenças (sem splitlines e sem split no \t)
    if criterio not in [5, 2, 1]:
        for a, sentence in enumerate(output):
            for b, linha in enumerate(sentence):
                if isinstance(linha, list):
                    sentence[b] = "\t".join(sentence[b])
            output[a] = "\n".join(sentence)

    start = time.time()
    for i, final in enumerate(output):
        if not fastSearch:
            anotado = estrutura_ud.Sentence(recursivo=False)
            estruturado = estrutura_ud.Sentence(recursivo=False)
            anotado.build(
                web.escape(
                    final.replace('<b>', '@BOLD').replace(
                        '</b>', '/BOLD').replace(
                            '<font color=' + tabelaf['yellow'] + '>',
                            '@YELLOW/').replace(
                                '<font color=' + tabelaf['red'] + '>',
                                '@RED/').replace(
                                    '<font color=' + tabelaf['cyan'] + '>',
                                    '@CYAN/').replace(
                                        '<font color=' + tabelaf['blue'] + '>',
                                        '@BLUE/').replace(
                                            '<font color=' +
                                            tabelaf['purple'] + '>',
                                            '@PURPLE/').replace(
                                                '</font>', '/FONT')))
            estruturado.build(
                web.unescape(final).replace('<b>', '@BOLD').replace(
                    '</b>', '/BOLD').replace(
                        '<font color=' + tabelaf['yellow'] + '>',
                        '@YELLOW/').replace(
                            '<font color=' + tabelaf['red'] + '>',
                            '@RED/').replace(
                                '<font color=' + tabelaf['cyan'] + '">',
                                '@CYAN/').replace(
                                    '<font color=' + tabelaf['blue'] + '>',
                                    '@BLUE/').
                replace('<font color=' + tabelaf['purple'] + '>',
                        '@PURPLE/').replace('</font>', '/FONT').replace(
                            '@BOLD', '').replace('/BOLD', '').replace(
                                '@YELLOW/', '').replace('@RED/', '').replace(
                                    '@CYAN/',
                                    '').replace('@BLUE/',
                                                '').replace('@PURPLE/',
                                                            '').replace(
                                                                '/FONT', ''))
        else:
            anotado = ""
            estruturado = ""
        output[i] = {
            'resultado': final,
            'resultadoAnotado': anotado,
            'resultadoEstruturado': estruturado,
        }
    #sys.stderr.write("\nbuscaDicionarios: " + str(time.time() - start))

    sentences = {}
    if not fastSearch:
        sentences = {
            x['resultadoEstruturado'].sent_id: i
            for i, x in enumerate(output)
        }

    return {
        'output': output,
        'casos': casos,
        'sentences': sentences,
        'parameters': pesquisa if pesquisa else parametros
    }

Ejemplo n.º 23

Mostrar archivo

Archivo: from_release_to_workbench.py Proyecto: alvelvis/ACDC-UD

import sys
sys.path.append("/home/elvis/ACDC-UD")
import estrutura_ud

if len(sys.argv < 3):
    sys.stderr.write("usage: release.conllu workbench_anyversion.conllu")
    exit()

release = estrutura_ud.Corpus(recursivo=True)
release.load(sys.argv[1])
workbench = estrutura_ud.Corpus(recursivo=True)
workbench.load(sys.argv[2])

for sentid, sentence in workbench.sentences.items():
    for t, token in enumerate(sentence.tokens):
        if token.deprel == "appos:parataxis":
            release.sentences[sentid].tokens[t].deprel = "appos:parataxis"
        if token.deprel == "ccomp:parataxis":
            release.sentences[sentid].tokens[t].deprel = "appos:parataxis"
        if token.deprel == "xcomp" and token.upos == "VERB" and token.head_token.upos == "VERB":
            token.deprel = token.head_token.deprel
            token.dephead = token.head_token.dephead
            token.head_token.upos = "AUX"
            token.head_token.deprel = "aux"
            token.head_token.dephead = token.id
            for _t, _token in enumerate(sentence.tokens):
                if _token.upos == "SCONJ" and _token.deprel == "mark" and _token.dephead == token.id:
                    _token.upos = "ADP"
                    _token.deprel = "compound"
                _token.dephead = token.head_token.id