def main(csv_data, laudo_type): """Gera o arquivo xml de laudos separados por sentença. Esta função assume que o arquivo csv consiste de duas colunas, a primeira com o índice do laudo, e a segunda com o laudo em si. Argumentos: csv_data - um iterable de dados em formato csv. laudo_type - um LaudoEnum indicando o tipo de laudo do csv. """ charent_corrected_str = encoding_handlers.substitute_char_entities(csv_data) wrongenc_corrected_str = encoding_handlers.substitute_wrong_encoding(charent_corrected_str) data = wrongenc_corrected_str.split('\n') csv_reader = csv.reader(data, delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\') laudos = list() next(csv_reader) # csv header for row in csv_reader: try: laudos.append(row[1]) except IndexError: pass formatted_data = bytes('\n'.join(laudos), 'utf8') # call python2.* sub_env = dict() sub_env['PYTHONPATH'] = os.getcwd() p = subprocess.Popen(['python2', 'tcc/preprocess/sentence_chunker.py'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=sub_env) sent_chunker_data = p.communicate(formatted_data)[0] sent_chunker_sents = str(sent_chunker_data, 'utf8').split('\n') sent_chunker_sents = (s for s in sent_chunker_sents if s) # remove empty lines csv_reader = csv.reader(sent_chunker_sents, delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\') csv_reader = list(csv_reader) laudo_groups = list() for k, g in itertools.groupby(csv_reader, key=lambda row: row[0]): laudo_groups.append(list(sent[1] for sent in g)) tok = tokenizer.SentenceTokenizer() tokenized_laudo_groups = list() for laudo_group in laudo_groups: tokenized_laudo = (tok.tokenize(sent) for sent in laudo_group) tokenized_laudo_groups.append(functools.reduce(op.concat, tokenized_laudo, [])) root = ET.Element('laudos') for sents in tokenized_laudo_groups: laudo_elem = ET.Element('laudo') for sent in sents: sent_elem = ET.Element('sentença') sent_elem.text = sent laudo_elem.append(sent_elem) root.append(laudo_elem) xml_string = ET.tostring(root) dom3 = parseString(xml_string) return dom3.toprettyxml(indent=' ')
def main(csv_data, laudo_type): """Gera o arquivo xml de laudos separados por sentença. Esta função assume que o arquivo csv consiste de duas colunas, a primeira com o índice do laudo, e a segunda com o laudo em si. Argumentos: csv_data - um iterable de dados em formato csv. laudo_type - um LaudoEnum indicando o tipo de laudo do csv. """ charent_corrected_str = encoding_handlers.substitute_char_entities(csv_data) wrongenc_corrected_str = encoding_handlers.substitute_wrong_encoding(charent_corrected_str) data = wrongenc_corrected_str.split('\n') csv_reader = csv.reader(data, delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\') laudos = list() next(csv_reader) # csv header for row in csv_reader: try: laudos.append(row[1]) except IndexError: pass # eliminate repeated sentences laudos = [k for k, g in itertools.groupby(laudos)] formatted_data = bytes('\n'.join(laudos), 'utf8') # call python2.* sub_env = dict() sub_env['PYTHONPATH'] = os.getcwd() p = subprocess.Popen(['python2.6', 'tcc/preprocess/sentence_chunker.py'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=sub_env) sent_chunker_data = p.communicate(formatted_data)[0] sent_chunker_sents = str(sent_chunker_data, 'utf8').split('\n') sent_chunker_sents = (s for s in sent_chunker_sents if s) # remove empty lines csv_reader = csv.reader(sent_chunker_sents, delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\') csv_reader = list(csv_reader) laudo_groups = list() for k, g in itertools.groupby(csv_reader, key=lambda row: row[0]): laudo_groups.append(list(sent[1] for sent in g)) tok = tokenizer.SentenceTokenizer() tokenized_laudo_groups = list() for laudo_group in laudo_groups: tokenized_laudo = (tok.tokenize(sent) for sent in laudo_group) tokenized_laudo_groups.append(functools.reduce(op.concat, tokenized_laudo, [])) conn = psycopg2.connect(database='buscas', user='******', host='150.162.67.6') if laudo_type == 'ecg': indexed_laudos = index_ecg(tokenized_laudo_groups) else: indexed_laudos = index_decs(tokenized_laudo_groups, conn) neg_tagger = negex.Tagger() for idx_laudo in indexed_laudos: for idx_sentence in idx_laudo: # surround each index with <achado> tags original_sent = idx_sentence.sentence() tagged_sent = insert_tags(original_sent, idx_sentence.indexes()) assert len(re.findall(r'<achado>', tagged_sent)) == len(idx_sentence.indexes()), str(idx_sentence) + '\n' + tagged_sent tagged_sent_indexes = list() for m in re.finditer(r'<achado>[^>]*</achado>', tagged_sent): tagged_sent_indexes.append((m.start(), m.end())) index_dict = dict(zip(tagged_sent_indexes, idx_sentence.indexes())) annotated_sent = neg_tagger.annotate(tagged_sent) findings = annotated_sent.findings() assert len(idx_sentence.indexes()) == len(findings), 'len(idx_sentence.indexes()) == ' + len(idx_sentence.indexes()) + '\n' + 'len(findings) == ' + len(findings) for finding in findings: tagged_sent_idx = (finding.start(), finding.end()) index = index_dict[tagged_sent_idx] idx_sentence.insertClassification(index, CLASSIF_STR_DICT[finding.classification()]) classif_dict = idx_sentence.idx_classif_dict() for idx in idx_sentence.indexes(): assert idx in classif_dict.keys(), str(idx_sentence) + '\n' + tagged_sent + '\n' + '\n\n'.join(original_sent[slice(*i)] for i in idx_sentence.indexes()) cur = conn.cursor() insert_laudo_l = list() values = dict() for idx_laudo in indexed_laudos: text_column = ''.join(str(idx_sent) for idx_sent in idx_laudo) cur.execute('SELECT nextval(%s);', ('laudo_id_seq',)) laudo_id = cur.fetchone()[0] cur.execute('INSERT INTO laudo (id, texto) VALUES (%s, %s);', (laudo_id, text_column)) for idx_sent in idx_laudo: cur.execute('SELECT nextval(%s);', ('sentenca_id_seq',)) sentenca_id = cur.fetchone()[0] values['id_sentenca'] = sentenca_id sent_str = str(idx_sent) cur.execute('INSERT INTO sentenca (id, id_laudo, texto) VALUES (%s, %s, %s);', (sentenca_id, laudo_id, sent_str)) idx_id_dict = idx_sent.idx_id_dict() idx_classif_dict = idx_sent.idx_classif_dict() for index in idx_sent.indexes(): values['id_termo_cotidiano'] = idx_id_dict[index] values['id_decs'] = idx_id_dict[index] values['inicio'] = index[0] values['fim'] = index[1] if laudo_type == 'ecg': cur.execute('INSERT INTO sentenca_termo_cotidiano (id_sentenca, id_termo_cotidiano, inicio, fim) VALUES (%(id_sentenca)s, %(id_termo_cotidiano)s, %(inicio)s, %(fim)s);', values) else: cur.execute('INSERT INTO sentenca_decs (id_sentenca, id_decs, inicio, fim) VALUES (%(id_sentenca)s, %(id_decs)s, %(inicio)s, %(fim)s);', values) classif = idx_classif_dict[index] idx_str = sent_str[slice(*index)] if classif == 'negativo': cur.execute('SELECT id FROM expressao_negada WHERE texto = %s', (idx_str,)) en_row = cur.fetchone() # fetchone() returns None if there are no rows if not en_row: cur.execute("SELECT nextval(%s);", ('expressao_negada_id_seq',)) expr_negada_id = cur.fetchone()[0] cur.execute('INSERT INTO expressao_negada (id, texto) VALUES (%s, %s);', (expr_negada_id, idx_str)) else: expr_negada_id = en_row[0] values['id_expressao_negada'] = expr_negada_id cur.execute('INSERT INTO sentenca_expressao_negada (id_sentenca, id_expressao_negada, inicio, fim) VALUES (%(id_sentenca)s, %(id_expressao_negada)s, %(inicio)s, %(fim)s);', values) elif classif == 'hipotético': cur.execute('SELECT id FROM expressao_hipotetica WHERE texto = %s', (idx_str,)) eh_row = cur.fetchone() # fetchone() returns None if there are no rows if not eh_row: cur.execute("SELECT nextval(%s);", ('expressao_hipotetica_id_seq',)) expr_hipotetica_id = cur.fetchone()[0] cur.execute('INSERT INTO expressao_hipotetica (id, texto) VALUES (%s, %s);', (expr_hipotetica_id, idx_str)) else: expr_hipotetica_id = eh_row[0] values['id_expressao_hipotetica'] = expr_hipotetica_id cur.execute('INSERT INTO sentenca_expressao_hipotetica (id_sentenca, id_expressao_hipotetica, inicio, fim) VALUES (%(id_sentenca)s, %(id_expressao_hipotetica)s, %(inicio)s, %(fim)s);', values) conn.commit()