def __init__(self, path_text, path_tags): # testo.txt => testo_txt,txt self.path_out = set_path_teim_out(path_text) # testo.txt => testo_txt.log path_info = set_path_teim_log(path_text) self.log_info = Log("w").open(path_info, 0).log # testo.txt => testo_teim_.xml self.path_xml = set_path_teim_xml_log(path_text) self.log_teim_xml = Log("w").open(self.path_xml, 0).log # testo.txt => testo_txt.ERR.log path_err = set_path_teim_err(path_text) self.log_err = Log("w").open(path_err, 1).log try: self.text_entities = TextEntities(path_text, path_tags, self.log_err) except Exception as e: sys.exit(e) self.row_num = -1 # self.rows_entities = [] # tipo numerazione righe self.LB = 'lb' self.LG_L = 'lg_l' self.type_line = self.LB self.trace = False # self.input_err_active = True self.input_err_active = False
def __init__(self, path_text, path_csv): # test.txt => testo_id.xml self.path_in = set_path_over_in(path_text) # test.txt => testo_id_over.xml self.path_out = set_path_over_out(path_text) # test.txt => testo_id_over.log path_span = set_path_over_log(path_text) self.logspan = Log("w").open(path_span, 0).log # test.txt => testo_id_over.ERR.log path_err = set_path_over_err(path_text) self.logerr = Log("w").open(path_err, 1).log self.root_xml = etree self.span_data = {} self.key_span_data = '' self.row_tag_over_js = {} self.tag_op_alter = [] self.tag_cl_alter = [] self.tag_op_lst = [] self.tag_cl_lst = [] self.rows_tag_over = [] # self.over_op_set=[] # self.over_cl_set=[] # self.over_tag_set=[] self.add_xml_span_overflow_list(path_csv)
def __init__(self, path_xml='', path_txt='', write_append='w'): self.path_xml = path_xml self.path_txt = path_txt self.write_append = write_append path_err = path_txt.replace(".txt", ".ERR.log") self.logerr = Log("w").open(path_err, 1).log self.txt_builder = None self.trace = False
def __init__(self, path_text): self.path_text = path_text path_out=set_path_checktxt_out(path_text) self.log_info = Log("w").open(path_out, 1).log path_err=set_path_checktxt_err(path_text) self.log_err = Log("w").open(path_err, 1).log self.trace = False self.rows = [] self.rnd_rows = []
def __init__(self): self.log = Log("w") self.log.open("log/txtbuilder.log", 0) self.logerr = Log("a") self.logerr.open("log/txtbuilder.ERR.log", 1) self.data_lst = [] self.data_txt_lst = [] self.data_span_lst = [] self.from_to_lst = [] self.txt_rows = [] self.up = True self.w_liv = 100 self.trace = False self.ramis=self.set_ramis_dict()
def __init__(self, path_text, log_err=None): self.path_text = path_text if log_err is None: path_err = set_path_teim_err(path_text) self.log_errr = Log('w').open(path_err, 1).log else: self.log_err = log_err
def do_main(path_src, path_tags): path_err = path_src.replace(".txt", "_words.ERR.log") log_err = Log("w").open(path_err, 1).log te = TextEntities(path_src, path_tags, log_err) lst = te.get_rows_entities() path_log = path_src.replace(".txt", "_words.log") word_entities_log(lst, path_log)
def log_tgid_js(self): try: path_json = self.path_out.replace(".xml", ".json") log_json = Log("w").open(path_json, 0).log s = json.dumps(self.tgid_js, indent=2) log_json(s) except Exception as e: msg = f'ERROR log_tgid_js() \n{e}' self.log_err(msg) sys.exit(msg)
def __init__(self, path_text, path_csv): self.path_text = path_text self.path_csv = path_csv # testo.txt => testo_checkover.log path_info = set_path_checkover_out(path_text) self.log_info = Log("w").open(path_info, 1).log # testo.txt => testo_checkover.ERR.log path_err = set_path_checkover_err(path_text) self.log_err = Log("w").open(path_err, 1).log self.text = '' self.spc = '' self.len_text = 0 self.brk_lst = [] self.tag_trype = "" self.tag_open = "" self.tag_close = "" self.err1 = 0 self.err2 = 0
def __init__(self, path_text, path_csv): self.path_text = path_text # testo.txt => testo_txt.txt self.path_in = set_path_id_in(path_text) # testo.txt => testo_id.xml self.path_out = set_path_id_out(path_text) self.path_csv = path_csv # testo.txt => testo_id.ERR.log path_err = set_path_id_log(path_text) self.log_err = Log("w").open(path_err, 1).log # testo.txt => testo_id.log path_info = set_path_id_log(path_text) self.log_info = Log("w").open(path_info, 0).log self.xml_root = etree self.tgid_js = {} self.id_cfg = {} # UA self.input_err_active = True self.input_err_active = False
def __init__(self, path_text='', path_note=''): # testo.txt => testo_id_over.xml self.path_in = tpth.set_path_note_in(path_text) # testo.txt => testo_id_over_note.xml self.path_out = tpth.set_path_note_out(path_text) #teimnote.csv self.note_path = path_note # testo.txt => testo_id_over_note.ERR.log path_err = tpth.set_path_note_err(path_text) self.log_err = Log("w").open(path_err, 1).log self.delimiter = '|'
import sys import re from teimedlib.ualog import Log #import ulalib.pathutils as ptu __date__ = "11-03-2022" __version__ = "0.1.0" __author__ = "Marta Materni" # ENCODING = 'ISO-8859-1' ENCODING = 'utf-8' APO = "’" PUNTS = ',.;::?!^~()[]{}<>=+-*#@£&%/\\«»“‘’`"\'' UNDER = "_" log_info = Log("w") log_err = Log("w") class TextCleaner(object): """ Sistema puteggiature rimuove spazi bianche maggiori di 1 elimina spazi ad inizio e fine riga il parametro l (opzionale): -1) lascia la separazione originale 0 ) separa per paragrafi >0) separa per lunghezza riga """
class TxtBuilder: def __init__(self): self.log = Log("w") self.log.open("log/txtbuilder.log", 0) self.logerr = Log("a") self.logerr.open("log/txtbuilder.ERR.log", 1) self.data_lst = [] self.data_txt_lst = [] self.data_span_lst = [] self.from_to_lst = [] self.txt_rows = [] self.up = True self.w_liv = 100 self.trace = False self.ramis = self.set_ramis_dict() def set_ramis_dict(self): js = {} for r in RAMIS: k, v = r.split('|') js[k] = {} ls = v.split(',') for xy in ls: x, y = xy.split(':') js[k][x] = y return js def get_ramis(self, key, ch): js = self.ramis.get(key, None) if js is None: return f"ERR{key}" r = js.get(ch, None) if r is None: return f"ERR{ch}" return r def fill_from_to_list(self): for data_span in self.data_span_lst: x_items = data_span.get('items', {}) x_from = x_items.get('from', None) x_to = x_items.get('to', None) x_type = x_items.get('type', None) if x_from is None or x_to is None or x_type is None: self.logerr.log("fill_from_to_list ERROR.").prn() self.logerr.log(pp(data_span)).prn() sys.exit(1) item = { "id0": x_from, "id1": x_to, "type": x_type } self.from_to_lst.append(item) def from_to_set_data_txt(self): for i in range(0, len(self.from_to_lst)): from_to = self.from_to_lst[i] id_from = from_to['id0'].strip() id_to = from_to['id1'].strip() span_type = from_to['type'].strip() err = 0 if id_from == '': err = 1 if id_to == '': err = 2 if err == 1: self.logerr.log(f"ERROR from is null. to:{id_to}.") elif err == 2: self.logerr.log(f"ERROR from={id_from} to is null.") for i in range(0, len(self.data_txt_lst)): data_txt = self.data_txt_lst[i] id = data_txt['id'] if id == '': continue if id_from == id: if span_type == MONOLOG: if err == 0: data_txt[START] = '[' else: data_txt[START] = '[ERR ' elif span_type == DIRECT: if err == 0: data_txt[START] = '{' else: data_txt[START] = '{ERR ' elif id_to == id: if span_type == MONOLOG: if err == 0: data_txt[END] = ']' else: data_txt[END] = ' ERR]' elif span_type == DIRECT: if err == 0: data_txt[END] = '}' else: data_txt[END] = ' ERR}' # def w_num(self, id): # p = id.find('w') # if p < 0: # return -1 # return int(id[p+1:]) def set_data_txt_list(self): """setta t_data utilizzano xml_data e csv_data """ t_up = False sic = False #w_num = 0 for i, d in enumerate(self.data_txt_lst): #id = d["id"] liv = d["liv"] tag = d['tag'].lower().strip() d['tag'] = tag text = d['text'].strip() d['t_i'] = i sp = '' ln = False if text != '': if t_up: self.data_txt_lst[i]['t_up'] = True t_up = False if sic: self.data_txt_lst[i]['text'] = '' sic = False if tag == 'w': sp = ' ' self.w_liv = liv elif tag == 'pc': if text in ['.', '!', '?']: t_up = True elif tag in NAMES_UP: t_up = True elif tag in ['lg']: t_up = True elif tag == 'del': self.data_txt_lst[i]['text'] = '' self.data_txt_lst[i]['tail'] = '' elif tag == 'sic': sic = True elif tag == 'l': ln = True d['t_sp'] = sp d['t_ln'] = ln def is_in_xml_items(self, items, key, val): v = items.get(key, '') v = v.replace('#', '').strip() return v == val def adjust_tail_inversion(self): """ <w xml:id="Kch2h1w14">des <expan corresp="#ab-sus-tu">t <ex>r</ex>u </expan>c <c ana="#hiat">i</c>on </w> <w xml:id="Kch1p1w104"> <expan corresp="#ab-tir-9"> <ex>con</ex> </expan> <expan corresp="#ab-tild-q">q<ex>ue</ex> </expan>re </w> errattO: con q re ue corretto: con q ue re "re" tail di expan "ue" text di ex "u" è stampato dopo perchè <ex> segue <expan> souzione: spostare "ue" prima di "re" <ex>text => prima di <expan>tail """ le = len(self.data_txt_lst)-1 for i, t_curr in enumerate(self.data_txt_lst): if i == 0: continue if i >= le: continue t_prec = self.data_txt_lst[i-1] t_succ = self.data_txt_lst[i+1] if t_curr['tail'] != '': if t_succ['liv'] > t_curr['liv']: # text e tail di <ex> text_succ = t_succ['text'] t_succ['text'] = '' tail_succ = t_succ['tail'] t_succ['tail'] = '' # il tail di<expan> tail_curr = t_curr['tail'] s = f'{text_succ}{tail_succ}{tail_curr}' t_curr['tail'] = s def build_txt_rows(self): """crea le righe di testo self._txt_rows utilizzando data_text=xml_data + csv_data + t_data """ self.adjust_tail_inversion() self.txt_rows = [] words = [] # n=8000 for i, d in enumerate(self.data_txt_lst): id_ = d['id'] tag = d['tag'].strip() text = d['text'].strip() tail = d['tail'].strip() items = d['items'] t_start = d['t_start'] t_sp = d['t_sp'] t_up = d['t_up'] t_end = d['t_end'] t_ln = d['t_ln'] if tag == 'c': if len(text) == 1: k = items.get('ana', None) if k is not None: r = self.get_ramis(k, text) text = r elif tag == 'w': # els if self.is_in_xml_items(items, 'ana', 'elis'): text = f'{text}{ELIS}' self.data_txt_lst[i+1]['t_sp'] = '' # encl if self.is_in_xml_items(items, 'ana', 'encl'): text = f'{ENCL}{text} ' t_sp = '' if t_sp != '': words.append(t_sp) if t_start != '': words.append(t_start) if t_up: text = text.capitalize() else: text = text.lower() tail = tail.lower() w = f"{text}{tail}" if w != '': words.append(w) if t_end != '': words.append(t_end) if t_ln: row = ''.join(words) self.txt_rows.append(row) words = [] # # if xtc['tag'] == 'w': # xtw = xtc # if tail != "" and i < xle: # if xts['liv'] > xtc['liv']: # print(pp(xtw, 20)) # s = xtw['val'].replace(" ", "") # print(pp(xtp, 20)) # print(pp(xtc, 20)) # print(pp(xts, 20)) # print(s) # input("?") # xtc = d # if xtc['tag'] == 'w' and tail != "": # print(pp(xtc)) # input("?") # if id_ == "Kch1p1w104": # #self.trace = True # pass # if self.trace: # print(pp(d, 20)) # print(text) # # print(d) # set_trace() # if id_ == "Kch2h1w14": # n = i # if tag == 'w': # xtw = self.data_txt_lst[i] # if i == n+1: # xtp = self.data_txt_lst[i-1] # xtc = d # xts = self.data_txt_lst[i+1] # print(pp(xtw, 20)) # s = xtw['val'].replace(" ", "") # print(pp(xtp, 20)) # print(pp(xtc, 20)) # print(pp(xts, 20)) # print(s) # input("?") row = ''.join(words).strip() self.txt_rows.append(row) def text_adjust(self): VIRG = '"' for i, rw in enumerate(self.txt_rows): rw = re.sub(r" ,", ", ", rw) rw = re.sub(r" ;", "; ", rw) rw = re.sub(r" \.", ". ", rw) rw = re.sub(r'\[\s*', ' "', rw) rw = re.sub(r'\]', '" ', rw) rw = re.sub(r'{\s*', ' "', rw) rw = re.sub(r'}', '" ', rw) rw = rw.replace(f"{ELIS} ", ELIS) rw = re.sub(r"\s{2,}", " ", rw) self.txt_rows[i] = rw.strip() def elab(self): for data in self.data_lst: if data['tag'] == 'span': self.data_span_lst.append(data) else: self.data_txt_lst.append(data) # popola la lista con gli id from to self.fill_from_to_list() # completa gli elemnti di data_txt_lst self.set_data_txt_list() # setta start ed end in data_txt self.from_to_set_data_txt() # cra le righe di testo self.build_txt_rows() # sistema le righe du testo self.text_adjust() def add(self, data): self.data_lst.append(data) @property def txt(self): s = os.linesep.join(self.txt_rows) return s
elab_rows get_rows_entities elab_row elab_word_entities set_args is_tag_to_add_w subst_add_w supplied_add_w set_word_attr remove_word_underscore check_xml check_xml """ LOG = Log("w").open("XXX.log", 1).log """ CNT = "$" # carattere temp per note def preserve_note(self, line): p0 = line.find("<note") p1 = line.find("</note>") s0 = line[0:p0] s1 = line[p0:p1] s2 = line[p1:] s1 = s1.replace(" ", CNT, -1) t = s0 + s1 + s2 t = t.replace("<note", " <note", -1).replace("</note>", "</note> ", -1) return t if line.find("<note") > -1:
def __init__(self): self.logerr = Log("a") self.log = Log("a") self.logerr.open("log/prjmgr.ERR.log", 1) self.log.open("log/prjmgr.log", 0)
class PrjMgr(object): """ gestisce i progetti codificti in json. digitando senza argomenti vengono visualizzate tutte le opzioni es. prjmgr.py prol_txt.json: prol_txt.json: { "log": "0", "exe": [ [ "teimxml.py ", "-i tou1/prol.txt", "-t teimcfg/teimtags.csv", "-o tou1/log/prol_teim.txt" ] ] } """ def __init__(self): self.logerr = Log("a") self.log = Log("a") self.logerr.open("log/prjmgr.ERR.log", 1) self.log.open("log/prjmgr.log", 0) def kv_split(self, s, sep): sp = s.split(sep) s0 = sp[0].strip() s1 = '' if len(sp) > 1: s1 = sp[1].strip() return s0, s1 def list2str(self, data): if isinstance(data, str): return data.strip() s = " ".join(data) return s.strip() def get(self, js, k): s = js.get(k, None) if s is None: raise Exception(f"{k} not found.{os.linesep}") return s def files_of_dir(self, d, e): p = pl.Path(d) if p.exists() is False: raise Exception(f'{d} not found.') fs = sorted(list(p.glob(e))) return fs def chmod(self, path): os.chmod(path, stat.S_IRWXG + stat.S_IRWXU + stat.S_IRWXO) def include_files(self, include): """nel file host sostitusce ogni parametro con il file ad esso collegato Args: js (dict): "include". ramo del project """ self.log.log(os.linesep, ">> include") try: file_host = include.get("host", None) file_dest = include.get("dest", None) file_lst = include.get("files", []) param_lst = include.get("params", []) # with open(file_host, "rt") as f: host = f.read() # for param_path in file_lst: param, path = self.kv_split(param_path, '|') self.log.log(f"{param}: {path}") with open(path, "rt") as f: txt = f.read() host = host.replace(param, txt) # for key_val in param_lst: key, val = self.kv_split(key_val, '|') self.log.log(f"{key}: {val}") host = host.replace(key, val) # with open(file_dest, "w+") as f: f.write(host) self.chmod(file_dest) except Exception as e: self.logerr.log("include") self.logerr.log(e) sys.exit(1) def execute_files_of_dir(self, exe_dir): self.log.log(">> exe_dir").prn() try: dr = self.get(exe_dir, 'dir') ptrn = self.get(exe_dir, 'pattern') exe_lst = self.get(exe_dir, 'exe_file') par_name = self.get(exe_dir, 'par_name') par_subst = self.get(exe_dir, 'par_subst') # replace par in par_name k, v = self.kv_split(par_subst, '|') files = self.files_of_dir(dr, ptrn) for f in files: file_name = os.path.basename(f) file_par = file_name.replace(k, v) for exe in exe_lst: exe = self.list2str(exe) x = exe.replace(par_name, file_par) self.log.log(x) r = os.system(x) if r != 0: raise Exception(f"execute:{x}") except Exception as e: self.logerr.log("ERROR","exe_dir") self.logerr.log(e) # self.logerr.log(pp(exe_dir)) #sys.exit(1) def remove_files_of_dir(self, remove_dir): self.log.log(">> remove_dir").prn() try: for de in remove_dir: self.log.log(de) dr = de.get('dir') ptrn = de.get('pattern') files = self.files_of_dir(dr, ptrn) for f in files: self.log.log(f) os.remove(f) except Exception as e: self.logerr.log("remove_dir") self.logerr.log(e) self.logerr.log(pp(remove_dir)) #sys.exit(1) def merge_files_of_list(self, merge_files): self.log.log(">> merge_files").prn() out = self.get(merge_files, "out_path") files = self.get(merge_files, "files") fout = open(out, "w+") for f in files: self.log.log(f) with open(f, "rt") as f: txt = f.read() fout.write(txt) fout.write(os.linesep) fout.close() self.log.log(out) self.chmod(out) def merge_files_of_dir(self, merge_dir): self.log.log(">> merge_dir").prn() try: dr = self.get(merge_dir, 'dir') ptrn = self.get(merge_dir, 'pattern') out_path = self.get(merge_dir, 'out_path') files = self.files_of_dir(dr, ptrn) file_out = open(out_path, "w") for fpath in files: self.log.log(fpath) with open(fpath, "rt") as f: txt = f.read() file_out.write(txt) file_out.write(os.linesep) file_out.close() self.chmod(out_path) self.log.log(out_path) except Exception as e: self.logerr.log("merge_dir") self.logerr.log(e) self.logerr.log(pp(merge_dir)) #sys.exit(1) def execute_list_progs(self, exe): self.log.log( ">> exe").prn() try: for x in exe: x = self.list2str(x) self.log.log(x) r = os.system(x) if r != 0: raise Exception(str(r)) except Exception as e: self.logerr.log("exe") self.logerr.log(e) self.logerr.log(pp(exe)) #sys.exit(1) def copy_file(self, copy_file): self.log.log(">> copy_file").prn() try: for x in copy_file: in_path = self.get(x, 'in_path') out_path = self.get(x, 'out_path') aw = self.get(x, "aw") self.log.log(in_path) with open(in_path, "rt") as f: text = f.read() with open(out_path, aw) as f: f.write(text) if aw == 'a': f.write(os.linesep) self.chmod(out_path) self.log.log(out_path) except Exception as e: self.logerr.log("copy_file") self.logerr.log(e) self.logerr.log(pp(copy_file)) sys.exit(1) def write_text(self, write_text): self.log.log(">> write_text").prn() try: text = self.get(write_text, 'text') out_path = self.get(write_text, 'out_path') aw = self.get(write_text, "aw") with open(out_path, aw) as f: f.write(text) if aw == 'a': f.write(os.linesep) self.chmod(out_path) self.log.log(out_path) except Exception as e: self.logerr.log("write_text") self.logerr.log(e) self.logerr.log(pp(write_text)) sys.exit(1) def parse_json(self, js): for k, v in js.items(): # accetta tag del tipo exe.1 exe.2 .. k = k.split('.')[0] if k == "exe": self.execute_list_progs(v) elif k == "merge_files": self.merge_files_of_list(v) elif k == "merge_dir": self.merge_files_of_dir(v) elif k == "include": self.include_files(v) elif k == "exe_dir": self.execute_files_of_dir(v) elif k == "remove_dir": self.remove_files_of_dir(v) elif k == "write_text": self.write_text(v) elif k == "copy_file": self.copy_file(v) elif k == "log": l = int(v) self.log.set_liv(l) else: self.logerr.log(f"ERROR option:{k} not implemented") def parse_file(self, in_path): try: with open(in_path, "r") as f: txt = f.read() js = json.loads(txt) except Exception as e: self.logerr.log("prjmgr.py json ERROR") self.logerr.log(e) sys.exit(1) self.parse_json(js) def parse_jsons(self,*js): lst=list(js) for j in lst: self.parse_json(j)
def do_main(path_in, path_out, add_div): """ formatta un file xml se add_div aggiunge <div> </div> """ path_err = path_out.replace('.xml', ".xml.ERR.log") log_err = Log("w").open(path_err, 1).log def make_xml_err(xml, err): #TODO f(m := re.search(r'(line )([0-9]+)(,)', err)): m = re.search(r'(line )([0-9]+)(,)', err) if m is not None: s = m.group(2) n = int(s) else: n = -1 rows = xml.split(os.linesep) for i, row in enumerate(rows): rows[i] = f'{i+1}){row}' if i + 1 == n: rows[i] = f'\nERRROR\n{rows[i]}\n{err}\n' text = os.linesep.join(rows) return text try: with open(path_in, "r") as f: src = f.read() except Exception as e: msg_err = f"ERROR format_xml()\n read xml\n{e}" log_err(msg_err) return try: if add_div: xml_src = f'<div>{src}</div>' else: xml_src = src #xml_src = f'<div>{src}</div>' parser = etree.XMLParser(remove_blank_text=True) root = etree.XML(xml_src, parser) xml = etree.tostring(root, method='xml', xml_declaration=None, encoding='unicode', with_tail=True, pretty_print=True, standalone=None, doctype=None, exclusive=False, inclusive_ns_prefixes=None, strip_text=False) except etree.ParseError as e: msg_err = f"ERROR format_xml()\n{e}" xml_err = make_xml_err(src, str(e)) log_err(xml_err, os.linesep) log_err(msg_err) try: os.remove(path_out) except: pass else: with open(path_out, "w") as f: f.write(xml)
def do_main(path_xml): path_err = path_xml.replace(".xml", ".tei_xml_ERR.log") logerr = Log("w").open(path_err, 1).log ctx = CheckTeimXml() ctx.check_tei_xml(path_xml, logerr)
import stat from teimedlib.ualog import Log from teimedlib.readovertags import read_tags_over_sorted from teimedlib.teim_paths import * import teimxmlformat as xmf __date__ = "30-05-2022" __version__ = "0.2.4" __author__ = "Marta Materni" def pp(data, w=40): return pprint.pformat(data, indent=2, width=w) LGDB = Log("w").open("debug.log", 1).log class TeimOverFlow(object): DATA_TYPE = "tp" DATA_FROM = "from" DATA_TO = "to" OP = 'op' CL = 'cl' LOP = 'lop' LCL = 'lcl' TP = 'tp' OPEN_CLOSE = 'ctrl' """ teimover.py -i text.txt -c teimcfg/teimoverflow.csv
class Xml2Txt: """ Estrae un file di testo da un file tei xml """ def __init__(self, path_xml='', path_txt='', write_append='w'): self.path_xml = path_xml self.path_txt = path_txt self.write_append = write_append path_err = path_txt.replace(".txt", ".ERR.log") self.logerr = Log("w").open(path_err, 1).log self.txt_builder = None self.trace = False def node_liv(self, node): d = 0 while node is not None: d += 1 node = node.getparent() return d - 1 def clean_key(self, k): s = k p0 = k.find("{http") if (p0 > -1): p1 = k.rfind('}') if p1 > -1: s = k[p1 + 1:] return s def node_items(self, nd): kvs = nd.items() js = {} for kv in kvs: k = self.clean_key(kv[0]) v = kv[1] js[k] = v return js def node_tag(self, nd): try: tag = nd.tag tag = tag if type(nd.tag) is str else "XXX" pid = tag.find('}') if pid > 0: tag = tag[pid + 1:] return tag.strip() except Exception as e: self.logerr.log("ERROR in xml") self.logerr.log(str(e)) return "XXX" def node_id(self, nd): s = '' kvs = nd.items() for kv in kvs: if kv[0].rfind('id') > -1: s = kv[1] break return s def node_id_num(self, id): if id == '': return '' m = re.search(r'\d', id) if m is None: return -1 p = m.start() return id[p:] def node_text(self, nd): text = nd.text text = '' if text is None else text.strip() text = text.strip().replace(os.linesep, ',,') return text def node_tail(self, nd): tail = '' if nd.tail is None else nd.tail tail = tail.strip().replace(os.linesep, '') return tail def node_val(self, nd): ls = [] for x in nd.itertext(): s = x.strip().replace(os.linesep, '') ls.append(s) texts = ' '.join(ls) s = re.sub(r"\s{2,}", ' ', texts) return s def node_is_parent(self, nd): cs = nd.getchildren() le = len(cs) return le > 0 def get_node_data(self, nd): items = self.node_items(nd) id = self.node_id(nd) if id != '': id_num = self.node_id_num(id) items['id_num'] = id_num js = { 'id': id, 'liv': self.node_liv(nd), 'tag': self.node_tag(nd), 'text': self.node_text(nd), 'tail': self.node_tail(nd), 'items': items, # 'keys': self.node_keys(nd) # 'val': self.node_val(nd), 'val': "", 'is_parent': self.node_is_parent(nd) } return js def build_txt_data(self, nd): """ crea un json contenente x_data (estratto da xml) t_data (empty per la furua elaborazione) Args: nd : nod xml Returns: json: json=x_data + c_data + t_data """ x_data = self.get_node_data(nd) txt_data = { 'id': x_data.get('id', 0), 'is_parent': x_data.get('is_parent', False), 'items': x_data.get('items', {}), 'liv': x_data.get('liv', 0), 'tag': x_data.get('tag', ''), 'text': x_data.get('text', ''), 'tail': x_data.get('tail', ''), 'val': x_data.get('val', ''), 't_i': 0, 't_type': '', 't_up': False, 't_start': '', 't_end': '', 't_sp': '', 't_ln': False, 't_flag': False } return txt_data def write_txt(self): try: parser = etree.XMLParser(ns_clean=True) xml_root = etree.parse(self.path_xml, parser) except Exception as e: self.logerr.log("ERROR teixml2txt.py write_txt() parse_xml") self.logerr.log(e) sys.exit(str(e)) try: self.txt_builder = TxtBuilder() ######################## for nd in xml_root.iter(): txt_data = self.build_txt_data(nd) self.txt_builder.add(txt_data) ######################## self.txt_builder.elab() txt = self.txt_builder.txt make_dir_of_file(self.path_txt) with open(self.path_txt, self.write_append) as f: f.write(txt) chmod(self.path_txt) except Exception as e: self.logerr.log("ERROR teixml2txt.py write_html()") self.logerr.log(e) ou = StringIO() traceback.print_exc(file=ou) st = ou.getvalue() ou.close() self.logerr.log(st) sys.exit(1) return self.path_txt
root_w = 1000 root_h = 600 root_x = 100 root_y = 100 log_w = 1000 log_h = 600 log_x = 200 log_y = 200 def pp(data, w=60): return pprint.pformat(data, width=w) tfhlogerr = Log("w") W_IDS = [ 'wmerge', 'wcheckover', 'wchecktxt', 'wtxt', 'wsetid', 'wover', 'wnote', 'wformat', 'wall', 'wremovelog' ] T_IDS = [ 'tcheckover', 'tchecktxt', 'ttxt', 'tsetid', 'tover', 'tnote', 'tformat', 'tall', 'tremovelog' ] TEIMFH_LOG_ERR = "log/teimxmlh.ERR.log" class TeimXmlFh(object):
class TxtBuilder: def __init__(self): self.log = Log("w") self.log.open("log/txtbuilder.log", 0) self.logerr = Log("a") self.logerr.open("log/txtbuilder.ERR.log", 1) self.data_lst = [] self.data_txt_lst = [] self.data_span_lst = [] self.from_to_lst = [] self.txt_rows = [] self.up = True self.w_liv = 100 self.trace = False self.ramis=self.set_ramis_dict() def set_ramis_dict(self): js={} for r in RAMIS: k,v=r.split('|') js[k]={} ls=v.split(',') for xy in ls: x,y=xy.split(':') js[k][x]=y return js def get_ramis(self,key,ch): js=self.ramis.get(key,None) if js is None: return f"ERR{key}" r=js.get(ch,None) if r is None: return f"ERR{ch}" return r def fill_from_to_list(self): for data_span in self.data_span_lst: x_items = data_span.get('items', {}) x_from = x_items.get('from', None) x_to = x_items.get('to', None) x_type = x_items.get('type', None) if x_from is None or x_to is None or x_type is None: self.logerr.log("fill_from_to_list ERROR.").prn() self.logerr.log(pp(data_span)).prn() sys.exit(1) item = { "id0": x_from, "id1": x_to, "type": x_type } self.from_to_lst.append(item) def from_to_set_data_txt(self): for i in range(0, len(self.from_to_lst)): from_to = self.from_to_lst[i] id_from = from_to['id0'].strip() id_to = from_to['id1'].strip() span_type = from_to['type'].strip() err = 0 if id_from == '': err = 1 if id_to == '': err = 2 if err == 1: self.logerr.log(f"ERROR from is null. to:{id_to}.") elif err == 2: self.logerr.log(f"ERROR from={id_from} to is null.") for i in range(0, len(self.data_txt_lst)): data_txt = self.data_txt_lst[i] id = data_txt['id'] if id == '': continue if id_from == id: if span_type == MONOLOG: if err == 0: data_txt[START] = '[' else: data_txt[START] = '[ERR ' elif span_type == DIRECT: if err == 0: data_txt[START] = '{' else: data_txt[START] = '{ERR ' elif id_to == id: if span_type == MONOLOG: if err == 0: data_txt[END] = ']' else: data_txt[END] = ' ERR]' elif span_type == DIRECT: if err == 0: data_txt[END] = '}' else: data_txt[END] = ' ERR}' # def w_num(self, id): # p = id.find('w') # if p < 0: # return -1 # return int(id[p+1:]) def set_data_txt_list(self): """setta t_data utilizzano xml_data e csv_data """ t_up = False sic = False w_num = 0 for i, d in enumerate(self.data_txt_lst): #id = d["id"] liv = d["liv"] tag = d['tag'].lower().strip() d['tag'] = tag text = d['text'].strip() d['t_i'] = i sp = '' ln = False if text != '': if t_up: self.data_txt_lst[i]['t_up'] = True t_up = False if sic: self.data_txt_lst[i]['text'] = '' sic = False if tag == 'w': sp = ' ' self.w_liv = liv elif tag == 'pc': if text in ['.', '!', '?']: t_up = True elif tag in NAMES_UP: t_up = True elif tag in ['lg']: t_up = True elif tag == 'del': self.data_txt_lst[i]['text'] = '' self.data_txt_lst[i]['tail'] = '' elif tag == 'sic': sic = True elif tag == 'l': ln = True d['t_sp'] = sp d['t_ln'] = ln def is_in_xml_items(self, items, key, val): v = items.get(key, '') v = v.replace('#', '').strip() return v == val def build_txt_rows(self): """crea le righe di testo self._txt_rows utilizzando data_text=xml_data + csv_data + t_data """ self.txt_rows = [] words = [] for i, d in enumerate(self.data_txt_lst): id = d['id'] tag = d['tag'].strip() text = d['text'].strip() tail = d['tail'].strip() items = d['items'] t_start = d['t_start'] t_sp = d['t_sp'] t_up = d['t_up'] t_end = d['t_end'] t_ln = d['t_ln'] if tag == 'c': if len(text)==1: k=items.get('ana',None) if k is not None: r=self.get_ramis(k,text) text=r elif tag == 'w': # els if self.is_in_xml_items(items, 'ana', 'elis'): text = f'{text}{ELIS}' self.data_txt_lst[i+1]['t_sp'] = '' # encl if self.is_in_xml_items(items, 'ana', 'encl'): text = f'{ENCL}{text} ' t_sp = '' if t_sp != '': words.append(t_sp) if t_start != '': words.append(t_start) if t_up: text = text.capitalize() else: text = text.lower() tail = tail.lower() w = f"{text}{tail}" if w != '': words.append(w) if t_end != '': words.append(t_end) if t_ln: row = ''.join(words) self.txt_rows.append(row) words = [] row = ''.join(words).strip() self.txt_rows.append(row) def text_adjust(self): VIRG = '"' for i, rw in enumerate(self.txt_rows): rw = re.sub(r" ,", ", ", rw) rw = re.sub(r" ;", "; ", rw) rw = re.sub(r" \.", ". ", rw) rw = re.sub(r'\[\s*', ' "', rw) rw = re.sub(r'\]', '" ', rw) rw = re.sub(r'{\s*', ' "', rw) rw = re.sub(r'}', '" ', rw) rw = rw.replace(f"{ELIS} ", ELIS) rw = re.sub(r"\s{2,}", " ", rw) self.txt_rows[i] = rw.strip() def elab(self): for data in self.data_lst: if data['tag'] == 'span': self.data_span_lst.append(data) else: self.data_txt_lst.append(data) # popola la lista con gli id from to self.fill_from_to_list() # completa gli elemnti di data_txt_lst self.set_data_txt_list() # setta start ed end in datat_tx self.from_to_set_data_txt() # cra le righe di testo self.build_txt_rows() # sistema le righe du testo self.text_adjust() def add(self, data): self.data_lst.append(data) @property def txt(self): s = os.linesep.join(self.txt_rows) return s