Exemple #1
0
    def __init__(self, path_text, path_tags):
        # testo.txt => testo_txt,txt
        self.path_out = set_path_teim_out(path_text)

        # testo.txt => testo_txt.log
        path_info = set_path_teim_log(path_text)
        self.log_info = Log("w").open(path_info, 0).log

        # testo.txt => testo_teim_.xml
        self.path_xml = set_path_teim_xml_log(path_text)
        self.log_teim_xml = Log("w").open(self.path_xml, 0).log

        # testo.txt => testo_txt.ERR.log
        path_err = set_path_teim_err(path_text)
        self.log_err = Log("w").open(path_err, 1).log
        try:
            self.text_entities = TextEntities(path_text, path_tags,
                                              self.log_err)
        except Exception as e:
            sys.exit(e)
        self.row_num = -1
        # self.rows_entities = []

        # tipo numerazione righe
        self.LB = 'lb'
        self.LG_L = 'lg_l'
        self.type_line = self.LB

        self.trace = False

        # self.input_err_active = True
        self.input_err_active = False
Exemple #2
0
    def __init__(self, path_text, path_csv):
        # test.txt => testo_id.xml
        self.path_in = set_path_over_in(path_text)

        # test.txt => testo_id_over.xml
        self.path_out = set_path_over_out(path_text)

        # test.txt => testo_id_over.log
        path_span = set_path_over_log(path_text)
        self.logspan = Log("w").open(path_span, 0).log

        # test.txt => testo_id_over.ERR.log
        path_err = set_path_over_err(path_text)
        self.logerr = Log("w").open(path_err, 1).log

        self.root_xml = etree
        self.span_data = {}
        self.key_span_data = ''
        self.row_tag_over_js = {}
        self.tag_op_alter = []
        self.tag_cl_alter = []
        self.tag_op_lst = []
        self.tag_cl_lst = []
        self.rows_tag_over = []
        # self.over_op_set=[]
        # self.over_cl_set=[]
        # self.over_tag_set=[]
        self.add_xml_span_overflow_list(path_csv)
Exemple #3
0
    def __init__(self, path_xml='', path_txt='', write_append='w'):
        self.path_xml = path_xml
        self.path_txt = path_txt
        self.write_append = write_append

        path_err = path_txt.replace(".txt", ".ERR.log")
        self.logerr = Log("w").open(path_err, 1).log

        self.txt_builder = None
        self.trace = False
Exemple #4
0
    def __init__(self, path_text):
        self.path_text = path_text

        path_out=set_path_checktxt_out(path_text)
        self.log_info = Log("w").open(path_out, 1).log

        path_err=set_path_checktxt_err(path_text)
        self.log_err = Log("w").open(path_err, 1).log

        self.trace = False
        self.rows = []
        self.rnd_rows = []
Exemple #5
0
 def __init__(self):
     self.log = Log("w")
     self.log.open("log/txtbuilder.log", 0)
     self.logerr = Log("a")
     self.logerr.open("log/txtbuilder.ERR.log", 1)
     self.data_lst = []
     self.data_txt_lst = []
     self.data_span_lst = []
     self.from_to_lst = []
     self.txt_rows = []
     self.up = True
     self.w_liv = 100
     self.trace = False
     self.ramis=self.set_ramis_dict()
Exemple #6
0
 def __init__(self, path_text, log_err=None):
     self.path_text = path_text
     if log_err is None:
         path_err = set_path_teim_err(path_text)
         self.log_errr = Log('w').open(path_err, 1).log
     else:
         self.log_err = log_err
Exemple #7
0
def do_main(path_src, path_tags):
    path_err = path_src.replace(".txt", "_words.ERR.log")
    log_err = Log("w").open(path_err, 1).log
    te = TextEntities(path_src, path_tags, log_err)
    lst = te.get_rows_entities()
    path_log = path_src.replace(".txt", "_words.log")
    word_entities_log(lst, path_log)
Exemple #8
0
 def log_tgid_js(self):
     try:
         path_json = self.path_out.replace(".xml", ".json")
         log_json = Log("w").open(path_json, 0).log
         s = json.dumps(self.tgid_js, indent=2)
         log_json(s)
     except Exception as e:
         msg = f'ERROR log_tgid_js() \n{e}'
         self.log_err(msg)
         sys.exit(msg)
Exemple #9
0
    def __init__(self, path_text, path_csv):
        self.path_text = path_text
        self.path_csv = path_csv

        # testo.txt => testo_checkover.log
        path_info = set_path_checkover_out(path_text)
        self.log_info = Log("w").open(path_info, 1).log

        # testo.txt => testo_checkover.ERR.log
        path_err = set_path_checkover_err(path_text)
        self.log_err = Log("w").open(path_err, 1).log

        self.text = ''
        self.spc = ''
        self.len_text = 0
        self.brk_lst = []
        self.tag_trype = ""
        self.tag_open = ""
        self.tag_close = ""
        self.err1 = 0
        self.err2 = 0
Exemple #10
0
    def __init__(self, path_text, path_csv):
        self.path_text = path_text
        # testo.txt => testo_txt.txt
        self.path_in = set_path_id_in(path_text)

        # testo.txt => testo_id.xml
        self.path_out = set_path_id_out(path_text)
        self.path_csv = path_csv

        # testo.txt => testo_id.ERR.log
        path_err = set_path_id_log(path_text)
        self.log_err = Log("w").open(path_err, 1).log

        # testo.txt => testo_id.log
        path_info = set_path_id_log(path_text)
        self.log_info = Log("w").open(path_info, 0).log

        self.xml_root = etree
        self.tgid_js = {}
        self.id_cfg = {}
        # UA self.input_err_active = True
        self.input_err_active = False
Exemple #11
0
    def __init__(self, path_text='', path_note=''):
        # testo.txt => testo_id_over.xml
        self.path_in = tpth.set_path_note_in(path_text)

        # testo.txt => testo_id_over_note.xml
        self.path_out = tpth.set_path_note_out(path_text)
        #teimnote.csv
        self.note_path = path_note

        # testo.txt => testo_id_over_note.ERR.log
        path_err = tpth.set_path_note_err(path_text)
        self.log_err = Log("w").open(path_err, 1).log

        self.delimiter = '|'
Exemple #12
0
import sys
import re
from teimedlib.ualog import Log
#import ulalib.pathutils as ptu

__date__ = "11-03-2022"
__version__ = "0.1.0"
__author__ = "Marta Materni"

# ENCODING = 'ISO-8859-1'
ENCODING = 'utf-8'
APO = "’"
PUNTS = ',.;::?!^~()[]{}<>=+-*#@£&%/\\«»“‘’`"\''
UNDER = "_"

log_info = Log("w")
log_err = Log("w")


class TextCleaner(object):
    """
    Sistema puteggiature
    rimuove spazi bianche maggiori di 1
    elimina spazi ad inizio e fine riga
    il parametro l (opzionale):
    -1) lascia la separazione originale
    0 ) separa per paragrafi
    >0) separa per lunghezza riga

    """
Exemple #13
0
class TxtBuilder:

    def __init__(self):
        self.log = Log("w")
        self.log.open("log/txtbuilder.log", 0)
        self.logerr = Log("a")
        self.logerr.open("log/txtbuilder.ERR.log", 1)
        self.data_lst = []
        self.data_txt_lst = []
        self.data_span_lst = []
        self.from_to_lst = []
        self.txt_rows = []
        self.up = True
        self.w_liv = 100
        self.trace = False
        self.ramis = self.set_ramis_dict()

    def set_ramis_dict(self):
        js = {}
        for r in RAMIS:
            k, v = r.split('|')
            js[k] = {}
            ls = v.split(',')
            for xy in ls:
                x, y = xy.split(':')
                js[k][x] = y
        return js

    def get_ramis(self, key, ch):
        js = self.ramis.get(key, None)
        if js is None:
            return f"ERR{key}"
        r = js.get(ch, None)
        if r is None:
            return f"ERR{ch}"
        return r

    def fill_from_to_list(self):
        for data_span in self.data_span_lst:
            x_items = data_span.get('items', {})
            x_from = x_items.get('from', None)
            x_to = x_items.get('to', None)
            x_type = x_items.get('type', None)
            if x_from is None or x_to is None or x_type is None:
                self.logerr.log("fill_from_to_list ERROR.").prn()
                self.logerr.log(pp(data_span)).prn()
                sys.exit(1)
            item = {
                "id0": x_from,
                "id1": x_to,
                "type": x_type
            }
            self.from_to_lst.append(item)

    def from_to_set_data_txt(self):
        for i in range(0, len(self.from_to_lst)):
            from_to = self.from_to_lst[i]
            id_from = from_to['id0'].strip()
            id_to = from_to['id1'].strip()
            span_type = from_to['type'].strip()
            err = 0
            if id_from == '':
                err = 1
            if id_to == '':
                err = 2
            if err == 1:
                self.logerr.log(f"ERROR from is null. to:{id_to}.")
            elif err == 2:
                self.logerr.log(f"ERROR from={id_from}  to is null.")
            for i in range(0, len(self.data_txt_lst)):
                data_txt = self.data_txt_lst[i]
                id = data_txt['id']
                if id == '':
                    continue
                if id_from == id:
                    if span_type == MONOLOG:
                        if err == 0:
                            data_txt[START] = '['
                        else:
                            data_txt[START] = '[ERR '
                    elif span_type == DIRECT:
                        if err == 0:
                            data_txt[START] = '{'
                        else:
                            data_txt[START] = '{ERR '
                elif id_to == id:
                    if span_type == MONOLOG:
                        if err == 0:
                            data_txt[END] = ']'
                        else:
                            data_txt[END] = ' ERR]'
                    elif span_type == DIRECT:
                        if err == 0:
                            data_txt[END] = '}'
                        else:
                            data_txt[END] = ' ERR}'

    # def w_num(self, id):
    #     p = id.find('w')
    #     if p < 0:
    #         return -1
    #     return int(id[p+1:])

    def set_data_txt_list(self):
        """setta t_data utilizzano xml_data e csv_data
        """
        t_up = False
        sic = False
        #w_num = 0
        for i, d in enumerate(self.data_txt_lst):
            #id = d["id"]
            liv = d["liv"]
            tag = d['tag'].lower().strip()
            d['tag'] = tag
            text = d['text'].strip()
            d['t_i'] = i
            sp = ''
            ln = False

            if text != '':
                if t_up:
                    self.data_txt_lst[i]['t_up'] = True
                    t_up = False
                if sic:
                    self.data_txt_lst[i]['text'] = ''
                    sic = False

            if tag == 'w':
                sp = ' '
                self.w_liv = liv
            elif tag == 'pc':
                if text in ['.', '!', '?']:
                    t_up = True
            elif tag in NAMES_UP:
                t_up = True
            elif tag in ['lg']:
                t_up = True
            elif tag == 'del':
                self.data_txt_lst[i]['text'] = ''
                self.data_txt_lst[i]['tail'] = ''
            elif tag == 'sic':
                sic = True
            elif tag == 'l':
                ln = True
            d['t_sp'] = sp
            d['t_ln'] = ln

    def is_in_xml_items(self, items, key, val):
        v = items.get(key, '')
        v = v.replace('#', '').strip()
        return v == val

    def adjust_tail_inversion(self):
        """
        <w xml:id="Kch2h1w14">des
         <expan corresp="#ab-sus-tu">t
            <ex>r</ex>u
        </expan>c
        <c ana="#hiat">i</c>on
        </w>

     <w xml:id="Kch1p1w104">
      <expan corresp="#ab-tir-9">
        <ex>con</ex>
      </expan>
      <expan corresp="#ab-tild-q">q<ex>ue</ex>
      </expan>re

     </w>

    errattO:  con q re ue
    corretto: con q ue re

        "re"  tail di expan
        "ue"  text di ex 
        "u" è stampato dopo perchè <ex> segue <expan>

        souzione:
            spostare "ue" prima di "re"
            <ex>text => prima di <expan>tail

        """
        le = len(self.data_txt_lst)-1
        for i, t_curr in enumerate(self.data_txt_lst):
            if i == 0:
                continue
            if i >= le:
                continue
            t_prec = self.data_txt_lst[i-1]
            t_succ = self.data_txt_lst[i+1]
            if t_curr['tail'] != '':
                if t_succ['liv'] > t_curr['liv']:

                    # text e tail  di <ex>
                    text_succ = t_succ['text']
                    t_succ['text'] = ''
                    tail_succ = t_succ['tail']
                    t_succ['tail'] = ''

                    # il tail di<expan>
                    tail_curr = t_curr['tail']
                    s = f'{text_succ}{tail_succ}{tail_curr}'
                    t_curr['tail'] = s

    def build_txt_rows(self):
        """crea le righe di testo self._txt_rows
        utilizzando data_text=xml_data + csv_data + t_data
        """
        self.adjust_tail_inversion()

        self.txt_rows = []
        words = []
        # n=8000
        for i, d in enumerate(self.data_txt_lst):
            id_ = d['id']
            tag = d['tag'].strip()
            text = d['text'].strip()
            tail = d['tail'].strip()
            items = d['items']

            t_start = d['t_start']
            t_sp = d['t_sp']
            t_up = d['t_up']
            t_end = d['t_end']
            t_ln = d['t_ln']

            if tag == 'c':
                if len(text) == 1:
                    k = items.get('ana', None)
                    if k is not None:
                        r = self.get_ramis(k, text)
                        text = r

            elif tag == 'w':
                # els
                if self.is_in_xml_items(items, 'ana', 'elis'):
                    text = f'{text}{ELIS}'
                    self.data_txt_lst[i+1]['t_sp'] = ''
                # encl
                if self.is_in_xml_items(items, 'ana', 'encl'):
                    text = f'{ENCL}{text} '
                    t_sp = ''

            if t_sp != '':
                words.append(t_sp)

            if t_start != '':
                words.append(t_start)

            if t_up:
                text = text.capitalize()
            else:
                text = text.lower()
            tail = tail.lower()

            w = f"{text}{tail}"

            if w != '':
                words.append(w)

            if t_end != '':
                words.append(t_end)

            if t_ln:
                row = ''.join(words)
                self.txt_rows.append(row)
                words = []

            #
            # if xtc['tag'] == 'w':
            #     xtw = xtc
            # if tail != "" and i < xle:
            #     if xts['liv'] > xtc['liv']:
            #         print(pp(xtw, 20))
            #         s = xtw['val'].replace(" ", "")
            #         print(pp(xtp, 20))
            #         print(pp(xtc, 20))
            #         print(pp(xts, 20))
            #         print(s)
            #         input("?")
            # xtc = d
            # if xtc['tag'] == 'w' and tail != "":
            #     print(pp(xtc))
            #     input("?")

            # if id_ == "Kch1p1w104":
            #     #self.trace = True
            #     pass
            # if self.trace:
            #     print(pp(d, 20))
            #     print(text)
            #     # print(d)
            #     set_trace()

            # if id_ == "Kch2h1w14":
            #     n = i
            # if tag == 'w':
            #     xtw = self.data_txt_lst[i]
            # if i == n+1:
            #     xtp = self.data_txt_lst[i-1]
            #     xtc = d
            #     xts = self.data_txt_lst[i+1]
            #     print(pp(xtw, 20))
            #     s = xtw['val'].replace(" ", "")
            #     print(pp(xtp, 20))
            #     print(pp(xtc, 20))
            #     print(pp(xts, 20))
            #     print(s)
            #     input("?")

        row = ''.join(words).strip()
        self.txt_rows.append(row)

    def text_adjust(self):
        VIRG = '"'
        for i, rw in enumerate(self.txt_rows):
            rw = re.sub(r" ,", ", ", rw)
            rw = re.sub(r" ;", "; ", rw)
            rw = re.sub(r" \.", ". ", rw)

            rw = re.sub(r'\[\s*', ' "', rw)
            rw = re.sub(r'\]', '" ', rw)
            rw = re.sub(r'{\s*', ' "', rw)
            rw = re.sub(r'}', '" ', rw)
            rw = rw.replace(f"{ELIS} ", ELIS)
            rw = re.sub(r"\s{2,}", " ", rw)
            self.txt_rows[i] = rw.strip()

    def elab(self):
        for data in self.data_lst:
            if data['tag'] == 'span':
                self.data_span_lst.append(data)
            else:
                self.data_txt_lst.append(data)
        # popola la lista con gli id from to
        self.fill_from_to_list()
        # completa gli elemnti di data_txt_lst
        self.set_data_txt_list()
        # setta start ed end in data_txt
        self.from_to_set_data_txt()
        # cra le righe di testo
        self.build_txt_rows()
        # sistema le righe du testo
        self.text_adjust()

    def add(self, data):
        self.data_lst.append(data)

    @property
    def txt(self):
        s = os.linesep.join(self.txt_rows)
        return s
Exemple #14
0
elab_rows
    get_rows_entities 
    elab_row
        elab_word_entities
            set_args
            is_tag_to_add_w
            subst_add_w
            supplied_add_w
            set_word_attr
                remove_word_underscore
            check_xml
    check_xml
"""

LOG = Log("w").open("XXX.log", 1).log
"""
   CNT = "$"    # carattere temp per note

    def preserve_note(self, line):
        p0 = line.find("<note")
        p1 = line.find("</note>")
        s0 = line[0:p0]
        s1 = line[p0:p1]
        s2 = line[p1:]
        s1 = s1.replace(" ", CNT, -1)
        t = s0 + s1 + s2
        t = t.replace("<note", " <note", -1).replace("</note>", "</note> ", -1)
        return t
 
        if line.find("<note") > -1:
Exemple #15
0
 def __init__(self):
     self.logerr = Log("a")
     self.log = Log("a")
     self.logerr.open("log/prjmgr.ERR.log", 1)
     self.log.open("log/prjmgr.log", 0)
Exemple #16
0
class PrjMgr(object):
    """
    gestisce i progetti codificti in json.
    digitando senza argomenti
    vengono visualizzate tutte le opzioni
    es.
    prjmgr.py prol_txt.json:
    
    prol_txt.json:
    {
    "log": "0",
    "exe": [
        [
        "teimxml.py ",
        "-i tou1/prol.txt",
        "-t teimcfg/teimtags.csv",
        "-o tou1/log/prol_teim.txt"
        ]
    ]
    }

    """    
    def __init__(self):
        self.logerr = Log("a")
        self.log = Log("a")
        self.logerr.open("log/prjmgr.ERR.log", 1)
        self.log.open("log/prjmgr.log", 0)

    def kv_split(self, s, sep):
        sp = s.split(sep)
        s0 = sp[0].strip()
        s1 = ''
        if len(sp) > 1:
            s1 = sp[1].strip()
        return s0, s1

    def list2str(self, data):
        if isinstance(data, str):
            return data.strip()
        s = " ".join(data)
        return s.strip()

    def get(self, js, k):
        s = js.get(k, None)
        if s is None:
            raise Exception(f"{k} not found.{os.linesep}")
        return s

    def files_of_dir(self, d, e):
        p = pl.Path(d)
        if p.exists() is False:
            raise Exception(f'{d} not found.')
        fs = sorted(list(p.glob(e)))
        return fs

    def chmod(self, path):
        os.chmod(path, stat.S_IRWXG + stat.S_IRWXU + stat.S_IRWXO)

    def include_files(self, include):
        """nel file host sostitusce ogni parametro
        con il file ad esso collegato

        Args:
            js (dict): "include". ramo del project
        """
        self.log.log(os.linesep, ">> include")
        try:
            file_host = include.get("host", None)
            file_dest = include.get("dest", None)
            file_lst = include.get("files", [])
            param_lst = include.get("params", [])
            #
            with open(file_host, "rt") as f:
                host = f.read()
            #
            for param_path in file_lst:
                param, path = self.kv_split(param_path, '|')
                self.log.log(f"{param}: {path}")
                with open(path, "rt") as f:
                    txt = f.read()
                host = host.replace(param, txt)
            #
            for key_val in param_lst:
                key, val = self.kv_split(key_val, '|')
                self.log.log(f"{key}: {val}")
                host = host.replace(key, val)
            #
            with open(file_dest, "w+") as f:
                f.write(host)
            self.chmod(file_dest)
        except Exception as e:
            self.logerr.log("include")
            self.logerr.log(e)
            sys.exit(1)

    def execute_files_of_dir(self, exe_dir):
        self.log.log(">> exe_dir").prn()
        try:
            dr = self.get(exe_dir, 'dir')
            ptrn = self.get(exe_dir, 'pattern')
            exe_lst = self.get(exe_dir, 'exe_file')
            par_name = self.get(exe_dir, 'par_name')
            par_subst = self.get(exe_dir, 'par_subst')
            # replace par in par_name
            k, v = self.kv_split(par_subst, '|')
            files = self.files_of_dir(dr, ptrn)
            for f in files:
                file_name = os.path.basename(f)
                file_par = file_name.replace(k, v)
                for exe in exe_lst:
                    exe = self.list2str(exe)
                    x = exe.replace(par_name, file_par)
                    self.log.log(x)
                    r = os.system(x)
                    if r != 0:
                        raise Exception(f"execute:{x}")
        except Exception as e:
            self.logerr.log("ERROR","exe_dir")
            self.logerr.log(e)
            # self.logerr.log(pp(exe_dir))
            #sys.exit(1)

    def remove_files_of_dir(self, remove_dir):
        self.log.log(">> remove_dir").prn()
        try:
            for de in remove_dir:
                self.log.log(de)
                dr = de.get('dir')
                ptrn = de.get('pattern')
                files = self.files_of_dir(dr, ptrn)
                for f in files:
                    self.log.log(f)
                    os.remove(f)
        except Exception as e:
            self.logerr.log("remove_dir")
            self.logerr.log(e)
            self.logerr.log(pp(remove_dir))
            #sys.exit(1)

    def merge_files_of_list(self, merge_files):
        self.log.log(">> merge_files").prn()
        out = self.get(merge_files, "out_path")
        files = self.get(merge_files, "files")
        fout = open(out, "w+")
        for f in files:
            self.log.log(f)
            with open(f, "rt") as f:
                txt = f.read()
            fout.write(txt)
            fout.write(os.linesep)
        fout.close()
        self.log.log(out)
        self.chmod(out)

    def merge_files_of_dir(self, merge_dir):
        self.log.log(">> merge_dir").prn()
        try:
            dr = self.get(merge_dir, 'dir')
            ptrn = self.get(merge_dir, 'pattern')
            out_path = self.get(merge_dir, 'out_path')
            files = self.files_of_dir(dr, ptrn)
            file_out = open(out_path, "w")
            for fpath in files:
                self.log.log(fpath)
                with open(fpath, "rt") as f:
                    txt = f.read()
                file_out.write(txt)
                file_out.write(os.linesep)
            file_out.close()
            self.chmod(out_path)
            self.log.log(out_path)
        except Exception as e:
            self.logerr.log("merge_dir")
            self.logerr.log(e)
            self.logerr.log(pp(merge_dir))
            #sys.exit(1)

    def execute_list_progs(self, exe):
        self.log.log( ">> exe").prn()
        try:
            for x in exe:
                x = self.list2str(x)
                self.log.log(x)
                r = os.system(x)
                if r != 0:
                    raise Exception(str(r))
        except Exception as e:
            self.logerr.log("exe")
            self.logerr.log(e)
            self.logerr.log(pp(exe))
            #sys.exit(1)

    def copy_file(self, copy_file):
        self.log.log(">> copy_file").prn()
        try:
            for x in copy_file:
                in_path = self.get(x, 'in_path')
                out_path = self.get(x, 'out_path')
                aw = self.get(x, "aw")
                self.log.log(in_path)
                with open(in_path, "rt") as f:
                    text = f.read()
                with open(out_path, aw) as f:
                    f.write(text)
                    if aw == 'a':
                        f.write(os.linesep)
                self.chmod(out_path)
                self.log.log(out_path)
        except Exception as e:
            self.logerr.log("copy_file")
            self.logerr.log(e)
            self.logerr.log(pp(copy_file))
            sys.exit(1)

    def write_text(self, write_text):
        self.log.log(">> write_text").prn()
        try:
            text = self.get(write_text, 'text')
            out_path = self.get(write_text, 'out_path')
            aw = self.get(write_text, "aw")
            with open(out_path, aw) as f:
                f.write(text)
                if aw == 'a':
                    f.write(os.linesep)
            self.chmod(out_path)
            self.log.log(out_path)
        except Exception as e:
            self.logerr.log("write_text")
            self.logerr.log(e)
            self.logerr.log(pp(write_text))
            sys.exit(1)

    def parse_json(self, js):
        for k, v in js.items():
            # accetta  tag del tipo exe.1 exe.2 ..
            k = k.split('.')[0]
            if k == "exe":
                self.execute_list_progs(v)
            elif k == "merge_files":
                self.merge_files_of_list(v)
            elif k == "merge_dir":
                self.merge_files_of_dir(v)
            elif k == "include":
                self.include_files(v)
            elif k == "exe_dir":
                self.execute_files_of_dir(v)
            elif k == "remove_dir":
                self.remove_files_of_dir(v)
            elif k == "write_text":
                self.write_text(v)
            elif k == "copy_file":
                self.copy_file(v)
            elif k == "log":
                l = int(v)
                self.log.set_liv(l)
            else:
                self.logerr.log(f"ERROR option:{k} not implemented")

    def parse_file(self, in_path):
        try:
            with open(in_path, "r") as f:
                txt = f.read()
            js = json.loads(txt)
        except Exception as e:
            self.logerr.log("prjmgr.py json ERROR")
            self.logerr.log(e)
            sys.exit(1)
        self.parse_json(js)

    def parse_jsons(self,*js):
        lst=list(js)
        for j in lst:
            self.parse_json(j)
Exemple #17
0
def do_main(path_in, path_out, add_div):
    """
    formatta un file xml 
    se add_div aggiunge
    <div> </div>
    """
    path_err = path_out.replace('.xml', ".xml.ERR.log")
    log_err = Log("w").open(path_err, 1).log

    def make_xml_err(xml, err):
        #TODO f(m := re.search(r'(line )([0-9]+)(,)', err)):
        m = re.search(r'(line )([0-9]+)(,)', err)
        if m is not None:
            s = m.group(2)
            n = int(s)
        else:
            n = -1
        rows = xml.split(os.linesep)
        for i, row in enumerate(rows):
            rows[i] = f'{i+1}){row}'
            if i + 1 == n:
                rows[i] = f'\nERRROR\n{rows[i]}\n{err}\n'
        text = os.linesep.join(rows)
        return text

    try:
        with open(path_in, "r") as f:
            src = f.read()
    except Exception as e:
        msg_err = f"ERROR format_xml()\n read xml\n{e}"
        log_err(msg_err)
        return
    try:
        if add_div:
            xml_src = f'<div>{src}</div>'
        else:
            xml_src = src
        #xml_src = f'<div>{src}</div>'
        parser = etree.XMLParser(remove_blank_text=True)
        root = etree.XML(xml_src, parser)
        xml = etree.tostring(root,
                             method='xml',
                             xml_declaration=None,
                             encoding='unicode',
                             with_tail=True,
                             pretty_print=True,
                             standalone=None,
                             doctype=None,
                             exclusive=False,
                             inclusive_ns_prefixes=None,
                             strip_text=False)
    except etree.ParseError as e:
        msg_err = f"ERROR format_xml()\n{e}"
        xml_err = make_xml_err(src, str(e))
        log_err(xml_err, os.linesep)
        log_err(msg_err)
        try:
            os.remove(path_out)
        except:
            pass
    else:
        with open(path_out, "w") as f:
            f.write(xml)
Exemple #18
0
def do_main(path_xml):
    path_err = path_xml.replace(".xml", ".tei_xml_ERR.log")
    logerr = Log("w").open(path_err, 1).log
    ctx = CheckTeimXml()
    ctx.check_tei_xml(path_xml, logerr)
Exemple #19
0
import stat
from teimedlib.ualog import Log
from teimedlib.readovertags import read_tags_over_sorted
from teimedlib.teim_paths import *
import teimxmlformat as xmf

__date__ = "30-05-2022"
__version__ = "0.2.4"
__author__ = "Marta Materni"


def pp(data, w=40):
    return pprint.pformat(data, indent=2, width=w)


LGDB = Log("w").open("debug.log", 1).log


class TeimOverFlow(object):
    DATA_TYPE = "tp"
    DATA_FROM = "from"
    DATA_TO = "to"

    OP = 'op'
    CL = 'cl'
    LOP = 'lop'
    LCL = 'lcl'
    TP = 'tp'
    OPEN_CLOSE = 'ctrl'
    """
    teimover.py -i text.txt -c teimcfg/teimoverflow.csv
Exemple #20
0
class Xml2Txt:
    """
    Estrae un file di testo da un file tei xml
    """
    def __init__(self, path_xml='', path_txt='', write_append='w'):
        self.path_xml = path_xml
        self.path_txt = path_txt
        self.write_append = write_append

        path_err = path_txt.replace(".txt", ".ERR.log")
        self.logerr = Log("w").open(path_err, 1).log

        self.txt_builder = None
        self.trace = False

    def node_liv(self, node):
        d = 0
        while node is not None:
            d += 1
            node = node.getparent()
        return d - 1

    def clean_key(self, k):
        s = k
        p0 = k.find("{http")
        if (p0 > -1):
            p1 = k.rfind('}')
            if p1 > -1:
                s = k[p1 + 1:]
        return s

    def node_items(self, nd):
        kvs = nd.items()
        js = {}
        for kv in kvs:
            k = self.clean_key(kv[0])
            v = kv[1]
            js[k] = v
        return js

    def node_tag(self, nd):
        try:
            tag = nd.tag
            tag = tag if type(nd.tag) is str else "XXX"
            pid = tag.find('}')
            if pid > 0:
                tag = tag[pid + 1:]
            return tag.strip()
        except Exception as e:
            self.logerr.log("ERROR in xml")
            self.logerr.log(str(e))
            return "XXX"

    def node_id(self, nd):
        s = ''
        kvs = nd.items()
        for kv in kvs:
            if kv[0].rfind('id') > -1:
                s = kv[1]
                break
        return s

    def node_id_num(self, id):
        if id == '':
            return ''
        m = re.search(r'\d', id)
        if m is None:
            return -1
        p = m.start()
        return id[p:]

    def node_text(self, nd):
        text = nd.text
        text = '' if text is None else text.strip()
        text = text.strip().replace(os.linesep, ',,')
        return text

    def node_tail(self, nd):
        tail = '' if nd.tail is None else nd.tail
        tail = tail.strip().replace(os.linesep, '')
        return tail

    def node_val(self, nd):
        ls = []
        for x in nd.itertext():
            s = x.strip().replace(os.linesep, '')
            ls.append(s)
        texts = ' '.join(ls)
        s = re.sub(r"\s{2,}", ' ', texts)
        return s

    def node_is_parent(self, nd):
        cs = nd.getchildren()
        le = len(cs)
        return le > 0

    def get_node_data(self, nd):
        items = self.node_items(nd)
        id = self.node_id(nd)
        if id != '':
            id_num = self.node_id_num(id)
            items['id_num'] = id_num
        js = {
            'id': id,
            'liv': self.node_liv(nd),
            'tag': self.node_tag(nd),
            'text': self.node_text(nd),
            'tail': self.node_tail(nd),
            'items': items,
            # 'keys': self.node_keys(nd)
            # 'val': self.node_val(nd),
            'val': "",
            'is_parent': self.node_is_parent(nd)
        }
        return js

    def build_txt_data(self, nd):
        """ crea un json contenente
        x_data (estratto da xml)
        t_data (empty per la furua elaborazione)
        Args:
            nd : nod xml
        Returns:
            json: json=x_data + c_data + t_data
        """
        x_data = self.get_node_data(nd)
        txt_data = {
            'id': x_data.get('id', 0),
            'is_parent': x_data.get('is_parent', False),
            'items': x_data.get('items', {}),
            'liv': x_data.get('liv', 0),
            'tag': x_data.get('tag', ''),
            'text': x_data.get('text', ''),
            'tail': x_data.get('tail', ''),
            'val': x_data.get('val', ''),
            't_i': 0,
            't_type': '',
            't_up': False,
            't_start': '',
            't_end': '',
            't_sp': '',
            't_ln': False,
            't_flag': False
        }
        return txt_data

    def write_txt(self):
        try:
            parser = etree.XMLParser(ns_clean=True)
            xml_root = etree.parse(self.path_xml, parser)
        except Exception as e:
            self.logerr.log("ERROR teixml2txt.py write_txt() parse_xml")
            self.logerr.log(e)
            sys.exit(str(e))
        try:
            self.txt_builder = TxtBuilder()
            ########################
            for nd in xml_root.iter():
                txt_data = self.build_txt_data(nd)
                self.txt_builder.add(txt_data)
            ########################
            self.txt_builder.elab()
            txt = self.txt_builder.txt
            make_dir_of_file(self.path_txt)
            with open(self.path_txt, self.write_append) as f:
                f.write(txt)
            chmod(self.path_txt)
        except Exception as e:
            self.logerr.log("ERROR teixml2txt.py write_html()")
            self.logerr.log(e)
            ou = StringIO()
            traceback.print_exc(file=ou)
            st = ou.getvalue()
            ou.close()
            self.logerr.log(st)
            sys.exit(1)
        return self.path_txt
Exemple #21
0
root_w = 1000
root_h = 600
root_x = 100
root_y = 100

log_w = 1000
log_h = 600
log_x = 200
log_y = 200


def pp(data, w=60):
    return pprint.pformat(data, width=w)


tfhlogerr = Log("w")

W_IDS = [
    'wmerge', 'wcheckover', 'wchecktxt', 'wtxt', 'wsetid', 'wover', 'wnote',
    'wformat', 'wall', 'wremovelog'
]

T_IDS = [
    'tcheckover', 'tchecktxt', 'ttxt', 'tsetid', 'tover', 'tnote', 'tformat',
    'tall', 'tremovelog'
]

TEIMFH_LOG_ERR = "log/teimxmlh.ERR.log"


class TeimXmlFh(object):
Exemple #22
0
class TxtBuilder:

    def __init__(self):
        self.log = Log("w")
        self.log.open("log/txtbuilder.log", 0)
        self.logerr = Log("a")
        self.logerr.open("log/txtbuilder.ERR.log", 1)
        self.data_lst = []
        self.data_txt_lst = []
        self.data_span_lst = []
        self.from_to_lst = []
        self.txt_rows = []
        self.up = True
        self.w_liv = 100
        self.trace = False
        self.ramis=self.set_ramis_dict()

    def set_ramis_dict(self):
        js={}
        for r in RAMIS:
            k,v=r.split('|')
            js[k]={}
            ls=v.split(',')
            for xy in ls:
                x,y=xy.split(':')
                js[k][x]=y
        return js

    def get_ramis(self,key,ch):
        js=self.ramis.get(key,None)
        if js is None:
            return f"ERR{key}"
        r=js.get(ch,None)
        if r is None:
            return f"ERR{ch}"
        return r

    def fill_from_to_list(self):
        for data_span in self.data_span_lst:
            x_items = data_span.get('items', {})
            x_from = x_items.get('from', None)
            x_to = x_items.get('to', None)
            x_type = x_items.get('type', None)
            if x_from is None or x_to is None or x_type is None:
                self.logerr.log("fill_from_to_list ERROR.").prn()
                self.logerr.log(pp(data_span)).prn()
                sys.exit(1)
            item = {
                "id0": x_from,
                "id1": x_to,
                "type": x_type
            }
            self.from_to_lst.append(item)

    def from_to_set_data_txt(self):
        for i in range(0, len(self.from_to_lst)):
            from_to = self.from_to_lst[i]
            id_from = from_to['id0'].strip()
            id_to = from_to['id1'].strip()
            span_type = from_to['type'].strip()
            err = 0
            if id_from == '':
                err = 1
            if id_to == '':
                err = 2
            if err == 1:
                self.logerr.log(f"ERROR from is null. to:{id_to}.")
            elif err == 2:
                self.logerr.log(f"ERROR from={id_from}  to is null.")
            for i in range(0, len(self.data_txt_lst)):
                data_txt = self.data_txt_lst[i]
                id = data_txt['id']
                if id == '':
                    continue
                if id_from == id:
                    if span_type == MONOLOG:
                        if err == 0:
                            data_txt[START] = '['
                        else:
                            data_txt[START] = '[ERR '
                    elif span_type == DIRECT:
                        if err == 0:
                            data_txt[START] = '{'
                        else:
                            data_txt[START] = '{ERR '
                elif id_to == id:
                    if span_type == MONOLOG:
                        if err == 0:
                            data_txt[END] = ']'
                        else:
                            data_txt[END] = ' ERR]'
                    elif span_type == DIRECT:
                        if err == 0:
                            data_txt[END] = '}'
                        else:
                            data_txt[END] = ' ERR}'

    # def w_num(self, id):
    #     p = id.find('w')
    #     if p < 0:
    #         return -1
    #     return int(id[p+1:])

    def set_data_txt_list(self):
        """setta t_data utilizzano xml_data e csv_data
        """
        t_up = False
        sic = False
        w_num = 0
        for i, d in enumerate(self.data_txt_lst):
            #id = d["id"]
            liv = d["liv"]
            tag = d['tag'].lower().strip()
            d['tag'] = tag
            text = d['text'].strip()
            d['t_i'] = i
            sp = ''
            ln = False

            if text != '':
                if t_up:
                    self.data_txt_lst[i]['t_up'] = True
                    t_up = False
                if sic:
                    self.data_txt_lst[i]['text'] = ''
                    sic = False

            if tag == 'w':
                sp = ' '
                self.w_liv = liv
            elif tag == 'pc':
                if text in ['.', '!', '?']:
                    t_up = True
            elif tag in NAMES_UP:
                t_up = True
            elif tag in ['lg']:
                t_up = True
            elif tag == 'del':
                self.data_txt_lst[i]['text'] = ''
                self.data_txt_lst[i]['tail'] = ''
            elif tag == 'sic':
                sic = True
            elif tag == 'l':
                ln = True
            d['t_sp'] = sp
            d['t_ln'] = ln

    def is_in_xml_items(self, items, key, val):
        v = items.get(key, '')
        v = v.replace('#', '').strip()
        return v == val

    def build_txt_rows(self):
        """crea le righe di testo self._txt_rows
        utilizzando data_text=xml_data + csv_data + t_data
        """
        self.txt_rows = []
        words = []
        for i, d in enumerate(self.data_txt_lst):
            id = d['id']
            tag = d['tag'].strip()
            text = d['text'].strip()
            tail = d['tail'].strip()
            items = d['items']
            t_start = d['t_start']
            t_sp = d['t_sp']
            t_up = d['t_up']
            t_end = d['t_end']
            t_ln = d['t_ln']

            if tag == 'c':
                if len(text)==1:
                    k=items.get('ana',None)
                    if k is not None:
                        r=self.get_ramis(k,text)
                        text=r

            elif tag == 'w':
                # els
                if self.is_in_xml_items(items, 'ana', 'elis'):
                    text = f'{text}{ELIS}'
                    self.data_txt_lst[i+1]['t_sp'] = ''
                # encl
                if self.is_in_xml_items(items, 'ana', 'encl'):
                    text = f'{ENCL}{text} '
                    t_sp = ''

            if t_sp != '':
                words.append(t_sp)

            if t_start != '':
                words.append(t_start)

            if t_up:
                text = text.capitalize()
            else:
                text = text.lower()
            tail = tail.lower()

            w = f"{text}{tail}"
            if w != '':
                words.append(w)

            if t_end != '':
                words.append(t_end)

            if t_ln:
                row = ''.join(words)
                self.txt_rows.append(row)
                words = []
        row = ''.join(words).strip()
        self.txt_rows.append(row)

    def text_adjust(self):
        VIRG = '"'
        for i, rw in enumerate(self.txt_rows):
            rw = re.sub(r" ,", ", ", rw)
            rw = re.sub(r" ;", "; ", rw)
            rw = re.sub(r" \.", ". ", rw)

            rw = re.sub(r'\[\s*', ' "', rw)
            rw = re.sub(r'\]', '" ', rw)
            rw = re.sub(r'{\s*', ' "', rw)
            rw = re.sub(r'}', '" ', rw)
            rw = rw.replace(f"{ELIS} ", ELIS)
            rw = re.sub(r"\s{2,}", " ", rw)
            self.txt_rows[i] = rw.strip()

    def elab(self):
        for data in self.data_lst:
            if data['tag'] == 'span':
                self.data_span_lst.append(data)
            else:
                self.data_txt_lst.append(data)
        # popola la lista con gli id from to
        self.fill_from_to_list()
        # completa gli elemnti di data_txt_lst
        self.set_data_txt_list()
        # setta start ed end in datat_tx
        self.from_to_set_data_txt()
        # cra le righe di testo
        self.build_txt_rows()
        # sistema le righe du testo
        self.text_adjust()

    def add(self, data):
        self.data_lst.append(data)

    @property
    def txt(self):
        s = os.linesep.join(self.txt_rows)
        return s