def notify_node(warning_txt, node): print(warning_txt, src_fname, parse_vip.gen_html_text(node))
def main(): dct_prefix = "/home/ilya/.stardict/dic/esperanto" dst_fname = o_p.join(dct_prefix, "REVO_Eksplika/Eksplika-REVO.txt") # :REFACTOR: dirname = os.path.dirname # :REFACTOR: realpath() - apliki tion al ĉiuj uzoj prj_fdir = dirname(os.path.realpath(__file__)) import shutil def copy_prj_fname(prj_fname, dst_fpath): o_p.force_makedirs(os.path.dirname(dst_fpath)) shutil.copy(o_p.join(prj_fdir, prj_fname), dst_fpath) prefix_eoru = dirname(dirname(prj_fdir)) unpacked_revo = o_p.join(prefix_eoru, "stuff/revo/revo") dictionaries = {} with make_gen_accumulator() as add_gen: def create_dictionary(dst_fname, css_link=None): remove_srcfile = True # False # on_article = add_gen( make_kondratjev.dictionary_generator( dst_fname, css_text=None, is_html=True, remove_srcfile=remove_srcfile)) if css_link: orig_on_article = on_article def on_article(key_names, txt): css_link txt = """<link href="%(css_link)s" rel="stylesheet" type="text/css" />%(txt)s""" % locals( ) return orig_on_article(key_names, txt) return on_article on_explika_article = create_dictionary(dst_fname, "revo.css") res_fdir = o_p.join(dirname(dst_fname), "res") copy_prj_fname("sample/revo/revo.css", o_p.join(res_fdir, "revo.css")) # kopias figurojn por beleco dst_smb = o_p.join(res_fdir, "smb") if not o_p.exists(dst_smb): shutil.copytree(o_p.join(unpacked_revo, "smb"), dst_smb) xml_fpath = o_p.join(unpacked_revo, "xml") def open_xml_article(xml_fname): xml_fname = o_p.join(xml_fpath, xml_fname) tree = open_xml_tree(xml_fname) return tree def fname2prefix(src_fname): return o_p.without_ext(src_fname) prefix_dct = {} def get_words(prefix): words = prefix_dct.get(prefix) if words is None: words = prefix_dct[prefix] = [] tree = open_xml_article(prefix + ".xml") for drv, headwords in for_drv_words_headwords(tree): words.extend(calc_words(headwords)) #print(words) #print(rvut_definitions.get_translations(drv).get("de")) #print() return words fname_lst = os.listdir(xml_fpath) if False: # True: # fname_lst = [ "ten.xml", "distin.xml", "apenau.xml", # <trd> in <subdrv> "pri.xml", # artikolo sen <drv> "sur.xml", # <ekz> ne en <(sub)snc>, sed en <subart> "al.xml", # <trdgrp> ĝuste en <art> "stift.xml", # kaj <ekz> ankaŭ en <art> "lima.xml", # перевод относился к <kap>, хотя был внутри текста (гад с 'la') - и таких статей много "kverk.xml", # diversaj homaj eraroj "jxak1.xml", "anim.xml", # <ekz> sen <ind> "blank.xml", # "milv.xml", # <bld> anstataŭ <>ekz "hel.xml", # trdgrp en <dif> "hazard.xml", # malplena trd etikedo "iks.xml", # vortoj kun signo '|' ] for src_fname in fname_lst: prefix = fname2prefix(src_fname) all_names = get_words(prefix) html_fname = o_p.join(unpacked_revo, "art", prefix + ".html") body = make_wells.get_html_body(html_fname, False) h1 = body.find("h1") hr = body.find("hr") div = etree.Element("div") el = h1.getnext() while el != hr: div.append(el) el = h1.getnext() def append_sub(name): sub_el = body.find("div[@class='%(name)s']" % locals()) if not (sub_el is None): div.append(etree.Element("hr")) div.append(sub_el) append_sub("fontoj") append_sub("notoj") # renovigas referencojn en stilo # kapt.html#kapt.0i => bword://kapti#kapt.0i for lnk in parse_vip.iter_tags(div, "a"): href = lnk.get("href") if href: m = re.match(r"(?P<lnk_fname>[^/]+\.html)#(?P<anchor>.+)$", href) if m: lnk_fname, anchor = m.group("lnk_fname"), m.group( "anchor") lnk_word = get_words(fname2prefix(lnk_fname))[0] # GD ne atentas #anchor, ColorDict - eĉ rifuzas sekvi la ligilon #lnk.set("href", "bword://%(lnk_word)s#%(anchor)s" % locals()) lnk.set("href", "bword://%(lnk_word)s#%(anchor)s" % locals()) # :REFACTOR: for img in parse_vip.iter_tags(div, "img"): src = img.get("src") if src: # egala funkciado por Goldendict (GD) k ColorDict (CD) m = re.match(r"^\.\./", src) if m: img.set("src", src[3:]) txt = parse_vip.gen_html_text(div) #print(txt) on_explika_article(all_names, txt) # eo-nacia vortaro national_dct = {} tree = open_xml_article(src_fname) def append_translations(translations, src_trs): for lang, lst in src_trs.items(): translations[lang] = lst + translations.setdefault( lang, []) used_tr_nodes = {} national_headwords = {} def get_count_translations(node): res = rvut_definitions.get_translations(node) # hazard.xml havas malplena tradukojn clean_res = {} for lang, lst in res.items(): lst = list(filter(bool, lst)) if lst: clean_res[lang] = lst res = clean_res append_translations(national_headwords, res) # :REFACTOR: for trd in node.findall('trd'): used_tr_nodes[trd] = True for trdp in node.findall('trdgrp'): used_tr_nodes[trdp] = True for trd in trdp.findall('trd'): used_tr_nodes[trd] = True return res def iterate_translations(translations, sub_node_dct, numerator_func, final_sep): for lang in sub_node_dct.keys() | translations.keys(): yield lang, gen_trans_text(lang, sub_node_dct, numerator_func, translations, final_sep) def notify_node(warning_txt, node): print(warning_txt, src_fname, parse_vip.gen_html_text(node)) # :TRICKY: plej simpla maniero por kalkuki jam traktitajn nodojn ekz_node_set = set() def find_ekz_translations(ekz_dct, node, flat_translations): #for trd in parse_vip.iter_tags(node, "ekz/trd|trdgrp"): def trd_iter(ekz_name, name): return parse_vip.iter_tags( node, "%(ekz_name)s/%(name)s" % locals()) def trd_iters(ekz_name): return trd_iter(ekz_name, "trd"), trd_iter(ekz_name, "trdgrp") for trd in itertools.chain(*(trd_iters("ekz") + trd_iters("bld"))): ekz = trd.getparent() if ekz in ekz_node_set: continue else: ekz_node_set.add(ekz) def make_orig_txt(ind_node): return ', '.join( rvut_words.get_words_from_kap(ind_node)) ind_node = ekz.find('ind') if ind_node is None: # kalkulas orig_txt mem, kolektante ĉiujn etikedojn ĝis apero de trd aŭ trdgrp # anim.xml: # <ekz> # <tld/>ita parolado<fnt>K</fnt>, # <trd lng="hu">lelkes beszéd</trd> # </ekz> ind_node = etree.Element("ind") ind_node.text = ekz.text for child in ekz.getchildren(): if child.tag in ["trd", "trdgrp"]: break else: child = copy.deepcopy(child) ind_node.append(child) tree.append(ind_node) orig_txt = make_orig_txt(ind_node) ind_node.getparent().remove(ind_node) else: orig_txt = make_orig_txt(ind_node) for lang, tr_lst in get_count_translations(ekz).items(): # :REFACTOR: lst = ekz_dct.setdefault(lang, []) tr_lst = ", ".join(tr_lst) ekz_txt = "<i><b>%(orig_txt)s</b>: %(tr_lst)s</i>" % locals( ) lst.append(ekz_txt) #return # :TRICKY: iuj <trd> kumulas tradukon mem k indikon de originala nomo (Latina prezipe) => # nur <trd> povas esti tia, ne <trdgrp>, ĉar tio estas perokula etikedo (angla - tag) # malĝuste - hel.xml! rest_translations = {} for trd in parse_vip.iter_tags(node, "trd"): if trd not in used_tr_nodes: par_node = trd.getparent() if par_node.tag == "trdgrp": lang = par_node.get("lng") used_tr_nodes[par_node] = True else: lang = trd.get("lng") foreign_word = rvut_flatten.flatten_node(trd) if foreign_word: # :REFACTOR: rest_translations.setdefault( lang, []).append(foreign_word) # :REFACTOR: used_tr_nodes[trd] = True append_translations(flat_translations, rest_translations) append_translations(national_headwords, rest_translations) def append_ekz_translations(dct, ekz_dct): # :TRICKY: ke lasi subsnc_dct simplan kaj ne ŝanĝi iterater_translations, # do ĵuste aldonas ekzemplojn al lasta ero de subsnc_dct for lang, ekz_lst in ekz_dct.items(): ekz_txt = "; ".join(ekz_lst) lst = dct.setdefault(lang, []) if lst: lst[-1] += "; " + ekz_txt else: lst.append(ekz_txt) def append_national_article(lang, names, txt): o_p_article, dst_fname = dictionaries.get(lang, (None, None)) if o_p_article is None: dict_fpath = o_p.join(dct_prefix, "REVO_%(lang)s" % locals()) # :REFACTOR: dst_fname = o_p.join(dict_fpath, "REVO-%(lang)s.txt" % locals()) o_p_article = create_dictionary(dst_fname, "revo-traduko.css") dictionaries[lang] = o_p_article, dst_fname copy_prj_fname( "sample/revo/eo-nacia/revo-traduko.css", o_p.join(dict_fpath, "res/revo-traduko.css")) o_p_article(names, txt) def append_row(translations, snc_dct, headwords, drv): # sur.xml: <ekz> povas esti ekster ajna <snc> ekz_dct = {} find_ekz_translations(ekz_dct, drv, translations) append_ekz_translations(translations, ekz_dct) assert headwords hw_txt = "<b>%s</b>" % "</b>, <b>".join(headwords) typ = None vspec = drv.find("gra/vspec") if vspec is not None: typ = vspec.text if typ: hw_txt = "%(hw_txt)s <i>%(typ)s</i>" % locals() for lang, tr_txt in iterate_translations( translations, snc_dct, arab_num, " <b>|</b> "): opa_args = national_dct.setdefault(lang, ([], [])) names, txt = opa_args names.extend(calc_words(headwords)) row_txt = """<div class="paragrafo">%(hw_txt)s %(tr_txt)s</div>""" % locals( ) txt.append(row_txt) # nacia-eo article n_keywords = national_headwords.get(lang) assert n_keywords # devas purigi poste originalan n_keywords, ne ŝanĝitan #n_keywords = [word.replace("|", "/") for word in n_keywords] clean_keywords = [ word.replace("|", "/") for word in n_keywords ] append_national_article(lang, clean_keywords, row_txt) n_keywords.clear() for drv, headwords in for_drv_words_headwords(tree): #print(src_fname) #print(translations) #print() def latin_num(i): return "%(chr(ord('a') + i))s)" % s_.EvalFormat() snc_dct = {} ekz_snc_dct = {} for snc in parse_vip.iter_tags(drv, "snc"): subsnc_dct = {} ekz_subsnc_dct = {} for subsnc in parse_vip.iter_tags(snc, "subsnc"): subsnc_translations = get_count_translations(subsnc) for lang, tr_lst in subsnc_translations.items(): lst = subsnc_dct.setdefault(lang, []) lst.append(", ".join(tr_lst)) find_ekz_translations(ekz_subsnc_dct, subsnc, subsnc_dct) append_ekz_translations(subsnc_dct, ekz_subsnc_dct) for lang, tr_txt in iterate_translations( get_count_translations(snc), subsnc_dct, latin_num, "; "): lst = snc_dct.setdefault(lang, []) lst.append(tr_txt) find_ekz_translations(ekz_snc_dct, snc, snc_dct) def arab_num(i): return "<b>%(i+1)s.</b>" % s_.EvalFormat() append_ekz_translations(snc_dct, ekz_snc_dct) def merge_trs(translations, drv): src_trs = get_count_translations(drv) append_translations(translations, src_trs) # ankoraŭ estas iome da <subdrv> en <drv> => aldonu translations = {} for subdrv in parse_vip.iter_tags(drv, "subdrv"): merge_trs(translations, subdrv) # k subart ankaŭ eblas havi <trd> ĝuste en subart - sur.xml: hu => rá- merge_trs(translations, drv) append_row(translations, snc_dct, headwords, drv) # :TRICKY: al.xml havas tradukojn ekstere subart, drv art_node = tree.find("art") append_row(get_count_translations(art_node), {}, find_kap_words(art_node), art_node) for lang, opa_args in national_dct.items(): names, txt = opa_args append_national_article(lang, names, "".join(txt)) strict_check = False # True # def alarm_not_processed(trd): is_ok = trd in used_tr_nodes if not is_ok: if strict_check: assert is_ok else: notify_node("Not processed trd:", trd.getparent()) # kontrolo, ke ĉiuj nodoj estas traktita for trd in parse_vip.iter_tags(tree, "trd"): alarm_not_processed(trd) for trd in parse_vip.iter_tags(tree, "trdgrp"): alarm_not_processed(trd) # zip'u vortarojn revo_dicts_fpath = o_p.join(dirname(unpacked_revo), "revo-dicts") o_p.force_makedirs(revo_dicts_fpath) # shutil povas zipfile! #import zipfile print("\nAtingeblaj REVO vortaroj:") def zip_dict(dst_fname): dir_fpath, basename = os.path.split(dst_fname) root_dir, dir_fname = os.path.split(dir_fpath) # uzado de zip ne funkciigas, rompas Colordict - t.e. la lasta komencas blinki senfine #fmt = "zip" fmt = "gztar" # se vortara datumo estas ĝuste en arkivo, ne en ia dosierujo, do CSS/figuroj estas ne trovataj save_without_folder = False # True # if save_without_folder: fname = shutil.make_archive(o_p.join(revo_dicts_fpath, dir_fname), fmt, dir_fpath) else: fname = shutil.make_archive(o_p.join(revo_dicts_fpath, dir_fname), fmt, root_dir, base_dir=dir_fname) ifo_fname = os.path.splitext(dst_fname)[0] + ".ifo" with open(ifo_fname) as ifo_f: properties = {} for line in ifo_f: lst = line.split("=") if len(lst) >= 2: key, value = lst[0].strip(), lst[1].strip() if key and value: properties[key] = value words_cnt = int(properties.get("wordcount")) synwordcount = properties.get("synwordcount") if synwordcount: words_cnt += int(synwordcount) fname = os.path.basename(fname) # du spacetoj fine estas por Markdown liniavanco print( "http://new.bombono.org/download/revo/%(fname)s\t%(words_cnt)s " % locals()) zip_dict(dst_fname) for lang, (func, dst_fname) in dictionaries.items(): zip_dict(dst_fname)
def main(): dct_prefix = "/home/ilya/.stardict/dic/esperanto" dst_fname = o_p.join(dct_prefix, "REVO_Eksplika/Eksplika-REVO.txt") # :REFACTOR: dirname = os.path.dirname # :REFACTOR: realpath() - apliki tion al ĉiuj uzoj prj_fdir = dirname(os.path.realpath(__file__)) import shutil def copy_prj_fname(prj_fname, dst_fpath): o_p.force_makedirs(os.path.dirname(dst_fpath)) shutil.copy(o_p.join(prj_fdir, prj_fname), dst_fpath) prefix_eoru = dirname(dirname(prj_fdir)) unpacked_revo = o_p.join(prefix_eoru, "stuff/revo/revo") dictionaries = {} with make_gen_accumulator() as add_gen: def create_dictionary(dst_fname, css_link=None): remove_srcfile = True # False # on_article = add_gen(make_kondratjev.dictionary_generator(dst_fname, css_text=None, is_html=True, remove_srcfile=remove_srcfile)) if css_link: orig_on_article = on_article def on_article(key_names, txt): css_link txt = """<link href="%(css_link)s" rel="stylesheet" type="text/css" />%(txt)s""" % locals() return orig_on_article(key_names, txt) return on_article on_explika_article = create_dictionary(dst_fname, "revo.css") res_fdir = o_p.join(dirname(dst_fname), "res") copy_prj_fname("sample/revo/revo.css", o_p.join(res_fdir, "revo.css")) # kopias figurojn por beleco dst_smb = o_p.join(res_fdir, "smb") if not o_p.exists(dst_smb): shutil.copytree(o_p.join(unpacked_revo, "smb"), dst_smb) xml_fpath = o_p.join(unpacked_revo, "xml") def open_xml_article(xml_fname): xml_fname = o_p.join(xml_fpath, xml_fname) tree = open_xml_tree(xml_fname) return tree def fname2prefix(src_fname): return o_p.without_ext(src_fname) prefix_dct = {} def get_words(prefix): words = prefix_dct.get(prefix) if words is None: words = prefix_dct[prefix] = [] tree = open_xml_article(prefix + ".xml") for drv, headwords in for_drv_words_headwords(tree): words.extend(calc_words(headwords)) #print(words) #print(rvut_definitions.get_translations(drv).get("de")) #print() return words fname_lst = os.listdir(xml_fpath) if False: # True: # fname_lst = [ "ten.xml", "distin.xml", "apenau.xml", # <trd> in <subdrv> "pri.xml", # artikolo sen <drv> "sur.xml", # <ekz> ne en <(sub)snc>, sed en <subart> "al.xml", # <trdgrp> ĝuste en <art> "stift.xml", # kaj <ekz> ankaŭ en <art> "lima.xml", # перевод относился к <kap>, хотя был внутри текста (гад с 'la') - и таких статей много "kverk.xml", # diversaj homaj eraroj "jxak1.xml", "anim.xml", # <ekz> sen <ind> "blank.xml", # "milv.xml", # <bld> anstataŭ <>ekz "hel.xml", # trdgrp en <dif> "hazard.xml", # malplena trd etikedo "iks.xml", # vortoj kun signo '|' ] for src_fname in fname_lst: prefix = fname2prefix(src_fname) all_names = get_words(prefix) html_fname = o_p.join(unpacked_revo, "art", prefix + ".html") body = make_wells.get_html_body(html_fname, False) h1 = body.find("h1") hr = body.find("hr") div = etree.Element("div") el = h1.getnext() while el != hr: div.append(el) el = h1.getnext() def append_sub(name): sub_el = body.find("div[@class='%(name)s']" % locals()) if not(sub_el is None): div.append(etree.Element("hr")) div.append(sub_el) append_sub("fontoj") append_sub("notoj") # renovigas referencojn en stilo # kapt.html#kapt.0i => bword://kapti#kapt.0i for lnk in parse_vip.iter_tags(div, "a"): href = lnk.get("href") if href: m = re.match(r"(?P<lnk_fname>[^/]+\.html)#(?P<anchor>.+)$", href) if m: lnk_fname, anchor = m.group("lnk_fname"), m.group("anchor") lnk_word = get_words(fname2prefix(lnk_fname))[0] # GD ne atentas #anchor, ColorDict - eĉ rifuzas sekvi la ligilon #lnk.set("href", "bword://%(lnk_word)s#%(anchor)s" % locals()) lnk.set("href", "bword://%(lnk_word)s#%(anchor)s" % locals()) # :REFACTOR: for img in parse_vip.iter_tags(div, "img"): src = img.get("src") if src: # egala funkciado por Goldendict (GD) k ColorDict (CD) m = re.match(r"^\.\./", src) if m: img.set("src", src[3:]) txt = parse_vip.gen_html_text(div) #print(txt) on_explika_article(all_names, txt) # eo-nacia vortaro national_dct = {} tree = open_xml_article(src_fname) def append_translations(translations, src_trs): for lang, lst in src_trs.items(): translations[lang] = lst + translations.setdefault(lang, []) used_tr_nodes = {} national_headwords = {} def get_count_translations(node): res = rvut_definitions.get_translations(node) # hazard.xml havas malplena tradukojn clean_res = {} for lang, lst in res.items(): lst = list(filter(bool, lst)) if lst: clean_res[lang] = lst res = clean_res append_translations(national_headwords, res) # :REFACTOR: for trd in node.findall('trd'): used_tr_nodes[trd] = True for trdp in node.findall('trdgrp'): used_tr_nodes[trdp] = True for trd in trdp.findall('trd'): used_tr_nodes[trd] = True return res def iterate_translations(translations, sub_node_dct, numerator_func, final_sep): for lang in sub_node_dct.keys() | translations.keys(): yield lang, gen_trans_text(lang, sub_node_dct, numerator_func, translations, final_sep) def notify_node(warning_txt, node): print(warning_txt, src_fname, parse_vip.gen_html_text(node)) # :TRICKY: plej simpla maniero por kalkuki jam traktitajn nodojn ekz_node_set = set() def find_ekz_translations(ekz_dct, node, flat_translations): #for trd in parse_vip.iter_tags(node, "ekz/trd|trdgrp"): def trd_iter(ekz_name, name): return parse_vip.iter_tags(node, "%(ekz_name)s/%(name)s" % locals()) def trd_iters(ekz_name): return trd_iter(ekz_name, "trd"), trd_iter(ekz_name, "trdgrp") for trd in itertools.chain(*(trd_iters("ekz") + trd_iters("bld"))): ekz = trd.getparent() if ekz in ekz_node_set: continue else: ekz_node_set.add(ekz) def make_orig_txt(ind_node): return ', '.join(rvut_words.get_words_from_kap(ind_node)) ind_node = ekz.find('ind') if ind_node is None: # kalkulas orig_txt mem, kolektante ĉiujn etikedojn ĝis apero de trd aŭ trdgrp # anim.xml: # <ekz> # <tld/>ita parolado<fnt>K</fnt>, # <trd lng="hu">lelkes beszéd</trd> # </ekz> ind_node = etree.Element("ind") ind_node.text = ekz.text for child in ekz.getchildren(): if child.tag in ["trd", "trdgrp"]: break else: child = copy.deepcopy(child) ind_node.append(child) tree.append(ind_node) orig_txt = make_orig_txt(ind_node) ind_node.getparent().remove(ind_node) else: orig_txt = make_orig_txt(ind_node) for lang, tr_lst in get_count_translations(ekz).items(): # :REFACTOR: lst = ekz_dct.setdefault(lang, []) tr_lst = ", ".join(tr_lst) ekz_txt = "<i><b>%(orig_txt)s</b>: %(tr_lst)s</i>" % locals() lst.append(ekz_txt) #return # :TRICKY: iuj <trd> kumulas tradukon mem k indikon de originala nomo (Latina prezipe) => # nur <trd> povas esti tia, ne <trdgrp>, ĉar tio estas perokula etikedo (angla - tag) # malĝuste - hel.xml! rest_translations = {} for trd in parse_vip.iter_tags(node, "trd"): if trd not in used_tr_nodes: par_node = trd.getparent() if par_node.tag == "trdgrp": lang = par_node.get("lng") used_tr_nodes[par_node] = True else: lang = trd.get("lng") foreign_word = rvut_flatten.flatten_node(trd) if foreign_word: # :REFACTOR: rest_translations.setdefault(lang, []).append(foreign_word) # :REFACTOR: used_tr_nodes[trd] = True append_translations(flat_translations, rest_translations) append_translations(national_headwords, rest_translations) def append_ekz_translations(dct, ekz_dct): # :TRICKY: ke lasi subsnc_dct simplan kaj ne ŝanĝi iterater_translations, # do ĵuste aldonas ekzemplojn al lasta ero de subsnc_dct for lang, ekz_lst in ekz_dct.items(): ekz_txt = "; ".join(ekz_lst) lst = dct.setdefault(lang, []) if lst: lst[-1] += "; " + ekz_txt else: lst.append(ekz_txt) def append_national_article(lang, names, txt): o_p_article, dst_fname = dictionaries.get(lang, (None, None)) if o_p_article is None: dict_fpath = o_p.join(dct_prefix, "REVO_%(lang)s" % locals()) # :REFACTOR: dst_fname = o_p.join(dict_fpath, "REVO-%(lang)s.txt" % locals()) o_p_article = create_dictionary(dst_fname, "revo-traduko.css") dictionaries[lang] = o_p_article, dst_fname copy_prj_fname("sample/revo/eo-nacia/revo-traduko.css", o_p.join(dict_fpath, "res/revo-traduko.css")) o_p_article(names, txt) def append_row(translations, snc_dct, headwords, drv): # sur.xml: <ekz> povas esti ekster ajna <snc> ekz_dct = {} find_ekz_translations(ekz_dct, drv, translations) append_ekz_translations(translations, ekz_dct) assert headwords hw_txt = "<b>%s</b>" % "</b>, <b>".join(headwords) typ = None vspec = drv.find("gra/vspec") if vspec is not None: typ = vspec.text if typ: hw_txt = "%(hw_txt)s <i>%(typ)s</i>" % locals() for lang, tr_txt in iterate_translations(translations, snc_dct, arab_num, " <b>|</b> "): opa_args = national_dct.setdefault(lang, ([], [])) names, txt = opa_args names.extend(calc_words(headwords)) row_txt = """<div class="paragrafo">%(hw_txt)s %(tr_txt)s</div>""" % locals() txt.append(row_txt) # nacia-eo article n_keywords = national_headwords.get(lang) assert n_keywords # devas purigi poste originalan n_keywords, ne ŝanĝitan #n_keywords = [word.replace("|", "/") for word in n_keywords] clean_keywords = [word.replace("|", "/") for word in n_keywords] append_national_article(lang, clean_keywords, row_txt) n_keywords.clear() for drv, headwords in for_drv_words_headwords(tree): #print(src_fname) #print(translations) #print() def latin_num(i): return "%(chr(ord('a') + i))s)" % s_.EvalFormat() snc_dct = {} ekz_snc_dct = {} for snc in parse_vip.iter_tags(drv, "snc"): subsnc_dct = {} ekz_subsnc_dct = {} for subsnc in parse_vip.iter_tags(snc, "subsnc"): subsnc_translations = get_count_translations(subsnc) for lang, tr_lst in subsnc_translations.items(): lst = subsnc_dct.setdefault(lang, []) lst.append(", ".join(tr_lst)) find_ekz_translations(ekz_subsnc_dct, subsnc, subsnc_dct) append_ekz_translations(subsnc_dct, ekz_subsnc_dct) for lang, tr_txt in iterate_translations(get_count_translations(snc), subsnc_dct, latin_num, "; "): lst = snc_dct.setdefault(lang, []) lst.append(tr_txt) find_ekz_translations(ekz_snc_dct, snc, snc_dct) def arab_num(i): return "<b>%(i+1)s.</b>" % s_.EvalFormat() append_ekz_translations(snc_dct, ekz_snc_dct) def merge_trs(translations, drv): src_trs = get_count_translations(drv) append_translations(translations, src_trs) # ankoraŭ estas iome da <subdrv> en <drv> => aldonu translations = {} for subdrv in parse_vip.iter_tags(drv, "subdrv"): merge_trs(translations, subdrv) # k subart ankaŭ eblas havi <trd> ĝuste en subart - sur.xml: hu => rá- merge_trs(translations, drv) append_row(translations, snc_dct, headwords, drv) # :TRICKY: al.xml havas tradukojn ekstere subart, drv art_node = tree.find("art") append_row(get_count_translations(art_node), {}, find_kap_words(art_node), art_node) for lang, opa_args in national_dct.items(): names, txt = opa_args append_national_article(lang, names, "".join(txt)) strict_check = False # True # def alarm_not_processed(trd): is_ok = trd in used_tr_nodes if not is_ok: if strict_check: assert is_ok else: notify_node("Not processed trd:", trd.getparent()) # kontrolo, ke ĉiuj nodoj estas traktita for trd in parse_vip.iter_tags(tree, "trd"): alarm_not_processed(trd) for trd in parse_vip.iter_tags(tree, "trdgrp"): alarm_not_processed(trd) # zip'u vortarojn revo_dicts_fpath = o_p.join(dirname(unpacked_revo), "revo-dicts") o_p.force_makedirs(revo_dicts_fpath) # shutil povas zipfile! #import zipfile print("\nAtingeblaj REVO vortaroj:") def zip_dict(dst_fname): dir_fpath, basename = os.path.split(dst_fname) root_dir, dir_fname = os.path.split(dir_fpath) # uzado de zip ne funkciigas, rompas Colordict - t.e. la lasta komencas blinki senfine #fmt = "zip" fmt = "gztar" # se vortara datumo estas ĝuste en arkivo, ne en ia dosierujo, do CSS/figuroj estas ne trovataj save_without_folder = False # True # if save_without_folder: fname = shutil.make_archive(o_p.join(revo_dicts_fpath, dir_fname), fmt, dir_fpath) else: fname = shutil.make_archive(o_p.join(revo_dicts_fpath, dir_fname), fmt, root_dir, base_dir=dir_fname) ifo_fname = os.path.splitext(dst_fname)[0] + ".ifo" with open(ifo_fname) as ifo_f: properties = {} for line in ifo_f: lst = line.split("=") if len(lst) >= 2: key, value = lst[0].strip(), lst[1].strip() if key and value: properties[key] = value words_cnt = int(properties.get("wordcount")) synwordcount = properties.get("synwordcount") if synwordcount: words_cnt += int(synwordcount) fname = os.path.basename(fname) # du spacetoj fine estas por Markdown liniavanco print("http://new.bombono.org/download/revo/%(fname)s\t%(words_cnt)s " % locals()) zip_dict(dst_fname) for lang, (func, dst_fname) in dictionaries.items(): zip_dict(dst_fname)
def parse_dictionary(on_parsed_article): dirname = os.path.dirname prefix_eoru = dirname(dirname(dirname(__file__))) unpacked_epub = o_p.join(prefix_eoru, "stuff/Wells/decrypted/depacked") for num in range(first_num, last_num+1): #num = 29 # K src_fname = o_p.join(unpacked_epub, "OEBPS/%03d.html" % num) body = get_html_body(src_fname, True) found_empty_p = False # альтернативный способ - .getchildren() + проверка .tag for p in body.iterfind(xhtml_tag("p")): txt = p.text_content().strip() if found_empty_p and txt: # очередная статья #print(txt) radix = None lst = [] def on_word(word): # <b>Kaboverd/o</b><b> </b>Cape Verde if word: lst.append(word) key_elements = list(parse_vip.iter_tags(p, xhtml_tag("b"))) assert key_elements for idx, el in enumerate(key_elements): bold_txt = el.text_content().strip() exceptions = [ "li diris, ~ ŝi atendas", # ke "~e, ke", # kondiĉo ] # "2" - kluso def is_number(txt): res = True try: int(txt) except: res = False return res if bold_txt in exceptions or is_number(bold_txt): w_lst = [] # [bold_txt] else: w_lst = [w.strip() for w in bold_txt.split(",")] def remove_bad_suffix(w): for suffix in [ ":", # boarding: " 1", # can 1 ]: if w.endswith(suffix): w = w[:-len(suffix)] return w # только первое слово - корень # kost/i, ~o cost; multe~a expensive if radix is None: radix = w_lst[0] slash = radix.find("/") if slash >= 0: radix = radix[:slash] radix = remove_bad_suffix(radix) for w in w_lst: for no_tilda_pattern in [ "(aerarmea) generalo", # air "koncerne (with accus)", # as "~ on daŭri", # run ]: if idx != 0 and w.find("~") == -1 and txt.find(no_tilda_pattern) != -1: w = "~ " + w # :TRICKY: некоторые термины содержат " ~ ", но без # ручного анализа правильное значение не подставишь: # - lav/i wash tr; ~ sin get washed, wash (oneself) # - est/i be; ~as (there) is/are; kio ~ al vi? what’s the matter? [skip] w = w.replace("/", "").replace("~", radix) # Kaliforni/o California; ≈o californium change_case = w.find("≈") >= 0 if change_case: w = w.replace("≈", radix) # :REFACTOR: w = w[0].swapcase() + w[1:] # digital/o 1 digitalis, foxglove; 2 ~a img2.png digital [= cifereca] if w.startswith("2 "): w = w[2:] w = remove_bad_suffix(w) # Prote/o Proteus; ≈a protean; ≈o 1 protea (flower); 2 olm (amphibian) # errors needs to be fixed by upstream if w in ['a', 'o']: continue if w == 'la' and txt.find("da is not used before la, other") != -1: continue make_kondratjev.with_x_hdw(w, on_word) is_first = False assert lst #print(lst) on_parsed_article(lst, parse_vip.gen_html_text(p)) # txt) if not txt: found_empty_p = True
def parse_dictionary(on_parsed_article): dirname = os.path.dirname prefix_eoru = dirname(dirname(dirname(__file__))) unpacked_epub = o_p.join(prefix_eoru, "stuff/Wells/decrypted/depacked") for num in range(first_num, last_num + 1): #num = 29 # K src_fname = o_p.join(unpacked_epub, "OEBPS/%03d.html" % num) body = get_html_body(src_fname, True) found_empty_p = False # альтернативный способ - .getchildren() + проверка .tag for p in body.iterfind(xhtml_tag("p")): txt = p.text_content().strip() if found_empty_p and txt: # очередная статья #print(txt) radix = None lst = [] def on_word(word): # <b>Kaboverd/o</b><b> </b>Cape Verde if word: lst.append(word) key_elements = list(parse_vip.iter_tags(p, xhtml_tag("b"))) assert key_elements for idx, el in enumerate(key_elements): bold_txt = el.text_content().strip() exceptions = [ "li diris, ~ ŝi atendas", # ke "~e, ke", # kondiĉo ] # "2" - kluso def is_number(txt): res = True try: int(txt) except: res = False return res if bold_txt in exceptions or is_number(bold_txt): w_lst = [] # [bold_txt] else: w_lst = [w.strip() for w in bold_txt.split(",")] def remove_bad_suffix(w): for suffix in [ ":", # boarding: " 1", # can 1 ]: if w.endswith(suffix): w = w[:-len(suffix)] return w # только первое слово - корень # kost/i, ~o cost; multe~a expensive if radix is None: radix = w_lst[0] slash = radix.find("/") if slash >= 0: radix = radix[:slash] radix = remove_bad_suffix(radix) for w in w_lst: for no_tilda_pattern in [ "(aerarmea) generalo", # air "koncerne (with accus)", # as "~ on daŭri", # run ]: if idx != 0 and w.find("~") == -1 and txt.find( no_tilda_pattern) != -1: w = "~ " + w # :TRICKY: некоторые термины содержат " ~ ", но без # ручного анализа правильное значение не подставишь: # - lav/i wash tr; ~ sin get washed, wash (oneself) # - est/i be; ~as (there) is/are; kio ~ al vi? what’s the matter? [skip] w = w.replace("/", "").replace("~", radix) # Kaliforni/o California; ≈o californium change_case = w.find("≈") >= 0 if change_case: w = w.replace("≈", radix) # :REFACTOR: w = w[0].swapcase() + w[1:] # digital/o 1 digitalis, foxglove; 2 ~a img2.png digital [= cifereca] if w.startswith("2 "): w = w[2:] w = remove_bad_suffix(w) # Prote/o Proteus; ≈a protean; ≈o 1 protea (flower); 2 olm (amphibian) # errors needs to be fixed by upstream if w in ['a', 'o']: continue if w == 'la' and txt.find( "da is not used before la, other") != -1: continue make_kondratjev.with_x_hdw(w, on_word) is_first = False assert lst #print(lst) on_parsed_article(lst, parse_vip.gen_html_text(p)) # txt) if not txt: found_empty_p = True