Ejemplo n.º 1
0
def create_vip_vortaro(dst_fname, remove_srcfile=True):
    
    use_external_css = False
    css_txt = read_all_file(o_p.join(project_dir_fname, "sample/vip/vip.css"))
    
    def parse_dictionary(on_parsed_article):
        src_fdir = make_dpath("articles")
        
        # :REFACTOR:
        for fname in os.listdir(src_fdir):
            txt = read_all_file(os.path.join(src_fdir, fname))
            
            all_names = []
            vip_href_prefix = "?w="
            fixed_text = []
            php_len = len(vip_href_prefix)
            def on_article(article):
                all_names.extend(get_search_words(article))
                
                for a in iter_tags(article, "a"):
                    href = a.get("href")
                    assert href.startswith(vip_href_prefix)
                    # подсказка по bword:
                    # http://siripong-english.blogspot.ru/2011/04/custom-dictionary-w-stardict-in-babylon.html
                    a.set("href", "bword://" + href[php_len:])
                    
                fixed_text.append(gen_html_text(article))
            process_vip_text(txt, on_article)
            
            txt = fixed_text[0]
            css_txt
            article_txt = txt # "<style>%(css_txt)s</style>%(txt)s" % locals()
            on_parsed_article(all_names, article_txt)
    
    make_kondratjev.make_dictionary_ex(parse_dictionary, dst_fname, css_text=None if use_external_css else css_txt,
                                    remove_srcfile=remove_srcfile, is_html=True)
Ejemplo n.º 2
0
def make_wells(first_num, last_num, dst_fname):
    def parse_dictionary(on_parsed_article):
        dirname = os.path.dirname
        prefix_eoru = dirname(dirname(dirname(__file__)))
        unpacked_epub = o_p.join(prefix_eoru, "stuff/Wells/decrypted/depacked")
             
        for num in range(first_num, last_num+1):
            #num = 29 # K
            src_fname = o_p.join(unpacked_epub, "OEBPS/%03d.html" % num)
            body = get_html_body(src_fname, True)
    
            found_empty_p = False
            # альтернативный способ - .getchildren() + проверка .tag
            for p in body.iterfind(xhtml_tag("p")):
                txt = p.text_content().strip()
        
                if found_empty_p and txt:
                    # очередная статья
                    #print(txt)
                    
                    radix = None
                    lst = []
                    def on_word(word):
                        # <b>Kaboverd/o</b><b> </b>Cape Verde
                        if word:
                            lst.append(word)
                     
                    key_elements = list(parse_vip.iter_tags(p, xhtml_tag("b")))
                    assert key_elements
                    
                    for idx, el in enumerate(key_elements):
                        bold_txt = el.text_content().strip()
                        exceptions = [
                            "li diris, ~ ŝi atendas", # ke
                            "~e, ke", # kondiĉo 
                        ]

                        # "2" - kluso
                        def is_number(txt):
                            res = True
                            try:
                                int(txt)
                            except:
                                res = False
                            return res
                        
                        if bold_txt in exceptions or is_number(bold_txt):
                            w_lst = [] # [bold_txt]
                        else:
                            w_lst = [w.strip() for w in bold_txt.split(",")]
                            
                        def remove_bad_suffix(w):
                            for suffix in [
                                ":",  # boarding:
                                " 1", # can 1
                            ]:
                                if w.endswith(suffix):
                                    w = w[:-len(suffix)]
                            return w
        
                        # только первое слово - корень
                        # kost/i, ~o cost; multe~a expensive
                        if radix is None:
                            radix = w_lst[0]
                            slash = radix.find("/")
                            if slash >= 0:
                                radix = radix[:slash]
                                
                            radix = remove_bad_suffix(radix)
        
                        for w in w_lst:
                            for no_tilda_pattern in [
                                "(aerarmea) generalo", # air
                                "koncerne (with accus)", # as
                                "~ on daŭri", # run
                            ]:
                                if idx != 0 and w.find("~") == -1 and txt.find(no_tilda_pattern) != -1:
                                    w = "~ " + w
                            
                            # :TRICKY: некоторые термины содержат " ~ ", но без
                            # ручного анализа правильное значение не подставишь:
                            # - lav/i wash tr; ~ sin get washed, wash (oneself)
                            # - est/i be; ~as (there) is/are; kio ~ al vi? what’s the matter? [skip]
                            w = w.replace("/", "").replace("~", radix)
                            
                            # Kaliforni/o California; ≈o californium
                            change_case = w.find("≈") >= 0
                            if change_case:
                                w = w.replace("≈", radix)
                                # :REFACTOR:
                                w = w[0].swapcase() + w[1:]
                                
                            # digital/o 1 digitalis, foxglove; 2 ~a img2.png digital [= cifereca]
                            if w.startswith("2 "):
                                w = w[2:]
                            w = remove_bad_suffix(w)
                                
                            # Prote/o Proteus; ≈a protean; ≈o 1 protea (flower); 2 olm (amphibian)
                            # errors needs to be fixed by upstream
                            if w in ['a', 'o']:
                                continue
                            
                            if w == 'la' and txt.find("da is not used before la, other") != -1:
                                continue
                            
                            make_kondratjev.with_x_hdw(w, on_word)
                        
                        is_first = False
                    
                    assert lst
                    #print(lst)
                    on_parsed_article(lst, parse_vip.gen_html_text(p)) # txt)
                
                if not txt:
                    found_empty_p = True
    make_kondratjev.make_dictionary_ex(parse_dictionary, dst_fname, css_text=None, is_html=True)
Ejemplo n.º 3
0
def make_wells(first_num, last_num, dst_fname):
    def parse_dictionary(on_parsed_article):
        dirname = os.path.dirname
        prefix_eoru = dirname(dirname(dirname(__file__)))
        unpacked_epub = o_p.join(prefix_eoru, "stuff/Wells/decrypted/depacked")

        for num in range(first_num, last_num + 1):
            #num = 29 # K
            src_fname = o_p.join(unpacked_epub, "OEBPS/%03d.html" % num)
            body = get_html_body(src_fname, True)

            found_empty_p = False
            # альтернативный способ - .getchildren() + проверка .tag
            for p in body.iterfind(xhtml_tag("p")):
                txt = p.text_content().strip()

                if found_empty_p and txt:
                    # очередная статья
                    #print(txt)

                    radix = None
                    lst = []

                    def on_word(word):
                        # <b>Kaboverd/o</b><b> </b>Cape Verde
                        if word:
                            lst.append(word)

                    key_elements = list(parse_vip.iter_tags(p, xhtml_tag("b")))
                    assert key_elements

                    for idx, el in enumerate(key_elements):
                        bold_txt = el.text_content().strip()
                        exceptions = [
                            "li diris, ~ ŝi atendas",  # ke
                            "~e, ke",  # kondiĉo 
                        ]

                        # "2" - kluso
                        def is_number(txt):
                            res = True
                            try:
                                int(txt)
                            except:
                                res = False
                            return res

                        if bold_txt in exceptions or is_number(bold_txt):
                            w_lst = []  # [bold_txt]
                        else:
                            w_lst = [w.strip() for w in bold_txt.split(",")]

                        def remove_bad_suffix(w):
                            for suffix in [
                                    ":",  # boarding:
                                    " 1",  # can 1
                            ]:
                                if w.endswith(suffix):
                                    w = w[:-len(suffix)]
                            return w

                        # только первое слово - корень
                        # kost/i, ~o cost; multe~a expensive
                        if radix is None:
                            radix = w_lst[0]
                            slash = radix.find("/")
                            if slash >= 0:
                                radix = radix[:slash]

                            radix = remove_bad_suffix(radix)

                        for w in w_lst:
                            for no_tilda_pattern in [
                                    "(aerarmea) generalo",  # air
                                    "koncerne (with accus)",  # as
                                    "~ on daŭri",  # run
                            ]:
                                if idx != 0 and w.find("~") == -1 and txt.find(
                                        no_tilda_pattern) != -1:
                                    w = "~ " + w

                            # :TRICKY: некоторые термины содержат " ~ ", но без
                            # ручного анализа правильное значение не подставишь:
                            # - lav/i wash tr; ~ sin get washed, wash (oneself)
                            # - est/i be; ~as (there) is/are; kio ~ al vi? what’s the matter? [skip]
                            w = w.replace("/", "").replace("~", radix)

                            # Kaliforni/o California; ≈o californium
                            change_case = w.find("≈") >= 0
                            if change_case:
                                w = w.replace("≈", radix)
                                # :REFACTOR:
                                w = w[0].swapcase() + w[1:]

                            # digital/o 1 digitalis, foxglove; 2 ~a img2.png digital [= cifereca]
                            if w.startswith("2 "):
                                w = w[2:]
                            w = remove_bad_suffix(w)

                            # Prote/o Proteus; ≈a protean; ≈o 1 protea (flower); 2 olm (amphibian)
                            # errors needs to be fixed by upstream
                            if w in ['a', 'o']:
                                continue

                            if w == 'la' and txt.find(
                                    "da is not used before la, other") != -1:
                                continue

                            make_kondratjev.with_x_hdw(w, on_word)

                        is_first = False

                    assert lst
                    #print(lst)
                    on_parsed_article(lst, parse_vip.gen_html_text(p))  # txt)

                if not txt:
                    found_empty_p = True

    make_kondratjev.make_dictionary_ex(parse_dictionary,
                                       dst_fname,
                                       css_text=None,
                                       is_html=True)
Ejemplo n.º 4
0
def main():
    dst_fpath = "/home/ilya/.stardict/dic/esperanto/muravjov/muravjov"
    import make_kondratjev
    make_kondratjev.make_dictionary_ex(parse_dictionary, dst_fpath)