コード例 #1
0
ファイル: expand.py プロジェクト: msklvsk/dict_uk
def modify(lines, modifiers):
#    util.dbg("mods", modifiers)
    if len(modifiers) == 0:
        return lines
    
    out = []
    for line in lines:

        if not filter_word(line, modifiers):
            logger.debug("skip %s %s", line, modifiers)
            continue
          
        if "pos" in modifiers:
            line = re_sub(" [^ :]+:", " " + modifiers["pos"] + ":", line)
#            logger.debug("pos repl %s in %s", modifiers["pos"], line)
      
        if "force_gen" in modifiers and not ":patr" in line:
            force_gen = modifiers["force_gen"]
            line = re_sub(":[mfn](:|$)",  ":" + force_gen + "\\1", line)
            logger.debug("gen repl: %s in %s", force_gen, line)
            
    
        out.append(line)
    

    if len(out) == 0:
        raise Exception("emtpy output for "+ str(lines) + " and " + str(modifiers))

    return out
コード例 #2
0
ファイル: expand.py プロジェクト: msklvsk/dict_uk
def adjustCommonFlag(affixFlag2):
    if ".cf" in affixFlag2:
        affixFlag2 = re_sub("(vr?)[1-4]\.cf", "\\1.cf", affixFlag2) # v5.cf is special
    if ".impers" in affixFlag2:
        affixFlag2 = re_sub("(vr?)[1-9]\.impers", "\\1.impers", affixFlag2)
    if ".patr" in affixFlag2:
        affixFlag2 = re_sub("n[0-9]+\.patr", "n.patr", affixFlag2)
    return affixFlag2
コード例 #3
0
ファイル: expand.py プロジェクト: msklvsk/dict_uk
def expand_subposition_adv(last_adv, line, extra_tags):
#    print("+.adv", last_adv, file=sys.stderr)
    
    out_lines = []
    
    if " +cs=" in line:
        word = re_match(r" \+cs=([^ ]+)", line).group(1)
        word = word[:-2] + "е"
    else:
        word = main_word[:-2] + "е"

    if "adjp" in extra_tags:    
        extra_tags = re_sub(r":&?adjp(:pasv|:actv|:pres|:past|:perf|:imperf)+", "", extra_tags)

    
    w1 = compose_compar(word, last_adv, "adv:compr" + extra_tags)
    out_lines.append( w1 )
    
    adv_super = compose_compar("най" + word, last_adv, "adv:super" + extra_tags)
    adv_super2 = compose_compar("щонай" + word, last_adv, "adv:super" + extra_tags)
    adv_super3 = compose_compar("якнай" + word, last_adv, "adv:super" + extra_tags)
    out_lines.extend( (adv_super, adv_super2, adv_super3) )
    
#    print("...", w1, adv_super, file=sys.stderr)
    
    return out_lines
コード例 #4
0
ファイル: expand.py プロジェクト: msklvsk/dict_uk
def expand_subposition(main_word, line, extra_tags, idx):
    idx = ":xx" + str(idx)
    logger.debug("expanding sub " + idx + " " + main_word + ": " + line)
    if line.startswith(" +cs"):
        if " +cs=" in line:
            word = re_match(" \\+cs=([^ ]+)", line).group(1)
        else:
            word = main_word[:-2] + "іший"
            
        if "&adjp" in extra_tags:
            extra_tags = re_sub(r":&adjp(:pasv|:actv|:pres|:past|:perf|:imperf)+", "", extra_tags)


        word_forms = expand(word, "/adj :compr" + idx + extra_tags, flush_stdout)
#        word_forms[0] = DERIV_PADDING + word_forms[0]
        
        word = "най" + word
        word_forms_super = expand(word, "/adj :super" + idx + extra_tags, flush_stdout)
        word_forms.extend(word_forms_super)

        word_scho = "що" + word
        word_forms_super = expand(word_scho, "/adj :super" + idx + extra_tags, flush_stdout)
        word_forms.extend(word_forms_super)

        word_jak = "як" + word
        word_forms_super = expand(word_jak, "/adj :super" + idx + extra_tags, flush_stdout)
        word_forms.extend(word_forms_super)

        if not "-corp" in sys.argv:
            word_forms = [ replace_base(line, main_word) for line in word_forms ]
        
        return word_forms
 
    raise "Unknown subposition for " + line + "(" + main_word + ")"
コード例 #5
0
ファイル: expand.py プロジェクト: msklvsk/dict_uk
def get_modifiers(mod_flags, flags, word):
#    if not "^" in mod_flags and "/adj" in flags and "<" in flags:
#        mod_flags = "^noun " + mod_flags

    mods = {}

    if "/adj" in flags and "<" in flags:
        mods["pos"] = "noun"
        
        if not "=" in mod_flags:
            if "<+" in flags:
                if word.endswith("а"):
                    mods["gen"] = "f"
                else:
                    mods["gen"] = "mfp"
                return mods
                
            if "<" in flags:
                if word.endswith("а"):
                    mods["gen"] = "fp"
                else:
                    mods["gen"] = "mp"
                return mods
    
        if not "=" in mod_flags:
            mods["gen"] = "mfp"
            return mods

    
    mod_set = mod_flags.split()
    
    for mod in mod_set:
        if mod[0] == "^":
            if mod.startswith("^adjp"):
                mods["pos"] = mod[1:]
            else:
                mod_tags = mod[1:].split(":")
                mods["pos"] = mod_tags[0]
                if len(mod_tags) > 1 and mod_tags[0] == "noun":
                    if len(mod_tags[1]) != 1:
                        raise Exception("Bad gender override: " + str(mod) + " -- " + str(mod_tags))
                    mods["force_gen"] = mod_tags[1]
                   
        elif mod[:2] == "g=":
            mods["gen"] = re_sub("g=([^ ])", "\\1", mod)    #mod[2:3]
        elif mod[:2] == "p=":
            mods["pers"] = mod[2:3]
        elif mod.startswith("tag="):
            mods["tag"] = mod[4:]
      
    if "<+m" in flags or "<m" in flags:
        mods["force_gen"] = "m"
        if "n2adj" in flags:
            mods["gen"] = "m"
    

#    logger.debug("mods %s for %s and %s", str(mods), flags, mod_flags)
    
    return mods
コード例 #6
0
ファイル: expand.py プロジェクト: msklvsk/dict_uk
def cleanup(line):
    return re_sub(":xx.", "", line)
コード例 #7
0
ファイル: expand.py プロジェクト: msklvsk/dict_uk
def post_process(lines):
    out_lines = []
        
    for line in lines:

        if " adv" in line and not "advp" in line and not ":compr" in line and not ":super" in line:
            line = promote(line)

        if not "--uncontr" in sys.argv:
            if "uncontr" in line:    # we don't need uncontr for rules LT
                continue
        if not "-corp" in sys.argv:
            if "ranim" in line:    # we can't handle it yet in LT
                line = line.replace(":ranim", "")
            elif "rinanim" in line:    # we can't handle it yet in LT
                line = line.replace(":rinanim", "")

        if "-corp" in sys.argv:
            if "advp" in line:
                line = promote(line)
            elif "noun" in line:
                if ":anim" in line:
                    line = line.replace(":anim", "").replace("noun:", "noun:anim:")
                elif ":inanim" in line:
                    line = line.replace(":inanim", "").replace("noun:", "noun:inanim:")
                elif not "&pron" in line:
                    line = line.replace("noun:", "noun:inanim:")
            elif "verb" in line:
                line = re_sub("(verb(?::rev)?)(.*)(:(im)?perf)", "\\1\\3\\2", line)
            elif "adj" in line:
                if ":comp" in line or ":super" in line:
                    line = re_sub(" (adj:)(.*):(comp[br]|super)(.*)", " \\1\\3:\\2\\4", line)

                if ":&adjp" in line:
                    adjp_line = re.sub(" (adj(?::compb|:compr|:super)?)(.*):&(adjp(?::pasv|:actv|:past|:pres|:perf|:imperf)+)(.*)", " \\3\\2\\4", line)
                    out_lines.append(adjp_line)

                    line = re.sub(":&adjp(:pasv|:actv|:past|:pres|:perf|:imperf)+", "", line)
#                    util.dbg("-1-", line)

#            if "advp" in line:
#                line = re_sub("(.*) .* (advp.*)", "\\1 \\1 \\2", line)
        else:
            if ":&adjp" in line and ":comp" in line:
                #  if ":comp" in line or ":super" in line:
                line = re_sub(" (adj:.:v_...:)(.*):(comp[br]|super)(.*)", " \\1\\3:\\2\\4", line)

                #   out_lines.append(line)

# TODO: extra :coll
#            if "сь advp" in line:
#                other_line = re_sub("(.*)сь (.*сь) (advp.*)", "\\1ся \\2 \\3:coll", line)
#                out_lines.append(other_line)
#
#            if "verb:" in line and ":inf" in line and ("ти " in line): # or "тися " in line):
#                other_line = re_sub("^(.*)ти((?:ся)? [^ ]+) (verb:.*)", "\\1ть\\2 \\3:coll", line)
#                out_lines.append(other_line)
        #else:
        out_lines.append(line)

    out_lines = [ util.tail_tag(line, (":v-u", ":bad", ":slang", ":rare", ":coll", ":abbr")) for line in out_lines ]    # TODO: add ":alt"

    return out_lines
コード例 #8
0
ファイル: expand.py プロジェクト: msklvsk/dict_uk
def preprocess2(line):
    out_lines = []

#     if "/v-u" in line or ".v-u" in line:
#         if "/v-u" in line:
#             line = re_sub(r"(?i)^([а-яіїєґ\"-]+) /v-u ?\^?", "\\1 ", line).replace(" :", ":")
#         else:
#             line = re_sub("\.v-u", "", line)
#             
#         space = " "
#         if " :" in line or not " /" in line:
#             space = ""
#         line = line + space + ":v-u"
#         line1 = re_sub("(^| )в", "\\1у", line)
#         out_lines = [line, line1]
#         logger.debug("v-u: " + str(out_lines))

    if "/<" in line:
        if "<+" in line:
            extra_tag = ":anim:lname"
        else:
            extra_tag = ":anim:fname"
        
        if not "<m" in line and not "<+m" in line:
#            tag = "noun:f:v_naz/v_rod/v_dav/v_zna/v_oru/v_mis/k_kly"
            tag = "noun:f:nv:np"
            line1 = re_sub("/<\\+?f?", tag + extra_tag, line)
            out_lines.append(line1)
        if not "<f" in line and not "<+f" in line:
            tag = "noun:m:nv:np"
            line1 = re_sub("/<\\+?m?", tag + extra_tag, line)
            out_lines.append(line1)

    elif "/n2" in line and "<+" in line:
        if not "<+m" in line and util.dual_last_name_ending(line):
            out_lines.append(line)
            line_fem_lastname = line.split()[0] + " noun:f:nv:np:anim:lname"
            out_lines.append(line_fem_lastname)
        else:
            out_lines = [line]
    elif "/n1" in line and "<+" in line:
        if not "<+f" in line and not "<+m" in line:
            out_lines.append(line)
            line_masc_lastname = line.replace("<+", "<+m")
            out_lines.append(line_masc_lastname)
        else:
            out_lines = [line]
    elif "/np" in line:
        space = " "
        if " :" in line or not " /" in line:
            space = ""
        line = line + space + ":ns"
        out_lines = [line]
    elif ":imperf:perf" in line:
        line1 = line.replace(":perf", "")
        line2 = line.replace(":imperf", "").replace(".cf", "")  #.replace(".advp")  # so we don"t get two identical advp:perf lines
        out_lines = [line1, line2]
    elif ":&adj" in line and not " :&adj" in line:
        line = line.replace(":&adj", " :&adj")
        out_lines = [line]
    else:
        out_lines = [line]

#     out_lines2 = []
#     for out_line in out_lines:
#         
#         if ":+f" in out_line:
#             out_line = out_line.replace(":f", "")
#             f_line = out_line + ""
#             out_lines2.append(f_line)
# 
#         out_lines2.append(out_line)
#    print("--", "\n++ ".join(out_lines), file=sys.stderr)  
    return out_lines
コード例 #9
0
ファイル: expand.py プロジェクト: msklvsk/dict_uk
def adjust_affix_tags(lines, main_flag, flags, modifiers):
    lines2 = []
  
    for line in lines:
        # DL-
        if main_flag[1] == "n":
                
            if main_flag.startswith("/n2") and re_search("^/n2[01234]", main_flag):
#                base_word = lines[0].split()[0]
                base_word = line.split()[1]
                
                if util.istota(flags):
                    if "m:v_rod" in line and not "/v_zna" in line:
                        line = line.replace("m:v_rod", "m:v_rod/v_zna")
        
                if not base_word[-1:] in "аеєиіїоюя" and not ".a" in flags:
#                    util.dbg("```", main_flag, line)
                    word = line.split()[0]
                    if word[-1:] in "ую":
                        logger.debug("u/rod %s - %s", line, base_word)
                        line = line.replace("v_dav", "v_rod/v_dav")
                        
            if main_flag.startswith("/n2") and "@" in flags:
                word = line.split(" ", 1)[0]
                if word[-1:] in "ая" and "m:v_rod" in line:
                    line = line.replace("m:v_rod", "m:v_rod/v_zna")
        
            if not "np" in main_flag and not ".p" in main_flag and not "n2adj" in flags:
                if ":p:" in line:
                    logger.debug("skipping line with p: " + line)
                elif "//p:" in line:
                    line = re_sub("//p:.*", "", line)
                    logger.debug("removing //p from: " + line)
        
            if "/v_kly" in line:
                if main_flag.startswith("/n1"): # Єремія /n10.ko.patr.<
                    base_word = line.split()[1]

                if ("<+" in flags and not ":p:" in line) or not util.person(flags) \
                        or (not ":patr" in line and re_search("\\.k[eo]", flags)) \
                        or (":m:" in line and ("<+" in flags)) \
                        or (main_flag.startswith("/n20") and base_word.endswith("ло") and "v_dav" in line):
                    logger.debug("removing v_kly from: %s, %s", line, flags)
                    line = line.replace("/v_kly", "")

            if ".p" in main_flag or "np" in main_flag:
                if util.person(flags):
                    line = line.replace("p:v_naz", "p:v_naz/v_kly")
    
                if util.istota(flags):
                    line = line.replace("p:v_rod", "p:v_rod/v_zna")
                    if ">" in flags: # animal
                        line = line.replace("p:v_naz", "p:v_naz/v_zna")
                else:
                    line = line.replace("p:v_naz", "p:v_naz/v_zna")

            
        elif ":perf" in flags and ":pres" in line:
            line = line.replace(":pres", ":futr")
            
        elif main_flag.startswith("/adj"):
            if "<" in flags or "^noun" in flags:
                if ":uncontr" in line:
                    continue
                
            if "<" in flags:
                if not ">" in flags and ":p:v_naz/v_zna" in line:
                    line = line.replace("v_naz/v_zna", "v_naz/v_kly")
                if ":m:v_naz" in line and not "<+" in flags:
                    line = line.replace("v_naz", "v_naz/v_kly")
            elif "^noun" in flags:
                if ":m:v_rod/v_zna" in line:
                    line = line.replace("v_rod/v_zna", "v_rod")
                elif ":p:v_rod/v_zna" in line:
                    line = line.replace("v_rod/v_zna", "v_rod")

    
#            if "<" in flags:
#                if util.person(flags):
#                    line = line.replace("p:v_naz", "p:v_naz/v_kly")
#    
#                if util.istota(flags):
#                    line = line.replace("p:v_rod", "p:v_rod/v_zna")
#                    if ">" in flags: # animal
#                        line = line.replace("p:v_naz", "p:v_naz/v_zna")
#                else:
#                    line = line.replace("p:v_naz", "p:v_naz/v_zna")

        lines2.append(line)

    return lines2
コード例 #10
0
ファイル: expand.py プロジェクト: msklvsk/dict_uk
def post_expand(lines, flags):
    if len(lines) == 0:
        raise Exception("emtpy lines")

    extra_flags = get_extra_flags(flags)
    
    
    if extra_flags:
        first_name_base = util.firstname(lines[0], flags)
        
        out_lines = []
        extra_out_lines = []
        
        for line in lines:
            extra_flags2 = extra_flags
    
            if first_name_base and not ":patr" in line:
                extra_flags2 += ":fname"
    
            if " advp" in line:
                if ":imperf" in line:
                    extra_flags2 = re_sub(":(im)?perf", "", extra_flags2)
                else:
                    line = line.replace(":perf", "")
            elif "adj.adv" in flags and " adv" in line:
                extra_flags2 = re_sub(r":&?adjp(:pasv|:actv|:pres|:past|:perf|:imperf)+", "", extra_flags2)
            elif ":+m" in extra_flags:
                extra_flags2 = extra_flags2.replace(":+m", "")
                
                if ":f:" in line:
                    masc_line = line.replace(":f:", ":m:") + extra_flags2
                    extra_out_lines.append(masc_line)
                elif ":n:" in line:
                    masc_line = line.replace(":n:", ":m:") + extra_flags2
                    
                    if util.istota(flags):
                        if "m:v_rod" in masc_line:
                            masc_line2 = masc_line.replace("m:v_rod", "m:v_zna")
                            extra_out_lines.append(masc_line2)
                        elif "m:v_zna" in masc_line:
                            masc_line = ""
                        if "m:v_kly" in masc_line:
                            word, lemma, tags = masc_line.split()
                            masc_line = word[:-1]+"е " + lemma + " " + tags
                    
                    if masc_line:
                        extra_out_lines.append(masc_line)
            elif ":+f" in extra_flags:
                extra_flags2 = extra_flags2.replace(":+f", "")
                
                if ":m:" in line:
                    masc_line = line.replace(":m:", ":f:") + extra_flags2
                    extra_out_lines.append(masc_line)
                elif ":n:" in line:
                    masc_line = line.replace(":n:", ":f:") + extra_flags2
                    
#                     if util.istota(flags):
#                         if "m:v_rod" in masc_line:
#                             masc_line2 = masc_line.replace("m:v_rod", "m:v_zna")
#                             extra_out_lines.append(masc_line2)
#                         elif "m:v_zna" in masc_line:
#                             masc_line = ""
                    
                    if masc_line:
                        extra_out_lines.append(masc_line)
            elif ":patr" in line and ":anim" in extra_flags2:
                line = line.replace(":patr", ":anim:patr")
                extra_flags2 = extra_flags2.replace(":anim", "")
    
            out_lines.append(line + extra_flags2)
    
        out_lines.extend(extra_out_lines)
        
        return out_lines
    
    return lines