def filter_word(w, modifiers): if "gen" in modifiers: # util.dbg("filter by gen", modifiers, w) if not re_search(":[" + modifiers["gen"] + "]:", w): return False if "pers" in modifiers and not re.search(":(inf|past)", w): if not re_search(":[" + modifiers["pers"] + "]", w): return False if "tag" in modifiers: if not re.search(modifiers["tag"], w): return False return True
def post_process_sorted(lines): out_lines = [] # print("\n".join(lines), file=sys.stderr) prev_line = "" for line in lines: if "patr" in line: if re_search(":[mf]:v_naz:.*patr", line): logger.debug("promoting patr lemma %s", line) last_lema = line.split()[0] line = replace_base(line, last_lema) elif "lname" in line and ":f:" in line and not ":nv" in line: if ":f:v_naz" in line: logger.debug("promoting f lname lemma %s", line) last_lema = line.split()[0] line = replace_base(line, last_lema) # elif " adv" in line and not " advp" in line and not ":combr" in line and not ":super" in line: # logger.debug("promoting adv lemma %s", line) # line = replace_base(line, line.split()[0]) if prev_line == line and ("advp:perf" in line or "advp:rev:perf" in line): continue prev_line = line out_lines.append(line) return out_lines
def get_extra_flags(flags): extra_flags = "" if " :" in flags: extra_flags = re_search(" (:[^ ]+)", flags).group(1) if "<" in flags or "patr" in flags: extra_flags += ":anim" if "<+" in flags: extra_flags += ":lname" return extra_flags
def expand_line(line, flush_stdout): global main_word global main_flag global last_adv lines = preprocess(line) out_lines = [] for line in lines: sub_lines = [] # +cs if "\\ +" in line: line, *sub_lines = line.split("\\") line = line.rstrip() if " :" in line or not " /" in line: line += ":compb" else: line += " :compb" # print(" \\+", line, file=sys.stderr) # main_word = line # sublines = expand_subposition(main_word, line) # out_lines.extend( sublines ) # word lemma tags elif word_lemma_re.search(line): if "/" in line: exp_lines = affix.expand_alts([line], "//") # TODO: change this to some single-char splitter? try: exp_lines = affix.expand_alts(exp_lines, "/") except: print("Failed to expand", exp_lines, file=sys.stderr) raise else: exp_lines = [ line ] if ":nv" in line and not "v_" in line: exp_lines = util.expand_nv(exp_lines) out_lines.extend( exp_lines ) continue # word tags # word /flags [mods] [tags] try: word, flags = line.split(" ", 1) except: print("Failed to find flags in", line, file=sys.stderr) raise main_word = word inflected_lines = expand(word, flags, flush_stdout) if sub_lines: idx = 0 for sub_line in sub_lines: if flags.startswith("adv:"): extra_flags = flags[3:].replace(":compb", "") # util.dbg("sub_lines: %s, %s", flags, extra_flags) elif " :" in flags or flags.startswith(":"): extra_flags = re_search("(^| )(:[^ ]+)", flags).group(2).replace(":compb", "") # util.dbg("===", extra_flags) else: extra_flags = "" if " adv" in line: sublines = expand_subposition_adv_main(main_word, sub_line, extra_flags) else: sublines = expand_subposition(main_word, sub_line, extra_flags, idx) out_lines.extend( sublines ) if ".adv" in line and "/adj" in line: for inflected_line in inflected_lines: if " adv" in inflected_line: last_adv = inflected_line.split()[0] cs_lines = expand_subposition_adv(last_adv, sub_line, extra_flags) out_lines.extend(cs_lines) break # print(".adv", last_adv, file=sys.stderr) idx += 1 out_lines.extend( inflected_lines ) for l in inflected_lines: if not l.strip(): raise Exception("empty liner", inflected_lines) return post_process(out_lines)
def adjust_affix_tags(lines, main_flag, flags, modifiers): lines2 = [] for line in lines: # DL- if main_flag[1] == "n": if main_flag.startswith("/n2") and re_search("^/n2[01234]", main_flag): # base_word = lines[0].split()[0] base_word = line.split()[1] if util.istota(flags): if "m:v_rod" in line and not "/v_zna" in line: line = line.replace("m:v_rod", "m:v_rod/v_zna") if not base_word[-1:] in "аеєиіїоюя" and not ".a" in flags: # util.dbg("```", main_flag, line) word = line.split()[0] if word[-1:] in "ую": logger.debug("u/rod %s - %s", line, base_word) line = line.replace("v_dav", "v_rod/v_dav") if main_flag.startswith("/n2") and "@" in flags: word = line.split(" ", 1)[0] if word[-1:] in "ая" and "m:v_rod" in line: line = line.replace("m:v_rod", "m:v_rod/v_zna") if not "np" in main_flag and not ".p" in main_flag and not "n2adj" in flags: if ":p:" in line: logger.debug("skipping line with p: " + line) elif "//p:" in line: line = re_sub("//p:.*", "", line) logger.debug("removing //p from: " + line) if "/v_kly" in line: if main_flag.startswith("/n1"): # Єремія /n10.ko.patr.< base_word = line.split()[1] if ("<+" in flags and not ":p:" in line) or not util.person(flags) \ or (not ":patr" in line and re_search("\\.k[eo]", flags)) \ or (":m:" in line and ("<+" in flags)) \ or (main_flag.startswith("/n20") and base_word.endswith("ло") and "v_dav" in line): logger.debug("removing v_kly from: %s, %s", line, flags) line = line.replace("/v_kly", "") if ".p" in main_flag or "np" in main_flag: if util.person(flags): line = line.replace("p:v_naz", "p:v_naz/v_kly") if util.istota(flags): line = line.replace("p:v_rod", "p:v_rod/v_zna") if ">" in flags: # animal line = line.replace("p:v_naz", "p:v_naz/v_zna") else: line = line.replace("p:v_naz", "p:v_naz/v_zna") elif ":perf" in flags and ":pres" in line: line = line.replace(":pres", ":futr") elif main_flag.startswith("/adj"): if "<" in flags or "^noun" in flags: if ":uncontr" in line: continue if "<" in flags: if not ">" in flags and ":p:v_naz/v_zna" in line: line = line.replace("v_naz/v_zna", "v_naz/v_kly") if ":m:v_naz" in line and not "<+" in flags: line = line.replace("v_naz", "v_naz/v_kly") elif "^noun" in flags: if ":m:v_rod/v_zna" in line: line = line.replace("v_rod/v_zna", "v_rod") elif ":p:v_rod/v_zna" in line: line = line.replace("v_rod/v_zna", "v_rod") # if "<" in flags: # if util.person(flags): # line = line.replace("p:v_naz", "p:v_naz/v_kly") # # if util.istota(flags): # line = line.replace("p:v_rod", "p:v_rod/v_zna") # if ">" in flags: # animal # line = line.replace("p:v_naz", "p:v_naz/v_zna") # else: # line = line.replace("p:v_naz", "p:v_naz/v_zna") lines2.append(line) return lines2