def fixup_one_link(m): lemma, infl = m.groups() # Make sure to remove accents, cf. [[десе́ртный|десе́ртное]] lemma = ru.remove_accents(re.sub("#Russian$", "", lemma)) if ru.remove_accents(infl) == lemma: return "[[%s]]" % infl return "[[%s|%s]]" % (lemma, infl)
def fixup_one_link(m): lemma, infl = m.groups() # Make sure to remove accents, cf. [[десе́ртный|десе́ртное]] lemma = rulib.remove_accents(re.sub("#Russian$", "", lemma)) if rulib.remove_accents(infl) == lemma: return "[[%s]]" % infl return "[[%s|%s]]" % (lemma, infl)
def process_template(pagetitle, index, template, ruparam, trparam, output_line, find_accents, verbose): origt = unicode(template) saveparam = ruparam def pagemsg(text): msg("Page %s %s: %s" % (index, pagetitle, text)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, semi_verbose) if semi_verbose: pagemsg("Processing template: %s" % unicode(template)) if unicode(template.name) == "head": # Skip {{head}}. We don't want to mess with headwords. return False if isinstance(ruparam, list): ruparam, saveparam = ruparam if ruparam == "page title": val = pagetitle else: val = getparam(template, ruparam) valtr = getparam(template, trparam) if trparam else "" changed = False if find_accents: newval, newtr = find_accented(val, valtr, verbose, pagemsg, expand_text, origt) if newval != val or newtr != valtr: if ru.remove_accents(newval) != ru.remove_accents(val): pagemsg("WARNING: Accented page %s changed from %s in more than just accents, not changing" % (newval, val)) else: changed = True addparam(template, saveparam, newval) if newtr: if not trparam: pagemsg("WARNING: Unable to change translit to %s because no translit param available (Cyrillic param %s): %s" % (newtr, saveparam, origt)) elif unicode(template.name) in ["ru-ux"]: pagemsg("WARNING: Not changing or adding translit param %s=%s to ru-ux: origt=%s" % ( trparam, newtr, origt)) else: if valtr and valtr != newtr: pagemsg("WARNING: Changed translit param %s from %s to %s: origt=%s" % (trparam, valtr, newtr, origt)) if not valtr: pagemsg("NOTE: Added translit param %s=%s to template: origt=%s" % (trparam, newtr, origt)) addparam(template, trparam, newtr) elif valtr: pagemsg("WARNING: Template has translit %s but lookup result has none, leaving translit alone: origt=%s" % (valtr, origt)) if check_need_accent(newval): output_line("Need accents (changed)") else: output_line("Found accents") if not changed and check_need_accent(val): output_line("Need accents") if changed: pagemsg("Replaced %s with %s" % (origt, unicode(template))) return ["auto-accent %s%s" % (newval, "//%s" % newtr if newtr else "")] if changed else False
def sort_aspect_pair(x, y): xpf, ximpf = x ypf, yimpf = y # First compare ignoring accents, so that влить goes before вли́ться, # then compare with accents so e.g. рассы́пать and рассыпа́ть are ordered # consistently. retval = compare_aspect_pair(ru.remove_accents(xpf), ru.remove_accents(ximpf), ru.remove_accents(ypf), ru.remove_accents(yimpf)) if retval == 0: return compare_aspect_pair(xpf, ximpf, ypf, yimpf) else: return retval
def sort_aspect_pair(x, y): xpf, ximpf = x ypf, yimpf = y # First compare ignoring accents, so that влить goes before вли́ться, # then compare with accents so e.g. рассы́пать and рассыпа́ть are ordered # consistently. retval = compare_aspect_pair(rulib.remove_accents(xpf), rulib.remove_accents(ximpf), rulib.remove_accents(ypf), rulib.remove_accents(yimpf)) if retval == 0: return compare_aspect_pair(xpf, ximpf, ypf, yimpf) else: return retval
def process_arg_set(arg_set): if not arg_set: return offset = 0 if re.search(r"^[a-f]'*(,[a-f]'*)*$", arg_set[offset]): offset = 1 if len(arg_set) <= offset: return # Remove * meaning non-stressed lemma = re.sub(r"^\*", "", arg_set[offset]) # Remove translit lemma = re.sub("//.*$", "", lemma) if not lemma: return headwords_separators = re.split(r"(\[\[.*?\]\]|[^ \-]+)", lemma) if headwords_separators[0] != "" or headwords_separators[-1] != "": pagemsg( "WARNING: Found junk at beginning or end of headword, skipping: %s" % lemma) return wordind = 0 for i in xrange(1, len(headwords_separators), 2): hword = headwords_separators[i] separator = headwords_separators[i + 1] if i < len(headwords_separators ) - 2 and separator != " " and separator != "-": pagemsg( "WARNING: Separator after word #%s isn't a space or hyphen, can't handle: word=<%s>, separator=<%s>" % (wordind + 1, hword, separator)) continue hword = hword.replace("#Russian", "") hword = rulib.remove_accents(blib.remove_right_side_links(hword)) check_lemma(hword) wordind += 1
def process_arg_set(arg_set): if not arg_set: return offset = 0 if re.search(r"^[a-f]'*(,[a-f]'*)*$", arg_set[offset]): offset = 1 if len(arg_set) <= offset: return # Remove * meaning non-stressed lemma = re.sub(r"^\*", "", arg_set[offset]) # Remove translit lemma = re.sub("//.*$", "", lemma) if not lemma: return headwords_separators = re.split(r"(\[\[.*?\]\]|[^ \-]+)", lemma) if headwords_separators[0] != "" or headwords_separators[-1] != "": pagemsg("WARNING: Found junk at beginning or end of headword, skipping: %s" % lemma) return wordind = 0 for i in xrange(1, len(headwords_separators), 2): hword = headwords_separators[i] separator = headwords_separators[i+1] if i < len(headwords_separators) - 2 and separator != " " and separator != "-": pagemsg("WARNING: Separator after word #%s isn't a space or hyphen, can't handle: word=<%s>, separator=<%s>" % (wordind + 1, hword, separator)) continue hword = hword.replace("#Russian", "") hword = rulib.remove_accents(blib.remove_right_side_links(hword)) check_lemma(hword) wordind += 1
def add_links(m): prefix = m.group(1) if re.search(u"[гкх]о$", prefix): first = prefix[:-1] + u"ий" else: first = prefix[:-1] + u"ый" return u"[[%s|%s]]-[[%s]]" % (rulib.remove_accents(first), prefix, m.group(2))
def process_decl(index, pagetitle, decl, forms, save, verbose): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if decl.startswith("{{ru-conj|"): tempcall = re.sub(r"^\{\{ru-conj", "{{ru-generate-verb-forms", decl) elif decl.startswith("{{ru-noun-table"): tempcall = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", decl) else: pagemsg("WARNING: Unrecognized decl template, skipping: %s" % decl) return result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") return args = blib.split_generate_args(result) for form in forms: if form in args: for formpagename in re.split(",", args[form]): formpagename = re.sub("//.*$", "", formpagename) formpagename = rulib.remove_accents(formpagename) formpage = pywikibot.Page(site, formpagename) if not formpage.exists(): pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename) elif formpagename == pagetitle: pagemsg("WARNING: Attempt to delete dictionary form, skipping") else: text = unicode(formpage.text) if "Etymology 1" in text: pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename) else: skip_form = False for m in re.finditer(r"^==([^=]*?)==$", text, re.M): if m.group(1) != "Russian": pagemsg("WARNING: Found entry for non-Russian language %s, skipping form %s" % (m.group(1), formpagename)) skip_form = True if not skip_form: comment = "Delete erroneously created form of %s" % pagetitle if save: formpage.delete(comment) else: pagemsg("Would delete page %s with comment=%s" % (formpagename, comment))
def process_verb_headword(htemp): # Look for either space-delimited words or bracket-delimited sections. words = [ x for num, x in enumerate( re.split(r"([^\s\[\]]+|\[\[.*?\]\])", getparam(htemp, "1"))) if num % 2 == 1 ] for word in words: word = word.replace("#Russian", "") word = rulib.remove_accents(blib.remove_right_side_links(word)) if "[" in word or "]" in word: pagemsg("WARNING: Found stray bracket in word %s in %s" % (word, unicode(htemp))) else: check_lemma(word)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) == "wikipedia": val = getparam(t, "1") newval = ru.remove_accents(val) if val != newval: pagemsg("Removing accents from 1= in {{wikipedia|...}}") notes.append("remove accents from 1= in {{wikipedia|...}}") t.add("1", newval) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose, nouns, adjectives): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") if re.search(u"с[яь]$", pagetitle): pagemsg("Skipping reflexive verb") return text = unicode(page.text) parsed = blib.parse(page) for t in parsed.filter_templates(): tname = unicode(t.name) if tname == "ru-conj": if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue conjtype = getparam(t, "2") tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue args = blib.split_generate_args(result) if "infinitive" not in args: # e.g. обнимать pagemsg("WARNING: No infinitive") continue infinitive = args["infinitive"] if "," in infinitive: pagemsg("WARNING: Infinitive has multiple forms: %s" % infinitive) continue if "//" in infinitive: pagemsg("WARNING: Infinitive has translit: %s" % infinitive) continue ppp = form_ppp(conjtype, pagetitle, args) if not ppp: continue if ppp.endswith(u"тый"): verbal_noun = re.sub(u"тый$", u"тие", ppp) verbal_noun_suffix = u"тие" verbal_adj = re.sub(u"тый$", u"тельный", ppp) verbal_adj_suffix = u"тельный" elif ppp.endswith(u"ённый"): verbal_noun = re.sub(u"ённый$", u"ение", ppp) verbal_noun_suffix = u"ение" verbal_adj = re.sub(u"ённый$", u"ительный", ppp) verbal_adj_suffix = u"ительный" elif ppp.endswith(u"енный"): verbal_noun = re.sub(u"енный$", u"ение", ppp) verbal_noun_suffix = u"ение" verbal_adj = re.sub(u"енный$", u"ительный", ppp) verbal_adj_suffix = u"ительный" else: assert ppp.endswith(u"анный") or ppp.endswith(u"янный") verbal_noun = re.sub(u"нный$", u"ние", ppp) verbal_adj = re.sub(u"нный$", u"тельный", ppp) m = re.search(u"(.)нный$", ppp) suffix_start = m.group(1) verbal_noun_suffix = suffix_start + u"ние" verbal_adj_suffix = suffix_start + u"тельный" agent_noun = re.sub(u"ный$", "", verbal_adj) agent_noun_suffix = re.sub(u"ный$", "", verbal_adj_suffix) stressed_verbal_noun_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_noun_suffix) stressed_verbal_adj_suffix = re.sub(u"^([аяеи])", ur"\1́", verbal_adj_suffix) stressed_agent_noun_suffix = re.sub(u"ный$", "", stressed_verbal_adj_suffix) if conjtype.startswith("7"): stem = getparam(t, "4") if infinitive.endswith(u"ть"): stem = stem.replace(u"ё", u"е́") else: stem = rulib.make_unstressed_ru(stem) stem = rulib.remove_accents(infinitive) + "+alt1=" + stem + "-" elif conjtype.startswith("8"): stem = rulib.remove_accents(infinitive) + "+alt1=" + getparam(t, "3").replace(u"ё", u"е́") + "-" else: stem = rulib.remove_monosyllabic_accents(infinitive) if verbal_noun in nouns: stressed_noun = find_noun(verbal_noun, pagemsg, errandpagemsg, expand_text) if not stressed_noun: msg("%s no-etym FIXME" % verbal_noun) elif stressed_noun == -1: pagemsg("Would add etym for %s but already has one" % verbal_noun) else: if stressed_noun.endswith(stressed_verbal_noun_suffix): suffix = stressed_verbal_noun_suffix else: suffix = verbal_noun_suffix msg("%s %s+-%s no-etym verbal-noun" % (verbal_noun, stem, suffix)) if agent_noun in nouns: stressed_noun = find_noun(agent_noun, pagemsg, errandpagemsg, expand_text) if stressed_noun == -1: pagemsg("Would add etym for %s but already has one" % agent_noun) else: msg(u"%s %s+-тель no-etym agent-noun" % (agent_noun, stem)) if verbal_adj in adjectives: stressed_adj = find_adj(verbal_adj, pagemsg, errandpagemsg, expand_text) if stressed_adj == -1: pagemsg("Would add etym for %s but already has one" % verbal_adj) else: msg(u"%s %s+-тельный no-etym verbal-adj" % (verbal_adj, stem))
def process_line(index, line, add_passive_of, override_etym, save, verbose): def error(text): errmsg("ERROR: Processing line: %s" % line) errmsg("ERROR: %s" % text) assert False def check_stress(word): word = re.sub(r"|.*", "", word) if word.startswith("-") or word.endswith("-"): # Allow unstressed prefix (e.g. разо-) and unstressed suffix (e.g. -овать) return if rulib.needs_accents(word, split_dash=True): error("Word %s missing an accent" % word) # Skip lines consisting entirely of comments if line.startswith("#"): return if line.startswith("!"): override_etym = True line = line[1:] # If the second element (the etymology) begins with raw:, allow spaces in the remainder to be # included as part of the second element. els = do_split(r"\s+", line, 1) if len(els) != 2: error("Expected two fields, saw %s" % len(els)) if not els[1].startswith("raw:"): els = do_split(r"\s+", line) # Replace _ with space and \u els = [el.replace("_", " ").replace(r"\u", "_") for el in els] if len(els) != 2: error("Expected two fields, saw %s" % len(els)) accented_term = els[0] term = rulib.remove_accents(accented_term) etym = els[1] pagetitle = term def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) # Handle etymology adjformtext = "" if etym == "?": error("Etymology consists of bare question mark") elif etym == "-": etymtext = "===Etymology===\n{{rfe|lang=ru}}\n\n" elif etym == "--": etymtext = "" elif re.search(r"^(part|adj|partadj)([fnp]):", etym): m = re.search(r"^(part|adj|partadj)([fnp]):(.*)", etym) forms = {"f":["nom|f|s"], "n":["nom|n|s", "acc|n|s"], "p":["nom|p", "in|acc|p"]} infleclines = ["# {{inflection of|lang=ru|%s||%s}}" % (m.group(3), form) for form in forms[m.group(2)]] if m.group(1) in ["adj", "partadj"]: adjinfltext = """===Adjective=== {{head|ru|adjective form|head=%s%s}} %s\n\n""" % (headterm, trtext, "\n".join(infleclines)) else: adjinfltext = "" if m.group(1) in ["part", "partadj"]: partinfltext = """===Participle=== {{head|ru|participle form|head=%s%s}} %s\n\n""" % (headterm, trtext, "\n".join(infleclines)) else: partinfltext = "" adjformtext = partinfltext + adjinfltext etymtext = "" else: if etym.startswith("acr:"): _, fullexpr, meaning = do_split(":", etym) etymtext = "{{ru-etym acronym of|%s||%s}}." % (fullexpr, meaning) elif etym.startswith("deverb:"): _, sourceterm = do_split(":", etym) etymtext = "Deverbal from {{m|ru|%s}}." % sourceterm elif etym.startswith("back:"): _, sourceterm = do_split(":", etym) etymtext = "{{back-form|lang=ru|%s}}" % sourceterm elif etym.startswith("raw:"): etymtext = re.sub(", *", ", ", re.sub("^raw:", "", etym)) elif ":" in etym and "+" not in etym: if etym.startswith("?"): prefix = "Perhaps borrowed from " etym = re.sub(r"^\?", "", etym) elif etym.startswith("<<"): prefix = "Ultimately borrowed from " etym = re.sub(r"^<<", "", etym) else: prefix = "Borrowed from " m = re.search(r"^([a-zA-Z.-]+):(.*)", etym) if not m: error("Bad etymology form: %s" % etym) etymtext = "%s{{bor|ru|%s|%s}}." % (prefix, m.group(1), m.group(2)) else: prefix = "" suffix = "" if etym.startswith("?"): prefix = "Perhaps from " suffix = "." etym = re.sub(r"^\?", "", etym) elif etym.startswith("<<"): prefix = "Ultimately from " suffix = "." etym = re.sub(r"^<<", "", etym) m = re.search(r"^([a-zA-Z.-]+):(.*)", etym) if m: langtext = "|lang1=%s" % m.group(1) etym = m.group(2) else: langtext = "" etymtext = "%s{{affix|ru|%s%s}}%s" % (prefix, "|".join(do_split(r"\+", re.sub(", *", ", ", etym))), langtext, suffix) etymbody = etymtext + "\n\n" etymtext = "===Etymology===\n" + etymbody if not etymtext: pagemsg("No etymology text, skipping") # Load page page = pywikibot.Page(site, pagetitle) if not blib.try_repeatedly(lambda: page.exists(), pagemsg, "check page existence"): pagemsg("Page doesn't exist, can't add etymology") return pagemsg("Adding etymology") notes = [] pagetext = unicode(page.text) # Split into sections splitsections = re.split("(^==[^=\n]+==\n)", pagetext, 0, re.M) # Extract off pagehead and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Go through each section in turn, looking for existing Russian section for i in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[i], re.M) if not m: pagemsg("Can't find language name in text: [[%s]]" % (sections[i])) elif m.group(1) == "Russian": if override_etym: subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M) replaced_etym = False for j in xrange(2, len(subsections), 2): if "==Etymology==" in subsections[j - 1] or "==Etymology 1==" in subsections[j - 1]: subsections[j] = etymbody replaced_etym = True break if replaced_etym: sections[i] = "".join(subsections) newtext = "".join(sections) notes.append("replace Etymology section in Russian lemma with manually specified etymology") break if "==Etymology==" in sections[i] or "==Etymology 1==" in sections[i]: errandpagemsg("WARNING: Already found etymology, skipping") return subsections = re.split("(^===+[^=\n]+===+\n)", sections[i], 0, re.M) insert_before = 1 if "===Alternative forms===" in subsections[insert_before]: insert_before += 2 subsections[insert_before] = etymtext + subsections[insert_before] sections[i] = "".join(subsections) if add_passive_of: active_term = rulib.remove_monosyllabic_accents( re.sub(u"с[яь]$", "", accented_term)) sections[i] = re.sub(r"(^(#.*\n)+)", r"\1# {{passive of|lang=ru|%s}}\n" % active_term, sections[i], 1, re.M) newtext = pagehead + "".join(sections) notes.append("add (manually specified) Etymology section to Russian lemma") break else: errandpagemsg("WARNING: Can't find Russian section, skipping") return if newtext != pagetext: if verbose: pagemsg("Replacing <%s> with <%s>" % (pagetext, newtext)) assert notes comment = "; ".join(group_notes(notes)) if save: blib.safe_page_save(page, comment, errandpagemsg) else: pagemsg("Would save with comment = %s" % comment)
def process_page(page, index, parsed): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") override_pos = pages_pos.get(pagetitle, None) if override_pos: del pages_pos[pagetitle] if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return titlewords = split_words(pagetitle, True) saw_e = False for word in titlewords: if word.endswith(u"е") and not rulib.is_monosyllabic(word): saw_e = True break if not saw_e: pagemsg(u"No possible final unstressed -е in page title, skipping") return #if (" " in pagetitle or "-" in pagetitle) and not override_pos: # pagemsg(u"WARNING: Space or hyphen in page title and probable final unstressed -е, not sure how to handle yet") # return text = unicode(page.text) notes = [] foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j - 1] == "==Russian==\n": if foundrussian: pagemsg( "WARNING: Found multiple Russian sections, skipping page") return foundrussian = True subsections = re.split( "(^===(?:Etymology|Pronunciation) [0-9]+===\n)", sections[j], 0, re.M) # If no separate etymology sections, add extra stuff at the beginning # to fit the pattern if len(subsections) == 1: subsections = ["", ""] + subsections subsections_with_ru_ipa_to_fix = set() subsections_with_ru_ipa = set() for k in xrange(0, len(subsections), 2): for t in blib.parse_text(subsections[k]).filter_templates(): if unicode(t.name) == "ru-IPA": subsections_with_ru_ipa.add(k) if getparam(t, "pos"): pagemsg( "Already has pos=, skipping template in section %s: %s" % (k // 2, unicode(t))) else: phon = (getparam(t, "phon") or getparam(t, "1") or pagetitle).lower() phonwords = split_words(phon, True) if len(phonwords) != len(titlewords): pagemsg( "WARNING: #Words (%s) in phon=%s not same as #words (%s) in title" % ((len(phonwords) + 1) // 2, phon, (len(titlewords) + 1) // 2)) for i in xrange(0, len(phonwords), 2): phonword = phonwords[i] wordno = i // 2 + 1 if rulib.is_monosyllabic(phonword): pagemsg( "Skipping monosyllabic pronun %s (#%s) in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) elif not phonword.endswith(u"е"): pagemsg( u"Skipping pronun word %s (#%s) in section %s because doesn't end in -е" % (phonword, wordno, k // 2)) else: pagemsg( "Found template that will be modified due to phonword %s (#%s) in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) subsections_with_ru_ipa_to_fix.add(k) else: for i in xrange(0, len(phonwords), 2): titleword = titlewords[i] phonword = phonwords[i] wordno = i // 2 + 1 if rulib.is_monosyllabic(phonword): pagemsg( "Skipping monosyllabic pronun %s (#%s) in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) elif not titleword.endswith(u"е"): pagemsg( u"Skipping title word %s (#%s) in section %s because doesn't end in -е" % (titleword, wordno, k // 2)) elif re.search( u"([еия]|цы|е̂|[кгхцшжщч]а)" + rulib.DOTABOVE + "?$", phonword): pagemsg( "Found template that will be modified due to phonword %s, titleword %s (#%s) in section %s: %s" % (phonword, titleword, wordno, k // 2, unicode(t))) subsections_with_ru_ipa_to_fix.add(k) elif not re.search( u"[еэѐ][" + rulib.AC + rulib.GR + rulib.CFLEX + rulib.DUBGR + "]?$", phonword): pagemsg( u"WARNING: ru-IPA pronunciation word %s (#%s) doesn't end in [еэия] or е̂ or hard sibilant + [ыа] when corresponding titleword %s ends in -е, something wrong in section %s: %s" % (phonword, wordno, titleword, k // 2, unicode(t))) else: pagemsg( u"Pronun word %s (#%s) with final -э or stressed vowel, ignoring in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) if not subsections_with_ru_ipa: pagemsg("No ru-IPA on page, skipping page") return if not subsections_with_ru_ipa_to_fix: pagemsg("No fixable ru-IPA on page, skipping page") return # If saw ru-IPA covering multiple etym sections, make sure we don't # also have pronuns inside the etym sections, and then treat as one # single section for the purposes of finding POS's if 0 in subsections_with_ru_ipa: if len(subsections_with_ru_ipa) > 1: pagemsg( "WARNING: Saw ru-IPA in section 0 (covering multiple etym or pronun sections) and also inside etym/pronun section(s) %s; skipping page" % (",".join(k // 2 for k in subsections_with_ru_ipa if k > 0))) return subsections = ["", "", "".join(subsections)] subsections_with_ru_ipa_to_fix = {2} for k in subsections_with_ru_ipa_to_fix: pagemsg("Fixing section %s" % (k // 2)) parsed = blib.parse_text(subsections[k]) if override_pos: pos = override_pos else: pos = set() is_lemma = set() lemma = set() saw_acc = False saw_noun_form = False for t in parsed.filter_templates(): def getp(param): return getparam(t, param) tname = unicode(t.name) if tname in ["ru-noun", "ru-proper noun"]: if getparam(t, "2") == "-": pagemsg("Found invariable noun: %s" % unicode(t)) pos.add("inv") else: pagemsg("Found declined noun: %s" % unicode(t)) pos.add("n") is_lemma.add(True) elif tname in ["ru-noun+", "ru-proper noun+"]: for param in t.params: if re.search("^[0-9]+$", unicode( param.name)) and "+" in unicode( param.value): pagemsg( "Found declined adjectival noun, treating as adjective: %s" % unicode(t)) pos.add("a") break else: pagemsg("Found declined noun: %s" % unicode(t)) pos.add("n") is_lemma.add(True) elif tname == "comparative of" and getp( "lang") == "ru": pagemsg("Found comparative: %s" % unicode(t)) pos.add("com") is_lemma.add(False) elif tname == "ru-adv": pagemsg("Found adverb: %s" % unicode(t)) pos.add("adv") is_lemma.add(True) elif tname == "ru-adj": pagemsg("Found adjective: %s" % unicode(t)) pos.add("a") is_lemma.add(True) elif tname == "ru-noun form": pagemsg("Found noun form: %s" % unicode(t)) saw_noun_form = True is_lemma.add(False) elif tname == "head" and getp("1") == "ru": if getp("2") == "verb form": pagemsg("Found verb form: %s" % unicode(t)) pos.add("v") is_lemma.add(False) elif getp("2") in [ "adjective form", "participle form" ]: pagemsg("Found adjective form: %s" % unicode(t)) pos.add("a") is_lemma.add(False) elif getp("2") == "noun form": pagemsg("Found noun form: %s" % unicode(t)) saw_noun_form = True is_lemma.add(False) elif getp("2") == "pronoun form": pagemsg("Found pronoun form: %s" % unicode(t)) pos.add("pro") is_lemma.add(False) elif getp("2") == "preposition": pagemsg("Found preposition: %s" % unicode(t)) pos.add("p") is_lemma.add(True) elif getp("2") == "numeral": pagemsg("Found numeral: %s" % unicode(t)) pos.add("num") is_lemma.add(True) elif getp("2") == "pronoun": pagemsg("Found pronoun: %s" % unicode(t)) pos.add("pro") is_lemma.add(True) elif tname == "inflection of" and getp("lang") == "ru": is_lemma.add(False) lemma.add(rulib.remove_accents(getp("1"))) if saw_noun_form: inflection_groups = [] inflection_group = [] for param in t.params: if param.name in ["1", "2"]: continue val = unicode(param.value) if val == ";": if inflection_group: inflection_groups.append( inflection_group) inflection_group = [] else: inflection_group.append(val) if inflection_group: inflection_groups.append(inflection_group) for igroup in inflection_groups: igroup = set(igroup) is_plural = not not ({"p", "plural"} & igroup) if is_plural and ({"nom", "nominative"} & igroup): pagemsg( "Found nominative plural case inflection: %s" % unicode(t)) pos.add("nnp") elif {"acc", "accusative"} & igroup: # We use "n" for misc cases, but skip accusative for now, # adding "n" later if we haven't seen nnp to avoid problems # below with the check for multiple pos's (nom pl and acc pl # are frequently the same) saw_acc = True elif not is_plural and ( {"pre", "prep", "prepositional"} & igroup): pagemsg( "Found prepositional singular case inflection: %s" % unicode(t)) pos.add("pre") elif not is_plural and ({"dat", "dative"} & igroup): pagemsg( "Found dative singular case inflection: %s" % unicode(t)) pos.add("dat") elif not is_plural and ( {"loc", "locative"} & igroup): pagemsg( "Found locative singular case inflection: %s" % unicode(t)) pos.add("dat") elif not is_plural and ( {"voc", "vocative"} & igroup): pagemsg( "Found vocative case inflection: %s" % unicode(t)) pos.add("voc") else: pos.add("n") elif tname == "prepositional singular of" and getp( "lang") == "ru": pagemsg( "Found prepositional singular case inflection: %s" % unicode(t)) pos.add("pre") is_lemma.add(False) lemma.add(getp("1")) elif tname == "dative singular of" and getp( "lang") == "ru": pagemsg( "Found dative singular case inflection: %s" % unicode(t)) pos.add("dat") is_lemma.add(False) lemma.add(getp("1")) elif tname == "vocative singular of" and getp( "lang") == "ru": pagemsg("Found vocative case inflection: %s" % unicode(t)) pos.add("voc") is_lemma.add(False) lemma.add(getp("1")) if saw_acc and "nnp" not in pos: pos.add("n") if "dat" in pos and "pre" in pos: pagemsg("Removing pos=dat because pos=pre is found") pos.remove("dat") if "com" in pos: if "a" in pos: pagemsg("Removing pos=a because pos=com is found") pos.remove("a") if "adv" in pos: pagemsg( "Removing pos=adv because pos=com is found") pos.remove("adv") if "a" in pos and "nnp" in pos: pagemsg("Removing pos=nnp because pos=a is found") pos.remove("nnp") if not pos: pagemsg( "WARNING: Can't locate any parts of speech, skipping section" ) continue if len(pos) > 1: pagemsg( "WARNING: Found multiple parts of speech, skipping section: %s" % ",".join(pos)) continue pos = list(pos)[0] # If multiword term or potential adjectival term, can't trust # the part of speech coming from the above process if (" " in pagetitle or "-" in pagetitle or re.search(u"[ыиео]́?е$", pagetitle)): if not is_lemma: pagemsg( "WARNING: Can't determine whether lemma or not, skipping section" ) continue if len(is_lemma) > 1: pagemsg( "WARNING: Found both lemma and non-lemma parts of speech, skipping section" ) continue is_lemma = list(is_lemma)[0] if (" " in pagetitle or "-" in pagetitle) and is_lemma: pagemsg( u"WARNING: Space or hyphen in lemma page title and probable final unstressed -e, not sure how to handle yet, skipping section" ) continue # If is_lemma, we are a single-word adjective and will be handled # correctly by the above code if not is_lemma: if not lemma: pagemsg( "WARNING: Non-lemma form and can't determine lemma, skipping section" ) continue if len(lemma) > 1: pagemsg( "WARNING: Found inflections of multiple lemmas, skipping section: %s" % ",".join(lemma)) continue lemma = list(lemma)[0] retval = find_noun_word_types(lemma, pagemsg) if not retval: continue word_types, seen_pos_specs = retval words = split_words(pagetitle, False) assert len(words) == len(word_types) modified_word_types = [] need_to_continue = False # FIXME: Should we be using phonetic version of lemma? for wordno, (word, ty) in enumerate( zip(words, word_types)): if word.endswith( u"е" ) and not rulib.is_monosyllabic(word): if ty == "inv": if len(seen_pos_specs) > 1: pagemsg( u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma has ambiguous pos= params (%s), not sure what to do, skipping section" % (pagetitle, word, ",".join(seen_pos_specs))) need_to_continue = True break elif not seen_pos_specs: pagemsg( u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma has no pos= params, not sure what to do, skipping section" % (pagetitle, word)) need_to_continue = True break else: seen_pos_spec = list( seen_pos_specs)[0] seen_poses = re.split( "/", seen_pos_spec) if len(seen_poses) == 1: ty = seen_poses[0] elif len(words) != len(seen_poses): pagemsg( u"WARNING: In multiword term %s, found word %s ending in -е and marked as invariable and lemma param pos=%s has wrong number of parts of speech, not sure what to do, skipping section" % (pagetitle, word, seen_pos_spec)) need_to_continue = True break else: ty = seen_poses[wordno] if not ty: pagemsg( "WARNING: Something wrong with retrieved pos= value from lemma, has blank value" ) need_to_continue = True break if ty == "decln": modified_word_types.append(pos) else: modified_word_types.append(ty) else: modified_word_types.append("") if need_to_continue: continue non_blank_distinct_mwt = set( x for x in modified_word_types if x) if len(non_blank_distinct_mwt) == 0: pagemsg( "WARNING: Something wrong, pos= would end up blank" ) elif len(non_blank_distinct_mwt) == 1: pos = list(non_blank_distinct_mwt)[0] else: pos = "/".join(modified_word_types) # Check whether there's a pronunciation with final -е for a given # word. There are some entries that have multiple pronunciations, # one with final -е and one with something else, e.g. final -и, # and we want to leave those alone with a warning. saw_final_e = {} for t in parsed.filter_templates(): if unicode(t.name) == "ru-IPA": param = "phon" phon = getparam(t, param) if not phon: param = "1" phon = getparam(t, "1") if not phon: param = "pagetitle" phon = pagetitle if getparam(t, "pos"): pass # Already output msg else: phonwords = split_words(phon, True) for i in xrange(0, len(phonwords), 2): if re.search(u"е$", phonwords[i]): saw_final_e[i] = True # Now modify the templates. for t in parsed.filter_templates(): if unicode(t.name) == "ru-IPA": param = "phon" phon = getparam(t, param) if not phon: param = "1" phon = getparam(t, "1") if not phon: param = "pagetitle" phon = pagetitle origt = unicode(t) if getparam(t, "pos"): pass # Already output msg else: phonwords = split_words(phon, True) mismatched_phon_title = len(phonwords) != len( titlewords) for i in xrange(0, len(phonwords), 2): titleword = not mismatched_phon_title and titlewords[ i] phonword = phonwords[i] lphonword = phonword.lower() wordno = i // 2 + 1 if rulib.is_monosyllabic(phonword): pass # Already output msg elif mismatched_phon_title: pass # Can't canonicalize template elif not titleword.endswith(u"е"): pass # Already output msg elif re.search( u"([еия]|цы|е̂|[кгхцшжщч]а)" + rulib.DOTABOVE + "?$", lphonword): # Found a template to modify if re.search(u"е" + rulib.DOTABOVE + "?$", lphonword): pass # No need to canonicalize else: if saw_final_e.get(i, False): pagemsg( u"WARNING: Found another pronunciation with final -е, skipping: phon=%s (word #%s)" % (phonword, wordno)) continue if re.search( u"и" + rulib.DOTABOVE + "?$", lphonword): pagemsg( u"phon=%s (word #%s) ends in -и, will modify to -е in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) notes.append( u"unstressed -и -> -е") elif re.search(u"е̂$", lphonword): # Make this a warning because we're not sure this is correct pagemsg( u"WARNING: phon=%s (word #%s) ends in -е̂, will modify to -е in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) notes.append(u"-е̂ -> -е") elif re.search( u"я" + rulib.DOTABOVE + "?$", lphonword): pagemsg( u"phon=%s (word #%s) ends in -я, will modify to -е in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) notes.append( u"unstressed -я -> -е") elif re.search( u"цы" + rulib.DOTABOVE + "?$", lphonword): pagemsg( u"phon=%s (word #%s) ends in ц + -ы, will modify to -е in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) notes.append( u"unstressed -ы after ц -> -е") elif re.search( u"[кгхцшжщч]а" + rulib.DOTABOVE + "?$", lphonword): pagemsg( u"phon=%s (word #%s) ends in unpaired cons + -а, will modify to -е in section %s: %s" % (phonword, wordno, k // 2, unicode(t))) notes.append( u"unstressed -а after unpaired cons -> -е" ) else: assert False, "Something wrong, strange ending, logic not correct: section %s, phon=%s (word #%s)" % ( k // 2, phonword, wordno) newphonword = re.sub( u"(?:[ияыа]|е̂)(" + rulib.DOTABOVE + "?)$", ur"е\1", phonword) newphonword = re.sub( u"(?:[ИЯЫА]|Е̂)(" + rulib.DOTABOVE + "?)$", ur"Е\1", newphonword) pagemsg( "Modified phon=%s (word #%s) to %s in section %s: %s" % (phonword, wordno, newphonword, k // 2, unicode(t))) phonwords[i] = newphonword newphon = "".join(phonwords) if newphon != phon: assert param != "pagetitle", u"Something wrong, page title should not have -и or similar that needs modification: section %s, phon=%s, newphon=%s" % ( k // 2, phon, newphon) if pos in ["voc", "inv", "pro"]: pagemsg( u"WARNING: pos=%s may be unstable or inconsistent in handling final -е, please check change of phon=%s to %s in section %s: %s" % (pos, phon, newphon, k // 2, unicode(t))) pagemsg( "Modified phon=%s to %s in section %s: %s" % (phon, newphon, k // 2, unicode(t))) if pos == "none": pagemsg( "WARNING: pos=none, should not occur, not modifying phon=%s to %s in section %s: %s" % (phon, newphon, k // 2, unicode(t))) else: t.add(param, newphon) if pos == "none": pagemsg( "WARNING: pos=none, should not occur, not setting pos= in section %s: %s" % (k // 2, unicode(t))) else: t.add("pos", pos) notes.append( "added pos=%s%s" % (pos, override_pos and " (override)" or "")) pagemsg( "Replaced %s with %s in section %s%s" % (origt, unicode(t), k // 2, override_pos and " (using override)" or "")) subsections[k] = unicode(parsed) sections[j] = "".join(subsections) new_text = "".join(sections) def fmt_key_val(key, val): if val == 1: return "%s" % key else: return "%s (%s)" % (key, val) if new_text != text: assert notes # Group identical notes together and append the number of such identical # notes if > 1, putting 'added pos=X' notes before others, so we get e.g. # "added pos=n (2); added pos=a; unstressed -и -> -е (2)" from five # original notes. # 1. Count items in notes[] and return a key-value list in descending order notescount = Counter(notes).most_common() # 2. Extract 'added pos=X' items; we put them first; note, descending order # of # of times each note has been seen is maintained added_pos = [(x, y) for x, y in notescount if x.startswith("added pos=")] # 3. Extract other items not_added_pos = [(x, y) for x, y in notescount if not x.startswith("added pos=")] # 4. Recreate notes for 'added pos=X', then others notes = [fmt_key_val(x, y) for x, y in added_pos] notes.extend([fmt_key_val(x, y) for x, y in not_added_pos]) return new_text, notes
els = do_split(r"\s+", line) if len(els) == 2 and els[1].startswith("altyo:"): altyoparts = do_split(":", els[1]) if len(altyoparts) != 3: error("Expected verb and aspect with altyo:") yoline = u"{{ru-verb-alt-ё|%s|%s}}" % (altyoparts[1], altyoparts[2]) msg("""%s ==Russian== ===Verb=== %s """ % (rulib.remove_accents(altyoparts[1]).replace(u"ё", u"е"), yoline)) continue # Replace _ with space, but not in the conjugation, where param names # may well have an underscore in them; but allow \s to stand for a space in # the conjugation, and \u to stand for an underscore elsewhere. els = [ el.replace(r"\s", " ") if i == 4 else el.replace("_", " ").replace( r"\u", "_") for i, el in enumerate(els) ] if len(els) < 5: error("Expected five fields, saw only %s" % len(els)) verb, etym, aspect, corverbs, conj = els[0], els[1], els[2], els[3], els[4] translit = None declverb = verb if "//" in verb:
def normalize_text(text): return rulib.remove_accents(blib.remove_links(text)).replace("'''", "")
parser.add_argument("--nouns", action='store_true', help="Do derived nouns instead of adjectives") parser.add_argument("--adverbs", action='store_true', help="Do derived adverbs") parser.add_argument("--base-lemmafile", help="File containing base lemmas") parser.add_argument("--derived-lemmafile", help="File containing derived lemmas") args = parser.parse_args() start, end = blib.parse_start_end(args.start, args.end) derived_lemmas = [] if args.derived_lemmafile: derived_lemmas = [ rulib.remove_accents(x.strip()) for x in codecs.open(args.derived_lemmafile, "r", "utf-8") ] else: for i, page in blib.cat_articles( "Russian adverbs" if args.adverbs else "Russian nouns" if args. nouns else "Russian adjectives"): derived_lemmas.append(page.title()) if args.base_lemmafile: for i, pagename in blib.iter_items([ rulib.remove_accents(x.strip()) for x in codecs.open(args.base_lemmafile, "r", "utf-8") ]): page = pywikibot.Page(site, pagename) process_page(i, page, args.save, args.verbose, derived_lemmas)
def process_page(index, page, lemmas): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") pagetext = unicode(page.text) section = blib.find_lang_section_from_text(pagetext, "Russian", pagemsg) if not section: errandpagemsg("WARNING: Couldn't find Russian section") return if "==Etymology" in section: return if rulib.check_for_alt_yo_terms(section, pagemsg): return parsed = blib.parse_text(section) for t in parsed.filter_templates(): if unicode(t.name) in ["ru-participle of"]: pagemsg("Skipping participle") return saw_verb = False saw_passive = False saw_bad_passive = False for t in parsed.filter_templates(): if unicode(t.name) in ["passive of", "passive form of"]: saw_passive = True if not saw_passive and ("passive of" in section or "passive form of" in section): saw_bad_passive = True splits = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-verb": saw_verb = True saw_paired_verb = False printed_msg = False heads = blib.fetch_param_chain(t, "1", "head") or [pagetitle] refl = heads[0].endswith(u"ся") or heads[0].endswith(u"сь") if refl: m = re.search(u"^(.*)(с[яь])$", heads[0]) assert m transverb_no_passive = (False if (saw_passive or saw_bad_passive) else is_transitive_verb( rulib.remove_accents(m.group(1)), pagemsg, errandpagemsg)) if (saw_passive or saw_bad_passive or transverb_no_passive): splits.append( (heads, [m.group(1) ], "%s+-%s" % (m.group(1), m.group(2)), "active-passive%s%s" % (saw_bad_passive and " (saw-bad-passive)" or "", transverb_no_passive and " (missing-passive-decl)" or ""))) continue if getparam(t, "2").startswith("impf"): pfs = blib.fetch_param_chain(t, "pf", "pf") for otheraspect in pfs: if heads[0][0:2] == otheraspect[0:2]: saw_paired_verb = True if saw_paired_verb: splits.append((heads, pfs, ",".join(pfs), "paired-impf")) printed_msg = True if getparam(t, "2").startswith("pf"): prefixes = [ u"взъ", u"вз", u"вс", u"возъ", u"воз", u"вос", u"вы́", u"въ", u"в", u"до", u"за", u"изъ", u"из", u"ис", u"на", u"объ", u"об", u"отъ", u"от", u"о", u"пере", u"подъ", u"под", u"по", u"предъ", u"пред", u"пре", u"при", u"про", u"разъ", u"раз", u"рас", u"съ", u"с", u"у" ] for break_reflexives in [False, True]: head = heads[0] if break_reflexives: if not head.endswith(u"ся") and not head.endswith( u"сь"): break reflsuf = "+-" + head[-2:] # fetch reflexive suffix head = head[:-2] # drop reflexive suffix else: reflsuf = "" for prefix in prefixes: m = re.match("^(%s)(.*)$" % prefix, head) if m: base = rulib.remove_monosyllabic_accents( re.sub(u"^ы", u"и", m.group(2))) if rulib.remove_accents(base) in lemmas: base_to_do = base elif rulib.remove_accents("-" + base) in lemmas: base_to_do = "-" + base else: base_to_do = None if base_to_do: prefix = prefix.replace(u"ъ", "") if m.group(1) == u"вы́": need_accent = "-NEED-ACCENT" else: need_accent = "" splits.append(( heads, [base_to_do], "%s-+%s%s%s" % (prefix, base_to_do, reflsuf, need_accent), "strip-prefix")) printed_msg = True if not printed_msg: msg("%s no-etym misc" % ",".join(heads)) for derived_terms, base_terms, analysis, comment in splits: warnings = [] base_terms_no_accent = [] for term in base_terms: term = rulib.remove_accents(term) if term not in base_terms_no_accent: base_terms_no_accent.append(term) if len(base_terms_no_accent) > 1: errandpagemsg( "WARNING: Multiple base pages %s for base lemmas %s" % (",".join(base_terms_no_accent), ",".join(base_terms))) continue if base_terms_no_accent[0] not in lemmas: continue derived_defns = rulib.find_defns(section) if not derived_defns: errandpagemsg( "WARNING: Couldn't find definitions for derived term %s" % ",".join(derived_terms)) continue base_section = blib.find_lang_section(base_terms_no_accent[0], "Russian", pagemsg, errandpagemsg) if not base_section: errandpagemsg( "WARNING: Couldn't find Russian section for base term %s" % base_terms_no_accent[0]) continue base_defns = rulib.find_defns(base_section) if not base_defns: errandpagemsg( "WARNING: Couldn't find definitions for base term %s" % ",".join(base_terms)) continue def concat_defns(defns): return ";".join(defns).replace("_", r"\u").replace(" ", "_") msg("%s %s%s no-etym %s %s //// %s" % (",".join(derived_terms), analysis, " WARNING:%s" % ",".join(warnings) if warnings else "", comment, concat_defns(base_defns), concat_defns(derived_defns))) if not saw_verb: msg("%s no-etym misc" % pagetitle)
def find_decl_args(lemma, infl, wordind): declpage = pywikibot.Page(site, lemma) if rulib.remove_accents(infl) == lemma: wordlink = "[[%s]]" % infl else: wordlink = "[[%s|%s]]" % (lemma, infl) if not declpage.exists(): if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma): pagemsg( "WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) return [("1", wordlink), ("2", "+")], True, None, None else: pagemsg( "WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None parsed = blib.parse_text(declpage.text) decl_templates = [] headword_templates = [] decl_z_templates = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-noun-table", "ru-decl-adj"]: pagemsg("find_decl_args: Found decl template: %s" % unicode(t)) decl_templates.append(t) if tname in ["ru-noun", "ru-proper noun"]: pagemsg("find_decl_args: Found headword template: %s" % unicode(t)) headword_templates.append(t) if tname in ["ru-decl-noun-z"]: pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t)) decl_z_templates.append(t) if not decl_templates: if decl_z_templates: # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} if len(decl_z_templates) > 1: pagemsg( "WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None else: decl_z_template = decl_z_templates[0] headword_template = None pagemsg("find_decl_args: Using z-decl template: %s" % unicode(decl_z_template)) if len(headword_templates) == 0: pagemsg( "WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) elif len(headword_templates) > 1: pagemsg( "WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) else: headword_template = headword_templates[0] pagemsg( "find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" % (wordind, lemma, infl, unicode(headword_template), unicode(decl_z_template))) decl_template = runounlib.convert_zdecl_to_ru_noun_table( decl_z_template, subpagetitle, pagemsg, headword_template=headword_template) decl_templates = [decl_template] elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [ x for x in headword_templates if getparam(x, "3") == "-" ]: return [("1", wordlink), ("2", "$")], False, None, None else: pagemsg( "WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None if len(decl_templates) == 1: decl_template = decl_templates[0] else: # Multiple decl templates for t in decl_templates: if unicode(t.name) == "ru-decl-adj" and re.search( u"(ий|ый|ой)$", lemma): pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) decl_template = t break else: if lemma in use_given_decl: overriding_decl = use_given_decl[lemma] pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text( overriding_decl).filter_templates()[0] elif pagetitle in use_given_page_decl: overriding_decl = use_given_page_decl[pagetitle].get( lemma, None) if not overriding_decl: pagemsg( "WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return else: pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text( overriding_decl).filter_templates()[0] else: pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template)) if unicode(decl_template.name) == "ru-decl-adj": if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U): return [("1", wordlink), ("2", u"+ь")], True, None, None else: return [("1", wordlink), ("2", "+")], True, None, None
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) origtext = page.text parsed = blib.parse_text(origtext) # Find the declension arguments for LEMMA and inflected form INFL, # the WORDINDth word in the expression. Return value is a tuple of # four items: a list of (NAME, VALUE) tuples for the arguments, whether # the word is an adjective, the value of n= (if given), and the value # of a= (if given). def find_decl_args(lemma, infl, wordind): declpage = pywikibot.Page(site, lemma) if rulib.remove_accents(infl) == lemma: wordlink = "[[%s]]" % infl else: wordlink = "[[%s|%s]]" % (lemma, infl) if not declpage.exists(): if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma): pagemsg( "WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) return [("1", wordlink), ("2", "+")], True, None, None else: pagemsg( "WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None parsed = blib.parse_text(declpage.text) decl_templates = [] headword_templates = [] decl_z_templates = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-noun-table", "ru-decl-adj"]: pagemsg("find_decl_args: Found decl template: %s" % unicode(t)) decl_templates.append(t) if tname in ["ru-noun", "ru-proper noun"]: pagemsg("find_decl_args: Found headword template: %s" % unicode(t)) headword_templates.append(t) if tname in ["ru-decl-noun-z"]: pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t)) decl_z_templates.append(t) if not decl_templates: if decl_z_templates: # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} if len(decl_z_templates) > 1: pagemsg( "WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None else: decl_z_template = decl_z_templates[0] headword_template = None pagemsg("find_decl_args: Using z-decl template: %s" % unicode(decl_z_template)) if len(headword_templates) == 0: pagemsg( "WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) elif len(headword_templates) > 1: pagemsg( "WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) else: headword_template = headword_templates[0] pagemsg( "find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" % (wordind, lemma, infl, unicode(headword_template), unicode(decl_z_template))) decl_template = runounlib.convert_zdecl_to_ru_noun_table( decl_z_template, subpagetitle, pagemsg, headword_template=headword_template) decl_templates = [decl_template] elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [ x for x in headword_templates if getparam(x, "3") == "-" ]: return [("1", wordlink), ("2", "$")], False, None, None else: pagemsg( "WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None if len(decl_templates) == 1: decl_template = decl_templates[0] else: # Multiple decl templates for t in decl_templates: if unicode(t.name) == "ru-decl-adj" and re.search( u"(ий|ый|ой)$", lemma): pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) decl_template = t break else: if lemma in use_given_decl: overriding_decl = use_given_decl[lemma] pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text( overriding_decl).filter_templates()[0] elif pagetitle in use_given_page_decl: overriding_decl = use_given_page_decl[pagetitle].get( lemma, None) if not overriding_decl: pagemsg( "WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return else: pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text( overriding_decl).filter_templates()[0] else: pagemsg( "WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template)) if unicode(decl_template.name) == "ru-decl-adj": if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U): return [("1", wordlink), ("2", u"+ь")], True, None, None else: return [("1", wordlink), ("2", "+")], True, None, None # ru-noun-table assert unicode(decl_template.name) == "ru-noun-table" # Split out the arg sets in the declension and check the # lemma of each one, taking care to handle cases where there is no lemma # (it would default to the page name). highest_numbered_param = 0 for p in decl_template.params: pname = unicode(p.name) if re.search("^[0-9]+$", pname): highest_numbered_param = max(highest_numbered_param, int(pname)) # Now gather the numbered arguments into arg sets. Code taken from # ru-noun.lua. offset = 0 arg_sets = [] arg_set = [] for i in xrange(1, highest_numbered_param + 2): end_arg_set = False val = getparam(decl_template, str(i)) if i == highest_numbered_param + 1: end_arg_set = True elif val == "_" or val == "-" or re.search("^join:", val): pagemsg( "WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None elif val == "or": end_arg_set = True if end_arg_set: arg_sets.append(arg_set) arg_set = [] offset = i else: arg_set.append(val) canon_infl = rulib.remove_accents(infl).lower() canon_lemma = lemma.lower() ispl = False need_sc1 = False found_gender = None if canon_infl != canon_lemma: for sgend, plend, gender, is_sc1 in pl_data: if sgend: check_sgend = sgend else: check_sgend = consonant_re if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub( sgend + "$", plend, canon_lemma): ispl = True found_gender = gender need_sc1 = is_sc1 break else: pagemsg( "WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None # Substitute the wordlink for any lemmas in the declension. # If plural, also add gender and verify special case (1) as necessary. # Concatenate all the numbered params, substituting the wordlink into # the lemma as necessary. numbered_params = [] for arg_set in arg_sets: lemma_arg = 0 if len(arg_set) > 0 and runounlib.arg1_is_stress(arg_set[0]): lemma_arg = 1 if len(arg_set) <= lemma_arg: arg_set.append("") arglemma = arg_set[lemma_arg] manualtr = "" if "//" in arglemma: arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups() if (not arglemma or arglemma.lower() == infl.lower() or rulib.is_monosyllabic(infl) and rulib.remove_accents(arglemma).lower() == rulib.remove_accents(infl).lower() or ispl and rulib.remove_accents(arglemma).lower() == lemma.lower()): arg_set[lemma_arg] = wordlink + manualtr else: pagemsg( "WARNING: Can't sub word link %s into decl lemma %s%s" % (wordlink, arg_set[lemma_arg], ispl and ", skipping" or "")) if ispl: return None if ispl: # Add the gender if len(arg_set) <= lemma_arg + 1: arg_set.append("") declarg = arg_set[lemma_arg + 1] # First, sub in gender m = re.search("(3f|[mfn])", declarg) if found_gender == "mf": if not m: pagemsg( u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None decl_gender = m.group(1) if decl_gender == "n": pagemsg( u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None elif decl_gender in ["m", "3f"]: pagemsg( u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (decl_gender, wordind, lemma, infl)) else: assert gender == "f" pagemsg( u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" % (wordind, lemma, infl)) declarg = re.sub("f", "3f", declarg, 1) else: if m: decl_gender = m.group(1) if decl_gender == found_gender: pagemsg( "Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (found_gender, wordind, lemma, infl)) else: pagemsg( "WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" % (decl_gender, wordind, found_gender, lemma, infl)) declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1) else: pagemsg( "No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" % (wordind, found_gender, lemma, infl)) declarg = found_gender + declarg # Now check special case 1 if need_sc1 != ("(1)" in declarg): if need_sc1: pagemsg( "WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % (wordind, declarg, lemma, infl)) return None else: pagemsg( "WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % (wordind, declarg, lemma, infl)) return None arg_set[lemma_arg + 1] = declarg if numbered_params: numbered_params.append("or") numbered_params.extend(arg_set) # Now gather all params, including named ones. params = [] params.extend( (str(i + 1), val) for i, val in zip(xrange(len(numbered_params)), numbered_params)) num = None anim = None for p in decl_template.params: pname = unicode(p.name) val = unicode(p.value) if pname == "a": anim = val elif pname == "n": num = val elif pname == "notes": params.append((pname, val)) elif pname == "title": pagemsg( "WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" % (wordind, lemma, infl, val)) elif re.search("^[0-9]+$", pname): pass else: keepparam = True if pname == "loc": if pagetitle in keep_locative: pagemsg( "Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) else: pagemsg( "WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) keepparam = False if pname == "par": pagemsg( "WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) keepparam = False if pname == "voc": pagemsg( "WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) keepparam = False if keepparam: if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U): pagemsg( u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) pname += str(wordind) params.append((pname, val))
# has пистолет-пулемёт given as a single entry. We have a check below # to try to catch this case, because no inflected nouns will show up. for i in xrange(1, len(headwords_separators), 2): hword = headwords_separators[i] separator = headwords_separators[i + 1] if i < len(headwords_separators ) - 2 and separator != " " and separator != "-": pagemsg( "WARNING: Separator after word #%s isn't a space or hyphen, can't handle: word=<%s>, separator=<%s>" % (wordind + 1, hword, separator)) return # Canonicalize link in headword m = re.search(r"^\[\[([^\[\]|]+)\|([^\[\]|]+)\]\]$", hword) if m: lemma, infl = m.groups() lemma = rulib.remove_accents(re.sub("#Russian$", "", lemma)) if lemma == rulib.remove_accents(infl): hword = "[[%s]]" % infl else: hword = "[[%s|%s]]" % (lemma, infl) headwords.append(hword) separators.append(separator) wordind += 1 pagemsg("Found headwords: %s" % " @@ ".join(headwords)) # Get headword genders (includes animacy and number) genders = blib.fetch_param_chain(headword_template, "2", "g") genders_include_pl = len([x for x in genders if re.search(r"\bp\b", x)]) > 0
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import re, sys, codecs, argparse from blib import msg, errmsg import rulib parser = argparse.ArgumentParser( description="Find lemmas which would have forms saved.") parser.add_argument('--direcfile', help="File containing directives.") args = parser.parse_args() lemmas = set() for line in codecs.open(args.direcfile, "r", "utf-8"): line = line.strip() if "Would save with comment" in line: m = re.search( "Would save with comment.* (?:of|dictionary form) (.*?)(,| after| before| \(add| \(modify| \(update|$)", line) if not m: errmsg("WARNING: Unable to parse line: %s" % line) else: lemmas.add(rulib.remove_accents(m.group(1))) for lemma in sorted(lemmas): print lemma.encode('utf-8')
#!/usr/bin/env python # -*- coding: utf-8 -*- import re, codecs, argparse from blib import msg import rulib parser = argparse.ArgumentParser(description="Make bare and list versions of 10,000-word frequency list from the Internet.") parser.add_argument('--file', help="File containing original list.") args = parser.parse_args() for line in codecs.open(args.file, "r", "utf-8"): line = line.strip() line = re.sub(" .*", "", line) line = rulib.remove_accents(line) if "/" in line: els = re.split("/", line) impf = els[0] msg(impf) for pf in els[1:]: if pf.endswith("-"): pf = re.sub("-$", impf, pf) msg(pf) else: msg(line)
assert pos in pos_to_full_pos fullpos = pos_to_full_pos[pos] if len(altyoparts) == 2: yoline = u"{{ru-pos-alt-ё|%s|%s}}" % (altyoparts[1], fullpos.lower()) else: error("With misc. part of speech, gender/aspect not supported") msg("""%s ==Russian== ===%s=== %s """ % (rulib.remove_accents(altyoparts[1]).replace( u"ё", u"е"), pos_to_full_pos[pos], yoline)) continue # Replace _ with space, but not in the declension, where there may be # an underscore, e.g. a|short_m=-; but allow \s to stand for a space in # the declension, and \u for underscore elsewhere els = [ el.replace(r"\s", " ") if i == 2 and (pos in ["n", "pn", "adj"]) else el.replace("_", " ").replace(r"\u", "_") for i, el in enumerate(els) ] if pos not in ["n", "pn", "adj"]: term, etym, defns = els[0], els[1], els[2] remainder = els[3:] else: if len(els) < 4: error("Expected four fields, saw only %s" % len(els))
def process_page(index, page, direc, delete_bad, fix_verbs, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] direc = direc.replace("3oa", u"3°a") for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj"]: conjtype = getparam(t, "1") if not conjtype.startswith("3olda"): continue if conjtype.startswith("3olda") and conjtype != "3olda": pagemsg("WARNING: Found 3a-old with variant, can't process: %s" % unicode(t)) continue tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue oldargs = rulib.split_generate_args(result) rmparam(t, "6") rmparam(t, "5") rmparam(t, "4") t.add("1", direc) tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue if delete_bad: newargs = rulib.split_generate_args(result) for form in ["past_m", "past_f", "past_n", "past_pl", "past_m_short", "past_f_short", "past_n_short", "past_pl_short"]: oldforms = re.split(",", oldargs[form]) if form in oldargs else [] newforms = re.split(",", newargs[form]) if form in newargs else [] for oldform in oldforms: if oldform not in newforms: formpagename = rulib.remove_accents(oldform) formpage = pywikibot.Page(site, formpagename) if not formpage.exists(): pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename) elif formpagename == pagetitle: pagemsg("WARNING: Attempt to delete dictionary form, skipping") else: text = unicode(formpage.text) if "Etymology 1" in text: pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename) elif "----" in text: pagemsg("WARNING: Multiple languages apparently in form, skippin form %s" % formpagename) else: numinfls = len(re.findall(r"\{\{inflection of\|", text)) if numinfls < 1: pagemsg("WARNING: Something wrong, no 'inflection of' templates on page for form %s" % formpagename) elif numinfls > 1: pagemsg("WARNING: Multiple 'inflection of' templates on page for form %s, skipping" % formpagename) else: comment = "Delete erroneously created long form of %s" % pagetitle pagemsg("Existing text for form %s: [[%s]]" % ( formpagename, text)) if save: formpage.delete(comment) else: pagemsg("Would delete page %s with comment=%s" % (formpagename, comment)) notes.append("fix 3olda -> %s" % direc) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text and fix_verbs: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose, adverbs, all_derived_lemmas): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") # ending and whether final consonant is palatal endings = [ (u"ывать", False), (u"ивать", False), (u"ать", False), (u"ять", True), (u"еть", True), (u"ить", True), (u"нуть", False), (u"ия", True), (u"ие", True), (u"я", True), (u"е", True), (u"ь", True), (u"и", True), (u"а", False), (u"о", False), (u"ы", False), (u"ый", False), (u"ий", True), (u"ой", False), ] stems = [] for ending, is_palatal in endings: if pagetitle.endswith(ending): stem = re.sub(ending + "$", "", pagetitle) stems.append((stem, is_palatal)) if not stems: stems.append((pagetitle, False)) possible = [] def append_possible(stem_to_try, suffix): possible.append((stem_to_try.lower() + suffix, suffix)) # Try -ный/-ной, -ка, -ко for stem, palatal in stems: stems_to_try = [] def frob(stem): stem = first_palatalization(stem) if stem.endswith(u"л"): stem += u"ь" if re.search("[" + rulib.vowel + "]$", stem): stem += u"й" return stem to_try_1 = frob(stem) to_try_2 = rulib.dereduce_stem(stem, False) if to_try_2: to_try_2 = frob(rulib.remove_accents(to_try_2)) to_try_3 = rulib.dereduce_stem(stem, True) if to_try_3: to_try_3 = frob(rulib.remove_accents(to_try_3)) stems_to_try.append(to_try_1) if to_try_2: stems_to_try.append(to_try_2) if to_try_3 and to_try_3 != to_try_2: stems_to_try.append(to_try_3) for stem_to_try in stems_to_try: append_possible(stem_to_try, u"ный") append_possible(stem_to_try, u"ной") append_possible(stem_to_try, u"ский") append_possible(stem_to_try, u"ской") append_possible(stem_to_try, u"ник") append_possible(stem_to_try, u"чик") append_possible(stem_to_try, u"щик") append_possible(stem_to_try, u"ка") append_possible(stem_to_try, u"ко") append_possible(stem_to_try, u"ство") # Try -овый/-евый/-ёвый/-овой/-евой, -ик, -ок/-ек/-ёк for stem, palatal in stems: stems_to_try = [] stems_to_try.append(stem) reduced = rulib.reduce_stem(stem) if reduced: stems_to_try.append(reduced) for stem_to_try in stems_to_try: if stem_to_try.endswith(u"й"): stem_to_try = stem_to_try[:-1] append_possible(stem_to_try, u"овый") append_possible(stem_to_try, u"евый") append_possible(stem_to_try, u"ёвый") append_possible(stem_to_try, u"овой") append_possible(stem_to_try, u"евой") stem_to_try = first_palatalization(stem_to_try) append_possible(stem_to_try, u"еский") append_possible(stem_to_try, u"ический") append_possible(stem_to_try, u"ество") append_possible(stem_to_try, u"ик") append_possible(stem_to_try, u"ок") append_possible(stem_to_try, u"ек") append_possible(stem_to_try, u"ёк") append_possible(stem_to_try, u"ец") # If derived adverbs, try -о, -е, -и if adverbs: for stem, palatal in stems: stems_to_try = [] stems_to_try.append(stem) for stem_to_try in stems_to_try: append_possible(stem_to_try, u"о") append_possible(stem_to_try, u"е") append_possible(stem_to_try, u"и") would_output = False for possible_derived, suffix in possible: if possible_derived in all_derived_lemmas: would_output = True if not would_output: return text = unicode(page.text) if rulib.check_for_alt_yo_terms(text, pagemsg): return base_lemmas = [] for possible_derived, suffix in possible: if possible_derived in all_derived_lemmas: derived_section = blib.find_lang_section(possible_derived, "Russian", pagemsg, errandpagemsg) if not derived_section: errandpagemsg( "WARNING: Couldn't find Russian section for derived term %s" % possible_derived) continue if "==Etymology" in derived_section: pagemsg( "Skipping derived term %s because it already has an etymology" % possible_derived) continue derived_defns = rulib.find_defns(derived_section) if not derived_defns: errandpagemsg( "WARNING: Couldn't find definitions for derived term %s" % possible_derived) continue derived_parsed = blib.parse_text(derived_section) derived_lemmas = find_noun_lemmas( derived_parsed, possible_derived, errandpagemsg, lambda tempcall: blib.expand_text( tempcall, possible_derived, pagemsg, verbose)) for t in derived_parsed.filter_templates(): if tname(t) in ["ru-adj", "ru-adv"]: lemmas = blib.fetch_param_chain(t, "1", "head", possible_derived) trs = blib.fetch_param_chain(t, "tr", "tr") if trs: lemmas = [ "%s//%s" % (lemma, tr) for lemma, tr in zip(lemmas, trs) ] for lemma in lemmas: add_if_not(derived_lemmas, lemma) if not derived_lemmas: errandpagemsg("WARNING: No derived term lemmas for %s" % possible_derived) return if not base_lemmas: base_parsed = blib.parse_text(text) base_lemmas = find_noun_lemmas(base_parsed, pagetitle, errandpagemsg, expand_text) for t in base_parsed.filter_templates(): if tname(t) in ["ru-verb", "ru-adj"]: lemmas = blib.fetch_param_chain( t, "1", "head", pagetitle) trs = blib.fetch_param_chain(t, "tr", "tr") if trs: lemmas = [ "%s//%s" % (lemma, tr) for lemma, tr in zip(lemmas, trs) ] for lemma in lemmas: add_if_not(base_lemmas, lemma) if not base_lemmas: errandpagemsg("WARNING: No base lemmas") return base_lemmas = [ rulib.remove_monosyllabic_accents(x) for x in base_lemmas ] warnings = [] if len(base_lemmas) > 1: warnings.append("multiple-lemmas") if any("//" in lemma for lemma in base_lemmas): warnings.append("translit-in-lemma") base_section = blib.find_lang_section_from_text( text, "Russian", pagemsg) if not base_section: errandpagemsg( "WARNING: Couldn't find Russian section for base") return base_defns = rulib.find_defns(base_section) if not base_defns: errandpagemsg( "WARNING: Couldn't find definitions for base") return def concat_defns(defns): return ";".join(defns).replace("_", r"\u").replace(" ", "_") suffixes_with_stress = [] for suf in [ suffix, rulib.make_beginning_stressed_ru(suffix), rulib.make_ending_stressed_ru(suffix) ]: for derived_lemma in derived_lemmas: if derived_lemma.endswith(suf): add_if_not(suffixes_with_stress, suf) msg("%s %s+-%s%s no-etym possible-suffixed %s //// %s" % (",".join(derived_lemmas), ",".join(base_lemmas), ",".join(suffixes_with_stress), " WARNING:%s" % ",".join(warnings) if warnings else "", concat_defns(base_defns), concat_defns(derived_defns)))
def process_page(index, page, direc, delete_bad, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("WARNING: Script no longer applies and would need fixing up") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] direc = direc.replace("3oa", u"3°a") for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) in ["ru-conj"]: conjtype = getparam(t, "1") if not conjtype.startswith("3olda"): continue if conjtype.startswith("3olda") and conjtype != "3olda": pagemsg("WARNING: Found 3a-old with variant, can't process: %s" % unicode(t)) continue tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue oldargs = blib.split_generate_args(result) rmparam(t, "6") rmparam(t, "5") rmparam(t, "4") t.add("1", direc) tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue if delete_bad: newargs = blib.split_generate_args(result) for form in ["past_m", "past_f", "past_n", "past_pl", "past_m_short", "past_f_short", "past_n_short", "past_pl_short"]: oldforms = re.split(",", oldargs[form]) if form in oldargs else [] newforms = re.split(",", newargs[form]) if form in newargs else [] for oldform in oldforms: if oldform not in newforms: formpagename = rulib.remove_accents(oldform) formpage = pywikibot.Page(site, formpagename) if not formpage.exists(): pagemsg("WARNING: Form page %s doesn't exist, skipping" % formpagename) elif formpagename == pagetitle: pagemsg("WARNING: Attempt to delete dictionary form, skipping") else: text = unicode(formpage.text) if "Etymology 1" in text: pagemsg("WARNING: Found 'Etymology 1', skipping form %s" % formpagename) elif "----" in text: pagemsg("WARNING: Multiple languages apparently in form, skippin form %s" % formpagename) else: numinfls = len(re.findall(r"\{\{inflection of\|", text)) if numinfls < 1: pagemsg("WARNING: Something wrong, no 'inflection of' templates on page for form %s" % formpagename) elif numinfls > 1: pagemsg("WARNING: Multiple 'inflection of' templates on page for form %s, skipping" % formpagename) else: comment = "Delete erroneously created long form of %s" % pagetitle pagemsg("Existing text for form %s: [[%s]]" % ( formpagename, text)) if save: formpage.delete(comment) else: pagemsg("Would delete page %s with comment=%s" % (formpagename, comment)) notes.append("fix 3olda -> %s" % direc) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def process_page(index, num, save, verbose, params): comment = None notes = [] lemma = ru_num(num) pagetitle = rulib.remove_accents(lemma) newtext = generate_page(num) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) # Prepare to create page pagemsg("Creating entry") page = pywikibot.Page(site, pagetitle) # If invalid title, don't do anything. existing_text = blib.safe_page_text(page, errandpagemsg, bad_value_ret=None) if existing_text is None: return if not blib.safe_page_exists(page, errandpagemsg): # Page doesn't exist. Create it. pagemsg("Creating page") comment = "Create page for Russian numeral %s (%s)" % (lemma, num) page.text = newtext if verbose: pagemsg("New text is [[%s]]" % page.text) else: # Page does exist pagetext = existing_text # Split into sections splitsections = re.split("(^==[^=\n]+==\n)", pagetext, 0, re.M) # Extract off pagehead and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Go through each section in turn, looking for existing Russian section for i in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[i], re.M) if not m: pagemsg("Can't find language name in text: [[%s]]" % (sections[i])) elif m.group(1) == "Russian": # Extract off trailing separator mm = re.match(r"^(.*?\n)(\n*--+\n*)$", sections[i], re.S) if mm: # Note that this changes the number of sections, which is seemingly # a problem because the for-loop above calculates the end point # at the beginning of the loop, but is not actually a problem # because we always break after processing the Russian section. sections[i:i + 1] = [mm.group(1), mm.group(2)] if params.overwrite_page: if "==Etymology 1==" in sections[ i] and not params.overwrite_etymologies: errandpagemsg( "WARNING: Found ==Etymology 1== in page text, not overwriting, skipping form" ) return else: pagemsg("WARNING: Overwriting entire Russian section") comment = "Create Russian section for numeral %s (%s)" % ( lemma, num) sections[i] = newtext notes.append("overwrite section") break else: errandpagemsg( "WARNING: Not overwriting existing Russian section") return elif m.group(1) > "Russian": pagemsg("Exists; inserting before %s section" % (m.group(1))) comment = "Create Russian section and entry for numeral %s (%s); insert before %s section" % ( lemma, num, m.group(1)) sections[i:i] = [newtext, "\n----\n\n"] break else: # else of for loop over sections, i.e. no break out of loop pagemsg("Exists; adding section to end") comment = "Create Russian section and entry for numeral %s (%s); append at end" % ( lemma, num) if sections: sections[-1] = ensure_two_trailing_nl(sections[-1]) sections += ["----\n\n", newsection] else: if not params.overwrite_page: notes.append("formerly empty") if pagehead.lower().startswith("#redirect"): pagemsg("WARNING: Page is redirect, overwriting") notes.append("overwrite redirect") pagehead = re.sub( r"#redirect *\[\[(.*?)\]\] *(<!--.*?--> *)*\n*", r"{{also|\1}}\n", pagehead, 0, re.I) elif not params.overwrite_page: pagemsg("WARNING: No language sections in current page") sections += [newsection] # End of loop over sections in existing page; rejoin sections newtext = pagehead + ''.join(sections) if page.text != newtext: assert comment or notes # Eliminate sequences of 3 or more newlines, which may come from # ensure_two_trailing_nl(). Add comment if none, in case of existing page # with extra newlines. newnewtext = re.sub(r"\n\n\n+", r"\n\n", newtext) if newnewtext != newtext and not comment and not notes: notes = ["eliminate sequences of 3 or more newlines"] newtext = newnewtext if page.text == newtext: pagemsg("No change in text") elif verbose: pagemsg("Replacing <%s> with <%s>" % (page.text, newtext)) else: pagemsg("Text has changed") page.text = newtext # Executed whether creating new page or modifying existing page. # Check for changed text and save if so. notestext = '; '.join(notes) if notestext: if comment: comment += " (%s)" % notestext else: comment = notestext if page.text != existing_text: if save: pagemsg("Saving with comment = %s" % comment) blib.safe_page_save(page, comment, errandpagemsg) else: pagemsg("Would save with comment = %s" % comment)
# work). We'll also have problems with e.g. пистолет-пулемёт Томпсона, # because the words are linked individually but the ru-decl-noun-see # has пистолет-пулемёт given as a single entry. We have a check below # to try to catch this case, because no inflected nouns will show up. for i in xrange(1, len(headwords_separators), 2): hword = headwords_separators[i] separator = headwords_separators[i+1] if i < len(headwords_separators) - 2 and separator != " " and separator != "-": pagemsg("WARNING: Separator after word #%s isn't a space or hyphen, can't handle: word=<%s>, separator=<%s>" % (wordind + 1, hword, separator)) return # Canonicalize link in headword m = re.search(r"^\[\[([^\[\]|]+)\|([^\[\]|]+)\]\]$", hword) if m: lemma, infl = m.groups() lemma = ru.remove_accents(re.sub("#Russian$", "", lemma)) if lemma == ru.remove_accents(infl): hword = "[[%s]]" % infl else: hword = "[[%s|%s]]" % (lemma, infl) headwords.append(hword) separators.append(separator) wordind += 1 pagemsg("Found headwords: %s" % " @@ ".join(headwords)) # Get headword genders (includes animacy and number) genders = blib.fetch_param_chain(headword_template, "2", "g") genders_include_pl = len([x for x in genders if re.search(r"\bp\b", x)]) > 0 # Extract lemmas and inflections for each word in headword
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) origtext = page.text parsed = blib.parse_text(origtext) # Find the declension arguments for LEMMA and inflected form INFL, # the WORDINDth word in the expression. Return value is a tuple of # four items: a list of (NAME, VALUE) tuples for the arguments, whether # the word is an adjective, the value of n= (if given), and the value # of a= (if given). def find_decl_args(lemma, infl, wordind): declpage = pywikibot.Page(site, lemma) if ru.remove_accents(infl) == lemma: wordlink = "[[%s]]" % infl else: wordlink = "[[%s|%s]]" % (lemma, infl) if not declpage.exists(): if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) return [("1", wordlink), ("2", "+")], True, None, None else: pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None parsed = blib.parse_text(declpage.text) decl_templates = [] headword_templates = [] decl_z_templates = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-noun-table", "ru-decl-adj"]: pagemsg("find_decl_args: Found decl template: %s" % unicode(t)) decl_templates.append(t) if tname in ["ru-noun", "ru-proper noun"]: pagemsg("find_decl_args: Found headword template: %s" % unicode(t)) headword_templates.append(t) if tname in ["ru-decl-noun-z"]: pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t)) decl_z_templates.append(t) if not decl_templates: if decl_z_templates: # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} if len(decl_z_templates) > 1: pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None else: decl_z_template = decl_z_templates[0] headword_template = None pagemsg("find_decl_args: Using z-decl template: %s" % unicode(decl_z_template)) if len(headword_templates) == 0: pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) elif len(headword_templates) > 1: pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) else: headword_template = headword_templates[0] pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" % (wordind, lemma, infl, unicode(headword_template), unicode(decl_z_template))) decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template, subpagetitle, pagemsg, headword_template=headword_template) decl_templates = [decl_template] elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [ x for x in headword_templates if getparam(x, "3") == "-"]: return [("1", wordlink), ("2", "$")], False, None, None else: pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None if len(decl_templates) == 1: decl_template = decl_templates[0] else: # Multiple decl templates for t in decl_templates: if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) decl_template = t break else: if lemma in use_given_decl: overriding_decl = use_given_decl[lemma] pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] elif pagetitle in use_given_page_decl: overriding_decl = use_given_page_decl[pagetitle].get(lemma, None) if not overriding_decl: pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template)) if unicode(decl_template.name) == "ru-decl-adj": if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U): return [("1", wordlink), ("2", u"+ь")], True, None, None else: return [("1", wordlink), ("2", "+")], True, None, None # ru-noun-table assert unicode(decl_template.name) == "ru-noun-table" # Split out the arg sets in the declension and check the # lemma of each one, taking care to handle cases where there is no lemma # (it would default to the page name). highest_numbered_param = 0 for p in decl_template.params: pname = unicode(p.name) if re.search("^[0-9]+$", pname): highest_numbered_param = max(highest_numbered_param, int(pname)) # Now gather the numbered arguments into arg sets. Code taken from # ru-noun.lua. offset = 0 arg_sets = [] arg_set = [] for i in xrange(1, highest_numbered_param + 2): end_arg_set = False val = getparam(decl_template, str(i)) if i == highest_numbered_param + 1: end_arg_set = True elif val == "_" or val == "-" or re.search("^join:", val): pagemsg("WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None elif val == "or": end_arg_set = True if end_arg_set: arg_sets.append(arg_set) arg_set = [] offset = i else: arg_set.append(val) canon_infl = ru.remove_accents(infl).lower() canon_lemma = lemma.lower() ispl = False need_sc1 = False found_gender = None if canon_infl != canon_lemma: for sgend, plend, gender, is_sc1 in pl_data: if sgend: check_sgend = sgend else: check_sgend = consonant_re if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub(sgend + "$", plend, canon_lemma): ispl = True found_gender = gender need_sc1 = is_sc1 break else: pagemsg("WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None # Substitute the wordlink for any lemmas in the declension. # If plural, also add gender and verify special case (1) as necessary. # Concatenate all the numbered params, substituting the wordlink into # the lemma as necessary. numbered_params = [] for arg_set in arg_sets: lemma_arg = 0 if len(arg_set) > 0 and runoun.arg1_is_stress(arg_set[0]): lemma_arg = 1 if len(arg_set) <= lemma_arg: arg_set.append("") arglemma = arg_set[lemma_arg] manualtr = "" if "//" in arglemma: arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups() if (not arglemma or arglemma.lower() == infl.lower() or ru.is_monosyllabic(infl) and ru.remove_accents(arglemma).lower() == ru.remove_accents(infl).lower() or ispl and ru.remove_accents(arglemma).lower() == lemma.lower() ): arg_set[lemma_arg] = wordlink + manualtr else: pagemsg("WARNING: Can't sub word link %s into decl lemma %s%s" % ( wordlink, arg_set[lemma_arg], ispl and ", skipping" or "")) if ispl: return None if ispl: # Add the gender if len(arg_set) <= lemma_arg + 1: arg_set.append("") declarg = arg_set[lemma_arg + 1] # First, sub in gender m = re.search("(3f|[mfn])", declarg) if found_gender == "mf": if not m: pagemsg(u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None decl_gender = m.group(1) if decl_gender == "n": pagemsg(u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None elif decl_gender in ["m", "3f"]: pagemsg(u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (decl_gender, wordind, lemma, infl)) else: assert gender == "f" pagemsg(u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" % (wordind, lemma, infl)) declarg = re.sub("f", "3f", declarg, 1) else: if m: decl_gender = m.group(1) if decl_gender == found_gender: pagemsg("Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (found_gender, wordind, lemma, infl)) else: pagemsg("WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" % (decl_gender, wordind, found_gender, lemma, infl)) declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1) else: pagemsg("No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" % (wordind, found_gender, lemma, infl)) declarg = found_gender + declarg # Now check special case 1 if need_sc1 != ("(1)" in declarg): if need_sc1: pagemsg("WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % ( wordind, declarg, lemma, infl)) return None else: pagemsg("WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % ( wordind, declarg, lemma, infl)) return None arg_set[lemma_arg + 1] = declarg if numbered_params: numbered_params.append("or") numbered_params.extend(arg_set) # Now gather all params, including named ones. params = [] params.extend((str(i+1), val) for i, val in zip(xrange(len(numbered_params)), numbered_params)) num = None anim = None for p in decl_template.params: pname = unicode(p.name) val = unicode(p.value) if pname == "a": anim = val elif pname == "n": num = val elif pname == "notes": params.append((pname, val)) elif pname == "title": pagemsg("WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" % (wordind, lemma, infl, val)) elif re.search("^[0-9]+$", pname): pass else: keepparam = True if pname == "loc": if pagetitle in keep_locative: pagemsg("Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) else: pagemsg("WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if pname == "par": pagemsg("WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if pname == "voc": pagemsg("WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if keepparam: if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U): pagemsg(u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) pname += str(wordind) params.append((pname, val))
def find_decl_args(lemma, infl, wordind): declpage = pywikibot.Page(site, lemma) if ru.remove_accents(infl) == lemma: wordlink = "[[%s]]" % infl else: wordlink = "[[%s|%s]]" % (lemma, infl) if not declpage.exists(): if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) return [("1", wordlink), ("2", "+")], True, None, None else: pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None parsed = blib.parse_text(declpage.text) decl_templates = [] headword_templates = [] decl_z_templates = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-noun-table", "ru-decl-adj"]: pagemsg("find_decl_args: Found decl template: %s" % unicode(t)) decl_templates.append(t) if tname in ["ru-noun", "ru-proper noun"]: pagemsg("find_decl_args: Found headword template: %s" % unicode(t)) headword_templates.append(t) if tname in ["ru-decl-noun-z"]: pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t)) decl_z_templates.append(t) if not decl_templates: if decl_z_templates: # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} if len(decl_z_templates) > 1: pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None else: decl_z_template = decl_z_templates[0] headword_template = None pagemsg("find_decl_args: Using z-decl template: %s" % unicode(decl_z_template)) if len(headword_templates) == 0: pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) elif len(headword_templates) > 1: pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) else: headword_template = headword_templates[0] pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" % (wordind, lemma, infl, unicode(headword_template), unicode(decl_z_template))) decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template, subpagetitle, pagemsg, headword_template=headword_template) decl_templates = [decl_template] elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [ x for x in headword_templates if getparam(x, "3") == "-"]: return [("1", wordlink), ("2", "$")], False, None, None else: pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None if len(decl_templates) == 1: decl_template = decl_templates[0] else: # Multiple decl templates for t in decl_templates: if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) decl_template = t break else: if lemma in use_given_decl: overriding_decl = use_given_decl[lemma] pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] elif pagetitle in use_given_page_decl: overriding_decl = use_given_page_decl[pagetitle].get(lemma, None) if not overriding_decl: pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template)) if unicode(decl_template.name) == "ru-decl-adj": if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U): return [("1", wordlink), ("2", u"+ь")], True, None, None else: return [("1", wordlink), ("2", "+")], True, None, None
maintext = """{{ru-adv|%s%s}} %s """ % (term, trtext, defntext) else: full_pos = pos_to_full_pos[pos] maintext = """{{head|ru|%s|head=%s%s}} %s """ % (full_pos, full_pos.lower(), term, trtext, defntext) if defns == "--": maintext = "" # If both adjective and participle header, move related-terms text to level 3 if maintext and parttext and reltext: reltext = re.sub("^====Related terms====", "===Related terms===", reltext) msg("""%s %s==Russian== %s%s===Pronunciation=== %s %s===%s=== %s%s%s%s%s%s[[ru:%s]] """ % (rulib.remove_accents(term), alsotext, alttext, etymtext, prontext, parttext, pos_to_full_pos[pos], maintext, syntext, anttext, dertext, reltext, seetext, rulib.remove_accents(term)))
def find_accented_2(term, termtr, verbose, pagemsg): if term in accentless_multisyllable: pagemsg("Not accenting unaccented multisyllabic particle %s" % term) return term, termtr # This can happen if e.g. we're passed "[[FOO|BAR]] BAZ"; we will reject it, # but it will then be word-split and handled correctly ("[[FOO|BAR]]" is # special-cased in find_accented_1()). if "|" in term: #pagemsg("Can't handle links with vertical bars: %s" % term) return term, termtr # This can happen if e.g. we're passed "[[FOO]] [[BAR]]"; we will reject it, # but it will then be word-split and handled correctly ("[[FOO]]" is # special-cased in find_accented_1()). if "[" in term or "]" in term: #pagemsg("Can't handle stray bracket in %s" % term) return term, termtr if "<" in term or ">" in term: pagemsg("Can't handle stray < or >: %s" % term) return term, termtr if u"\u0301" in term or u"ё" in term: pagemsg(u"Term has accent or ё, not looking up accents: %s" % term) return term, termtr if ru.is_monosyllabic(term): pagemsg("Term is monosyllabic, not looking up accents: %s" % term) return term, termtr pagename = ru.remove_accents(term) # We can't use expand_text() from find_accented_1() because it has a # different value for PAGENAME, and the proper value is important in # expanding ru-noun+ and ru-proper noun+. def expand_text(tempcall): return blib.expand_text(tempcall, pagename, pagemsg, semi_verbose) # Look up the page if semi_verbose: pagemsg("find_accented: Finding heads on page %s" % pagename) cached_redirect = False global num_cache_lookups num_cache_lookups += 1 if pagename in accented_cache: global num_cache_hits num_cache_hits += 1 result = accented_cache[pagename] cached = True if result is None: if semi_verbose: pagemsg("find_accented: Page %s doesn't exist (cached)" % pagename) return term, termtr elif result == "redirect": cached_redirect = True heads = set() saw_head = False else: heads, saw_head = result else: cached = False page = pywikibot.Page(site, pagename) try: if not page.exists(): if semi_verbose: pagemsg("find_accented: Page %s doesn't exist" % pagename) if not global_disable_cache: accented_cache[pagename] = None return term, termtr except Exception as e: pagemsg("WARNING: Error checking page existence: %s" % unicode(e)) if not global_disable_cache: accented_cache[pagename] = None return term, termtr # Page exists, find the heads heads = set() def add(val, tr): val_to_add = blib.remove_links(val) if val_to_add: heads.add((val_to_add, tr)) saw_head = False for t in blib.parse(page).filter_templates(): tname = unicode(t.name) if tname in ru_head_templates: saw_head = True if getparam(t, "1"): add(getparam(t, "1"), getparam(t, "tr")) elif getparam(t, "head"): add(getparam(t, "head"), getparam(t, "tr")) elif tname == "head" and getparam(t, "1") == "ru": saw_head = True add(getparam(t, "head"), getparam(t, "tr")) elif tname in ["ru-noun+", "ru-proper noun+"]: saw_head = True lemma = ru.fetch_noun_lemma(t, expand_text) lemmas = re.split(",", lemma) lemmas = [split_ru_tr(lemma) for lemma in lemmas] # Group lemmas by Russian, to group multiple translits lemmas = ru.group_translits(lemmas, pagemsg, expand_text) for val, tr in lemmas: add(val, tr) if saw_head: for i in xrange(2, 10): headn = getparam(t, "head" + str(i)) if headn: add(headn, getparam(t, "tr" + str(i))) if not global_disable_cache: accented_cache[pagename] = (heads, saw_head) # We have the heads cached_msg = " (cached)" if cached else "" if len(heads) == 0: if not saw_head: if cached_redirect: pagemsg("Redirect without heads (cached)") elif not cached and re.match("#redirect", page.text, re.I): if not global_disable_cache: accented_cache[pagename] = "redirect" pagemsg("Redirect without heads") else: pagemsg("WARNING: Can't find any heads: %s%s" % (pagename, cached_msg)) return term, termtr if len(heads) > 1: pagemsg("WARNING: Found multiple heads for %s%s: %s" % (pagename, cached_msg, ",".join("%s%s" % (ru, "//%s" % tr if tr else "") for ru, tr in heads))) return term, termtr newterm, newtr = list(heads)[0] if semi_verbose: pagemsg("find_accented: Found head %s%s%s" % (newterm, "//%s" % newtr if newtr else "", cached_msg)) if re.search("[!?]$", newterm) and not re.search("[!?]$", term): newterm_wo_punc = re.sub("[!?]$", "", newterm) if ru.remove_accents(newterm_wo_punc) == ru.remove_accents(term): pagemsg("Removing punctuation from %s when matching against %s" % ( newterm, term)) newterm = newterm_wo_punc if ru.remove_accents(newterm) != ru.remove_accents(term): pagemsg("WARNING: Accented term %s differs from %s in more than just accents%s" % ( newterm, term, cached_msg)) return newterm, newtr