def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) text = unicode(page.text) if pagetitle.startswith("Module:"): return pagemsg("Processing") notes = [] # WARNING: Not idempotent. to_add_period = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "place" and not t.has("t") and not t.has("t1") and not t.has("t2") and not t.has("t3"): to_add_period.append(unicode(t)) for curr_template in to_add_period: repl_template = curr_template + "." newtext, did_replace = blib.replace_in_text(text, curr_template, repl_template, pagemsg) if did_replace: newtext = re.sub(re.escape(curr_template) + r"\.([.,])", curr_template + r"\1", newtext) if newtext != text: notes.append("add period to {{place}} template (formerly automatically added)") text = newtext return text, notes
def add_adj_form_of(secbody, pos, comparative_superlative_t, ending): lemma = getparam(comparative_superlative_t, "2") if check_if_lemma_and_ending_match_pagetitle(lemma, ending, pagetitle, allow_umlaut=False): form_pos = "superlative adjective form" if pos == "superlative" else "adjective form" newsec = """ ===Adjective=== {{head|de|%s}} # {{de-adj form of|%s}}""" % (form_pos, lemma) secbody, replaced = blib.replace_in_text( secbody, unicode(comparative_superlative_t), unicode(comparative_superlative_t) + newsec, pagemsg, abort_if_warning=True) if not replaced: pagemsg("WARNING: Couldn't add -%s inflection, skipping: %s" % (ending, unicode(comparative_of_t))) return secbody, False notes.append("add {{de-adj form of}} for %s" % pos) else: pagemsg( "WARNING: Lemma %s + %s ending %s doesn't match pagetitle" % (lemma, pos, ending)) return secbody, True
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] lines = re.split("\n", text) newlines = [] langs_at_levels = {} kurdish_indent = None kurdish_borrowing = None for line in lines: thisline_lang = None m = re.search("^([*]+:*)", line) if m: thisline_indent = len(m.group(1)) if kurdish_indent and thisline_indent <= kurdish_indent: kurdish_indent = None if "{{desc|" in line or "{{desctree|" in line: parsed = blib.parse_text(line) for t in parsed.filter_templates(): tn = tname(t) if tn in ["desc", "desctree"]: thisline_lang = getparam(t, "1") if thisline_lang == "ku": if getparam(t, "2") != "-": pagemsg( "WARNING: Saw real 'Kurdish' descendant rather than anchoring line: %s" % unicode(t)) continue kurdish_indent = thisline_indent kurdish_borrowing = getparam(t, "bor") line, did_replace = blib.replace_in_text( line, unicode(t), "Kurdish:", pagemsg) notes.append( "replace {{desc|ku}} with raw 'Kurdish:'") elif kurdish_indent and thisline_indent > kurdish_indent and kurdish_borrowing: t.add("bor", "1") line = unicode(parsed) notes.append( "add bor=1 to Kurdish-language (%s) descendant" % thisline_lang) else: kurdish_indent = None newlines.append(line) newtext = "\n".join(newlines) return newtext, notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if not re.search( r"\{\{head\|de\|(adjective (|comparative |superlative )|participle )form", text): return pagemsg("Processing") notes = [] retval = blib.find_modifiable_lang_section(text, "German", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find German section") return sections, j, secbody, sectail, has_non_lang = retval if re.search("== *Etymology 1 *==", secbody): pagemsg("WARNING: Multiple etymology sections, skipping") return parsed = blib.parse_text(secbody) headt = None comparative_of_t = None superlative_of_t = None inflection_of_t = None need_superlative_of_t_lemma = None for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) def do_comparative_superlative_of(pos, existing_t, should_end): if getparam(t, "1") != "de": pagemsg( "WARNING: Saw wrong language in {{%s of}}, skipping: %s" % (pos, origt)) return False if existing_t: pagemsg( "WARNING: Saw two {{%s of}} templates, skipping: %s and %s" % (pos, unicode(existing_t), origt)) return False if not headt: pagemsg( "WARNING: Saw {{%s of}} without head template, skipping: %s" % (pos, origt)) return False if not pagetitle.endswith(should_end): pagemsg( "WARNING: Incorrect ending for %s, should be -%s, skipping" % (pos, should_end)) return False param2 = getparam(headt, "2") if param2 != "%s adjective" % pos: headt.add("2", "%s adjective" % pos) notes.append( "convert {{head|de|%s}} to {{head|de|%s adjective}}" % (param2, pos)) return t if tn == "head" and getparam(t, "1") == "de" and getparam(t, "2") in [ "adjective form", "adjective comparative form", "adjective superlative form", "participle form" ]: if headt: pagemsg( "WARNING: Saw two head templates, skipping: %s and %s" % (unicode(headt), origt)) return headt = t elif tn == "head" and getparam(t, "1") == "de" and getparam( t, "2") == "verb form": pagemsg("Allowing and ignoring {{head|de|verb form}}: %s" % origt) elif tn == "head": pagemsg("WARNING: Saw unrecognized head template, skipping: %s" % origt) return elif tn == "comparative of": comparative_of_t = do_comparative_superlative_of( "comparative", comparative_of_t, "er") if not comparative_of_t: return elif tn == "superlative of": superlative_of_t = do_comparative_superlative_of( "superlative", superlative_of_t, "sten") if not superlative_of_t: return elif tn == "de-adj form of": pagemsg("Saw {{de-adj form of}}, assuming already converted: %s" % origt) return elif tn in ["inflection of", "infl of"]: if getparam(t, "1") != "de": pagemsg( "WARNING: Saw wrong language in {{inflection of}}, skipping: %s" % origt) return if not headt: pagemsg( "WARNING: Saw {{inflection of}} without head template, skipping: %s" % origt) return if inflection_of_t: pagemsg( "WARNING: Saw {{inflection of}} twice, skipping: %s and %s" % (unicode(inflection_of_t), origt)) return inflection_of_t = t lemma = getparam(t, "2") if getparam(t, "3"): pagemsg( "WARNING: Saw alt form in {{inflection of}}, skipping: %s" % origt) return infl_tags = [] for param in t.params: pn = pname(param) pv = unicode(param.value) if not re.search("^[0-9]+$", pn): pagemsg( "WARNING: Saw unrecognized param %s=%s in {{inflection of}}, skipping: %s" % (pn, pv, origt)) return if int(pn) >= 4: infl_tags.append(pv) tags = "|".join(infl_tags) if tags not in tags_to_ending: pagemsg( "WARNING: Saw unrecognized tags in {{inflection of}}, skipping: %s" % origt) return del t.params[:] ending = tags_to_ending[tags] if ending in ["sten", "esten"]: need_superlative_of_t_lemma = lemma blib.set_template_name(t, "de-adj form of") t.add("1", lemma) no_explicit = check_if_lemma_and_ending_match_pagetitle( lemma, ending, pagetitle, allow_umlaut=True) if not no_explicit: pagemsg("WARNING: Explicit ending %s required for lemma %s" % (ending, lemma)) t.add("2", ending) notes.append( "convert {{inflection of|de|...}} to {{de-adj form of}}") if "comd" in tags: param2 = getparam(headt, "2") if param2 != "comparative adjective form": headt.add("2", "comparative adjective form") notes.append( "convert {{head|de|%s}} to {{head|de|comparative adjective form}}" % param2) elif "supd" in tags: param2 = getparam(headt, "2") if param2 != "superlative adjective form": headt.add("2", "superlative adjective form") notes.append( "convert {{head|de|%s}} to {{head|de|superlative adjective form}}" % param2) secbody = unicode(parsed) def add_adj_form_of(secbody, pos, comparative_superlative_t, ending): lemma = getparam(comparative_superlative_t, "2") if check_if_lemma_and_ending_match_pagetitle(lemma, ending, pagetitle, allow_umlaut=False): form_pos = "superlative adjective form" if pos == "superlative" else "adjective form" newsec = """ ===Adjective=== {{head|de|%s}} # {{de-adj form of|%s}}""" % (form_pos, lemma) secbody, replaced = blib.replace_in_text( secbody, unicode(comparative_superlative_t), unicode(comparative_superlative_t) + newsec, pagemsg, abort_if_warning=True) if not replaced: pagemsg("WARNING: Couldn't add -%s inflection, skipping: %s" % (ending, unicode(comparative_of_t))) return secbody, False notes.append("add {{de-adj form of}} for %s" % pos) else: pagemsg( "WARNING: Lemma %s + %s ending %s doesn't match pagetitle" % (lemma, pos, ending)) return secbody, True if comparative_of_t and not inflection_of_t: secbody, ok = add_adj_form_of(secbody, "comparative", comparative_of_t, "er") if not ok: return if superlative_of_t and not inflection_of_t: secbody, ok = add_adj_form_of(secbody, "superlative", superlative_of_t, "sten") if not ok: return if inflection_of_t and not superlative_of_t and need_superlative_of_t_lemma: cursec = """===Adjective=== {{head|de|superlative adjective form}} # %s""" % unicode(inflection_of_t) newsec = """===Adjective=== {{head|de|superlative adjective}} # {{superlative of|de|%s}} """ % need_superlative_of_t_lemma secbody, replaced = blib.replace_in_text(secbody, cursec, newsec + cursec, pagemsg, abort_if_warning=True) if not replaced: pagemsg("WARNING: Couldn't add {{superlative of}}, skipping: %s" % unicode(inflection_of_t)) return notes.append("add {{superlative of|de|...}}") sections[j] = secbody + sectail text = "".join(sections) if not notes: pagemsg("WARNING: Couldn't convert page") return text, notes
def do_headword_template(headt, declts, pagetitle, subsections, subsection_with_head, subsection_with_declts, pagemsg): notes = [] def analyze_declts(declts, pagetitle, headword_gens, headword_pls): decl_genders_gens_and_pls = [] prev_is_weak = None prev_is_sg = None for declt in declts: def getp(param): return getparam(declt, param) tn = tname(declt) gender = re.sub(".*-", "", tn) if gender == "pl": gender = "p" decl_gens = [] decl_pls = [] if gender != "p": is_weak = False is_sg = False for param in ["head", "ns", "gs", "ds", "as", "bs", "vs", "np", "gp", "dp", "ap", "notes"]: if getp(param): pagemsg("WARNING: Saw %s=%s, can't handle yet: %s" % (param, getp(param), unicode(declt))) return None if gender in ["m", "n"]: arg1 = getp("1") if not arg1: gen = "" elif arg1 in ["n", "ns", "en", "ens"]: is_weak = True gen = arg1 elif arg1 in ["s", "es", "ses", "(e)s", "(s)", "'"]: gen = arg1 else: pagemsg("WARNING: Unrecognized arg1=%s: %s" % (arg1, unicode(declt))) return None decl_gens = convert_gens(pagetitle, [gen], from_decl=True) num = getp("n") if num == "sg": is_sg = True elif num not in ["full", ""]: pagemsg("WARNING: Unrecognized n=%s: %s" % (num, unicode(declt))) return None if not is_sg: if gender == "f": plsuffix = getp("1") else: plsuffix = getp("2") argpl = getp("pl") if argpl: pl = argpl else: pl = pagetitle + plsuffix if pl == "-": is_sg = True else: decl_pls = normalize_values([pl]) if prev_is_weak is not None and prev_is_weak != is_weak: pagemsg("WARNING: Saw declension template with weak=%s different from previous weak=%s: %s" % (is_weak, prev_is_weak, declts_to_unicode(declts))) return None prev_is_weak = is_weak if prev_is_sg is not None and prev_is_sg != is_sg: pagemsg("WARNING: Saw declension template with sg=%s different from previous sg=%s: %s" % (is_sg, prev_is_sg, declts_to_unicode(declts))) return None prev_is_sg = is_sg decl_genders_gens_and_pls.append((gender, decl_gens, decl_pls)) all_decl_genders = [] all_decl_gens = [] all_decl_pls = [] for decl_gender, decl_gens, decl_pls in decl_genders_gens_and_pls: if decl_gender not in all_decl_genders: all_decl_genders.append(decl_gender) for decl_gen in decl_gens: if decl_gen not in all_decl_gens: all_decl_gens.append(decl_gen) for decl_pl in decl_pls: if decl_pl not in all_decl_pls: all_decl_pls.append(decl_pl) first_gender, first_decl_gens, first_decl_pls = decl_genders_gens_and_pls[0] if len(all_decl_genders) > 1 and ( len(all_decl_gens) != len(first_decl_gens) or len(all_decl_pls) != len(first_decl_pls) ): pagemsg("WARNING: Multiple declension templates with different genders as well as different either genitives or plurals: %s" % declts_to_unicode(declts)) return None if len(all_decl_gens) != len(first_decl_gens) and len(all_decl_pls) != len(first_decl_pls): pagemsg("WARNING: Multiple declension templates with different both genitives and plurals: %s" % declts_to_unicode(declts)) return None is_weak = prev_is_weak is_sg = prev_is_sg declspec = ":".join(all_decl_genders) def compute_part(declspec, headword_parts, all_decl_parts, get_default_part, desc): defparts = [] for gender in all_decl_genders: defpart = pagetitle + get_default_part(pagetitle, gender, is_weak) if defpart not in defparts: defparts.append(defpart) if all_decl_parts == defparts: declspec += "," else: all_decl_part_forms = analyze_forms(pagetitle, all_decl_parts, None) if set(headword_parts) == set(all_decl_parts): headword_part_forms = analyze_forms(pagetitle, headword_parts, None) if headword_part_forms != all_decl_part_forms: pagemsg("NOTE: Headword %s(s) %s same as all decl %s(s) %s but analyzed form(s) different (probably different ordering), preferring headword analyzed form(s) %s over decl analyzed form(s) %s: declts=%s" % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts), headword_part_forms, all_decl_part_forms, declts_to_unicode(declts))) all_decl_part_forms = headword_part_forms else: pagemsg("WARNING: Headword %s(s) %s not same as all decl %s(s) %s, continuing" % (desc, ",".join(headword_parts), desc, ",".join(all_decl_parts))) declspec += ",%s" % all_decl_part_forms return declspec if "m" in all_decl_genders or "n" in all_decl_genders: declspec = compute_part(declspec, headword_gens, all_decl_gens, get_default_gen, "genitive") if "p" not in all_decl_genders: declspec = compute_part(declspec, headword_pls, all_decl_pls, get_default_pl, "plural") declspec = re.sub(",*$", "", declspec) if is_weak: declspec += ".weak" if is_sg: declspec += ".sg" if ss: declspec += ".ss" return declspec, all_decl_genders, all_decl_gens, all_decl_pls old_style_headt = False for param in ["old", "2", "3", "4", "g1", "g2", "g3", "gen1", "gen2", "gen3", "pl1", "pl2", "pl3"]: if getparam(headt, param): old_style_headt = True break if not old_style_headt: pagemsg("NOTE: Skipping new-style headt=%s%s" % (unicode(headt), declts and ", declts=%s" % declts_to_unicode(declts) or "")) return notes is_proper = tname(headt) == "de-proper noun" ss = False if declts: sses = [not not getparam(declt, "ss") for declt in declts] if len(set(sses)) > 1: pagemsg("WARNING: Saw inconsistent values for ss= in decl templates: %s" % declts_to_unicode(declts)) return ss = list(set(sses)) == [True] if ss: if not pagetitle.endswith(u"ß"): pagemsg(u"WARNING: Bad ss=1 setting for pagetitle not ending in -ß: %s" % declts_to_unicode(declts)) return # If ss specified, pretend pagetitle ends in -ss, as it does in post-1996 spelling. Later on we add .ss to the # headword and declension specs. pagetitle = re.sub(u"ß$", "ss", pagetitle) adjectival = any(tname(t).startswith("de-decl-adj+noun") for t in declts) genders = blib.fetch_param_chain(headt, "1", "g") headword_genders = genders gens = normalize_values(blib.fetch_param_chain(headt, "2", "gen", True)) pls = normalize_values(blib.fetch_param_chain(headt, "3", "pl")) dims = normalize_values(blib.fetch_param_chain(headt, "4", "dim")) fems = normalize_values(blib.fetch_param_chain(headt, "f")) mascs = normalize_values(blib.fetch_param_chain(headt, "m")) if gens == [True]: gens = [] for param in headt.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3", "4", "m", "f", "old"] and not re.search("^(g|gen|pl|dim|m|f)[0-9]+$", pn) and ( not adjectival or pn not in "head"): pagemsg("WARNING: Unrecognized param %s=%s: %s" % (pn, pv, unicode(headt))) return if not genders: pagemsg("WARNING: No genders in head template: %s" % unicode(headt)) return if "p" in genders and len(genders) > 1: pagemsg("WARNING: Saw gender 'p' and another gender: %s" % unicode(headt)) return if "p" in genders and (gens or pls): pagemsg("WARNING: Saw genitive(s) or plural(s) with plural-only: %s" % unicode(headt)) return saw_mn = "m" in genders or "n" in genders if not saw_mn and not adjectival: if gens and gens == [pagetitle]: gens = [] if gens: pagemsg("WARNING: Saw genitive(s) with feminine-only gender: %s" % unicode(headt)) return if adjectival: if len(declts) > 1: pagemsg("WARNING: Saw adjectival declension along with multiple declension templates, can't handle: %s" % declts_to_unicode(declts)) return declt = declts[0] def getp(param): return getparam(declt, param) tn = tname(declt) m = re.search(r"^de-decl-adj\+noun(-sg)?-([mfn])$", tn) if m: default_equiv = None is_sg, gender = m.groups() adj = getp("1") noun = getp("2") if gender in ["m", "f"]: default_equiv = adj + ("e" if gender == "m" else "er") if noun: default_equiv += " " + construct_default_equiv(noun, gender) if gender in ["m", "n"]: noun_gen = getp("3") noun_pl = getp("4") else: noun_gen = "-" noun_pl = getp("3") noun_pl_full = getp("pl") adj_ending = "er" if gender == "m" else "e" if gender == "f" else "es" expected_lemma = adj + adj_ending if gender == "f": # Should be '-er' but we often see '-en' (weak form) instead expected_gens = [adj + "er", adj + "en"] else: expected_gens = [adj + "en"] if is_sg: expected_pls = [] else: expected_pls = [adj + "e", adj + "en"] if not noun: if noun_gen != "-" or noun_pl_full or (noun_pl and noun_pl != "-"): pagemsg("WARNING: Bad parameters for adjectival noun: %s" % unicode(declt)) return all_decl_genders = [gender] else: fake_declt = "{{de-decl-noun-%s%s|%s|pl=%s%s}}" % (gender, "" if gender == "f" else "|" + noun_gen, noun_pl, noun_pl_full, "|n=sg" if is_sg else "") fake_declt = list(blib.parse_text(fake_declt).filter_templates())[0] def analyze_headword_parts_for_noun(parts, desc): noun_headword_parts = [] for part in parts: m = re.search("^([^ ]+) ([^ ]+)$", part.strip()) if not m: pagemsg("WARNING: Can't analyze headword %s '%s' into adjective and noun, continuing: head=%s, decl=%s" % (desc, part, unicode(headt), unicode(declt))) return [] part_adj, part_noun = m.groups() noun_headword_parts.append(part_noun) return noun_headword_parts noun_headword_gens = analyze_headword_parts_for_noun(gens, "genitive") noun_headword_pls = analyze_headword_parts_for_noun(pls, "plural") retval = analyze_declts([fake_declt], noun, noun_headword_gens, noun_headword_pls) if retval is None: return declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval expected_lemma = "%s %s" % (expected_lemma, noun) expected_gens = ["%s %s" % (expected_gen, gen) for expected_gen in expected_gens for gen in ([noun] if gender == "f" else all_decl_gens)] if is_sg: expected_pls = [] else: expected_pls = ["%se %s" % (adj, pl) for pl in all_decl_pls] if pagetitle != expected_lemma: pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected lemma '%s' but saw '%s': head=%s, decl=%s" % (expected_lemma, pagetitle, unicode(headt), unicode(declt))) return if set(genders) != set(all_decl_genders): pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected gender(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(all_decl_genders), ",".join(genders), unicode(headt), unicode(declt))) return if not (set(gens) <= set(expected_gens)): pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected genitive(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(expected_gens), ",".join(gens), unicode(headt), unicode(declt))) return if pls == ["-"]: if expected_pls: pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt))) return elif not (set(pls) <= set(expected_pls)): pagemsg("WARNING: For adjectival noun or adjective-noun combination, expected plural(s) '%s' but saw '%s': head=%s, decl=%s" % (",".join(expected_pls), ",".join(pls), unicode(headt), unicode(declt))) return if not noun: declspec = "+" if is_sg: declspec += ".sg" else: if re.search("^" + CAP, adj): adj_lemma = adj.lower() else: adj_lemma = adj if adj_lemma in ["erst", "zweit", "dritt", "viert", u"fünft", "sechst", "siebent", "acht", "neunt", "zehnt"]: adj_lemma += "e" adj_form = adj + adj_ending if adj_form.startswith(adj_lemma): adj_link = "[[%s]]%s" % (adj_lemma, adj_form[len(adj_lemma):]) else: adj_link = "[[%s|%s]]" % (adj_lemma, adj_form) noun_link = "[[%s]]" % noun # This is less accurate than the above. Often head= is wrong. # Try to update adjective and noun links from head= if given. #head = getparam(headt, "head") #if head: # m = re.search("^([^ ]*) ([^ ]*)$", head) # if not m: # pagemsg("WARNING: Can't parse head=%s for adjective-noun combination, continuing: head=%s, decl=%s" # % (head, unicode(headt), unicode(declt))) # else: # head_adj_link, head_noun_link = m.groups() # m = re.search(r"\[\[([^][]*)\|([^][]*)\]\]$", head_adj_link) # if m: # adj_link_lemma, adj_link_form = m.groups() # if adj_link_form.startswith(adj_link_lemma): # head_adj_link = "[[%s]]%s" % (adj_link_lemma, adj_link_form[len(adj_link_lemma):]) # if head_adj_link != adj_link: # pagemsg("NOTE: Head-derived adjective link %s not same as decl-template-derived adjective link %s, using the former: head=%s, decl=%s" # % (head_adj_link, adj_link, unicode(headt), unicode(declt))) # adj_link = head_adj_link # if head_noun_link != noun_link: # pagemsg("NOTE: Head-derived noun link %s not same as decl-template-derived noun link %s, using the former: head=%s, decl=%s" # % (head_noun_link, noun_link, unicode(headt), unicode(declt))) # noun_link = head_noun_link declspec = "%s<+> %s<%s>" % (adj_link, noun_link, declspec) headspec = declspec is_both = is_proper and not is_sg else: pagemsg("WARNING: Unrecognized decl template(s): %s" % declts_to_unicode(declts)) return else: # not adjectival if len(genders) == 1 and genders[0] in ["m", "f"]: default_equiv = construct_default_equiv(pagetitle, genders[0]) headspec = ":".join(genders) is_sg = False is_both = False is_weak = False headword_gens = [] headword_pls = [] if headspec != "p": pls = convert_pls(pagetitle, pls, is_proper=is_proper) headword_pls = pls if saw_mn: gens = convert_gens(pagetitle, gens) headword_gens = gens if (len(gens) == 1 and any(gens[0] == pagetitle + ending for ending in ["n", "en", "ns", "ens"]) and len(pls) == 1 and (pls[0] == "-" or any(pls[0] == pagetitle + ending for ending in ["n", "en"]))): is_weak = True def_gens = [] for gender in genders: def_gen = pagetitle + get_default_gen(pagetitle, gender, is_weak) if def_gen not in def_gens: def_gens.append(def_gen) if set(def_gens) == set(gens): headspec += "," else: headspec += ",%s" % analyze_forms(pagetitle, gens, None) def_pls = [] for gender in genders: def_pl = pagetitle + get_default_pl(pagetitle, gender, is_weak) if def_pl not in def_pls: def_pls.append(def_pl) if set(def_pls) == set(pls): headspec += "," if is_proper: is_both = True elif pls == ["-"]: is_sg = True else: headspec += ",%s" % analyze_forms(pagetitle, pls, None) headspec = re.sub(",*$", "", headspec) if is_weak: headspec += ".weak" if is_sg: headspec += ".sg" if ss: headspec += ".ss" extraspec = "" if dims: extraspec += "|dim=%s" % analyze_forms(pagetitle, dims, None, do_stem=True, joiner=",") if fems: extraspec += "|f=%s" % analyze_forms(pagetitle, fems, default_equiv, do_stem=True, joiner=",") if mascs: extraspec += "|m=%s" % analyze_forms(pagetitle, mascs, default_equiv, do_stem=True, joiner=",") if declts and not adjectival: retval = analyze_declts(declts, pagetitle, headword_gens, headword_pls) if retval is None: return declspec, all_decl_genders, all_decl_gens, all_decl_pls = retval if headspec != declspec: if set(all_decl_gens) <= set(headword_gens) and set(all_decl_pls) <= set(headword_pls): if set(all_decl_genders) == set(headword_genders): pagemsg("NOTE: Headword spec '%s' not same as declension spec '%s', but decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s and gender(s) %s agree: headt=%s, declt=%s" % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls), ",".join(headword_pls), ",".join(all_decl_genders), unicode(headt), unicode(declt))) declspec = headspec else: pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s', decl gens %s a subset of headword gens %s and decl pls %s a subset of headword pls %s, but decl gender(s) %s don't agree with headword gender(s) %s: headt=%s, declt=%s" % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls), ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt))) return else: pagemsg("WARNING: Headword spec '%s' not same as declension spec '%s' and either decl gens %s not a subset of headword gens %s or decl pls %s not a subset of headword pls %s, with decl gender(s) %s and headword gender(s) %s: headt=%s, declt=%s" % (headspec, declspec, ",".join(all_decl_gens), ",".join(headword_gens), ",".join(all_decl_pls), ",".join(headword_pls), ",".join(all_decl_genders), ",".join(headword_genders), unicode(headt), unicode(declt))) return if is_proper: headspec = headspec.replace(".sg", "") if is_both: if ".ss" in headspec: headspec = headspec.replace(".ss", ".both.ss") else: headspec += ".both" newheadt = "{{de-%s|%s%s}}" % ("proper noun" if is_proper else "noun", headspec, extraspec) headt_outmsg = "convert %s to new-format %s" % (unicode(headt), newheadt) outmsg = "Would " + headt_outmsg if declts: newdeclt = "{{de-ndecl|%s}}" % declspec declt_outmsg = "convert %s to %s" % (declts_to_unicode(declts), newdeclt) outmsg += " and " + declt_outmsg pagemsg(outmsg) if unicode(headt) != newheadt: newsectext, replaced = blib.replace_in_text(subsections[subsection_with_head], unicode(headt), newheadt, pagemsg, abort_if_warning=True) if not replaced: return notes.append(headt_outmsg) subsections[subsection_with_head] = newsectext if declts: declts_existing = "\n".join(unicode(declt) for declt in declts) newsectext, replaced = blib.replace_in_text(subsections[subsection_with_declts], declts_existing, newdeclt, pagemsg, abort_if_warning=True) if not replaced: return notes.append(declt_outmsg) subsections[subsection_with_declts] = newsectext return notes
def process_page(page, index, line, respelling, orig_template, repl_template, args): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if respelling == "-": pagemsg("Skipping line with respelling '-': %s" % line) return if respelling == "": pagemsg("WARNING: Skipping blank respelling: %s" % line) return notes = [] text = unicode(page.text) if orig_template not in text: pagemsg("WARNING: Can't find original template %s in text" % orig_template) return m = re.search("^.*?%s.*$" % re.escape(orig_template), text, re.M) if not m: pagemsg("WARNING: Couldn't find template %s in page text" % orig_template) textline = "(unknown)" else: textline = m.group(0) m = re.search(r"(\|pos=[a-z]+)", repl_template) if m: posarg = m.group(1) else: posarg = "" if respelling == "y": respellingarg = "" else: respellingarg = "|" + "|".join(respelling.split(",")) real_repl = "{{fr-IPA%s%s}}" % (respellingarg, posarg) if "{{a|" in textline: pagemsg( "WARNING: Replacing %s with %s and saw accent spec on line: %s" % (orig_template, real_repl, textline)) newtext, did_replace = blib.replace_in_text(text, orig_template, real_repl, pagemsg) text = newtext if did_replace: notes.append("semi-manually replace %s with %s" % (orig_template, real_repl)) if respelling != "y": parsed = blib.parse_text(text) saw_fr_conj_auto = False for t in parsed.filter_templates(): tn = tname(t) if tn == "fr-conj-auto": if saw_fr_conj_auto: pagemsg( "WARNING: Saw {{fr-conj-auto}} twice, first=%s, second=%s" % (saw_fr_conj_auto, unicode(t))) saw_fr_conj_auto = unicode(t) if getparam(t, "pron"): pagemsg("WARNING: Already saw pron= param: %s" % unicode(t)) continue pronarg = ",".join(pron or pagetitle for pron in respelling.split(",")) origt = unicode(t) t.add("pron", pronarg) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("add pron=%s to {{fr-conj-auto}}" % pronarg) text = unicode(parsed) return text, notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) if ":" in pagetitle and not re.search( "^(Citations|Appendix|Reconstruction|Transwiki|Talk|Wiktionary|[A-Za-z]+ talk):", pagetitle): pagemsg( "WARNING: Colon in page title and not a recognized namespace to include, skipping page" ) return None, None templates_to_replace = [] for t in parsed.filter_templates(): tn = tname(t) if tn in sv_verb_templates_with_plural_of and tn in all_templates: plural_of = getparam(t, "plural of") if plural_of: origt = unicode(t) rmparam(t, "plural of") newt = "{{sv-obs verb pl|%s}} %s" % (plural_of, unicode(t)) templates_to_replace.append(( origt, newt, "move plural of= in {{%s}} to {{sv-obs verb pl}} outside of template" % tn)) if tn in sv_noun_templates_with_obsoleted_by and tn in all_templates: obsoleted_by = getparam(t, "obsoleted by") if obsoleted_by: origt = unicode(t) rmparam(t, "obsoleted by") newt = "{{sv-obs noun form|%s}} %s" % (obsoleted_by, unicode(t)) templates_to_replace.append(( origt, newt, "move plural of= in {{%s}} to {{sv-obs noun form}} outside of template" % tn)) if tn in ca_templates_with_val and tn in all_templates: val = getparam(t, "val") val2 = getparam(t, "val2") if val: origt = unicode(t) rmparam(t, "val") rmparam(t, "val2") newt = "%s {{ca-val|%s%s}}" % (unicode(t), val, "|" + val2 if val2 else "") templates_to_replace.append( (origt, newt, "move val= in {{%s}} to {{ca-val}} outside of template" % tn)) if tn in nl_templates_with_comp_of_sup_of and tn in all_templates: comp_of = getparam(t, "comp-of") sup_of = getparam(t, "sup-of") if comp_of: comp_of = ", the {{nc comp of|nl|%s}}" % comp_of if sup_of: sup_of = ", the {{nc sup of|nl|%s}}" % sup_of if comp_of or sup_of: origt = unicode(t) rmparam(t, "comp-of") rmparam(t, "sup-of") newt = "%s%s%s" % (unicode(t), comp_of, sup_of) templates_to_replace.append(( origt, newt, "move comp-of=/sup-of== in {{%s}} to {{nc comp of}}/{{nc sup of}} outside of template" % tn)) if tn in el_templates_with_active and tn in all_templates: active = getparam(t, "active") ta = getparam(t, "ta") if active: origt = unicode(t) rmparam(t, "active") rmparam(t, "ta") newt = "%s, {{nc pass of|el|%s%s}}" % ( unicode(t), active, "|t=" + ta if ta else "") templates_to_replace.append(( origt, newt, "move active= in {{%s}} to {{nc pass of}} outside of template" % tn)) if tn in el_templates_to_move_dot and tn in all_templates: origt = unicode(t) nodot = getparam(t, "nodot") rmparam(t, "nodot") # in case it's blank if nodot: templates_to_replace.append( (origt, unicode(t), "remove nodot= from {{%s}}, with changed semantics" % tn)) else: newt = "%s." % unicode(t) templates_to_replace.append(( origt, newt, "add explicit final period to {{%s}} when nodot= not specified, due to change in semantics" % tn)) for curr_template, repl_template, note in templates_to_replace: text, replaced = blib.replace_in_text(text, curr_template, repl_template, pagemsg) if replaced: notes.append(note) return text, notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) #if ":" in pagetitle and not re.search("^(Appendix|Reconstruction|Citations):", pagetitle): # return origtext = text notes = [] removed_cats = [] def should_remove_cat(cat): return re.match(args.regex + "$", cat.replace("_", " ")) parsed = blib.parse_text(text) text_to_remove = [] for t in parsed.filter_templates(): tn = tname(t) if tn in topics_templates or tn in catlangname_templates or tn in categorize_templates: lang = getparam(t, "1").strip() cats = [] for paramno in xrange(2, 30): cat = getparam(t, str(paramno)).strip() if cat: cats.append(cat) filtered_cats = [] for cat in cats: if tn in topics_templates: full_cat = "%s:%s" % (lang, cat) elif tn in categorize_templates: full_cat = cat else: if lang not in blib.languages_byCode: pagemsg( "WARNING: Saw unrecognized language code '%s'" % lang) return else: full_cat = "%s %s" % ( blib.languages_byCode[lang]["canonicalName"], cat) if should_remove_cat(full_cat): if full_cat not in removed_cats: removed_cats.append(full_cat) else: filtered_cats.append(cat) if cats == filtered_cats: continue non_numbered_params = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() showkey = param.showkey if not re.search("^[0-9]+$", pname): non_numbered_params.append((pname, pval, showkey)) if filtered_cats: origt = unicode(t) # Erase all params. del t.params[:] # Put back new params. t.add("1", lang) for catind, cat in enumerate(filtered_cats): t.add(str(catind + 2), cat) for pname, pval, showkey in non_numbered_params: t.add(pname, pval, showkey=showkey, preserve_spacing=False) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) else: text_to_remove.append(unicode(t)) text = unicode(parsed) for m in re.finditer(r"\[\[(?:Category|category|CAT):(.*?)\]\]\n?", text): cat = m.group(1) cat = re.sub(r"\|.*", "", cat) if should_remove_cat(cat): text_to_remove.append(m.group(0)) if m.group(1) not in removed_cats: removed_cats.append(m.group(1)) for remove_it in text_to_remove: text, did_replace = blib.replace_in_text(text, remove_it, "", pagemsg, no_found_repl_check=True) if not did_replace: return pagemsg("Removed %s" % remove_it.replace("\n", r"\n")) text = re.sub(r"\n\n+", "\n\n", text) if removed_cats: notes.append("remove categories: %s" % ",".join(removed_cats)) if text != origtext and not notes: notes.append("condense 3+ newlines") return text, notes