def do_canon_param(pagetitle, index, template, fromparam, toparam, paramtr, arabic, latin, include_tempname_in_changelog=False): actions = [] tname = unicode(template.name) def pagemsg(text): msg("Page %s %s: %s.%s: %s" % (index, pagetitle, tname, fromparam, text)) if show_template: pagemsg("Processing %s" % (unicode(template))) if include_tempname_in_changelog: paramtrname = "%s.%s" % (tname, paramtr) else: paramtrname = paramtr if latin == "-": pagemsg("Latin is -, taking no action") return False, False, [] # Compute canonarabic and canonlatin match_canon = False canonlatin = "" if latin: try: canonarabic, canonlatin = ar_translit.tr_matching(arabic, latin, True, msgfun=pagemsg) match_canon = True except Exception as e: pagemsg("NOTE: Unable to vocalize %s (%s): %s: %s" % (arabic, latin, e, unicode(template))) canonlatin, canonarabic = ar_translit.canonicalize_latin_arabic( latin, arabic, msgfun=pagemsg) else: _, canonarabic = ar_translit.canonicalize_latin_arabic(None, arabic, msgfun=pagemsg) newlatin = canonlatin == latin and "same" or canonlatin newarabic = canonarabic == arabic and "same" or canonarabic latintrtext = (latin or canonlatin) and " (%s -> %s)" % (latin, newlatin) or "" try: translit = ar_translit.tr(canonarabic, msgfun=pagemsg) if not translit: pagemsg("NOTE: Unable to auto-translit %s (canoned from %s): %s" % (canonarabic, arabic, unicode(template))) except Exception as e: pagemsg("NOTE: Unable to transliterate %s (canoned from %s): %s: %s" % (canonarabic, arabic, e, unicode(template))) translit = None show_diff_string = False if canonarabic == arabic: pagemsg("No change in Arabic %s%s" % (arabic, latintrtext)) canonarabic = False else: if match_canon: operation = "Vocalizing" actionop = "vocalize" elif latin: operation = "Cross-canoning" actionop = "cross-canon" show_diff_string = True else: operation = "Self-canoning" actionop = "self-canon" show_diff_string = True if show_diff_string: diffmsg = " (%s)" % diff_string(arabic, canonarabic) else: diffmsg = "" pagemsg("%s Arabic %s -> %s%s%s: %s" % (operation, arabic, canonarabic, latintrtext, diffmsg, unicode(template))) if fromparam == toparam: actions.append("%s %s=%s -> %s" % (actionop, fromparam, arabic, canonarabic)) else: actions.append("%s %s=%s -> %s=%s" % (actionop, fromparam, arabic, toparam, canonarabic)) rdcanonarabic = ar_translit.remove_diacritics(canonarabic) rdarabic = ar_translit.remove_diacritics(arabic) if rdarabic != rdcanonarabic: msgs = [] if " " in rdarabic or rdarabic.startswith( " ") or rdarabic.endswith(" "): msgs.append("stray space") if re.search("[A-Za-z]", nfd_form(rdarabic)): msgs.append("Latin") if u"\u00A0" in rdarabic: msgs.append("NBSP") if re.search(u"[\u200E\u200F]", rdarabic): msgs.append("L2R/R2L") if u"ی" in rdarabic: msgs.append("Farsi Yeh") if u"ک" in rdarabic: msgs.append("Keheh") if re.search(u"[\uFB50-\uFDCF]", rdarabic): msgs.append("Arabic Pres-A") if re.search(u"[\uFDF0-\uFDFF]", rdarabic): msgs.append("Arabic word ligatures") if re.search(u"[\uFE70-\uFEFF]", rdarabic): msgs.append("Arabic Pres-B") diffmsg = diff_string(rdarabic, rdcanonarabic) pagemsg( "NOTE: Without diacritics, old Arabic %s different from canon %s%s (%s): %s" % (arabic, canonarabic, msgs and " (in old: %s)" % ", ".join(msgs) or "", diffmsg, unicode(template))) if not latin: pass elif translit and ( translit == canonlatin # or translit == canonlatin + "un" or # translit == u"ʾ" + canonlatin or # translit == u"ʾ" + canonlatin + "un" ): pagemsg("Removing redundant translit for %s -> %s%s" % (arabic, newarabic, latintrtext)) actions.append("remove redundant %s=%s" % (paramtrname, latin)) canonlatin = True else: if match_canon: operation = "Match-canoning" passive = "Match-canoned" actionop = "match-canon" else: operation = "Cross-canoning" passive = "Cross-canoned" actionop = "cross-canon" if translit: pagemsg( "NOTE: %s Latin %s not same as auto-translit %s, can't remove: %s" % (passive, canonlatin, translit, unicode(template))) if canonlatin == latin: pagemsg( "No change in Latin %s: Arabic %s -> %s (auto-translit %s)" % (latin, arabic, newarabic, translit)) canonlatin = False else: pagemsg( "%s Latin %s -> %s: Arabic %s -> %s (auto-translit %s): %s" % (operation, latin, canonlatin, arabic, newarabic, translit, unicode(template))) actions.append("%s %s=%s -> %s" % (actionop, paramtrname, latin, canonlatin)) return (canonarabic, canonlatin, actions)
def create_declension(page, index, save, pos, tempname, decltempname, sgnum, removeparams, is_proper=False): pagename = page.title() comments = [] def pgmsg(text): msg("Page %s %s: %s" % (index, pagename, text)) # Starts with definite article al- def starts_with_al(text): return re.match(ALIF_ANY + A + "?" + L, text) def sub_if(fr, to, text): if re.search(fr, text): return re.sub(fr, to, text) else: return "" # Remove definite article al- from text def remove_al(text): return (sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?(.)" + SH, r"\1", text) or sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?", "", text) or text) # Remove definite article al- from transliterated text def remove_al_tr(text): return (sub_if(ur"^a?([sšṣtṯṭdḏḍzžẓnrḷ])-\1", r"\1", text) or sub_if("^a?l-", "", text) or text) # Split off interwiki links at end m = re.match(r"^(.*?\n+)((\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$", page.text, re.S) if m: pagebody = m.group(1) pagetail = m.group(2) else: pagebody = page.text pagetail = "" # Split top-level sections (by language) splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M) # Extract off head and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Look for Arabic section for seci in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[seci], re.M) if not m: pgmsg("Can't find language name in text: [[%s]]" % (sections[seci])) elif m.group(1) == "Arabic": # Extract off trailing separator mm = re.match(r"^(.*?\n+)(--+\n*)$", sections[seci], re.S) if mm: secbody = mm.group(1) sectail = mm.group(2) else: secbody = sections[seci] sectail = "" # Split into subsections based on headers subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M) # Go through each subsection for j in xrange(len(subsections)): notes = [] def add_note(note): if note not in notes: notes.append(note) # Look for subsections matching the given POS if j > 0 and (j % 2) == 0 and re.match("^===+%s===+\n" % pos, subsections[j - 1]): # Call reorder_shadda here so the templates we work with have # shadda in correct order but we don't mess with other text to # avoid unnecessary saving parsed = blib.parse_text(reorder_shadda(subsections[j])) def pagemsg(text): pgmsg("%s: [[%s]]" % (text, subsections[j])) # Check for various conditions causing us to skip this entry and # not try to add a declension table # Skip declension if certain templates found in definition. # We don't check for {{alternative form of|...}}, because it's # used for e.g. different ways of spelling "camera" in Arabic, # some with -ā and some with -a, so we still want to create # declensions for those. altspelling_templates = [temp for temp in parsed.filter_templates() if temp.name in ["alternative spelling of"]] if len(altspelling_templates) > 0: pagemsg("Alternative spelling redirect found in text, skipping") continue if pos == "Adjective": feminine_of_templates = [temp for temp in parsed.filter_templates() if temp.name in ["feminine of"]] if len(feminine_of_templates) > 0: pagemsg("feminine-of template found for adjective, skipping") continue # Retrieve headword_template, make sure exactly one and it is the right type headword_templates = [temp for temp in parsed.filter_templates() if temp.name in ["ar-noun", "ar-proper noun", "ar-coll-noun", "ar-sing-noun", "ar-noun-pl", "ar-noun-dual", "ar-adj-fem", "ar-adj-pl", "ar-noun-inf-cons", "ar-adj-inf-def", "ar-adj-dual", "ar-adj", "ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-in", "ar-adj-an"]] if len(headword_templates) == 0: pagemsg("WARNING: Can't find headword template in text, skipping") continue if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates in text, skipping") continue headword_template = headword_templates[0] if headword_template.name != tempname: pagemsg("Headword template should be '%s' but is '%s', skipping" % (tempname, headword_template.name)) continue def getp(param): return getparam(headword_template, param) # NOTE: We physically add and remove parameters from the headword # template to get the list of parameters to use in creating the # declension template. These changes don't get propagated to the # headword template because we don't convert the parsed text back # to a string. def putp(param, value): addparam(headword_template, param, value) head = getp("1") orighead = head # Check for declension already present if (j + 1 < len(subsections) and re.match("^===+Declension===+\n", subsections[j + 1]) or j + 3 < len(subsections) and re.match("^===+Usage", subsections[j + 1]) and re.match("^===+Declension===+\n", subsections[j + 3]) ): pagemsg("Declension already found for head %s, skipping" % head) continue # Check for cpl # FIXME: Convert cpl into pl and fpl if getp("cpl"): pagemsg("WARNING: Headword template for head %s has cpl param in it, skipping" % (head)) continue # Check for empty head. If w/o explicit translit, skip; else, # fetch head from page title. if not head: if not getp("tr"): pagemsg("WARNING: Headword template head is empty and without explicit translit, skipping") continue else: pagemsg("Headword template head is empty but has explicit translit") add_note("empty head, using page name") head = pagename putp("1", head) # Try to handle cases with a modifier; we can't handle all of them yet headspace = False if ' ' in head: headspace = True words = re.split(r"\s", remove_links(head)) head = words[0] if len(words) > 2: pagemsg("WARNING: Headword template head %s has two or more spaces in it, skipping" % orighead) continue assert(len(words) == 2) # Check for params we don't yet know how to handle must_continue = False for badparam in ["pl2", "pltr", "head2", "sing", "coll"]: if getp(badparam): # FIXME pagemsg("WARNING: Headword template head %s has space in it and param %s, skipping" % (orighead, badparam)) must_continue = True break if must_continue: continue # Now check for various types of construction, all either # construct (ʾidāfa) or adjectival def remove_nom_gen_i3rab(word, nomgen, undia, undiatext, udia, udiatext): if word.endswith(undia): pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, undiatext, word)) add_note("removing %s i3rab (%s)" % (nomgen, undiatext)) return re.sub(undia + "$", "", word) if word.endswith(udia): pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, udiatext, word)) add_note("removing %s i3rab (%s)" % (nomgen, udiatext)) return re.sub(udia + "$", "", word) if re.search(DIACRITIC_ANY_BUT_SH + "$", word): pagemsg("WARNING: Strange diacritic at end of %s %s" % (nomgen, word)) if word[0] == ALIF_WASLA: pagemsg("Changing %s alif wasla to plain alif for %s" % (nomgen, word)) add_note("changing %s alif wasla to plain alif" % (nomgen)) word = ALIF + word[1:] return word def remove_gen_i3rab(word): return remove_nom_gen_i3rab(word, "genitive", IN, "IN", I, "I") def remove_nom_i3rab(word): return remove_nom_gen_i3rab(word, "nominative", UN, "UN", U, "U") def remove_gen_i3rab_tr(word): return remove_nom_gen_i3rab(word, "genitive", "in", "in", "i", "i") def remove_nom_i3rab_tr(word): return remove_nom_gen_i3rab(word, "nominative", "un", "un", "u", "u") idafa = False word0al = starts_with_al(words[0]) word1al = starts_with_al(words[1]) words[0] = remove_al(words[0]) words[1] = remove_al(words[1]) putp("1", words[0]) putp("mod", words[1]) if word0al and word1al: pagemsg("Headword template head %s has space in it and found definite adjective construction" % (orighead)) add_note("modifier definite adjective construction") putp("state", "def") elif word0al and not word1al: pagemsg("WARNING: Headword template head %s has space in it and found al-X + Y construction, can't handle, skipping" % (orighead)) continue elif is_proper: if words[0].endswith(ALIF) and word1al: pagemsg("Proper noun headword template head %s has space in it and found ind-def with definite adjectival modifier" % (orighead)) add_note("modifier proper noun + definite adjective construction") putp("state", "ind-def") elif remove_diacritics(words[0]) == u"جمهورية": if word1al: pagemsg("Proper noun headword template head %s has space in it and found definite idafa" % (orighead)) add_note("modifier definite idafa construction") idafa = True assert sgnum == "sg" idafaval = "def" putp("idafa", idafaval) elif words[1].endswith(ALIF): pagemsg("Proper noun headword template head %s has space in it and found idafa with ind-def modifier" % (orighead)) add_note("modifier proper-noun ind-def idafa construction") assert sgnum == "sg" idafaval = "ind-def" putp("idafa", idafaval) else: pagemsg("WARNING: Proper noun headword template head %s has space in it and found idafa construction we can't handle, skipping" % (orighead)) continue else: pagemsg("WARNING: Proper noun headword template head %s has space in it and can't determine whether idafa, skipping" % (orighead)) continue elif not word0al and word1al: # Found an ʾidāfa construction pagemsg("Headword template head %s has space in it and found definite idafa" % (orighead)) add_note("modifier definite idafa construction") idafa = True idafaval = "def-" + sgnum if idafaval == "def-sg": idafaval = "def" putp("idafa", idafaval) elif words[1].endswith(I + Y): pagemsg("WARNING: Headword template head %s has space in it and appears to end in badly formatted nisba, FIXME, skipping" % (orighead)) continue elif words[1].endswith(I + Y + SH): pagemsg("Headword template head %s has space in it and found indefinite adjective nisba construction" % (orighead)) add_note("modifier indefinite nisba adjective construction") elif pagename in adjectival_phrases: pagemsg("Headword template head %s has space in it, indefinite, and manually specified to be adjectival" % (orighead)) add_note("modifier indefinite adjective construction") else: pagemsg("Headword template head %s has space in it, indefinite, and not specified to be adjectival, assuming idafa" % (orighead)) add_note("modifier indefinite idafa construction") idafa = True putp("idafa", sgnum) # Now remove any i3rab diacritics putp("1", remove_nom_i3rab(getp("1"))) if idafa: putp("mod", remove_gen_i3rab(getp("mod"))) else: putp("mod", remove_nom_i3rab(getp("mod"))) # Now check if the lemma is plural if re.match(r"\bp\b", getp("2")): pagemsg("Headword template head %s has space in it and is plural" % (orighead)) add_note("plural lemma") if getp("tr"): # FIXME (doesn't occur though) pagemsg("WARNING: Headword template head %s has space in it and manual translit and is plural, skipping" % (orighead)) continue putp("pl", getp("1")) putp("1", "-") if not idafa: putp("modpl", getp("mod")) putp("mod", "-") # Now check if lemma has plural specified elif getp("pl"): pls = re.split(r"\s", remove_links(getp("pl"))) assert(len(pls) == 2) pls[0] = remove_al(pls[0]) pls[1] = remove_al(pls[1]) putp("pl", remove_nom_i3rab(pls[0])) if not idafa: putp("modpl", remove_nom_i3rab(pls[1])) else: if pls[1] != getp("mod"): pagemsg("FIXME: Headword template head %s, plural modifier %s not same as singular modifier %s in idafa construction" % (orighead, pls[1], getp("mod"))) # Now check if there's manual translit. We need to split the # manual translit in two and pair up manual translit with # corresponding Arabic words. But first remove -t indicating # construct state, and check to see if manual translit is # same as auto translit, in which case it's unnecessary. if getp("tr"): pagemsg("Headword template head %s has space in it and manual translit" % (orighead)) trwords = re.split(r"\s", getp("tr")) assert(len(trwords) == 2) trwords[0] = remove_nom_i3rab_tr(remove_al_tr(trwords[0])) if idafa: trwords[1] = remove_gen_i3rab_tr(remove_al_tr(trwords[1])) else: trwords[1] = remove_nom_i3rab_tr(remove_al_tr(trwords[1])) # Remove any extraneous -t from translit, either from construct # state of from removal of i3rab in a feminine noun/adj. for i in [0, 1]: if words[i].endswith(TAM) and trwords[i].endswith("t"): trwords[i] = trwords[i][0:-1] if words[i].endswith(ALIF + TAM) and not trwords[i].endswith("h"): trwords[i] += "h" if ar_translit.tr(words[0]) != trwords[0]: pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[0], words[0])) add_note("modified head w/manual translit") putp("1", "%s/%s" % (getp("1"), trwords[0])) else: pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[0], words[0])) add_note("modified head w/ignored manual translit") if ar_translit.tr(words[1]) != trwords[1]: pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[1], words[1])) add_note("modifier w/manual translit") putp("mod", "%s/%s" % (getp("mod"), trwords[1])) else: pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[1], words[1])) add_note("modifier w/ignored manual translit") else: # no space in head, not dealing with a modifier # If has link in it, just remove it if '[' in head or ']' in head or '|' in head: pagemsg("Headword template head %s has link in it" % (head)) add_note("removed links from head") head = remove_links(head) putp("1", head) # If starts with definite article, remove article from everything, # including transliterations, and set state=def if starts_with_al(head): pagemsg("Headword template head %s starts with definite article" % (head)) add_note("definite lemma") head = remove_al(head) putp("1", head) putp("state", "def") # Also remove al- from remaining head and pl params def check_for_al(param): param = remove_links(param) value = getparam(headword_template, param) if value: if '[' in value or ']' in value or '|' in value: pagemsg("Param %s value %s has link in it" % (param, value)) add_note("removed links from %s" % param) value = remove_links(value) putp(param, remove_al(value)) params_to_check = ["pl", "sing", "coll", "pauc", "f", "fpl"] for param in params_to_check: check_for_al(param) for i in xrange(2, 10): check_for_al("head%s" % i) for param in params_to_check: check_for_al("%s%s" % (param, i)) # Also remove al- from transliteration def check_for_al_tr(param): value = getparam(headword_template, param) if value: putp(param, remove_al_tr(value)) check_for_al("tr") for param in params_to_check: check_for_al("%str" % param) for i in xrange(2, 10): check_for_al("tr%s" % i) for param in params_to_check: check_for_al("%s%str" % (param, i)) elif is_proper: if head.endswith(ALIF): pagemsg(u"Headword template head %s ends in -ā" % (head)) putp("state", "ind-def") else: pagemsg(u"WARNING: Headword template head %s is indefinite proper noun, not ending in -ā, skipping" % (head)) continue if head.endswith(UN): pagemsg("Headword template head %s ends with explicit i3rab (UN)" % (head)) add_note("head has explicit i3rab (UN)") # We don't continue here because we handle this case below elif head.endswith(U): pagemsg("Headword template head %s ends with explicit i3rab (U)" % (head)) add_note("head has explicit i3rab (U)") # We don't continue here because we don't need to handle this case # Now check if the lemma is plural if re.match(r"\bp\b", getp("2")): pagemsg("Headword template head %s is plural" % (head)) add_note("plural lemma") if getp("tr"): # FIXME (doesn't occur though) pagemsg("WARNING: Headword template head %s has manual translit and is plural, skipping" % (head)) continue putp("pl", getp("1")) putp("1", "-") # Now fetch the parameters from the headword template, removing # any that we want to remove, removing the i3rab -UN ending, and # adding any specified manual translit as a / annotation. def param_should_be_removed(param): name = unicode(param.name) if name == "sc" and unicode(param.value) == "Arab": return True if name.endswith("tr"): return True for remove in removeparams: if name == remove: return True if re.match("^[a-z]+$", remove) and re.match("^%s([0-9]+)?$" % remove, name): return True return False def remove_i3rab(param): text = unicode(param) if text.endswith(UN): pgmsg("Removing i3rab from %s: %s" % (text, unicode(headword_template))) add_note("removing i3rab") return re.sub(UN + "$", "", text) def trparam(name): if name == "1": return "tr" elif name.startswith("head"): return name.replace("head", "tr") else: return name + "tr" def process_param(param): arabic = remove_i3rab(param) # Value of + is used in ar-nisba, ar-noun-nisba, ar-adj-in # to signal the strong plural. if arabic.endswith("=+"): newarabic = re.sub(r"=\+$", "=sp", arabic) pgmsg("Converting %s to %s: %s" % (arabic, newarabic, unicode(headword_template))) arabic = newarabic # Value of - is used in ar-adj-in to signal an unknown # feminine plural. if arabic.endswith("=-"): newarabic = re.sub(r"=-$", "=?", arabic) pgmsg("Converting %s to %s: %s" % (arabic, newarabic, unicode(headword_template))) arabic = newarabic # Don't process translit in modifier constructions, where the # translit is also processed. if not headspace: tr = getparam(headword_template, trparam(unicode(param.name))) if tr: return arabic + "/" + tr return arabic params = '|'.join([process_param(param) for param in headword_template.params if not param_should_be_removed(param)]) # For templates that automatically supply the masculine plural, # supply it here, too if not overridden. if tempname in ["ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-an"] and not getp("pl"): params += '|pl=sp' # Separate off any [[Category: Foo]] declarators, insert before them m = re.match(r"^(.*?\n+)((\[\[[A-Za-z0-9_\-]+:[^\]]+\]\]\n*)*)$", subsections[j], re.S) if m: body = m.group(1) tail = m.group(2) else: body = subsections[j] tail = "" # Make sure there are two trailing newlines if body.endswith("\n\n"): pass elif body.endswith("\n"): body += "\n" else: body += "\n\n" body += (subsections[j - 1].replace(pos, "=Declension=") + "{{%s|%s}}\n\n" % (decltempname, params)) subsections[j] = body + tail comment = "added declension for %s %s" % (tempname, remove_links(orighead) or "%s/%s" % (pagename, getp("tr"))) note = ', '.join(notes) if note: comment = "%s (%s)" % (comment, note) comments.append(comment) sections[seci] = ''.join(subsections) + sectail newtext = pagehead + ''.join(sections) + pagetail comment = '; '.join(comments) assert((not comment) == (newtext == page.text)) if newtext != page.text: if verbose: msg("Replacing [[%s]] with [[%s]]" % (page.text, newtext)) page.text = newtext msg("For page %s, comment = %s" % (pagename, comment)) if save: page.save(comment = comment)
def process_param(pagetitle, index, template, param, paramtr, include_tempname_in_changelog=False): def pagemsg(text): msg("Page %s %s: %s.%s: %s" % (index, pagetitle, template.name, param, text)) arabic = getparam(template, param) latin = getparam(template, paramtr) if include_tempname_in_changelog: paramtrname = "%s.%s" % (template.name, paramtr) else: paramtrname = paramtr if not arabic: return False if latin == "-": pagemsg("Translit is '-', skipping") return True if latin: try: _, canonlatin = tr_matching(arabic, latin, True, pagemsg) if not canonlatin: pagemsg("Unable to match-canonicalize %s (%s)" % (arabic, latin)) except Exception as e: pagemsg("Trying to match-canonicalize %s (%s): %s" % (arabic, latin, e)) canonlatin = None try: translit = ar_translit.tr(arabic) if not translit: pagemsg("Unable to auto-translit %s" % arabic) except Exception as e: pagemsg("Trying to transliterate %s: %s" % (arabic, e)) translit = None if translit and canonlatin: if translit == canonlatin: #if (translit == canonlatin or # translit == canonlatin + "un" or # translit == u"ʾ" + canonlatin or # translit == u"ʾ" + canonlatin + "un"): pagemsg("Removing redundant translit for %s (%s)" % (arabic, latin)) oldtempl = "%s" % unicode(template) template.remove(paramtr) msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) return ["remove redundant %s=%s" % (paramtrname, latin)] else: pagemsg("Auto-translit for %s (%s) not same as manual translit %s (canonicalized %s)" % (arabic, translit, latin, canonlatin)) if canonlatin: if latin != canonlatin: pagemsg("Match-canonicalizing Latin %s to %s" % (latin, canonlatin)) oldtempl = "%s" % unicode(template) addparam(template, paramtr, canonlatin) msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) return ["match-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)] return True canonlatin, _ = ar_translit.canonicalize_latin_arabic(latin, None) if latin != canonlatin: pagemsg("Self-canonicalizing Latin %s to %s" % (latin, canonlatin)) oldtempl = "%s" % unicode(template) addparam(template, paramtr, canonlatin) msg("Page %s %s: Replaced %s with %s" % (index, pagetitle, oldtempl, unicode(template))) return ["self-canon %s=%s -> %s" % (paramtrname, latin, canonlatin)] return True
def __init__( self, eastarabnum, english, nom, femnom=None, obl=None, femobl=None, hundred=None, thousand=None, ord=None, adv=None, frac=None, dist=None, mult=None, numadj=None, ): nom = reorder_shadda(nom) if not femnom: if nom.endswith(AH): femnom = re.sub(AH + "$", "", nom) else: femnom = nom if not obl: if nom.endswith(UUN): obl = re.sub(UUN + "$", IIN, nom) else: obl = nom femnom = reorder_shadda(femnom) if not femobl: if femnom.endswith(UUN): femobl = re.sub(UUN + "$", IIN, femnom) else: femobl = femnom self.eastarabnum = eastarabnum self.english = english self.nom = nom self.nomtr = ar_translit.tr(nom) self.femnom = femnom self.femnomtr = ar_translit.tr(femnom) self.obl = obl self.obltr = ar_translit.tr(obl) self.femobl = femobl self.femobltr = ar_translit.tr(femobl) self.hundred = hundred self.thousand = thousand self.thousandtr = None if self.thousand: self.thousandtr = ar_translit.tr(self.thousand) self.ordroot = None self.ordeng = None self.ordgloss = None self.cardteeneng = None self.ordlemma = None self.cardteen = None self.adv = adv self.frac = frac self.dist = dist self.mult = mult self.numadj = numadj if ord: if len(ord) == 6: self.ordroot, self.ordeng, self.ordgloss, self.cardteeneng, self.ordlemma, self.cardteen = ord else: self.ordroot, self.ordeng, self.cardteeneng = ord self.ordgloss = self.ordeng self.ordlemma = self.ordroot[0] + AA + self.ordroot[1] + I + self.ordroot[2] self.cardteen = self.nom + A + u" عَشَرَ" self.ordlemmatr = ar_translit.tr(self.ordlemma) R1 = self.ordroot[0] R2 = self.ordroot[1] R3 = self.ordroot[2] self.femordlemma = R1 + AA + R2 + I + R3 + AH self.femordlemmatr = ar_translit.tr(self.femordlemma) self.ordteen = R1 + AA + R2 + I + R3 + A + u" عَشَرَ" self.femordteen = self.femordlemma + A + u" عَشْرَةَ" self.ordteeneng = ( "twelfth" if self.cardteeneng == "twelve" else "twentieth" if self.cardteeneng == "twenty" else self.cardteeneng + "th" ) # one and two are totally special-cased if self.english != "one" and self.english != "two": self.frac = R1 + U + R2 + SK + R3 self.adv = self.femnom + u" مَرَّات" self.mult = u"مُ" + R1 + A + R2 + SH + A + R3 self.numadj = R1 + U + R2 + AA + R3 + IY + SH if not self.hundred: self.hundred = self.femnom + U + u"مِائَة" if not self.thousand: self.thousand = self.nom + " " + self.thousandpl self.thousandtr = self.nomtr + "t " + self.thousandpltr self.hundredtr = ar_translit.tr(self.hundred.replace(u"مِا", u"مِ"))
def do_canon_param(pagetitle, index, template, fromparam, toparam, paramtr, arabic, latin, include_tempname_in_changelog=False): actions = [] tname = unicode(template.name) def pagemsg(text): msg("Page %s %s: %s.%s: %s" % (index, pagetitle, tname, fromparam, text)) if show_template: pagemsg("Processing %s" % (unicode(template))) if include_tempname_in_changelog: paramtrname = "%s.%s" % (tname, paramtr) else: paramtrname = paramtr if latin == "-": pagemsg("Latin is -, taking no action") return False, False, [] # Compute canonarabic and canonlatin match_canon = False canonlatin = "" if latin: try: canonarabic, canonlatin = ar_translit.tr_matching(arabic, latin, True, msgfun=pagemsg) match_canon = True except Exception as e: pagemsg("NOTE: Unable to vocalize %s (%s): %s: %s" % (arabic, latin, e, unicode(template))) canonlatin, canonarabic = ar_translit.canonicalize_latin_arabic(latin, arabic, msgfun=pagemsg) else: _, canonarabic = ar_translit.canonicalize_latin_arabic(None, arabic, msgfun=pagemsg) newlatin = canonlatin == latin and "same" or canonlatin newarabic = canonarabic == arabic and "same" or canonarabic latintrtext = (latin or canonlatin) and " (%s -> %s)" % (latin, newlatin) or "" try: translit = ar_translit.tr(canonarabic, msgfun=pagemsg) if not translit: pagemsg("NOTE: Unable to auto-translit %s (canoned from %s): %s" % (canonarabic, arabic, unicode(template))) except Exception as e: pagemsg("NOTE: Unable to transliterate %s (canoned from %s): %s: %s" % (canonarabic, arabic, e, unicode(template))) translit = None show_diff_string = False if canonarabic == arabic: pagemsg("No change in Arabic %s%s" % (arabic, latintrtext)) canonarabic = False else: if match_canon: operation="Vocalizing" actionop="vocalize" elif latin: operation="Cross-canoning" actionop="cross-canon" show_diff_string = True else: operation="Self-canoning" actionop="self-canon" show_diff_string = True if show_diff_string: diffmsg = " (%s)" % diff_string(arabic, canonarabic) else: diffmsg = "" pagemsg("%s Arabic %s -> %s%s%s: %s" % (operation, arabic, canonarabic, latintrtext, diffmsg, unicode(template))) if fromparam == toparam: actions.append("%s %s=%s -> %s" % (actionop, fromparam, arabic, canonarabic)) else: actions.append("%s %s=%s -> %s=%s" % (actionop, fromparam, arabic, toparam, canonarabic)) rdcanonarabic = ar_translit.remove_diacritics(canonarabic) rdarabic = ar_translit.remove_diacritics(arabic) if rdarabic != rdcanonarabic: msgs = [] if " " in rdarabic or rdarabic.startswith(" ") or rdarabic.endswith(" "): msgs.append("stray space") if re.search("[A-Za-z]", nfd_form(rdarabic)): msgs.append("Latin") if u"\u00A0" in rdarabic: msgs.append("NBSP") if re.search(u"[\u200E\u200F]", rdarabic): msgs.append("L2R/R2L") if u"ی" in rdarabic: msgs.append("Farsi Yeh") if u"ک" in rdarabic: msgs.append("Keheh") if re.search(u"[\uFB50-\uFDCF]", rdarabic): msgs.append("Arabic Pres-A") if re.search(u"[\uFDF0-\uFDFF]", rdarabic): msgs.append("Arabic word ligatures") if re.search(u"[\uFE70-\uFEFF]", rdarabic): msgs.append("Arabic Pres-B") diffmsg = diff_string(rdarabic, rdcanonarabic) pagemsg("NOTE: Without diacritics, old Arabic %s different from canon %s%s (%s): %s" % (arabic, canonarabic, msgs and " (in old: %s)" % ", ".join(msgs) or "", diffmsg, unicode(template))) if not latin: pass elif translit and (translit == canonlatin # or translit == canonlatin + "un" or # translit == u"ʾ" + canonlatin or # translit == u"ʾ" + canonlatin + "un" ): pagemsg("Removing redundant translit for %s -> %s%s" % ( arabic, newarabic, latintrtext)) actions.append("remove redundant %s=%s" % (paramtrname, latin)) canonlatin = True else: if match_canon: operation="Match-canoning" passive="Match-canoned" actionop="match-canon" else: operation="Cross-canoning" passive="Cross-canoned" actionop="cross-canon" if translit: pagemsg("NOTE: %s Latin %s not same as auto-translit %s, can't remove: %s" % (passive, canonlatin, translit, unicode(template))) if canonlatin == latin: pagemsg("No change in Latin %s: Arabic %s -> %s (auto-translit %s)" % (latin, arabic, newarabic, translit)) canonlatin = False else: pagemsg("%s Latin %s -> %s: Arabic %s -> %s (auto-translit %s): %s" % ( operation, latin, canonlatin, arabic, newarabic, translit, unicode(template))) actions.append("%s %s=%s -> %s" % (actionop, paramtrname, latin, canonlatin)) return (canonarabic, canonlatin, actions)
def create_declension(page, index, save, pos, tempname, decltempname, sgnum, removeparams, is_proper=False): pagename = page.title() comments = [] def pgmsg(text): msg("Page %s %s: %s" % (index, pagename, text)) # Starts with definite article al- def starts_with_al(text): return re.match(ALIF_ANY + A + "?" + L, text) def sub_if(fr, to, text): if re.search(fr, text): return re.sub(fr, to, text) else: return "" # Remove definite article al- from text def remove_al(text): return (sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?(.)" + SH, r"\1", text) or sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?", "", text) or text) # Remove definite article al- from transliterated text def remove_al_tr(text): return (sub_if(ur"^a?([sšṣtṯṭdḏḍzžẓnrḷ])-\1", r"\1", text) or sub_if("^a?l-", "", text) or text) # Split off interwiki links at end m = re.match(r"^(.*?\n+)((\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$", page.text, re.S) if m: pagebody = m.group(1) pagetail = m.group(2) else: pagebody = page.text pagetail = "" # Split top-level sections (by language) splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M) # Extract off head and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Look for Arabic section for seci in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[seci], re.M) if not m: pgmsg("Can't find language name in text: [[%s]]" % (sections[seci])) elif m.group(1) == "Arabic": # Extract off trailing separator mm = re.match(r"^(.*?\n+)(--+\n*)$", sections[seci], re.S) if mm: secbody = mm.group(1) sectail = mm.group(2) else: secbody = sections[seci] sectail = "" # Split into subsections based on headers subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M) # Go through each subsection for j in xrange(len(subsections)): notes = [] def add_note(note): if note not in notes: notes.append(note) # Look for subsections matching the given POS if j > 0 and (j % 2) == 0 and re.match("^===+%s===+\n" % pos, subsections[j - 1]): # Call reorder_shadda here so the templates we work with have # shadda in correct order but we don't mess with other text to # avoid unnecessary saving parsed = blib.parse_text(reorder_shadda(subsections[j])) def pagemsg(text): pgmsg("%s: [[%s]]" % (text, subsections[j])) # Check for various conditions causing us to skip this entry and # not try to add a declension table # Skip declension if certain templates found in definition. # We don't check for {{alternative form of|...}}, because it's # used for e.g. different ways of spelling "camera" in Arabic, # some with -ā and some with -a, so we still want to create # declensions for those. altspelling_templates = [ temp for temp in parsed.filter_templates() if temp.name in ["alternative spelling of"] ] if len(altspelling_templates) > 0: pagemsg( "Alternative spelling redirect found in text, skipping" ) continue if pos == "Adjective": feminine_of_templates = [ temp for temp in parsed.filter_templates() if temp.name in ["feminine of"] ] if len(feminine_of_templates) > 0: pagemsg( "feminine-of template found for adjective, skipping" ) continue # Retrieve headword_template, make sure exactly one and it is the right type headword_templates = [ temp for temp in parsed.filter_templates() if temp.name in [ "ar-noun", "ar-proper noun", "ar-coll-noun", "ar-sing-noun", "ar-noun-pl", "ar-noun-dual", "ar-adj-fem", "ar-adj-pl", "ar-noun-inf-cons", "ar-adj-inf-def", "ar-adj-dual", "ar-adj", "ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-in", "ar-adj-an" ] ] if len(headword_templates) == 0: pagemsg( "WARNING: Can't find headword template in text, skipping" ) continue if len(headword_templates) > 1: pagemsg( "WARNING: Found multiple headword templates in text, skipping" ) continue headword_template = headword_templates[0] if headword_template.name != tempname: pagemsg( "Headword template should be '%s' but is '%s', skipping" % (tempname, headword_template.name)) continue def getp(param): return getparam(headword_template, param) # NOTE: We physically add and remove parameters from the headword # template to get the list of parameters to use in creating the # declension template. These changes don't get propagated to the # headword template because we don't convert the parsed text back # to a string. def putp(param, value): addparam(headword_template, param, value) head = getp("1") orighead = head # Check for declension already present if (j + 1 < len(subsections) and re.match( "^===+Declension===+\n", subsections[j + 1]) or j + 3 < len(subsections) and re.match("^===+Usage", subsections[j + 1]) and re.match("^===+Declension===+\n", subsections[j + 3])): pagemsg( "Declension already found for head %s, skipping" % head) continue # Check for cpl # FIXME: Convert cpl into pl and fpl if getp("cpl"): pagemsg( "WARNING: Headword template for head %s has cpl param in it, skipping" % (head)) continue # Check for empty head. If w/o explicit translit, skip; else, # fetch head from page title. if not head: if not getp("tr"): pagemsg( "WARNING: Headword template head is empty and without explicit translit, skipping" ) continue else: pagemsg( "Headword template head is empty but has explicit translit" ) add_note("empty head, using page name") head = pagename putp("1", head) # Try to handle cases with a modifier; we can't handle all of them yet headspace = False if ' ' in head: headspace = True words = re.split(r"\s", remove_links(head)) head = words[0] if len(words) > 2: pagemsg( "WARNING: Headword template head %s has two or more spaces in it, skipping" % orighead) continue assert (len(words) == 2) # Check for params we don't yet know how to handle must_continue = False for badparam in [ "pl2", "pltr", "head2", "sing", "coll" ]: if getp(badparam): # FIXME pagemsg( "WARNING: Headword template head %s has space in it and param %s, skipping" % (orighead, badparam)) must_continue = True break if must_continue: continue # Now check for various types of construction, all either # construct (ʾidāfa) or adjectival def remove_nom_gen_i3rab(word, nomgen, undia, undiatext, udia, udiatext): if word.endswith(undia): pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, undiatext, word)) add_note("removing %s i3rab (%s)" % (nomgen, undiatext)) return re.sub(undia + "$", "", word) if word.endswith(udia): pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, udiatext, word)) add_note("removing %s i3rab (%s)" % (nomgen, udiatext)) return re.sub(udia + "$", "", word) if re.search(DIACRITIC_ANY_BUT_SH + "$", word): pagemsg( "WARNING: Strange diacritic at end of %s %s" % (nomgen, word)) if word[0] == ALIF_WASLA: pagemsg( "Changing %s alif wasla to plain alif for %s" % (nomgen, word)) add_note( "changing %s alif wasla to plain alif" % (nomgen)) word = ALIF + word[1:] return word def remove_gen_i3rab(word): return remove_nom_gen_i3rab( word, "genitive", IN, "IN", I, "I") def remove_nom_i3rab(word): return remove_nom_gen_i3rab( word, "nominative", UN, "UN", U, "U") def remove_gen_i3rab_tr(word): return remove_nom_gen_i3rab( word, "genitive", "in", "in", "i", "i") def remove_nom_i3rab_tr(word): return remove_nom_gen_i3rab( word, "nominative", "un", "un", "u", "u") idafa = False word0al = starts_with_al(words[0]) word1al = starts_with_al(words[1]) words[0] = remove_al(words[0]) words[1] = remove_al(words[1]) putp("1", words[0]) putp("mod", words[1]) if word0al and word1al: pagemsg( "Headword template head %s has space in it and found definite adjective construction" % (orighead)) add_note( "modifier definite adjective construction") putp("state", "def") elif word0al and not word1al: pagemsg( "WARNING: Headword template head %s has space in it and found al-X + Y construction, can't handle, skipping" % (orighead)) continue elif is_proper: if words[0].endswith(ALIF) and word1al: pagemsg( "Proper noun headword template head %s has space in it and found ind-def with definite adjectival modifier" % (orighead)) add_note( "modifier proper noun + definite adjective construction" ) putp("state", "ind-def") elif remove_diacritics(words[0]) == u"جمهورية": if word1al: pagemsg( "Proper noun headword template head %s has space in it and found definite idafa" % (orighead)) add_note( "modifier definite idafa construction") idafa = True assert sgnum == "sg" idafaval = "def" putp("idafa", idafaval) elif words[1].endswith(ALIF): pagemsg( "Proper noun headword template head %s has space in it and found idafa with ind-def modifier" % (orighead)) add_note( "modifier proper-noun ind-def idafa construction" ) assert sgnum == "sg" idafaval = "ind-def" putp("idafa", idafaval) else: pagemsg( "WARNING: Proper noun headword template head %s has space in it and found idafa construction we can't handle, skipping" % (orighead)) continue else: pagemsg( "WARNING: Proper noun headword template head %s has space in it and can't determine whether idafa, skipping" % (orighead)) continue elif not word0al and word1al: # Found an ʾidāfa construction pagemsg( "Headword template head %s has space in it and found definite idafa" % (orighead)) add_note("modifier definite idafa construction") idafa = True idafaval = "def-" + sgnum if idafaval == "def-sg": idafaval = "def" putp("idafa", idafaval) elif words[1].endswith(I + Y): pagemsg( "WARNING: Headword template head %s has space in it and appears to end in badly formatted nisba, FIXME, skipping" % (orighead)) continue elif words[1].endswith(I + Y + SH): pagemsg( "Headword template head %s has space in it and found indefinite adjective nisba construction" % (orighead)) add_note( "modifier indefinite nisba adjective construction" ) elif pagename in adjectival_phrases: pagemsg( "Headword template head %s has space in it, indefinite, and manually specified to be adjectival" % (orighead)) add_note( "modifier indefinite adjective construction") else: pagemsg( "Headword template head %s has space in it, indefinite, and not specified to be adjectival, assuming idafa" % (orighead)) add_note("modifier indefinite idafa construction") idafa = True putp("idafa", sgnum) # Now remove any i3rab diacritics putp("1", remove_nom_i3rab(getp("1"))) if idafa: putp("mod", remove_gen_i3rab(getp("mod"))) else: putp("mod", remove_nom_i3rab(getp("mod"))) # Now check if the lemma is plural if re.match(r"\bp\b", getp("2")): pagemsg( "Headword template head %s has space in it and is plural" % (orighead)) add_note("plural lemma") if getp("tr"): # FIXME (doesn't occur though) pagemsg( "WARNING: Headword template head %s has space in it and manual translit and is plural, skipping" % (orighead)) continue putp("pl", getp("1")) putp("1", "-") if not idafa: putp("modpl", getp("mod")) putp("mod", "-") # Now check if lemma has plural specified elif getp("pl"): pls = re.split(r"\s", remove_links(getp("pl"))) assert (len(pls) == 2) pls[0] = remove_al(pls[0]) pls[1] = remove_al(pls[1]) putp("pl", remove_nom_i3rab(pls[0])) if not idafa: putp("modpl", remove_nom_i3rab(pls[1])) else: if pls[1] != getp("mod"): pagemsg( "FIXME: Headword template head %s, plural modifier %s not same as singular modifier %s in idafa construction" % (orighead, pls[1], getp("mod"))) # Now check if there's manual translit. We need to split the # manual translit in two and pair up manual translit with # corresponding Arabic words. But first remove -t indicating # construct state, and check to see if manual translit is # same as auto translit, in which case it's unnecessary. if getp("tr"): pagemsg( "Headword template head %s has space in it and manual translit" % (orighead)) trwords = re.split(r"\s", getp("tr")) assert (len(trwords) == 2) trwords[0] = remove_nom_i3rab_tr( remove_al_tr(trwords[0])) if idafa: trwords[1] = remove_gen_i3rab_tr( remove_al_tr(trwords[1])) else: trwords[1] = remove_nom_i3rab_tr( remove_al_tr(trwords[1])) # Remove any extraneous -t from translit, either from construct # state of from removal of i3rab in a feminine noun/adj. for i in [0, 1]: if words[i].endswith( TAM) and trwords[i].endswith("t"): trwords[i] = trwords[i][0:-1] if words[i].endswith( ALIF + TAM) and not trwords[i].endswith("h"): trwords[i] += "h" if ar_translit.tr(words[0]) != trwords[0]: pagemsg( "Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[0], words[0])) add_note("modified head w/manual translit") putp("1", "%s/%s" % (getp("1"), trwords[0])) else: pagemsg( "Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[0], words[0])) add_note( "modified head w/ignored manual translit") if ar_translit.tr(words[1]) != trwords[1]: pagemsg( "Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[1], words[1])) add_note("modifier w/manual translit") putp("mod", "%s/%s" % (getp("mod"), trwords[1])) else: pagemsg( "Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[1], words[1])) add_note("modifier w/ignored manual translit") else: # no space in head, not dealing with a modifier # If has link in it, just remove it if '[' in head or ']' in head or '|' in head: pagemsg( "Headword template head %s has link in it" % (head)) add_note("removed links from head") head = remove_links(head) putp("1", head) # If starts with definite article, remove article from everything, # including transliterations, and set state=def if starts_with_al(head): pagemsg( "Headword template head %s starts with definite article" % (head)) add_note("definite lemma") head = remove_al(head) putp("1", head) putp("state", "def") # Also remove al- from remaining head and pl params def check_for_al(param): param = remove_links(param) value = getparam(headword_template, param) if value: if '[' in value or ']' in value or '|' in value: pagemsg( "Param %s value %s has link in it" % (param, value)) add_note("removed links from %s" % param) value = remove_links(value) putp(param, remove_al(value)) params_to_check = [ "pl", "sing", "coll", "pauc", "f", "fpl" ] for param in params_to_check: check_for_al(param) for i in xrange(2, 10): check_for_al("head%s" % i) for param in params_to_check: check_for_al("%s%s" % (param, i)) # Also remove al- from transliteration def check_for_al_tr(param): value = getparam(headword_template, param) if value: putp(param, remove_al_tr(value)) check_for_al("tr") for param in params_to_check: check_for_al("%str" % param) for i in xrange(2, 10): check_for_al("tr%s" % i) for param in params_to_check: check_for_al("%s%str" % (param, i)) elif is_proper: if head.endswith(ALIF): pagemsg( u"Headword template head %s ends in -ā" % (head)) putp("state", "ind-def") else: pagemsg( u"WARNING: Headword template head %s is indefinite proper noun, not ending in -ā, skipping" % (head)) continue if head.endswith(UN): pagemsg( "Headword template head %s ends with explicit i3rab (UN)" % (head)) add_note("head has explicit i3rab (UN)") # We don't continue here because we handle this case below elif head.endswith(U): pagemsg( "Headword template head %s ends with explicit i3rab (U)" % (head)) add_note("head has explicit i3rab (U)") # We don't continue here because we don't need to handle this case # Now check if the lemma is plural if re.match(r"\bp\b", getp("2")): pagemsg("Headword template head %s is plural" % (head)) add_note("plural lemma") if getp("tr"): # FIXME (doesn't occur though) pagemsg( "WARNING: Headword template head %s has manual translit and is plural, skipping" % (head)) continue putp("pl", getp("1")) putp("1", "-") # Now fetch the parameters from the headword template, removing # any that we want to remove, removing the i3rab -UN ending, and # adding any specified manual translit as a / annotation. def param_should_be_removed(param): name = unicode(param.name) if name == "sc" and unicode(param.value) == "Arab": return True if name.endswith("tr"): return True for remove in removeparams: if name == remove: return True if re.match("^[a-z]+$", remove) and re.match( "^%s([0-9]+)?$" % remove, name): return True return False def remove_i3rab(param): text = unicode(param) if text.endswith(UN): pgmsg("Removing i3rab from %s: %s" % (text, unicode(headword_template))) add_note("removing i3rab") return re.sub(UN + "$", "", text) def trparam(name): if name == "1": return "tr" elif name.startswith("head"): return name.replace("head", "tr") else: return name + "tr" def process_param(param): arabic = remove_i3rab(param) # Value of + is used in ar-nisba, ar-noun-nisba, ar-adj-in # to signal the strong plural. if arabic.endswith("=+"): newarabic = re.sub(r"=\+$", "=sp", arabic) pgmsg("Converting %s to %s: %s" % (arabic, newarabic, unicode(headword_template))) arabic = newarabic # Value of - is used in ar-adj-in to signal an unknown # feminine plural. if arabic.endswith("=-"): newarabic = re.sub(r"=-$", "=?", arabic) pgmsg("Converting %s to %s: %s" % (arabic, newarabic, unicode(headword_template))) arabic = newarabic # Don't process translit in modifier constructions, where the # translit is also processed. if not headspace: tr = getparam(headword_template, trparam(unicode(param.name))) if tr: return arabic + "/" + tr return arabic params = '|'.join([ process_param(param) for param in headword_template.params if not param_should_be_removed(param) ]) # For templates that automatically supply the masculine plural, # supply it here, too if not overridden. if tempname in [ "ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-an" ] and not getp("pl"): params += '|pl=sp' # Separate off any [[Category: Foo]] declarators, insert before them m = re.match( r"^(.*?\n+)((\[\[[A-Za-z0-9_\-]+:[^\]]+\]\]\n*)*)$", subsections[j], re.S) if m: body = m.group(1) tail = m.group(2) else: body = subsections[j] tail = "" # Make sure there are two trailing newlines if body.endswith("\n\n"): pass elif body.endswith("\n"): body += "\n" else: body += "\n\n" body += (subsections[j - 1].replace(pos, "=Declension=") + "{{%s|%s}}\n\n" % (decltempname, params)) subsections[j] = body + tail comment = "added declension for %s %s" % ( tempname, remove_links(orighead) or "%s/%s" % (pagename, getp("tr"))) note = ', '.join(notes) if note: comment = "%s (%s)" % (comment, note) comments.append(comment) sections[seci] = ''.join(subsections) + sectail newtext = pagehead + ''.join(sections) + pagetail comment = '; '.join(comments) assert ((not comment) == (newtext == page.text)) if newtext != page.text: if verbose: msg("Replacing [[%s]] with [[%s]]" % (page.text, newtext)) page.text = newtext msg("For page %s, comment = %s" % (pagename, comment)) if save: blib.safe_page_save(page, comment, errandmsg)
def __init__(self, eastarabnum, english, nom, femnom=None, obl=None, femobl=None, hundred=None, thousand=None, ord=None, adv=None, frac=None, dist=None, mult=None, numadj=None): nom = reorder_shadda(nom) if not femnom: if nom.endswith(AH): femnom = re.sub(AH + "$", "", nom) else: femnom = nom if not obl: if nom.endswith(UUN): obl = re.sub(UUN + "$", IIN, nom) else: obl = nom femnom = reorder_shadda(femnom) if not femobl: if femnom.endswith(UUN): femobl = re.sub(UUN + "$", IIN, femnom) else: femobl = femnom self.eastarabnum = eastarabnum self.english = english self.nom = nom self.nomtr = ar_translit.tr(nom) self.femnom = femnom self.femnomtr = ar_translit.tr(femnom) self.obl = obl self.obltr = ar_translit.tr(obl) self.femobl = femobl self.femobltr = ar_translit.tr(femobl) self.hundred = hundred self.thousand = thousand self.thousandtr = None if self.thousand: self.thousandtr = ar_translit.tr(self.thousand) self.ordroot = None self.ordeng = None self.ordgloss = None self.cardteeneng = None self.ordlemma = None self.cardteen = None self.adv = adv self.frac = frac self.dist = dist self.mult = mult self.numadj = numadj if ord: if len(ord) == 6: self.ordroot, self.ordeng, self.ordgloss, self.cardteeneng, \ self.ordlemma, self.cardteen = ord else: self.ordroot, self.ordeng, self.cardteeneng = ord self.ordgloss = self.ordeng self.ordlemma = (self.ordroot[0] + AA + self.ordroot[1] + I + self.ordroot[2]) self.cardteen = self.nom + A + u" عَشَرَ" self.ordlemmatr = ar_translit.tr(self.ordlemma) R1 = self.ordroot[0] R2 = self.ordroot[1] R3 = self.ordroot[2] self.femordlemma = R1 + AA + R2 + I + R3 + AH self.femordlemmatr = ar_translit.tr(self.femordlemma) self.ordteen = R1 + AA + R2 + I + R3 + A + u" عَشَرَ" self.femordteen = self.femordlemma + A + u" عَشْرَةَ" self.ordteeneng = ("twelfth" if self.cardteeneng == "twelve" else "twentieth" if self.cardteeneng == "twenty" else self.cardteeneng + "th") # one and two are totally special-cased if self.english != "one" and self.english != "two": self.frac = R1 + U + R2 + SK + R3 self.adv = self.femnom + u" مَرَّات" self.mult = u"مُ" + R1 + A + R2 + SH + A + R3 self.numadj = R1 + U + R2 + AA + R3 + IY + SH if not self.hundred: self.hundred = self.femnom + U + u"مِائَة" if not self.thousand: self.thousand = self.nom + " " + self.thousandpl self.thousandtr = self.nomtr + "t " + self.thousandpltr self.hundredtr = ar_translit.tr(self.hundred.replace( u"مِا", u"مِ"))